Skip to content

Commit aa30272

Browse files
committed
Always store texts in license and rules #3067
- Ensure that all tests pass. - Convert legalese to a mapping rather than a set Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent ec50bd2 commit aa30272

28 files changed

+4793
-4690
lines changed

src/licensedcode/__init__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,15 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
1110
# FIXME: we should consider the length of two rules and two matches when considering MAX_DIST
1211
# eventually this should be skipped early right during the matching too
1312
# maximum distance between two matches to merge
1413
MAX_DIST = 50
1514

16-
1715
# minimum number of tokens a match should have to be considered as worthy keeping
1816
MIN_MATCH_LENGTH = 4
1917
MIN_MATCH_HIGH_LENGTH = 3
2018

21-
2219
# rules smaller than this are treated as "small rules"
2320
SMALL_RULE = 15
2421

src/licensedcode/cache.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def build_index(
169169
spdx_tokens = set(get_all_spdx_key_tokens(licenses_db))
170170
license_tokens = set(get_license_tokens())
171171

172-
# only skip licenses to be indexed
172+
# only skip licenses to be indexed
173173
if not index_all_languages:
174174
rules = (r for r in rules if r.language == 'en')
175175

src/licensedcode/index.py

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,11 @@ def __init__(
163163
):
164164
"""
165165
Initialize the index with an iterable of Rule objects.
166-
``_legalese`` is a set of common license-specific words aka. legalese
166+
``_legalese`` is a sorted mapping of common license-specific words aka. legalese as {token: id}
167167
``_spdx_tokens`` is a set of tokens used in SPDX license identifiers
168-
``license_tokens`` is a set of "license" tokens used as start or end of a rule
168+
``_license_tokens`` is a set of "license" tokens used as start or end of a rule
169+
If ``_all_languages`` is True, use all spoken languages license and rules.
170+
Otherwise, use only English rules and licenses.
169171
"""
170172
# total number of unique known tokens
171173
self.len_tokens = 0
@@ -267,9 +269,9 @@ def _add_rules(
267269
Add a list of Rule objects to the index and constructs optimized and
268270
immutable index structures.
269271
270-
`_legalese` is a set of common license-specific words aka. legalese
271-
`_spdx_tokens` is a set of token strings used in SPDX license identifiers
272-
``license_tokens`` is a set of "license" tokens used as start or end of a rule
272+
``_legalese`` is a sorted mapping of common license-specific words aka. legalese as {token: id}
273+
``_spdx_tokens`` is a set of token strings used in SPDX license identifiers
274+
``_license_tokens`` is a set of "license" tokens used as start or end of a rule
273275
"""
274276
if self.optimized:
275277
raise Exception('Index has been optimized and cannot be updated.')
@@ -281,10 +283,7 @@ def _add_rules(
281283
# valid "unichr" values, making it easier downstream when used in
282284
# automatons
283285

284-
self.dictionary = dictionary = {
285-
ts: tid for tid, ts in enumerate(sorted(_legalese))
286-
}
287-
286+
self.dictionary = dictionary = dict(_legalese)
288287
dictionary_get = dictionary.get
289288

290289
self.len_legalese = len_legalese = len(dictionary)
@@ -385,7 +384,7 @@ def _add_rules(
385384

386385
# A rule is weak if it does not contain at least one legalese word:
387386
# we consider all rules to be weak until proven otherwise below.
388-
# "weak" rules can only be matched with an automaton.
387+
# "weak" rules can only be matched with an automaton exactly.
389388
is_weak = True
390389

391390
for rts in rule.tokens():
@@ -400,7 +399,10 @@ def _add_rules(
400399
if is_weak and rtid < len_legalese:
401400
is_weak = False
402401

403-
rule_token_ids_append(rtid)
402+
try:
403+
rule_token_ids_append(rtid)
404+
except Exception as e:
405+
raise Exception(rtid, rts, rule) from e
404406

405407
rule_length = rule.length
406408
is_tiny = rule_length < TINY_RULE
@@ -564,21 +566,29 @@ def _add_rules(
564566
msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
565567
assert len_tokens <= MAX_TOKENS, msg
566568

567-
dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
568-
if dupe_rules:
569-
dupe_rule_paths = []
570-
for rules in dupe_rules:
571-
drp = [rule.identifier for rule in rules]
572-
drp.sort()
573-
dupe_rule_paths.append('\n'.join(drp))
569+
dupe_rule_paths = []
570+
for rules in dupe_rules_by_hash.values():
571+
if len(rules) == 1:
572+
continue
573+
drp = [rule.identifier for rule in rules]
574+
drp.sort()
575+
dupe_rule_paths.append('\n'.join(drp))
574576

577+
if dupe_rule_paths:
575578
msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
576579
raise DuplicateRuleError(msg)
577580

578581
self.optimized = True
579582

580-
def debug_matches(self, matches, message, location=None, query_string=None,
581-
with_text=False, qry=None):
583+
def debug_matches(
584+
self,
585+
matches,
586+
message,
587+
location=None,
588+
query_string=None,
589+
with_text=False,
590+
qry=None,
591+
):
582592
"""
583593
Log debug-level data for a list of `matches`.
584594
"""

src/licensedcode/languages.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Copyright (c) Django Software Foundation and individual contributors.
44
# SPDX-License-Identifier: BSD-3-Clause
55
# Originally from https://raw.githubusercontent.com/django/django/ce586ed6931092d3a5f06df9031cdeb891793ddb/django/conf/locale/__init__.py
6-
#
6+
#
77
# See https://github.com/nexB/scancode-toolkit for support or download.
88
# See https://aboutcode.org for more information about nexB OSS projects.
99
#

src/licensedcode/legal.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
'COPYLEFT',
3939
)
4040

41-
4241
special_names_lower = tuple(x.lower() for x in special_names)
4342

4443

0 commit comments

Comments
 (0)