@@ -163,9 +163,11 @@ def __init__(
163
163
):
164
164
"""
165
165
Initialize the index with an iterable of Rule objects.
166
- ``_legalese`` is a set of common license-specific words aka. legalese
166
+ ``_legalese`` is a sorted mapping of common license-specific words aka. legalese as {token: id}
167
167
``_spdx_tokens`` is a set of tokens used in SPDX license identifiers
168
- ``license_tokens`` is a set of "license" tokens used as start or end of a rule
168
+ ``_license_tokens`` is a set of "license" tokens used as start or end of a rule
169
+ If ``_all_languages`` is True, use all spoken languages license and rules.
170
+ Otherwise, use only English rules and licenses.
169
171
"""
170
172
# total number of unique known tokens
171
173
self .len_tokens = 0
@@ -267,9 +269,9 @@ def _add_rules(
267
269
Add a list of Rule objects to the index and constructs optimized and
268
270
immutable index structures.
269
271
270
- `_legalese` is a set of common license-specific words aka. legalese
271
- `_spdx_tokens` is a set of token strings used in SPDX license identifiers
272
- ``license_tokens `` is a set of "license" tokens used as start or end of a rule
272
+ `` _legalese`` is a sorted mapping of common license-specific words aka. legalese as {token: id}
273
+ `` _spdx_tokens` ` is a set of token strings used in SPDX license identifiers
274
+ ``_license_tokens `` is a set of "license" tokens used as start or end of a rule
273
275
"""
274
276
if self .optimized :
275
277
raise Exception ('Index has been optimized and cannot be updated.' )
@@ -281,10 +283,7 @@ def _add_rules(
281
283
# valid "unichr" values, making it easier downstream when used in
282
284
# automatons
283
285
284
- self .dictionary = dictionary = {
285
- ts : tid for tid , ts in enumerate (sorted (_legalese ))
286
- }
287
-
286
+ self .dictionary = dictionary = dict (_legalese )
288
287
dictionary_get = dictionary .get
289
288
290
289
self .len_legalese = len_legalese = len (dictionary )
@@ -385,7 +384,7 @@ def _add_rules(
385
384
386
385
# A rule is weak if it does not contain at least one legalese word:
387
386
# we consider all rules to be weak until proven otherwise below.
388
- # "weak" rules can only be matched with an automaton.
387
+ # "weak" rules can only be matched with an automaton exactly .
389
388
is_weak = True
390
389
391
390
for rts in rule .tokens ():
@@ -400,7 +399,10 @@ def _add_rules(
400
399
if is_weak and rtid < len_legalese :
401
400
is_weak = False
402
401
403
- rule_token_ids_append (rtid )
402
+ try :
403
+ rule_token_ids_append (rtid )
404
+ except Exception as e :
405
+ raise Exception (rtid , rts , rule ) from e
404
406
405
407
rule_length = rule .length
406
408
is_tiny = rule_length < TINY_RULE
@@ -564,21 +566,29 @@ def _add_rules(
564
566
msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
565
567
assert len_tokens <= MAX_TOKENS , msg
566
568
567
- dupe_rules = [rules for rules in dupe_rules_by_hash . values () if len ( rules ) > 1 ]
568
- if dupe_rules :
569
- dupe_rule_paths = []
570
- for rules in dupe_rules :
571
- drp = [rule .identifier for rule in rules ]
572
- drp .sort ()
573
- dupe_rule_paths .append ('\n ' .join (drp ))
569
+ dupe_rule_paths = []
570
+ for rules in dupe_rules_by_hash . values () :
571
+ if len ( rules ) == 1 :
572
+ continue
573
+ drp = [rule .identifier for rule in rules ]
574
+ drp .sort ()
575
+ dupe_rule_paths .append ('\n ' .join (drp ))
574
576
577
+ if dupe_rule_paths :
575
578
msg = ('Duplicate rules: \n ' + '\n \n ' .join (dupe_rule_paths ))
576
579
raise DuplicateRuleError (msg )
577
580
578
581
self .optimized = True
579
582
580
- def debug_matches (self , matches , message , location = None , query_string = None ,
581
- with_text = False , qry = None ):
583
+ def debug_matches (
584
+ self ,
585
+ matches ,
586
+ message ,
587
+ location = None ,
588
+ query_string = None ,
589
+ with_text = False ,
590
+ qry = None ,
591
+ ):
582
592
"""
583
593
Log debug-level data for a list of `matches`.
584
594
"""
0 commit comments