aboutcode-org
diff --git a/‎src/licensedcode/__init__.py
Lines changed: 0 additions & 3 deletions b/‎src/licensedcode/__init__.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/licensedcode/cache.py
Lines changed: 1 addition & 1 deletion b/‎src/licensedcode/cache.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/licensedcode/index.py
Lines changed: 30 additions & 20 deletions b/‎src/licensedcode/index.py
Lines changed: 30 additions & 20 deletions
diff --git a/‎src/licensedcode/languages.py
Lines changed: 1 addition & 1 deletion b/‎src/licensedcode/languages.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/licensedcode/legal.py
Lines changed: 0 additions & 1 deletion b/‎src/licensedcode/legal.py
Lines changed: 0 additions & 1 deletion
@@ -7,18 +7,15 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
-
 # FIXME: we should consider the length of two rules and two matches when considering MAX_DIST
 # eventually this should be skipped early right during the matching too
 # maximum distance between two matches to merge
 MAX_DIST = 50
 
-
 # minimum number of tokens a match should have to be considered as worthy keeping
 MIN_MATCH_LENGTH = 4
 MIN_MATCH_HIGH_LENGTH = 3
 
-
 # rules smaller than this are treated as "small rules"
 SMALL_RULE = 15
 
 
@@ -169,7 +169,7 @@ def build_index(
     spdx_tokens = set(get_all_spdx_key_tokens(licenses_db))
     license_tokens = set(get_license_tokens())
 
-    # only skip licenses to be indexed 
+    # only skip licenses to be indexed
     if not index_all_languages:
         rules = (r for r in rules if r.language == 'en')
 
 
@@ -163,9 +163,11 @@ def __init__(
     ):
         """
         Initialize the index with an iterable of Rule objects.
-        ``_legalese`` is a set of common license-specific words aka. legalese
+        ``_legalese`` is a sorted mapping of common license-specific words aka. legalese as {token: id}
         ``_spdx_tokens`` is a set of tokens used in SPDX license identifiers
-        ``license_tokens`` is a set of "license" tokens used as start or end of a rule
+        ``_license_tokens`` is a set of "license" tokens used as start or end of a rule
+        If ``_all_languages`` is True, use all spoken languages license and rules.
+        Otherwise, use only English rules and licenses.
         """
         # total number of unique known tokens
         self.len_tokens = 0
@@ -267,9 +269,9 @@ def _add_rules(
         Add a list of Rule objects to the index and constructs optimized and
         immutable index structures.
 
-        `_legalese` is a set of common license-specific words aka. legalese
-        `_spdx_tokens` is a set of token strings used in SPDX license identifiers
-        ``license_tokens`` is a set of "license" tokens used as start or end of a rule
+        ``_legalese`` is a sorted mapping of common license-specific words aka. legalese as {token: id}
+        ``_spdx_tokens`` is a set of token strings used in SPDX license identifiers
+        ``_license_tokens`` is a set of "license" tokens used as start or end of a rule
         """
         if self.optimized:
             raise Exception('Index has been optimized and cannot be updated.')
@@ -281,10 +283,7 @@ def _add_rules(
         # valid "unichr" values, making it easier downstream when used in
         # automatons
 
-        self.dictionary = dictionary = {
-            ts: tid for tid, ts in enumerate(sorted(_legalese))
-        }
-
+        self.dictionary = dictionary = dict(_legalese)
         dictionary_get = dictionary.get
 
         self.len_legalese = len_legalese = len(dictionary)
@@ -385,7 +384,7 @@ def _add_rules(
 
             # A rule is weak if it does not contain at least one legalese word:
             # we consider all rules to be weak until proven otherwise below.
-            # "weak" rules can only be matched with an automaton.
+            # "weak" rules can only be matched with an automaton exactly.
             is_weak = True
 
             for rts in rule.tokens():
@@ -400,7 +399,10 @@ def _add_rules(
                 if is_weak and rtid < len_legalese:
                     is_weak = False
 
-                rule_token_ids_append(rtid)
+                try:
+                    rule_token_ids_append(rtid)
+                except Exception as e:
+                    raise Exception(rtid, rts, rule) from e
 
             rule_length = rule.length
             is_tiny = rule_length < TINY_RULE
@@ -564,21 +566,29 @@ def _add_rules(
         msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
         assert len_tokens <= MAX_TOKENS, msg
 
-        dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
-        if dupe_rules:
-            dupe_rule_paths = []
-            for rules in dupe_rules:
-                drp = [rule.identifier for rule in rules]
-                drp.sort()
-                dupe_rule_paths.append('\n'.join(drp))
+        dupe_rule_paths = []
+        for rules in dupe_rules_by_hash.values():
+            if len(rules) == 1:
+                continue
+            drp = [rule.identifier for rule in rules]
+            drp.sort()
+            dupe_rule_paths.append('\n'.join(drp))
 
+        if dupe_rule_paths:
             msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
             raise DuplicateRuleError(msg)
 
         self.optimized = True
 
-    def debug_matches(self, matches, message, location=None, query_string=None,
-                      with_text=False, qry=None):
+    def debug_matches(
+        self,
+        matches,
+        message,
+        location=None,
+        query_string=None,
+        with_text=False,
+        qry=None,
+    ):
         """
         Log debug-level data for a list of `matches`.
         """
 
@@ -3,7 +3,7 @@
 # Copyright (c) Django Software Foundation and individual contributors.
 # SPDX-License-Identifier: BSD-3-Clause
 # Originally from https://raw.githubusercontent.com/django/django/ce586ed6931092d3a5f06df9031cdeb891793ddb/django/conf/locale/__init__.py
-# 
+#
 # See https://github.com/nexB/scancode-toolkit for support or download.
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
@@ -38,7 +38,6 @@
     'COPYLEFT',
 )
 
-
 special_names_lower = tuple(x.lower() for x in special_names)
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`# Copyright (c) Django Software Foundation and individual contributors.`
`4`	`4`	`# SPDX-License-Identifier: BSD-3-Clause`
`5`	`5`	`# Originally from https://raw.githubusercontent.com/django/django/ce586ed6931092d3a5f06df9031cdeb891793ddb/django/conf/locale/__init__.py`
`6`		`-#`
	`6`	`+#`
`7`	`7`	`# See https://github.com/nexB/scancode-toolkit for support or download.`
`8`	`8`	`# See https://aboutcode.org for more information about nexB OSS projects.`
`9`	`9`	`#`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,6 @@`
`38`	`38`	`'COPYLEFT',`
`39`	`39`	`)`
`40`	`40`
`41`		`-`
`42`	`41`	`special_names_lower = tuple(x.lower() for x in special_names)`
`43`	`42`
`44`	`43`