aboutcode-org
diff --git a/‎etc/scripts/licenses/buildrules.py
Lines changed: 5 additions & 13 deletions b/‎etc/scripts/licenses/buildrules.py
Lines changed: 5 additions & 13 deletions
diff --git a/‎etc/scripts/licenses/report_license_rules.py
Lines changed: 3 additions & 3 deletions b/‎etc/scripts/licenses/report_license_rules.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎etc/scripts/licenses/synclic.py
Lines changed: 1 addition & 3 deletions b/‎etc/scripts/licenses/synclic.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/licensedcode/index.py
Lines changed: 12 additions & 10 deletions b/‎src/licensedcode/index.py
Lines changed: 12 additions & 10 deletions
diff --git a/‎src/licensedcode/match.py
Lines changed: 1 addition & 1 deletion b/‎src/licensedcode/match.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/licensedcode/match_aho.py
Lines changed: 8 additions & 2 deletions b/‎src/licensedcode/match_aho.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/licensedcode/match_spdx_lid.py
Lines changed: 1 addition & 1 deletion b/‎src/licensedcode/match_spdx_lid.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/licensedcode/match_unknown.py
Lines changed: 1 addition & 1 deletion b/‎src/licensedcode/match_unknown.py
Lines changed: 1 addition & 1 deletion
@@ -156,8 +156,8 @@ def all_rule_by_tokens():
         try:
             rule_tokens[tuple(rule.tokens())] = rule.identifier
         except Exception as e:
-            df = f"  file://{rule.data_file}"
-            tf = f"  file://{rule.text_file}"
+            df = f"  file://{rule.data_file()}"
+            tf = f"  file://{rule.text_file()}"
             raise Exception(
                 f"Failed to to get tokens from rule:: {rule.identifier}\n" f"{df}\n{tf}"
             ) from e
@@ -211,7 +211,7 @@ def cli(licenses_file):
         rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage)
 
         rl = models.BasicRule(**rdata.data)
-        rl.stored_text = rdata.text
+        rl.text = rdata.text
         skinny_rules.append(rl)
 
     models.validate_rules(skinny_rules, licenses_by_key, with_text=True)
@@ -226,7 +226,7 @@ def cli(licenses_file):
         else:
             base_name = rule.license_expression
 
-        text = rule.text()
+        text = rule.text
 
         existing_rule = rule_exists(text)
         skinny_text = " ".join(text[:80].split()).replace("{", " ").replace("}", " ")
@@ -244,7 +244,7 @@ def cli(licenses_file):
         base_loc = find_rule_base_loc(base_name)
 
         rd = rule.to_dict()
-        rd["stored_text"] = rule.stored_text
+        rd["text"] = rule.text
         rd["has_stored_relevance"] = rule.has_stored_relevance
         rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage
 
@@ -253,9 +253,6 @@ def cli(licenses_file):
         # force recomputing relevance to remove junk stored relevance for long rules
         rulerec.set_relevance()
 
-        rulerec.data_file = base_loc + ".yml"
-        rulerec.text_file = base_loc + ".RULE"
-
         rule_tokens = tuple(rulerec.tokens())
 
         existing_rule = rule_by_tokens.get(rule_tokens)
@@ -264,11 +261,6 @@ def cli(licenses_file):
             continue
         else:
             print(f"Adding new rule: {base_name}")
-            print("  file://" + rulerec.data_file)
-            print(
-                "  file://" + rulerec.text_file,
-            )
-            rulerec.dump()
             models.update_ignorables(rulerec, verbose=False)
             rulerec.dump()
 
 
@@ -184,7 +184,7 @@ def cli(licenses, rules, category, license_key, with_text):
             if with_text:
                 license_data["text"] = lic.text[:200]
             license_data["is_unknown"] = lic.is_unknown
-            license_data["words_count"] = len(lic.text)
+            license_data["length"] = len(lic.text)
             license_data["reference_url"] = SCANCODE_LICENSEDB_URL.format(lic.key)
             licenses_output.append(license_data)
 
@@ -210,9 +210,9 @@ def cli(licenses, rules, category, license_key, with_text):
             rule_data["identifier"] = rule.identifier
             rule_data["referenced_filenames"] = rule.referenced_filenames
             if with_text:
-                rule_data["text"] = rule.text()[:200]
+                rule_data["text"] = rule.text[:200]
             rule_data["has_unknown"] = rule.has_unknown
-            rule_data["words_count"] = len(rule.text())
+            rule_data["length"] = len(rule.text)
             try:
                 rule_data["category"] = licenses_data[rule_data["license_expression"]].category
             except KeyError:
 
@@ -147,9 +147,7 @@ def get_licenses(
                 start = time.time()
 
             try:
-                with io.open(lic.text_file, "w", encoding="utf-8") as tf:
-                    tf.write(text)
-                lic.dump()
+                lic.dump(licenses_data_dir=self.original_dir)
                 licenses.append(lic)
             except:
                 if TRACE:
 
@@ -49,6 +49,7 @@
 TRACE_APPROX = False
 TRACE_APPROX_CANDIDATES = False
 TRACE_APPROX_MATCHES = False
+TRACE_INDEXING = False or os.environ.get('SCANCODE_DEBUG_LICENSE_INDEX', False)
 TRACE_INDEXING_PERF = False
 TRACE_TOKEN_DOC_FREQ = False
 TRACE_SPDX_LID = False
@@ -63,6 +64,7 @@ def logger_debug(*args):
     or TRACE_APPROX
     or TRACE_APPROX_CANDIDATES
     or TRACE_APPROX_MATCHES
+    or TRACE_INDEXING
     or TRACE_INDEXING_PERF
     or TRACE_SPDX_LID
 ):
@@ -304,6 +306,10 @@ def _add_rules(
                 dictionary[sts] = stid
 
         self.rules_by_rid = rules_by_rid = list(rules)
+        if TRACE_INDEXING:
+            for _rid, _rule in enumerate(rules_by_rid):
+                logger_debug('rules_by_rid:', _rid, _rule)
+
         # ensure that rules are sorted
         rules_by_rid.sort()
         len_rules = len(rules_by_rid)
@@ -560,16 +566,12 @@ def _add_rules(
 
         dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
         if dupe_rules:
-            dupe_rule_paths = [
-                '\n'.join(
-                    sorted([
-                        ('file://' + rule.text_file)
-                        if rule.text_file
-                        else ('text: ' + rule.stored_text)
-                            for rule in rules])
-                    )
-                for rules in dupe_rules
-            ]
+            dupe_rule_paths = []
+            for rules in dupe_rules:
+                drp = [rule.identifier for rule in rules]
+                drp.sort()
+                dupe_rule_paths.append('\n'.join(drp))
+
             msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
             raise DuplicateRuleError(msg)
 
 
@@ -1783,7 +1783,7 @@ def filter_invalid_matches_to_single_word_gibberish(
                 highlight=False,
             ).strip()
 
-            rule_text = rule.text().strip()
+            rule_text = rule.prepare_text()
 
             if trace:
                 logger_debug(
 
@@ -20,9 +20,9 @@
 """
 
 # Set to True to enable debug tracing
-TRACE = False
+TRACE = True
 TRACE_FRAG = False
-TRACE_DEEP = False
+TRACE_DEEP = True
 
 if TRACE or TRACE_FRAG:
     import logging
@@ -93,7 +93,13 @@ def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
     qbegin = query_run.start
 
     matched_positions = get_matched_positions(query_run.tokens, qbegin, automaton)
+    if TRACE:
+        matched_positions = list(matched_positions)
+        logger_debug(' ##exact_AHO: matched_positions', matched_positions)
     matched_spans = get_matched_spans(matched_positions, query_run.matchables)
+    if TRACE:
+        matched_spans = list(matched_spans)
+        logger_debug(' ##exact_AHO: matched_spans', matched_spans)
 
     len_legalese = idx.len_legalese
     rules_by_rid = idx.rules_by_rid
 
@@ -88,7 +88,7 @@ def spdx_id_match(idx, query_run, text, expression_symbols=None):
         # Alternatively we could use the expression string, padded with
         # spdx-license-identifier: this may be wrong too, if the line was
         # not padded originally with this tag
-        stored_text=text,
+        text=text,
         length=match_len,
     )
 
 
@@ -198,7 +198,7 @@ def get_tokens(_toks):
         print('match_unknowns: text', text)
 
     # ... and use this in a synthetic UnknownRule
-    rule = UnknownRule(stored_text=text, length=match_len)
+    rule = UnknownRule(text=text, length=match_len)
 
     # finally craft a LicenseMatch and return
     len_legalese = idx.len_legalese
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def spdx_id_match(idx, query_run, text, expression_symbols=None):`
`88`	`88`	`# Alternatively we could use the expression string, padded with`
`89`	`89`	`# spdx-license-identifier: this may be wrong too, if the line was`
`90`	`90`	`# not padded originally with this tag`
`91`		`- stored_text=text,`
	`91`	`+ text=text,`
`92`	`92`	`length=match_len,`
`93`	`93`	`)`
`94`	`94`