Skip to content

Commit 2b4e561

Browse files
committed
Always store texts in license and rules #3067
This way we do not have paths stored at all. This requires a fairly significant code change. - License/Rule data_file and text_file are now methods. - License/Rule stored_text is gone and text is a field and not a property - loading and dumping requires a location. Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 4f619e5 commit 2b4e561

34 files changed

+745
-875
lines changed

etc/scripts/licenses/buildrules.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,8 @@ def all_rule_by_tokens():
156156
try:
157157
rule_tokens[tuple(rule.tokens())] = rule.identifier
158158
except Exception as e:
159-
df = f" file://{rule.data_file}"
160-
tf = f" file://{rule.text_file}"
159+
df = f" file://{rule.data_file()}"
160+
tf = f" file://{rule.text_file()}"
161161
raise Exception(
162162
f"Failed to to get tokens from rule:: {rule.identifier}\n" f"{df}\n{tf}"
163163
) from e
@@ -211,7 +211,7 @@ def cli(licenses_file):
211211
rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage)
212212

213213
rl = models.BasicRule(**rdata.data)
214-
rl.stored_text = rdata.text
214+
rl.text = rdata.text
215215
skinny_rules.append(rl)
216216

217217
models.validate_rules(skinny_rules, licenses_by_key, with_text=True)
@@ -226,7 +226,7 @@ def cli(licenses_file):
226226
else:
227227
base_name = rule.license_expression
228228

229-
text = rule.text()
229+
text = rule.text
230230

231231
existing_rule = rule_exists(text)
232232
skinny_text = " ".join(text[:80].split()).replace("{", " ").replace("}", " ")
@@ -244,7 +244,7 @@ def cli(licenses_file):
244244
base_loc = find_rule_base_loc(base_name)
245245

246246
rd = rule.to_dict()
247-
rd["stored_text"] = rule.stored_text
247+
rd["text"] = rule.text
248248
rd["has_stored_relevance"] = rule.has_stored_relevance
249249
rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage
250250

@@ -253,9 +253,6 @@ def cli(licenses_file):
253253
# force recomputing relevance to remove junk stored relevance for long rules
254254
rulerec.set_relevance()
255255

256-
rulerec.data_file = base_loc + ".yml"
257-
rulerec.text_file = base_loc + ".RULE"
258-
259256
rule_tokens = tuple(rulerec.tokens())
260257

261258
existing_rule = rule_by_tokens.get(rule_tokens)
@@ -264,11 +261,6 @@ def cli(licenses_file):
264261
continue
265262
else:
266263
print(f"Adding new rule: {base_name}")
267-
print(" file://" + rulerec.data_file)
268-
print(
269-
" file://" + rulerec.text_file,
270-
)
271-
rulerec.dump()
272264
models.update_ignorables(rulerec, verbose=False)
273265
rulerec.dump()
274266

etc/scripts/licenses/report_license_rules.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def cli(licenses, rules, category, license_key, with_text):
184184
if with_text:
185185
license_data["text"] = lic.text[:200]
186186
license_data["is_unknown"] = lic.is_unknown
187-
license_data["words_count"] = len(lic.text)
187+
license_data["length"] = len(lic.text)
188188
license_data["reference_url"] = SCANCODE_LICENSEDB_URL.format(lic.key)
189189
licenses_output.append(license_data)
190190

@@ -210,9 +210,9 @@ def cli(licenses, rules, category, license_key, with_text):
210210
rule_data["identifier"] = rule.identifier
211211
rule_data["referenced_filenames"] = rule.referenced_filenames
212212
if with_text:
213-
rule_data["text"] = rule.text()[:200]
213+
rule_data["text"] = rule.text[:200]
214214
rule_data["has_unknown"] = rule.has_unknown
215-
rule_data["words_count"] = len(rule.text())
215+
rule_data["length"] = len(rule.text)
216216
try:
217217
rule_data["category"] = licenses_data[rule_data["license_expression"]].category
218218
except KeyError:

etc/scripts/licenses/synclic.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,7 @@ def get_licenses(
147147
start = time.time()
148148

149149
try:
150-
with io.open(lic.text_file, "w", encoding="utf-8") as tf:
151-
tf.write(text)
152-
lic.dump()
150+
lic.dump(licenses_data_dir=self.original_dir)
153151
licenses.append(lic)
154152
except:
155153
if TRACE:

src/licensedcode/index.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
TRACE_APPROX = False
5050
TRACE_APPROX_CANDIDATES = False
5151
TRACE_APPROX_MATCHES = False
52+
TRACE_INDEXING = False or os.environ.get('SCANCODE_DEBUG_LICENSE_INDEX', False)
5253
TRACE_INDEXING_PERF = False
5354
TRACE_TOKEN_DOC_FREQ = False
5455
TRACE_SPDX_LID = False
@@ -63,6 +64,7 @@ def logger_debug(*args):
6364
or TRACE_APPROX
6465
or TRACE_APPROX_CANDIDATES
6566
or TRACE_APPROX_MATCHES
67+
or TRACE_INDEXING
6668
or TRACE_INDEXING_PERF
6769
or TRACE_SPDX_LID
6870
):
@@ -304,6 +306,10 @@ def _add_rules(
304306
dictionary[sts] = stid
305307

306308
self.rules_by_rid = rules_by_rid = list(rules)
309+
if TRACE_INDEXING:
310+
for _rid, _rule in enumerate(rules_by_rid):
311+
logger_debug('rules_by_rid:', _rid, _rule)
312+
307313
# ensure that rules are sorted
308314
rules_by_rid.sort()
309315
len_rules = len(rules_by_rid)
@@ -560,16 +566,12 @@ def _add_rules(
560566

561567
dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
562568
if dupe_rules:
563-
dupe_rule_paths = [
564-
'\n'.join(
565-
sorted([
566-
('file://' + rule.text_file)
567-
if rule.text_file
568-
else ('text: ' + rule.stored_text)
569-
for rule in rules])
570-
)
571-
for rules in dupe_rules
572-
]
569+
dupe_rule_paths = []
570+
for rules in dupe_rules:
571+
drp = [rule.identifier for rule in rules]
572+
drp.sort()
573+
dupe_rule_paths.append('\n'.join(drp))
574+
573575
msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
574576
raise DuplicateRuleError(msg)
575577

src/licensedcode/match.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1783,7 +1783,7 @@ def filter_invalid_matches_to_single_word_gibberish(
17831783
highlight=False,
17841784
).strip()
17851785

1786-
rule_text = rule.text().strip()
1786+
rule_text = rule.prepare_text()
17871787

17881788
if trace:
17891789
logger_debug(

src/licensedcode/match_aho.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
"""
2121

2222
# Set to True to enable debug tracing
23-
TRACE = False
23+
TRACE = True
2424
TRACE_FRAG = False
25-
TRACE_DEEP = False
25+
TRACE_DEEP = True
2626

2727
if TRACE or TRACE_FRAG:
2828
import logging
@@ -93,7 +93,13 @@ def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
9393
qbegin = query_run.start
9494

9595
matched_positions = get_matched_positions(query_run.tokens, qbegin, automaton)
96+
if TRACE:
97+
matched_positions = list(matched_positions)
98+
logger_debug(' ##exact_AHO: matched_positions', matched_positions)
9699
matched_spans = get_matched_spans(matched_positions, query_run.matchables)
100+
if TRACE:
101+
matched_spans = list(matched_spans)
102+
logger_debug(' ##exact_AHO: matched_spans', matched_spans)
97103

98104
len_legalese = idx.len_legalese
99105
rules_by_rid = idx.rules_by_rid

src/licensedcode/match_spdx_lid.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def spdx_id_match(idx, query_run, text, expression_symbols=None):
8888
# Alternatively we could use the expression string, padded with
8989
# spdx-license-identifier: this may be wrong too, if the line was
9090
# not padded originally with this tag
91-
stored_text=text,
91+
text=text,
9292
length=match_len,
9393
)
9494

src/licensedcode/match_unknown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def get_tokens(_toks):
198198
print('match_unknowns: text', text)
199199

200200
# ... and use this in a synthetic UnknownRule
201-
rule = UnknownRule(stored_text=text, length=match_len)
201+
rule = UnknownRule(text=text, length=match_len)
202202

203203
# finally craft a LicenseMatch and return
204204
len_legalese = idx.len_legalese

0 commit comments

Comments
 (0)