Skip to content

Commit 890b297

Browse files
Refactor license detection to use rehydrated objects
* Due to license references being default, reference data isn't inlined anymore, so we need to use the cache to get this data and also rehydrate them into objects to be able to post process license related info. * Use license objects wherever possible instead of mappings. Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 9a7e4a6 commit 890b297

File tree

11 files changed

+295
-174
lines changed

11 files changed

+295
-174
lines changed

src/cluecode/plugin_filter_clues.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,8 @@ def process_codebase(self, codebase, **kwargs):
6363

6464
from licensedcode.cache import get_index
6565

66-
rules_by_id = {r.identifier: r for r in get_index().rules_by_rid}
67-
6866
for resource in codebase.walk():
69-
filtered = filter_ignorable_resource_clues(resource, rules_by_id)
67+
filtered = filter_ignorable_resource_clues(resource, get_index().rules_by_id)
7068
if filtered:
7169
filtered.save(codebase)
7270

src/licensedcode/detection.py

Lines changed: 129 additions & 73 deletions
Large diffs are not rendered by default.

src/licensedcode/index.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ class LicenseIndex(object):
131131
'digit_only_tids',
132132
'tokens_by_tid',
133133

134+
'rules_by_id',
134135
'rules_by_rid',
135136
'tids_by_rid',
136137

@@ -190,6 +191,9 @@ def __init__(
190191
# Note: all the following are mappings-like (using lists) of
191192
# rid-> data are lists of data where the index is the rule id.
192193

194+
# mapping of rule identifiers -> rule objects
195+
self.rules_by_id = {}
196+
193197
# maping-like of rule_id -> rule objects proper
194198
self.rules_by_rid = []
195199

@@ -305,6 +309,7 @@ def _add_rules(
305309
dictionary[sts] = stid
306310

307311
self.rules_by_rid = rules_by_rid = list(rules)
312+
self.rules_by_id = {r.identifier: r for r in self.rules_by_rid}
308313
if TRACE_INDEXING:
309314
for _rid, _rule in enumerate(rules_by_rid):
310315
logger_debug('rules_by_rid:', _rid, _rule)

src/licensedcode/match.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@
2222
from licensedcode.tokenize import index_tokenizer
2323
from licensedcode.tokenize import matched_query_text_tokenizer
2424

25+
26+
from scancode.api import SPDX_LICENSE_URL
27+
from scancode.api import SCANCODE_LICENSEDB_URL
28+
from scancode.api import SCANCODE_LICENSE_URL
29+
from scancode.api import SCANCODE_LICENSE_RULE_URL
30+
from scancode.api import SCANCODE_RULE_URL
31+
2532
"""
2633
LicenseMatch data structure and processing.
2734
A key feature is merging and filtering of matches.
@@ -755,8 +762,8 @@ def matched_text(
755762

756763
def get_mapping(
757764
self,
758-
license_url_template,
759-
spdx_license_url,
765+
license_url_template=SCANCODE_LICENSEDB_URL,
766+
spdx_license_url=SPDX_LICENSE_URL,
760767
include_text=False,
761768
license_text_diagnostics=False,
762769
whole_lines=True,
@@ -777,11 +784,6 @@ def get_mapping(
777784
else:
778785
matched_text = self.matched_text(whole_lines=False, highlight=False)
779786

780-
SCANCODE_DATA_BASE_URL = 'https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data'
781-
SCANCODE_LICENSE_URL = SCANCODE_DATA_BASE_URL + '/licenses/{}.LICENSE'
782-
SCANCODE_LICENSE_RULE_URL = SCANCODE_DATA_BASE_URL + '/licenses/{}'
783-
SCANCODE_RULE_URL = SCANCODE_DATA_BASE_URL + '/rules/{}'
784-
785787
result = {}
786788

787789
# Detection Level Information

src/licensedcode/models.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
from licensedcode.tokenize import key_phrase_tokenizer
4646
from licensedcode.tokenize import KEY_PHRASE_OPEN
4747
from licensedcode.tokenize import KEY_PHRASE_CLOSE
48+
from scancode.api import SCANCODE_LICENSE_RULE_URL
49+
from scancode.api import SCANCODE_RULE_URL
4850

4951
"""
5052
Reference License and license Rule structures persisted as a combo of a YAML
@@ -1770,6 +1772,31 @@ def get_min_high_matched_length(self, unique=False):
17701772
return (self.min_high_matched_length_unique if unique
17711773
else self.min_high_matched_length)
17721774

1775+
def get_reference_data(self, matcher=None):
1776+
1777+
data = {}
1778+
1779+
data['license_expression'] = self.license_expression
1780+
data['rule_identifier'] = self.identifier
1781+
if matcher:
1782+
if matcher == "1-spdx-id":
1783+
data['rule_url'] = None
1784+
elif self.is_from_license:
1785+
data['rule_url'] = SCANCODE_LICENSE_RULE_URL.format(self.identifier)
1786+
else:
1787+
data['rule_url'] = SCANCODE_RULE_URL.format(self.identifier)
1788+
1789+
data['referenced_filenames'] = self.referenced_filenames
1790+
data['is_license_text'] = self.is_license_text
1791+
data['is_license_notice'] = self.is_license_notice
1792+
data['is_license_reference'] = self.is_license_reference
1793+
data['is_license_tag'] = self.is_license_tag
1794+
data['is_license_intro'] = self.is_license_intro
1795+
data['rule_length'] = self.length
1796+
data['rule_relevance'] = self.relevance
1797+
1798+
return data
1799+
17731800
def to_dict(self, include_text=False):
17741801
"""
17751802
Return an ordered mapping of self, excluding texts unless

src/licensedcode/plugin_license.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@
2424
from licensedcode.detection import get_detected_license_expression
2525
from licensedcode.detection import get_matches_from_detection_mappings
2626
from licensedcode.detection import get_referenced_filenames
27-
from licensedcode.detection import SCANCODE_LICENSEDB_URL
2827
from licensedcode.detection import LicenseDetection
2928
from licensedcode.detection import group_matches
3029
from licensedcode.detection import process_detections
3130
from licensedcode.detection import DetectionCategory
3231
from licensedcode.detection import detections_from_license_detection_mappings
3332
from licensedcode.detection import matches_from_license_match_mappings
3433
from licensedcode.detection import UniqueDetection
34+
from licensedcode.detection import LicenseDetectionFromResult
35+
from licensedcode.licenses_reference import populate_license_references
3536
from packagedcode.utils import combine_expressions
3637
from scancode.api import SCANCODE_LICENSEDB_URL
3738

@@ -64,11 +65,13 @@ class LicenseScanner(ScanPlugin):
6465
('license_detections', attr.ib(default=attr.Factory(list))),
6566
('license_clues', attr.ib(default=attr.Factory(list))),
6667
('percentage_of_license_text', attr.ib(default=0)),
67-
('for_licenses', attr.ib(default=attr.Factory(list))),
68+
('for_license_detections', attr.ib(default=attr.Factory(list))),
6869
])
6970

7071
codebase_attributes = dict(
71-
licenses=attr.ib(default=attr.Factory(list)),
72+
license_detections=attr.ib(default=attr.Factory(list)),
73+
license_references=attr.ib(default=attr.Factory(list)),
74+
license_rule_references=attr.ib(default=attr.Factory(list))
7275
)
7376

7477
sort_order = 2
@@ -209,28 +212,30 @@ def process_codebase(self, codebase, **kwargs):
209212
f'after : {license_expressions_after}'
210213
)
211214

212-
populate_for_licenses_in_resources(
215+
populate_for_license_detections_in_resources(
213216
codebase=codebase,
214217
detections=unique_license_detections,
215218
)
216-
codebase.attributes.licenses.extend([
219+
codebase.attributes.license_detections.extend([
217220
unique_detection.to_dict()
218221
for unique_detection in unique_license_detections
219222
])
220223

224+
populate_license_references(codebase)
221225

222-
def populate_for_licenses_in_resources(codebase, detections):
226+
227+
def populate_for_license_detections_in_resources(codebase, detections):
223228

224229
for detection in detections:
225230
if TRACE:
226231
logger_debug(
227-
f'populate_for_licenses_in_resources:',
232+
f'populate_for_license_detections_in_resources:',
228233
f'for detection: {detection.license_expression}\n',
229234
f'file paths: {detection.files}',
230235
)
231236
for file_region in detection.files:
232237
resource = codebase.get_resource(path=file_region.path)
233-
resource.for_licenses.append(detection.identifier)
238+
resource.for_license_detections.append(detection.identifier)
234239

235240

236241
def collect_license_detections(codebase):
@@ -243,7 +248,7 @@ def collect_license_detections(codebase):
243248

244249
if hasattr(codebase.root, 'license_detections'):
245250
has_licenses = True
246-
251+
247252
all_license_detections = []
248253

249254
for resource in codebase.walk():
@@ -346,16 +351,22 @@ def add_referenced_license_matches_for_detections(resource, codebase):
346351
if not resource.is_file:
347352
return
348353

349-
license_detections = resource.license_detections
350-
if not license_detections:
354+
license_detection_mappings = resource.license_detections
355+
if not license_detection_mappings:
351356
return
352357

353358
modified = False
354359

355-
for detection in license_detections:
360+
for license_detection_mapping in license_detection_mappings:
361+
362+
license_detection_object = LicenseDetectionFromResult.from_license_detection_mapping(
363+
license_detection_mapping=license_detection_mapping,
364+
file_path=resource.path,
365+
)
356366
detection_modified = False
357-
matches = detection["matches"]
358-
referenced_filenames = get_referenced_filenames(matches)
367+
license_match_mappings = license_detection_mapping["matches"]
368+
referenced_filenames = get_referenced_filenames(license_detection_object.matches)
369+
359370
if not referenced_filenames:
360371
continue
361372

@@ -369,7 +380,7 @@ def add_referenced_license_matches_for_detections(resource, codebase):
369380
if referenced_resource and referenced_resource.license_detections:
370381
modified = True
371382
detection_modified = True
372-
matches.extend(
383+
license_match_mappings.extend(
373384
get_matches_from_detection_mappings(
374385
license_detections=referenced_resource.license_detections
375386
)
@@ -379,12 +390,12 @@ def add_referenced_license_matches_for_detections(resource, codebase):
379390
continue
380391

381392
detection_log, license_expression = get_detected_license_expression(
382-
matches=matches,
393+
license_match_mappings=license_match_mappings,
383394
analysis=DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value,
384395
post_scan=True,
385396
)
386-
detection["license_expression"] = str(license_expression)
387-
detection["detection_log"] = detection_log
397+
license_detection_mapping["license_expression"] = str(license_expression)
398+
license_detection_mapping["detection_log"] = detection_log
388399

389400
if modified:
390401
license_expressions = [

0 commit comments

Comments
 (0)