Skip to content

Commit f70bbb7

Browse files
Merge pull request #3620 from nexB/update-license-detections
Update license detections
2 parents 572f4fb + f016ef9 commit f70bbb7

File tree

831 files changed

+42378
-18247
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

831 files changed

+42378
-18247
lines changed

CHANGELOG.rst

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,57 @@ v33.0.0 (next next, roadmap)
1717
v32.1.0 (next, roadmap)
1818
----------------------------
1919

20+
Major API/other changes:
21+
22+
- Output Format Version updated to 3.1.0 (minor version bump)
23+
- Drops python 3.7 and adopts python 3.12
24+
- New license match attributes:
25+
- ``from_file``
26+
- ``matched_text_diagnostics`` is added for ``--license-text-diagnostics``
27+
- In codebase-level ``license_detections`` we have a new attribute
28+
``reference_matches``
29+
- SPDX license expressions everywhere side-by-side with ScanCode
30+
license expressions.
31+
- All rule attribute level data provided in codebase level ``todo`` items.
32+
33+
Changes in Output Data Structure:
34+
35+
- The data structure of the JSON output has changed for
36+
licenses at file level, and license detections at top-level.
37+
But note that all the changes are additions to the JSON output,
38+
so we have a minor version bump ``3.0.0`` to ``3.1.0``:
39+
40+
- There is a new attribute ``from_file`` in ``matches`` which is in
41+
``license_detections`` in:
42+
* File level ``license_detections``
43+
* Codebase level ``license_detections``
44+
* ``license_detections`` and ``other_license_detections`` in
45+
file-level ``package_data``
46+
* ``license_detections`` and ``other_license_detections`` in
47+
codebase level ``packages``
48+
49+
- On using the CLI option ``--license-text-diagnostics`` there is
50+
now a new license match attribute ``matched_text_diagnostics``
51+
with the matched text and highlighted diagnostics, instead of
52+
having this replace the plain ``matched_text``.
53+
54+
- A new ``reference_matches`` attribute is added to codebase-level
55+
``license_detections`` which is same as the ``matches`` attribute
56+
in other license detections.
57+
58+
- We now have SPDX license expressions everywhere we have
59+
ScanCode license expressions for ease of use and adopting
60+
SPDX everywhere. A new attribute ``license_expression_spdx``
61+
is added to:
62+
- ``license_detections`` in file and codebase level
63+
- in package ``license_detections`` and ``other_license_detections``
64+
- ``matches`` for ``license_detections`` everywhere
65+
66+
- Adds all rule atrribute level info in codebase level ``todo``
67+
data, to assist in review. This includes length, text, notes,
68+
referenced_filenames, and the boolean attributes (like
69+
is_license_notice, is_license_intro etc, as applicable).
70+
2071
- A new field in packages with the license category for the
2172
detected license expression and also an API function to
2273
compute license categories from license expressions.

src/licensedcode/cache.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,11 +545,12 @@ def validate_spdx_license_keys(license_expression, licensing):
545545
try:
546546
parsed.render(template='{symbol.wrapped.spdx_license_key}')
547547
except AttributeError:
548+
msg = f"Error rendering SPDX license key for: {key}"
548549
messages.append(msg)
549550
pass
550551

551552
if messages:
552-
raise InvalidLicenseKeyError(messages)
553+
raise InvalidLicenseKeyError(f"ERROR in parsing license_expression: {license_expression}: type: {type(license_expression)} :{messages}")
553554

554555

555556
class InvalidLicenseKeyError(Exception):

src/licensedcode/detection.py

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,15 @@ class LicenseDetection:
165165
license_expression = attr.ib(
166166
default=None,
167167
metadata=dict(
168-
help='Full license expression string '
169-
'using the SPDX license expression syntax and ScanCode license keys.')
168+
help='A license expression string using the SPDX license expression'
169+
' syntax and ScanCode license keys, the effective license expression'
170+
' for this license detection.')
171+
)
172+
173+
license_expression_spdx = attr.ib(
174+
default=None,
175+
metadata=dict(
176+
help='SPDX license expression string with SPDX ids.')
170177
)
171178

172179
matches = attr.ib(
@@ -248,8 +255,17 @@ def from_matches(
248255
detection_log=detection_log,
249256
)
250257
detection.identifier = detection.identifier_with_expression
258+
detection.license_expression_spdx = detection.spdx_license_expression()
251259
return detection
252260

261+
def spdx_license_expression(self):
262+
from licensedcode.cache import build_spdx_license_expression
263+
from licensedcode.cache import get_cache
264+
return str(build_spdx_license_expression(
265+
license_expression=self.license_expression,
266+
licensing=get_cache().licensing,
267+
))
268+
253269
def __eq__(self, other):
254270
return (
255271
isinstance(other, LicenseDetection)
@@ -515,6 +531,7 @@ def from_license_detection_mapping(
515531

516532
detection = cls(
517533
license_expression=license_detection_mapping["license_expression"],
534+
license_expression_spdx=license_detection_mapping["license_expression_spdx"],
518535
detection_log=license_detection_mapping.get("detection_log", []) or None,
519536
identifier=license_detection_mapping["identifier"],
520537
matches=matches,
@@ -590,6 +607,12 @@ class LicenseMatchFromResult(LicenseMatch):
590607
help='Text which was matched')
591608
)
592609

610+
matched_text_diagnostics = attr.ib(
611+
default=None,
612+
metadata=dict(
613+
help='Text which was matched, with extra diagnostics information.')
614+
)
615+
593616
def score(self):
594617
return self.match_score
595618

@@ -615,15 +638,18 @@ def from_dict(cls, license_match_mapping):
615638
"""
616639
rule = Rule.from_match_data(license_match_mapping)
617640
matched_text = license_match_mapping.get("matched_text") or None
641+
matched_text_diagnostics = license_match_mapping.get("matched_text_diagnostics") or None
618642

619643
return cls(
644+
from_file=license_match_mapping["from_file"],
620645
start_line=license_match_mapping["start_line"],
621646
end_line=license_match_mapping["end_line"],
622647
match_score=license_match_mapping["score"],
623648
matched_length=license_match_mapping["matched_length"],
624649
match_coverage=license_match_mapping["match_coverage"],
625650
matcher=license_match_mapping["matcher"],
626651
text=matched_text,
652+
matched_text_diagnostics=matched_text_diagnostics,
627653
rule=rule,
628654
qspan=None,
629655
ispan=None,
@@ -642,35 +668,57 @@ def to_dict(
642668
include_text=False,
643669
license_text_diagnostics=False,
644670
whole_lines=True,
671+
rule_details=False,
645672
):
646673
"""
647674
Return a "result" scan data built from a LicenseMatch object.
648675
"""
649-
matched_text = None
650-
if include_text:
651-
matched_text = self.matched_text
652-
653676
result = {}
654677

655-
# Detection Level Information
656-
result['score'] = self.score()
678+
result['license_expression'] = self.rule.license_expression
679+
result['license_expression_spdx'] = self.rule.spdx_license_expression()
680+
result['from_file'] = self.from_file
657681
result['start_line'] = self.start_line
658682
result['end_line'] = self.end_line
683+
if rule_details:
684+
result.update(self.rule.get_flags_mapping())
685+
result['matcher'] = self.matcher
686+
result['score'] = self.score()
659687
result['matched_length'] = self.len()
688+
if rule_details:
689+
result["rule_length"] = self.rule.length
660690
result['match_coverage'] = self.coverage()
661-
result['matcher'] = self.matcher
662-
663-
# LicenseDB Level Information (Rule that was matched)
664-
result['license_expression'] = self.rule.license_expression
665-
result['rule_identifier'] = self.rule.identifier
666691
result['rule_relevance'] = self.rule.relevance
692+
result['rule_identifier'] = self.rule.identifier
667693
result['rule_url'] = self.rule.rule_url
694+
if rule_details:
695+
result["rule_notes"] = self.rule.notes
696+
result["referenced_filenames"] = self.rule.referenced_filenames
697+
if include_text and self.matched_text:
698+
result['matched_text'] = self.matched_text
699+
if license_text_diagnostics and self.matched_text_diagnostics:
700+
result['matched_text_diagnostics'] = self.matched_text_diagnostics
701+
if rule_details:
702+
result["rule_text"] = self.rule.text
668703

669-
if include_text:
670-
result['matched_text'] = matched_text
671704
return result
672705

673706

707+
def populate_matches_with_path(matches, path):
708+
"""
709+
Given `matches` list of LicenseMatch objects, populate the `from_file`
710+
attribute in them with `path` which is the path for the origin file for
711+
that license match.
712+
"""
713+
for match in matches:
714+
# Here if we have the `from_file` attribute populated already,
715+
# they are from other files, and if it's empty, they are from
716+
# the original resource, so we populate the files with the resource
717+
# path for the original resource of their origin
718+
if not match["from_file"]:
719+
match["from_file"] = path
720+
721+
674722
def collect_license_detections(codebase, include_license_clues=True):
675723
"""
676724
Return a list of LicenseDetectionFromResult object rehydrated from
@@ -680,7 +728,10 @@ def collect_license_detections(codebase, include_license_clues=True):
680728
according to their license detections. This is required because package fields
681729
are populated in package plugin, which runs before the license plugin, and thus
682730
the license plugin step where unknown references to other files are dereferenced
683-
does not show up automatically in package attributes.
731+
does not show up automatically in package attributes.
732+
733+
Also populate from_file attributes with resource paths for matches which have
734+
origin in the same file.
684735
"""
685736
has_packages = hasattr(codebase.root, 'package_data')
686737
has_licenses = hasattr(codebase.root, 'license_detections')
@@ -692,7 +743,11 @@ def collect_license_detections(codebase, include_license_clues=True):
692743
resource_license_detections = []
693744
if has_licenses:
694745
license_detections = getattr(resource, 'license_detections', []) or []
746+
for detection in license_detections:
747+
populate_matches_with_path(matches=detection["matches"], path=resource.path)
695748
license_clues = getattr(resource, 'license_clues', []) or []
749+
populate_matches_with_path(matches=license_clues, path=resource.path)
750+
codebase.save_resource(resource)
696751

697752
if license_detections:
698753
license_detection_objects = detections_from_license_detection_mappings(
@@ -729,6 +784,9 @@ def collect_license_detections(codebase, include_license_clues=True):
729784

730785
package_license_detections = package["license_detections"]
731786
if package_license_detections:
787+
for detection in package_license_detections:
788+
populate_matches_with_path(matches=detection["matches"], path=resource.path)
789+
modified = True
732790
package_license_detection_mappings.extend(package_license_detections)
733791
detection_is_same, license_expression = verify_package_license_expression(
734792
license_detection_mappings=package_license_detections,
@@ -828,6 +886,7 @@ class UniqueDetection:
828886
"""
829887
identifier = attr.ib(default=None)
830888
license_expression = attr.ib(default=None)
889+
license_expression_spdx = attr.ib(default=None)
831890
detection_count = attr.ib(default=None)
832891
matches = attr.ib(default=attr.Factory(list))
833892
detection_log = attr.ib(default=attr.Factory(list))
@@ -860,12 +919,14 @@ def get_unique_detections(cls, license_detections):
860919
for match in detection.matches
861920
]
862921
))
922+
detection.license_expression_spdx = detection.spdx_license_expression()
863923
detection.identifier = detection.identifier_with_expression
864924

865925
unique_license_detections.append(
866926
cls(
867927
identifier=detection.identifier,
868928
license_expression=detection.license_expression,
929+
license_expression_spdx=detection.license_expression_spdx,
869930
detection_log=detection_log or [],
870931
matches=detection.matches,
871932
detection_count=len(file_regions),
@@ -875,7 +936,11 @@ def get_unique_detections(cls, license_detections):
875936

876937
return unique_license_detections
877938

878-
def to_dict(self, license_diagnostics):
939+
def to_dict(self,
940+
include_text=False,
941+
license_text_diagnostics=False,
942+
license_diagnostics=False,
943+
):
879944

880945
def dict_fields(attr, value):
881946

@@ -890,11 +955,20 @@ def dict_fields(attr, value):
890955

891956
return True
892957

893-
return attr.asdict(self, filter=dict_fields)
958+
detection_mapping = attr.asdict(self, filter=dict_fields)
959+
detection_mapping["reference_matches"] = [
960+
match.to_dict(
961+
include_text=include_text,
962+
license_text_diagnostics=license_text_diagnostics,
963+
)
964+
for match in self.matches
965+
]
966+
return detection_mapping
894967

895968
def get_license_detection_object(self):
896969
return LicenseDetection(
897970
license_expression=self.license_expression,
971+
license_expression_spdx=self.license_expression_spdx,
898972
detection_log=self.detection_log,
899973
matches=self.matches,
900974
identifier=self.identifier,

src/licensedcode/licenses_reference.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ def process_codebase(self, codebase, **kwargs):
6969
Collect the ``license_references`` and ``rule_references``
7070
list of data mappings and add to the ``codebase``.
7171
"""
72-
include_files = 'license' in kwargs
73-
include_packages = 'package' in kwargs
72+
include_files = hasattr(codebase.attributes, 'license_detections')
73+
include_packages = hasattr(codebase.attributes, 'packages')
7474

7575
license_references, rule_references = collect_license_and_rule_references(
7676
codebase=codebase,
@@ -86,17 +86,25 @@ def collect_license_and_rule_references(codebase, include_packages=True, include
8686
Return a two-tuple of (``license_references``, ``license_rule_references``)
8787
sorted lists of unique mappings collected from a ``codebase``.
8888
"""
89+
if TRACE:
90+
logger_debug(f'include_packages: {include_packages}, include_files: {include_files}')
8991

9092
license_keys = set()
9193
rules_by_identifier = {}
9294

9395
if include_packages:
9496
pks, prules = collect_references_from_packages(codebase)
97+
if TRACE:
98+
logger_debug(f'collect_references_from_packages: license keys: {pks}')
99+
logger_debug(f'collect_references_from_packages: rules by id: {prules}')
95100
license_keys.update(pks)
96101
rules_by_identifier.update(prules)
97102

98103
if include_files:
99104
pks, prules = collect_references_from_files(codebase)
105+
if TRACE:
106+
logger_debug(f'collect_references_from_files: license keys: {pks}')
107+
logger_debug(f'collect_references_from_files: rules by id: {prules}')
100108
license_keys.update(pks)
101109
rules_by_identifier.update(prules)
102110

@@ -140,10 +148,6 @@ def collect_references_from_packages(codebase):
140148
if expression:
141149
license_keys.update(licensing.license_keys(expression))
142150

143-
detections = getattr(resource, 'license_detections', []) or []
144-
rules_by_id = build_rules_from_detection_data(detections)
145-
rules_by_identifier.update(rules_by_id)
146-
147151
for rule in rules_by_identifier.values():
148152
# TODO: consider using the expresion object directly instead
149153
expo = rule.license_expression

0 commit comments

Comments
 (0)