Skip to content

Commit 9a7e4a6

Browse files
Make licenses referance default in license detection
* removes the `--licenses-reference` CLI option and plugin * the license_references and license_rule_references attributes are now default with `--license` option and the same has been removed from match level data to avoid data duplication. * tests are reorganized and files renamed Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 4825830 commit 9a7e4a6

16 files changed

+2340
-3942
lines changed

setup-mini.cfg

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,6 @@ scancode_post_scan =
189189
is-license-text = licensedcode.plugin_license_text:IsLicenseText
190190
filter-clues = cluecode.plugin_filter_clues:RedundantCluesFilter
191191
consolidate = summarycode.plugin_consolidate:Consolidator
192-
licenses-reference = licensedcode.plugin_licenses_reference:LicensesReference
193192

194193

195194
# scancode_output_filter is the entry point for filter plugins executed after

setup.cfg

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,6 @@ scancode_post_scan =
190190
is-license-text = licensedcode.plugin_license_text:IsLicenseText
191191
filter-clues = cluecode.plugin_filter_clues:RedundantCluesFilter
192192
consolidate = summarycode.plugin_consolidate:Consolidator
193-
licenses-reference = licensedcode.plugin_licenses_reference:LicensesReference
194193

195194

196195
# scancode_output_filter is the entry point for filter plugins executed after

src/formattedcode/output_debian.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from plugincode.output import output_impl
1818
from plugincode.output import OutputPlugin
1919
from licensedcode.detection import get_matches_from_detection_mappings
20-
from licensedcode.plugin_licenses_reference import get_matched_text_from_reference_data
20+
from licensedcode.licenses_reference import get_matched_text_from_reference_data
2121
from scancode import notice
2222

2323
"""

src/formattedcode/output_spdx.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from commoncode.text import python_safe_name
3232
from formattedcode import FileOptionType
3333
from licensedcode.detection import get_matches_from_detection_mappings
34-
from licensedcode.plugin_licenses_reference import get_matched_text_from_reference_data
34+
from licensedcode.licenses_reference import get_matched_text_from_reference_data
3535
from plugincode.output import output_impl
3636
from plugincode.output import OutputPlugin
3737
import scancode_config
Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# ScanCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/scancode-toolkit for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import os
11+
import logging
12+
from license_expression import Licensing
13+
14+
15+
TRACE_REFERENCE = os.environ.get('SCANCODE_DEBUG_LICENSE_REFERENCE', False)
16+
TRACE_EXTRACT = os.environ.get('SCANCODE_DEBUG_LICENSE_REFERENCE_EXTRACT', False)
17+
18+
def logger_debug(*args):
19+
pass
20+
21+
22+
logger = logging.getLogger(__name__)
23+
24+
if TRACE_REFERENCE or TRACE_EXTRACT:
25+
import sys
26+
logging.basicConfig(stream=sys.stdout)
27+
logger.setLevel(logging.DEBUG)
28+
29+
def logger_debug(*args):
30+
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
31+
32+
33+
def populate_license_references(codebase):
34+
"""
35+
Get unique License and Rule data from all license detections in a codebase-level
36+
list and only refer to them in the resource level detections.
37+
"""
38+
licexps = []
39+
rules_data = []
40+
41+
if not hasattr(codebase.attributes, 'license_detections'):
42+
return
43+
44+
has_packages = False
45+
if hasattr(codebase.attributes, 'packages'):
46+
has_packages = True
47+
48+
if has_packages:
49+
codebase_packages = codebase.attributes.packages
50+
for pkg in codebase_packages:
51+
if TRACE_REFERENCE:
52+
logger_debug(
53+
f'populate_license_references: codebase.packages',
54+
f'extract_license_rules_reference_data from: {pkg["purl"]}\n',
55+
)
56+
57+
license_rules_reference_data = extract_license_rules_reference_data(
58+
license_detections=pkg['license_detections']
59+
)
60+
if license_rules_reference_data:
61+
rules_data.extend(license_rules_reference_data)
62+
licexps.append(pkg['declared_license_expression'])
63+
64+
# This license rules reference data is duplicate as `licenses` is a
65+
# top level summary of all unique license detections but this function
66+
# is called as the side effect is removing the reference attributes
67+
# from license matches
68+
69+
if TRACE_REFERENCE:
70+
identifiers = [
71+
detection["identifier"]
72+
for detection in codebase.attributes.license_detections
73+
]
74+
logger_debug(
75+
f'populate_license_references: codebase.license_detections',
76+
f'extract_license_rules_reference_data from: {identifiers}\n',
77+
)
78+
_discard = extract_license_rules_reference_data(codebase.attributes.license_detections)
79+
80+
for resource in codebase.walk():
81+
82+
# Get license_expressions from both package and license detections
83+
license_licexp = getattr(resource, 'detected_license_expression')
84+
if license_licexp:
85+
licexps.append(license_licexp)
86+
87+
if has_packages:
88+
package_data = getattr(resource, 'package_data', []) or []
89+
package_licexps = [
90+
pkg['declared_license_expression']
91+
for pkg in package_data
92+
]
93+
licexps.extend(package_licexps)
94+
95+
# Get license matches from both package and license detections
96+
package_license_detections = []
97+
for pkg in package_data:
98+
if not pkg['license_detections']:
99+
continue
100+
101+
package_license_detections.extend(pkg['license_detections'])
102+
103+
license_rules_reference_data = extract_license_rules_reference_data(
104+
license_detections=package_license_detections
105+
)
106+
if license_rules_reference_data:
107+
rules_data.extend(license_rules_reference_data)
108+
109+
license_detections = getattr(resource, 'license_detections', []) or []
110+
license_clues = getattr(resource, 'license_clues', []) or []
111+
112+
license_rules_reference_data = extract_license_rules_reference_data(
113+
license_detections=license_detections,
114+
license_matches=license_clues,
115+
)
116+
if license_rules_reference_data:
117+
rules_data.extend(license_rules_reference_data)
118+
119+
codebase.save_resource(resource)
120+
121+
license_references = get_license_references(license_expressions=licexps)
122+
codebase.attributes.license_references.extend(license_references)
123+
124+
rule_references = get_unique_rule_references(rules_data=rules_data)
125+
codebase.attributes.license_rule_references.extend(rule_references)
126+
127+
if TRACE_REFERENCE:
128+
logger_debug(
129+
f'populate_license_references: codebase.license_references',
130+
f'license_expressions: {licexps}\n',
131+
f'license_references: {license_references}\n',
132+
f'rules_data: {rules_data}\n',
133+
f'rule_references: {rule_references}\n',
134+
)
135+
raise Exception()
136+
137+
138+
def add_detection_to_license_references(codebase, license_detection_mappings):
139+
140+
license_expressions = [
141+
detection["license_expression"]
142+
for detection in license_detection_mappings
143+
]
144+
license_references = get_license_references(license_expressions=license_expressions)
145+
license_rules_reference_data = extract_license_rules_reference_data(
146+
license_detections=license_detection_mappings,
147+
)
148+
rule_references = get_unique_rule_references(rules_data=license_rules_reference_data)
149+
add_license_references_to_codebase(codebase, license_references, rule_references)
150+
151+
152+
def add_license_references_to_codebase(codebase, license_references, rule_references):
153+
154+
license_references_new = []
155+
rule_references_new = []
156+
157+
license_keys = set()
158+
rule_identifiers = set()
159+
160+
for license_reference in codebase.attributes.license_references:
161+
license_keys.add(license_reference["key"])
162+
163+
for rule_reference in codebase.attributes.license_rule_references:
164+
rule_identifiers.add(rule_reference["rule_identifier"])
165+
166+
for license_reference in license_references:
167+
if not license_reference["key"] in license_keys:
168+
license_references_new.append(license_reference)
169+
170+
for rule_reference in rule_references:
171+
if not rule_reference["rule_identifier"] in rule_identifiers:
172+
rule_references_new.append(rule_reference)
173+
174+
codebase.attributes.license_references.extend(license_references_new)
175+
codebase.attributes.license_rule_references.extend(rule_references_new)
176+
177+
178+
def get_matched_text_from_reference_data(codebase, rule_identifier):
179+
for rule_reference_data in codebase.attributes.license_rule_references:
180+
if rule_reference_data["rule_identifier"] == rule_identifier:
181+
matched_text = getattr(rule_reference_data, "matched_text", None) or None
182+
return matched_text
183+
184+
185+
def get_license_references(license_expressions, licensing=Licensing()):
186+
"""
187+
Get a list of unique License data from a list of `license_expression` strings.
188+
"""
189+
from licensedcode.cache import get_licenses_db
190+
191+
license_keys = set()
192+
license_references = []
193+
194+
for expression in license_expressions:
195+
if expression:
196+
license_keys.update(licensing.license_keys(expression))
197+
198+
db = get_licenses_db()
199+
for key in sorted(license_keys):
200+
license_references.append(
201+
db[key].to_dict(include_ignorables=False, include_text=True)
202+
)
203+
204+
return license_references
205+
206+
207+
def get_unique_rule_references(rules_data):
208+
"""
209+
Get a list of unique Rule data from a list of Rule data.
210+
"""
211+
rule_identifiers = set()
212+
rules_references = []
213+
214+
for rule_data in rules_data:
215+
216+
rule_identifier = rule_data['rule_identifier']
217+
if rule_identifier not in rule_identifiers:
218+
rule_identifiers.update(rule_identifier)
219+
rules_references.append(rule_data)
220+
221+
return rules_references
222+
223+
224+
def extract_license_rules_reference_data(license_detections=None, license_matches=None):
225+
"""
226+
Get Rule data for references from a list of LicenseDetections.
227+
228+
Also removes this data from the list of LicenseMatch in detections,
229+
apart from the `rule_identifier` as this data is referenced at top-level
230+
by this attribute.
231+
"""
232+
rule_identifiers = set()
233+
rules_reference_data = []
234+
235+
if license_detections:
236+
237+
for detection in license_detections:
238+
if not detection:
239+
continue
240+
241+
for match in detection['matches']:
242+
243+
rule_identifier = match['rule_identifier']
244+
if 'referenced_filenames' in match:
245+
ref_data = get_reference_data(match)
246+
247+
if rule_identifier not in rule_identifiers:
248+
rule_identifiers.update(rule_identifier)
249+
rules_reference_data.append(ref_data)
250+
251+
if TRACE_EXTRACT:
252+
logger_debug(
253+
f'extract_license_rules_reference_data:',
254+
f'rule_identifier: {rule_identifier}\n',
255+
f'ref_data: {ref_data}\n',
256+
f'match: {match}\n',
257+
f'rules_reference_data: {rules_reference_data}\n',
258+
)
259+
260+
if license_matches:
261+
262+
for match in license_matches:
263+
264+
rule_identifier = match['rule_identifier']
265+
ref_data = get_reference_data(match)
266+
267+
if rule_identifier not in rule_identifiers:
268+
rule_identifiers.update(rule_identifier)
269+
rules_reference_data.append(ref_data)
270+
271+
return rules_reference_data
272+
273+
274+
def get_reference_data(match):
275+
276+
ref_data = {}
277+
ref_data['license_expression'] = match['license_expression']
278+
ref_data['rule_identifier'] = match['rule_identifier']
279+
ref_data['referenced_filenames'] = match.pop('referenced_filenames')
280+
ref_data['is_license_text'] = match.pop('is_license_text')
281+
ref_data['is_license_notice'] = match.pop('is_license_notice')
282+
ref_data['is_license_reference'] = match.pop('is_license_reference')
283+
ref_data['is_license_tag'] = match.pop('is_license_tag')
284+
ref_data['is_license_intro'] = match.pop('is_license_intro')
285+
ref_data['rule_length'] = match.pop('rule_length')
286+
ref_data['rule_relevance'] = match.pop('rule_relevance')
287+
288+
_ = match.pop('licenses')
289+
290+
return ref_data

0 commit comments

Comments
 (0)