Skip to content

Commit 6c15ebf

Browse files
Merge pull request #3681 from nexB/update-referenced-files
Refine referenced filenames #3547
2 parents bfd88b6 + 5f28d5c commit 6c15ebf

File tree

9 files changed

+482
-35
lines changed

9 files changed

+482
-35
lines changed

src/licensedcode/detection.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,25 @@ def identifier_with_expression(self):
334334
id_safe_expression = python_safe_name(s=str(self.license_expression))
335335
return "{}-{}".format(id_safe_expression, self._identifier)
336336

337+
@property
338+
def is_unknown(self):
339+
"""
340+
Return True if there are unknown license keys in the license expression
341+
for this detection, return False otherwise.
342+
"""
343+
unknown_license_keys = [
344+
"unknown-license-reference",
345+
"unknown-spdx",
346+
"unknown",
347+
"free-unknown"
348+
]
349+
350+
for license_key in unknown_license_keys:
351+
if license_key in self.license_expression:
352+
return True
353+
354+
return False
355+
337356
def get_start_end_line(self):
338357
"""
339358
Return start and end line for a license detection issue, from the
@@ -1356,6 +1375,61 @@ def has_references_to_local_files(license_matches):
13561375
)
13571376

13581377

1378+
def use_referenced_license_expression(referenced_license_expression, license_detection, licensing=Licensing()):
1379+
"""
1380+
Return True if the ``license_detection`` LicenseDetection should include
1381+
the matches represented by the ``referenced_license_expression`` string.
1382+
Return False otherwise.
1383+
1384+
Used when we have a ``license_detection`` with a match to a license rule like
1385+
"See license in COPYING" and where the ``referenced_license_expression`` is the
1386+
expression found in the "COPYING" file, which is the combined expression from
1387+
all license detections found in "COPYING" (or multiple referenced files).
1388+
1389+
Reference: https://github.com/nexB/scancode-toolkit/issues/3547
1390+
"""
1391+
#TODO: Also determing if referenced matches could be added but
1392+
# resulting license expression should not be modified.
1393+
1394+
if not referenced_license_expression or not license_detection:
1395+
return False
1396+
1397+
# We should always include referenced license matches to resolve an unknown
1398+
# license reference
1399+
if license_detection.is_unknown:
1400+
return True
1401+
1402+
# We should always include referenced license matches when the license
1403+
# expression from the referenced license matches match the license
1404+
# expression for the detection
1405+
if referenced_license_expression == license_detection.license_expression:
1406+
return True
1407+
1408+
license_keys = set(
1409+
licensing.license_keys(expression=license_detection.license_expression)
1410+
)
1411+
referenced_license_keys = set(
1412+
licensing.license_keys(expression=referenced_license_expression)
1413+
)
1414+
same_expression = referenced_license_expression == license_detection.license_expression
1415+
same_license_keys = license_keys == referenced_license_keys
1416+
1417+
# If we have the same license keys but not the same license expression then
1418+
# the reference could merely be pointing to notices, combining which produces
1419+
# a different expression, and the original detection is correct
1420+
if same_license_keys and not same_expression:
1421+
return False
1422+
1423+
# when there are many license keys in an expression, and there are no
1424+
# unknown or other cases, we cannot safely conclude that we should
1425+
# follow the license in the referenced filenames. This is likely
1426+
# a case where we have larger notices and several combined expressions,
1427+
if len(referenced_license_keys) > 5:
1428+
return False
1429+
1430+
return True
1431+
1432+
13591433
def get_detected_license_expression(
13601434
analysis,
13611435
license_matches=None,

src/licensedcode/plugin_license.py

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from commoncode.cliutils import PluggableCommandLineOption
1616
from commoncode.cliutils import SCAN_GROUP
1717
from commoncode.cliutils import SCAN_OPTIONS_GROUP
18+
from license_expression import combine_expressions
1819
from plugincode.scan import ScanPlugin
1920
from plugincode.scan import scan_impl
2021

@@ -30,10 +31,12 @@
3031
from licensedcode.detection import LicenseDetectionFromResult
3132
from licensedcode.detection import sort_unique_detections
3233
from licensedcode.detection import UniqueDetection
34+
from licensedcode.detection import use_referenced_license_expression
3335
from packagedcode.utils import combine_expressions
3436
from scancode.api import SCANCODE_LICENSEDB_URL
3537

3638
TRACE = os.environ.get('SCANCODE_DEBUG_PLUGIN_LICENSE', False)
39+
TRACE_REFERENCE = os.environ.get('SCANCODE_DEBUG_PLUGIN_LICENSE_REFERENCE', False)
3740

3841

3942
def logger_debug(*args):
@@ -42,7 +45,7 @@ def logger_debug(*args):
4245

4346
logger = logging.getLogger(__name__)
4447

45-
if TRACE:
48+
if TRACE or TRACE_REFERENCE:
4649
import sys
4750
logging.basicConfig(stream=sys.stdout)
4851
logger.setLevel(logging.DEBUG)
@@ -214,6 +217,8 @@ def process_codebase(self, codebase, license_text=False, license_diagnostics=Fal
214217
f'before: {license_expressions_before}\n'
215218
f'after : {license_expressions_after}'
216219
)
220+
221+
#raise Exception()
217222

218223
license_detections = collect_license_detections(
219224
codebase=codebase,
@@ -259,20 +264,28 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):
259264

260265
modified = False
261266

267+
if TRACE_REFERENCE:
268+
logger_debug(
269+
f'add_referenced_license_matches: resource_path: {resource.path}',
270+
)
271+
262272
for license_detection_mapping in license_detection_mappings:
263273

264274
license_detection = LicenseDetectionFromResult.from_license_detection_mapping(
265275
license_detection_mapping=license_detection_mapping,
266276
file_path=resource.path,
267277
)
268-
detection_modified = False
269-
detections_added = []
270278
license_match_mappings = license_detection_mapping["matches"]
271279
referenced_filenames = get_referenced_filenames(license_detection.matches)
272280

273281
if not referenced_filenames:
282+
if TRACE_REFERENCE:
283+
logger_debug(
284+
f'No references at license detection with expression: {license_detection.license_expression}',
285+
)
274286
continue
275287

288+
referenced_detections = []
276289
for referenced_filename in referenced_filenames:
277290
referenced_resource = find_referenced_resource(
278291
referenced_filename=referenced_filename,
@@ -281,26 +294,53 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):
281294
)
282295

283296
if referenced_resource and referenced_resource.license_detections:
284-
modified = True
285-
detection_modified = True
286-
detections_added.extend(referenced_resource.license_detections)
287-
matches_to_extend = get_matches_from_detection_mappings(
288-
license_detections=referenced_resource.license_detections
297+
referenced_detections.extend(
298+
referenced_resource.license_detections
289299
)
290-
populate_matches_with_path(
291-
matches=matches_to_extend,
292-
path=referenced_resource.path
293-
)
294-
license_match_mappings.extend(matches_to_extend)
295300

296-
if not detection_modified:
301+
for detection in referenced_resource.license_detections:
302+
populate_matches_with_path(
303+
matches=detection["matches"],
304+
path=referenced_resource.path
305+
)
306+
307+
referenced_license_expression = combine_expressions(
308+
expressions=[
309+
detection["license_expression"]
310+
for detection in referenced_detections
311+
],
312+
)
313+
if not use_referenced_license_expression(
314+
referenced_license_expression=referenced_license_expression,
315+
license_detection=license_detection,
316+
):
317+
if TRACE_REFERENCE:
318+
logger_debug(
319+
f'use_referenced_license_expression: False for '
320+
f'resource: {referenced_resource.path} and '
321+
f'license_expression: {referenced_license_expression}',
322+
)
297323
continue
298324

325+
if TRACE_REFERENCE:
326+
logger_debug(
327+
f'use_referenced_license_expression: True for '
328+
f'resource: {referenced_resource.path} and '
329+
f'license_expression: {referenced_license_expression}',
330+
)
331+
332+
modified = True
333+
matches_to_extend = get_matches_from_detection_mappings(
334+
license_detections=referenced_detections
335+
)
336+
license_match_mappings.extend(matches_to_extend)
337+
299338
detection_log, license_expression = get_detected_license_expression(
300339
license_match_mappings=license_match_mappings,
301340
analysis=DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value,
302341
post_scan=True,
303342
)
343+
304344
license_expression_spdx = build_spdx_license_expression(
305345
license_expression=str(license_expression),
306346
licensing=get_cache().licensing,
@@ -310,7 +350,7 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):
310350
license_detection_mapping["detection_log"] = detection_log
311351
license_detection_mapping["identifier"] = get_new_identifier_from_detections(
312352
initial_detection=license_detection_mapping,
313-
detections_added=detections_added,
353+
detections_added=referenced_detections,
314354
license_expression=license_expression,
315355
)
316356

src/packagedcode/licensing.py

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from licensedcode.detection import detect_licenses
2727
from licensedcode.detection import LicenseDetectionFromResult
2828
from licensedcode.detection import populate_matches_with_path
29+
from licensedcode.detection import use_referenced_license_expression
2930
from licensedcode.spans import Span
3031
from licensedcode import query
3132

@@ -93,41 +94,52 @@ def add_referenced_license_matches_for_package(resource, codebase):
9394
file_path=resource.path,
9495
)
9596

96-
detection_modified = False
97+
detections_added = []
9798
license_match_mappings = license_detection_mapping["matches"]
9899
referenced_filenames = get_referenced_filenames(license_detection_object.matches)
99100
if not referenced_filenames:
100101
continue
101102

103+
referenced_detections = []
102104
for referenced_filename in referenced_filenames:
103105
referenced_resource = find_referenced_resource(
104106
referenced_filename=referenced_filename,
105107
resource=resource,
106108
codebase=codebase,
107109
)
108110

109-
if not referenced_resource:
110-
continue
111-
112-
referenced_license_detections = referenced_resource.license_detections
113-
114-
if referenced_license_detections:
115-
modified = True
116-
detection_modified = True
117-
matches_to_extend = get_matches_from_detection_mappings(
118-
license_detections=referenced_license_detections
111+
if referenced_resource and referenced_resource.license_detections:
112+
referenced_detections.extend(
113+
referenced_resource.license_detections
119114
)
115+
120116
# For LicenseMatches with different resources as origin, add the
121117
# resource path to these matches as origin info
122-
populate_matches_with_path(
123-
matches=matches_to_extend,
124-
path=referenced_resource.path
125-
)
126-
license_match_mappings.extend(matches_to_extend)
127-
128-
if not detection_modified:
118+
for detection in referenced_resource.license_detections:
119+
populate_matches_with_path(
120+
matches=detection["matches"],
121+
path=referenced_resource.path
122+
)
123+
124+
referenced_license_expression = combine_expressions(
125+
expressions=[
126+
detection["license_expression"]
127+
for detection in referenced_detections
128+
],
129+
)
130+
if not use_referenced_license_expression(
131+
referenced_license_expression=referenced_license_expression,
132+
license_detection=license_detection_object,
133+
):
129134
continue
130135

136+
modified = True
137+
detections_added.extend(referenced_resource.license_detections)
138+
matches_to_extend = get_matches_from_detection_mappings(
139+
license_detections=referenced_resource.license_detections,
140+
)
141+
license_match_mappings.extend(matches_to_extend)
142+
131143
detection_log, license_expression = get_detected_license_expression(
132144
license_match_mappings=license_match_mappings,
133145
analysis=DetectionCategory.PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL.value,
@@ -142,7 +154,7 @@ def add_referenced_license_matches_for_package(resource, codebase):
142154
license_detection_mapping["detection_log"] = detection_log
143155
license_detection_mapping["identifier"] = get_new_identifier_from_detections(
144156
initial_detection=license_detection_mapping,
145-
detections_added=referenced_license_detections,
157+
detections_added=detections_added,
146158
license_expression=license_expression,
147159
)
148160

@@ -223,7 +235,20 @@ def add_referenced_license_detection_from_package(resource, codebase):
223235
f'sibling_license_detections: {sibling_license_detections}'
224236
)
225237

238+
referenced_license_expression = combine_expressions(
239+
expressions=[
240+
detection["license_expression"]
241+
for detection in sibling_license_detections
242+
],
243+
)
244+
if not use_referenced_license_expression(
245+
referenced_license_expression=referenced_license_expression,
246+
license_detection=license_detection_object,
247+
):
248+
continue
249+
226250
for sibling_detection in sibling_license_detections:
251+
227252
modified = True
228253
detection_modified = True
229254
license_match_mappings.extend(sibling_detection["matches"])
@@ -239,6 +264,21 @@ def add_referenced_license_detection_from_package(resource, codebase):
239264
break
240265

241266
pkg_detections = codebase_package["license_detections"]
267+
if not pkg_detections:
268+
continue
269+
270+
referenced_license_expression = combine_expressions(
271+
expressions=[
272+
detection["license_expression"]
273+
for detection in pkg_detections
274+
],
275+
)
276+
if not use_referenced_license_expression(
277+
referenced_license_expression=referenced_license_expression,
278+
license_detection=license_detection_object,
279+
):
280+
continue
281+
242282
for pkg_detection in pkg_detections:
243283
modified = True
244284
detection_modified = True

0 commit comments

Comments
 (0)