Skip to content

Commit 8721bdf

Browse files
Improve unknown reference to package dereferencing #2965 #1379
* in case of unknown references being present without top-level detected, dereference using license detections in legalese/readme files at codebase root. * add example cases from samba/samba, sugarlabs/physics, debian fusiondirectory, and paddlenlp as tests. Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 7d5c647 commit 8721bdf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+43221
-30
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This file is distributed under the same license as the
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
license_expression: free-unknown
2+
is_license_reference: yes
3+
is_continuous: yes
4+
relevance: 100
5+
minimum_coverage: 100
6+
referenced_filenames:
7+
- package

src/licensedcode/detection.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141

4242
TRACE = os.environ.get('SCANCODE_DEBUG_LICENSE_DETECTION', False)
4343

44+
TRACE_ANALYSIS = False
45+
TRACE_IS_FUNCTIONS = False
4446

4547
def logger_debug(*args):
4648
pass
@@ -49,7 +51,11 @@ def logger_debug(*args):
4951
logger = logging.getLogger(__name__)
5052

5153

52-
if TRACE:
54+
if (
55+
TRACE
56+
or TRACE_ANALYSIS
57+
or TRACE_IS_FUNCTIONS
58+
):
5359
import sys
5460

5561
logging.basicConfig(stream=sys.stdout)
@@ -82,6 +88,7 @@ class DetectionCategory(Enum):
8288
UNKNOWN_INTRO_BEFORE_DETECTION = 'unknown-intro-before-detection'
8389
UNKNOWN_FILE_REFERENCE_LOCAL = 'unknown-file-reference-local'
8490
UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE = 'unknown-reference-in-file-to-package'
91+
UNKNOWN_REFERENCE_IN_FILE_TO_NONEXISTENT_PACKAGE = 'unknown-reference-in-file-to-nonexistent-package'
8592
PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL = 'package-unknown-file-reference-local'
8693
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
8794
PACKAGE_ADD_FROM_FILE = 'from-package-file'
@@ -101,6 +108,7 @@ class DetectionRule(Enum):
101108
UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'unknown-reference-to-local-file'
102109
UNKNOWN_INTRO_FOLLOWED_BY_MATCH = 'unknown-intro-followed-by-match'
103110
UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE = 'unknown-reference-in-file-to-package'
111+
UNKNOWN_REFERENCE_IN_FILE_TO_NONEXISTENT_PACKAGE = 'unknown-reference-in-file-to-nonexistent-package'
104112
CONTAINED_SAME_LICENSE = 'contained-with-same-license'
105113
NOTICE_FOLLOWED_BY_TEXT = 'notice-followed-by-text'
106114
CONTIGUOUS_SAME_LICENSE = 'contiguous-with-same-license'
@@ -700,55 +708,84 @@ def get_detected_license_expression(matches, analysis, post_scan=False):
700708
Return a tuple of (reasons, combined_expression) by combining a `matches` list of
701709
LicenseMatch objects using an `analysis` code string.
702710
"""
703-
if TRACE:
711+
if TRACE or TRACE_ANALYSIS:
704712
logger_debug(f'license_matches {matches}', f'package_license {analysis}', f'post_scan: {post_scan}')
705713

706714
matches_for_expression = None
707715
combined_expression = None
708716
reasons = []
709717

710718
if analysis == DetectionCategory.FALSE_POSITVE.value:
719+
if TRACE_ANALYSIS:
720+
logger_debug(f'analysis {DetectionRule.FALSE_POSITIVE.value}')
711721
reasons.append(DetectionRule.FALSE_POSITIVE.value)
712722
return reasons, combined_expression
713723

714724
elif analysis == DetectionCategory.UNDETECTED_LICENSE.value:
725+
if TRACE_ANALYSIS:
726+
logger_debug(f'analysis {DetectionCategory.UNDETECTED_LICENSE.value}')
715727
matches_for_expression = matches
716728
reasons.append(DetectionRule.UNDETECTED_LICENSE.value)
717729

718730
elif analysis == DetectionCategory.UNKNOWN_INTRO_BEFORE_DETECTION.value:
731+
if TRACE_ANALYSIS:
732+
logger_debug(f'analysis {DetectionCategory.UNKNOWN_INTRO_BEFORE_DETECTION.value}')
719733
matches_for_expression = filter_license_intros(matches)
720734
reasons.append(DetectionRule.UNKNOWN_INTRO_FOLLOWED_BY_MATCH.value)
721735

722736
elif post_scan:
723737
if analysis == DetectionCategory.UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE.value:
738+
if TRACE_ANALYSIS:
739+
logger_debug(f'analysis {DetectionCategory.UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE.value}')
724740
matches_for_expression = filter_license_references(matches)
725741
reasons.append(DetectionRule.UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE.value)
726742

743+
elif analysis == DetectionCategory.UNKNOWN_REFERENCE_IN_FILE_TO_NONEXISTENT_PACKAGE.value:
744+
if TRACE_ANALYSIS:
745+
logger_debug(f'analysis {DetectionCategory.UNKNOWN_REFERENCE_IN_FILE_TO_NONEXISTENT_PACKAGE.value}')
746+
matches_for_expression = filter_license_references(matches)
747+
reasons.append(DetectionRule.UNKNOWN_REFERENCE_IN_FILE_TO_NONEXISTENT_PACKAGE.value)
748+
727749
elif analysis == DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value:
750+
if TRACE_ANALYSIS:
751+
logger_debug(f'analysis {DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value}')
728752
matches_for_expression = filter_license_references(matches)
729753
reasons.append(DetectionRule.UNKNOWN_REFERENCE_TO_LOCAL_FILE.value)
730754

731755
elif analysis == DetectionCategory.PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL.value:
756+
if TRACE_ANALYSIS:
757+
logger_debug(f'analysis {DetectionCategory.PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL.value}')
732758
matches_for_expression = filter_license_references(matches)
733759
reasons.append(DetectionRule.PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE.value)
734760

735761
elif analysis == DetectionCategory.PACKAGE_ADD_FROM_SIBLING_FILE.value:
762+
if TRACE_ANALYSIS:
763+
logger_debug(f'analysis {DetectionCategory.PACKAGE_ADD_FROM_SIBLING_FILE.value}')
736764
matches_for_expression = filter_license_references(matches)
737765
reasons.append(DetectionRule.PACKAGE_ADD_FROM_SIBLING_FILE.value)
738766

739767
elif analysis == DetectionCategory.PACKAGE_ADD_FROM_FILE.value:
768+
if TRACE_ANALYSIS:
769+
logger_debug(f'analysis {DetectionCategory.PACKAGE_ADD_FROM_FILE.value}')
740770
matches_for_expression = filter_license_references(matches)
741771
reasons.append(DetectionRule.PACKAGE_ADD_FROM_FILE.value)
742772

743773
elif analysis == DetectionCategory.UNKNOWN_MATCH.value:
774+
if TRACE_ANALYSIS:
775+
logger_debug(f'analysis {DetectionCategory.UNKNOWN_MATCH.value}')
776+
matches_for_expression = matches
744777
reasons.append(DetectionRule.UNKNOWN_MATCH.value)
745778
return reasons, combined_expression
746779

747780
elif analysis == DetectionCategory.LICENSE_CLUES.value:
781+
if TRACE_ANALYSIS:
782+
logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
748783
reasons.append(DetectionRule.LICENSE_CLUES.value)
749784
return reasons, combined_expression
750785

751786
else:
787+
if TRACE_ANALYSIS:
788+
logger_debug(f'analysis {DetectionRule.NOT_COMBINED.value}')
752789
matches_for_expression = matches
753790
reasons.append(DetectionRule.NOT_COMBINED.value)
754791

@@ -764,7 +801,7 @@ def get_detected_license_expression(matches, analysis, post_scan=False):
764801
expressions=[match.rule.license_expression for match in matches_for_expression]
765802
)
766803

767-
if TRACE:
804+
if TRACE or TRACE_ANALYSIS:
768805
logger_debug(f'combined_expression {combined_expression}')
769806

770807
return reasons, combined_expression

src/packagedcode/licensing.py

Lines changed: 75 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,13 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
143143
yield resource
144144

145145

146-
def add_referenced_license_detection_from_package(resource, codebase):
146+
def add_referenced_license_detection_from_package(resource, codebase, no_licenses):
147147
"""
148148
Return an updated ``resource`` saving it in place, after adding new license
149149
matches (licenses and license_expressions) following their Rule
150-
``referenced_filenames`` if it is pointing to a package.
150+
``referenced_filenames`` if it is pointing to a package. If there is no
151+
top level packages, check for License/Readme files at codebase root and
152+
add licenses from there.
151153
"""
152154
if TRACE:
153155
logger_debug(f'packagedcode.licensing: add_referenced_license_matches_from_package: resource: {resource.path}')
@@ -178,25 +180,45 @@ def add_referenced_license_detection_from_package(resource, codebase):
178180
if not has_reference_to_package:
179181
continue
180182

181-
for_packages = resource.for_packages
182-
for package_uid in for_packages:
183-
184-
for codebase_package in codebase_packages:
185-
if codebase_package["package_uid"] == package_uid:
186-
break
183+
if not codebase_packages:
184+
root_path = codebase.root.path
185+
root_resource = codebase.get_resource(path=root_path)
186+
sibling_license_detections, _le = get_license_detections_from_sibling_file(
187+
root_resource, codebase, no_licenses
188+
)
189+
if TRACE:
190+
logger_debug(
191+
f'packagedcode.licensing: add_referenced_license_matches_from_package: root_path: {root_path}'
192+
f'sibling_license_detections: {sibling_license_detections}'
193+
)
187194

188-
pkg_detections = codebase_package["license_detections"]
189-
for pkg_detection in pkg_detections:
195+
for sibling_detection in sibling_license_detections:
190196
modified = True
191197
detection_modified = True
192-
matches.extend(pkg_detection["matches"])
193-
198+
matches.extend(sibling_detection["matches"])
199+
analysis = DetectionCategory.UNKNOWN_REFERENCE_IN_FILE_TO_NONEXISTENT_PACKAGE.value
200+
201+
else:
202+
for_packages = resource.for_packages
203+
for package_uid in for_packages:
204+
205+
for codebase_package in codebase_packages:
206+
if codebase_package["package_uid"] == package_uid:
207+
break
208+
209+
pkg_detections = codebase_package["license_detections"]
210+
for pkg_detection in pkg_detections:
211+
modified = True
212+
detection_modified = True
213+
matches.extend(pkg_detection["matches"])
214+
analysis=DetectionCategory.UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE.value
215+
194216
if not detection_modified:
195217
continue
196218

197219
reasons, license_expression = get_detected_license_expression(
198220
matches=matches,
199-
analysis=DetectionCategory.UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE.value,
221+
analysis=analysis,
200222
post_scan=True,
201223
)
202224
detection["license_expression"] = str(license_expression)
@@ -249,24 +271,35 @@ def add_license_from_sibling_file(resource, codebase, no_licenses):
249271
package["license_detections"] = license_detections
250272
package["declared_license_expression"] = license_expression
251273
package["declared_license_expression_spdx"] = str(build_spdx_license_expression(
252-
license_expression=pkg["declared_license_expression"],
274+
license_expression=package["declared_license_expression"],
253275
licensing=get_cache().licensing,
254276
))
255277

256278
codebase.save_resource(resource)
257279
return package
258280

259281

282+
def is_legal_or_readme(resource):
283+
is_legal = check_resource_name_start_and_end(resource=resource, STARTS_ENDS=LEGAL_STARTS_ENDS)
284+
is_readme = check_resource_name_start_and_end(resource=resource, STARTS_ENDS=README_STARTS_ENDS)
285+
if is_legal or is_readme:
286+
return True
287+
288+
return False
289+
290+
260291
def get_license_detections_from_sibling_file(resource, codebase, no_licenses):
261292

262293
siblings = []
263294

264295
if resource.has_parent():
265296
for sibling in resource.siblings(codebase):
266-
is_legal = check_resource_name_start_and_end(resource=sibling, STARTS_ENDS=LEGAL_STARTS_ENDS)
267-
is_readme = check_resource_name_start_and_end(resource=sibling, STARTS_ENDS=README_STARTS_ENDS)
268-
if is_legal or is_readme:
297+
if is_legal_or_readme(resource=sibling):
269298
siblings.append(sibling)
299+
elif resource.has_children:
300+
for child in resource.children(codebase):
301+
if is_legal_or_readme(resource=child):
302+
siblings.append(child)
270303

271304
if not siblings:
272305
return [], None
@@ -287,7 +320,10 @@ def get_license_detections_from_sibling_file(resource, codebase, no_licenses):
287320
if not license_detections:
288321
return [], None
289322

290-
license_expression = get_license_expression_from_detection_mappings(license_detections)
323+
license_expression = get_license_expression_from_detection_mappings(
324+
detections=license_detections,
325+
valid_expression=False,
326+
)
291327
return license_detections, license_expression
292328

293329

@@ -383,9 +419,14 @@ def get_license_expression_from_matches(license_matches, relation='AND', unique=
383419
if not license_matches:
384420
return
385421

386-
license_expressions = [
387-
match.rule.license_expression for match in license_matches
388-
]
422+
if type(license_matches[0]) == dict:
423+
license_expressions = [
424+
match['license_expression'] for match in license_matches
425+
]
426+
else:
427+
license_expressions = [
428+
match.rule.license_expression for match in license_matches
429+
]
389430

390431
if len(license_expressions) == 1:
391432
license_expression = str(license_expressions[0])
@@ -397,10 +438,22 @@ def get_license_expression_from_matches(license_matches, relation='AND', unique=
397438
return license_expression
398439

399440

400-
def get_license_expression_from_detection_mappings(detections, relation='AND', unique=True):
441+
def get_license_expression_from_detection_mappings(
442+
detections,
443+
relation='AND',
444+
unique=True,
445+
valid_expression=False,
446+
):
401447

402448
expressions = []
403449
for detection in detections:
450+
if valid_expression:
451+
if not detection["license_expression"]:
452+
expressions.append(
453+
get_license_expression_from_matches(detection["matches"])
454+
)
455+
continue
456+
404457
expressions.append(detection["license_expression"])
405458

406459
return str(

src/packagedcode/plugin_package.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):
198198
for resource in codebase.walk(topdown=False):
199199
# If there is a unknown reference to a package we add the license
200200
# from the package license detection
201-
modified = list(add_referenced_license_detection_from_package(resource, codebase))
201+
modified = list(add_referenced_license_detection_from_package(resource, codebase, no_licenses))
202202
if TRACE and modified:
203203
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_from_package: modified: {modified}')
204204

@@ -243,7 +243,10 @@ def add_license_from_file(resource, codebase, no_licenses):
243243
else:
244244
detection["detection_rules"].append(DetectionRule.PACKAGE_ADD_FROM_FILE.value)
245245

246-
license_expression = get_license_expression_from_detection_mappings(license_detections_file)
246+
license_expression = get_license_expression_from_detection_mappings(
247+
detections=license_detections_file,
248+
valid_expression=True
249+
)
247250
pkg["declared_license_expression"] = license_expression
248251
pkg["declared_license_expression_spdx"] = str(build_spdx_license_expression(
249252
license_expression=license_expression,

0 commit comments

Comments
 (0)