Skip to content

Commit f0213ec

Browse files
Update license clues heuristics
Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent b1c999f commit f0213ec

File tree

22 files changed

+5165
-2960
lines changed

22 files changed

+5165
-2960
lines changed

src/licensedcode/detection.py

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import attr
1717
from license_expression import combine_expressions
18+
from license_expression import Licensing
1819

1920
from commoncode.resource import clean_path
2021
from licensedcode.cache import get_index
@@ -96,15 +97,16 @@ class DetectionCategory(Enum):
9697
UNKNOWN_MATCH = 'unknown-match'
9798
LICENSE_CLUES = 'license-clues'
9899
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
99-
FALSE_POSITVE = 'false-positive'
100+
FALSE_POSITVE = 'possible-false-positive'
100101
UNDETECTED_LICENSE = 'undetected-license'
101102

102103

103104
class DetectionRule(Enum):
104105
NOT_COMBINED = 'not-combined'
105106
UNKNOWN_MATCH = 'unknown-match'
106107
LICENSE_CLUES = 'license-clues'
107-
FALSE_POSITIVE = 'false-positive'
108+
FALSE_POSITIVE = 'possible-false-positive'
109+
NOT_LICENSE_CLUES = 'not-license-clues-as-more-detections-present'
108110
UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'unknown-reference-to-local-file'
109111
UNKNOWN_INTRO_FOLLOWED_BY_MATCH = 'unknown-intro-followed-by-match'
110112
UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE = 'unknown-reference-in-file-to-package'
@@ -204,7 +206,10 @@ def from_matches(
204206
)
205207

206208
if license_expression == None:
207-
return cls(matches=matches)
209+
return cls(
210+
matches=matches,
211+
detection_log=detection_log,
212+
)
208213

209214
return cls(
210215
matches=matches,
@@ -440,10 +445,10 @@ def get_detections_from_mappings(detection_mappings):
440445

441446

442447
def is_undetected_license_matches(license_matches):
443-
448+
444449
if len(license_matches) != 1:
445450
return False
446-
451+
447452
if license_matches[0].matcher == MATCHER_UNDETECTED:
448453
return True
449454

@@ -453,11 +458,15 @@ def is_correct_detection(license_matches):
453458
Return True if all the matches in `license_matches` List of LicenseMatch
454459
are correct license detections.
455460
"""
456-
#TODO: Add matches with full match coverage
457461
matchers = (license_match.matcher for license_match in license_matches)
462+
is_match_coverage_perfect = [
463+
license_match.coverage() == 100
464+
for license_match in license_matches
465+
]
466+
458467
return (
459-
all(matcher in ("1-hash", "1-spdx-id") for matcher in matchers)
460-
and not has_unknown_matches(license_matches)
468+
all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers)
469+
and all(is_match_coverage_perfect) and not has_unknown_matches(license_matches)
461470
)
462471

463472

@@ -952,15 +961,15 @@ def analyze_detection(license_matches, package_license=False):
952961
elif has_unknown_references_to_local_files(license_matches):
953962
return DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value
954963

955-
# Case where all matches have `matcher` as `1-hash` or `4-spdx-id`
956-
elif is_correct_detection(license_matches):
957-
return DetectionCategory.PERFECT_DETECTION.value
958-
959964
# Case where the match is a false positive
960965
# In package license detection this is turned off
961966
elif not package_license and is_false_positive(license_matches, package_license):
962967
return DetectionCategory.FALSE_POSITVE.value
963968

969+
# Case where all matches have `matcher` as `1-hash` or `4-spdx-id`
970+
elif is_correct_detection(license_matches):
971+
return DetectionCategory.PERFECT_DETECTION.value
972+
964973
elif is_license_clues(license_matches):
965974
return DetectionCategory.LICENSE_CLUES.value
966975

@@ -1156,6 +1165,43 @@ def find_referenced_resource(referenced_filename, resource, codebase, **kwargs):
11561165
return resource
11571166

11581167

1168+
def process_detections(detections, licensing=Licensing()):
1169+
"""
1170+
Yield LicenseDetection objects given a list of LicenseDetection objects
1171+
after postprocessing to include license clues as detections if there are
1172+
other proper detections with the same license keys.
1173+
"""
1174+
if len(detections) == 1:
1175+
yield detections[0]
1176+
else:
1177+
detected_license_keys = set()
1178+
1179+
for detection in detections:
1180+
if detection.license_expression != None:
1181+
detected_license_keys.update(
1182+
licensing.license_keys(detection.license_expression)
1183+
)
1184+
1185+
for detection in detections:
1186+
if detection.license_expression == None:
1187+
license_keys = licensing.license_keys(detection.license_expression)
1188+
if all(
1189+
key in detected_license_keys
1190+
for key in license_keys
1191+
):
1192+
detection.license_expression = str(combine_expressions(
1193+
expressions=[
1194+
match.rule.license_expression
1195+
for match in detection.matches
1196+
],
1197+
unique=True,
1198+
licensing=licensing,
1199+
))
1200+
detection.detection_log.append(DetectionRule.NOT_LICENSE_CLUES.value)
1201+
1202+
yield detection
1203+
1204+
11591205
def detect_licenses(
11601206
index=None,
11611207
location=None,
@@ -1204,10 +1250,15 @@ def detect_licenses(
12041250
if TRACE:
12051251
logger_debug(f"detection: detect_licenses: location: {location}: query_string: {query_string}")
12061252

1253+
detections = []
12071254
for group_of_matches in group_matches(matches):
1208-
yield LicenseDetection.from_matches(
1209-
matches=group_of_matches,
1210-
analysis=analysis,
1211-
post_scan=post_scan,
1212-
package_license=package_license,
1255+
detections.append(
1256+
LicenseDetection.from_matches(
1257+
matches=group_of_matches,
1258+
analysis=analysis,
1259+
post_scan=post_scan,
1260+
package_license=package_license,
1261+
)
12131262
)
1263+
1264+
yield from process_detections(detections)

src/packagedcode/debian_copyright.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -950,12 +950,13 @@ def get_license_detections_mapping(self):
950950
if not license_matches:
951951
continue
952952

953-
detection = LicenseDetection.from_matches(
954-
license_matches
953+
detection_objects.append(
954+
LicenseDetection.from_matches(
955+
matches=license_matches,
956+
package_license=True,
957+
)
955958
)
956959

957-
detection_objects.append(detection)
958-
959960
detections_mapping, _expression = get_mapping_and_expression_from_detections(
960961
license_detections=detection_objects,
961962
whole_lines=False,

tests/licensedcode/data/plugin_license/scan/ffmpeg-license.expected.json

Lines changed: 49 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
{
44
"path": "ffmpeg-LICENSE.md",
55
"type": "file",
6-
"detected_license_expression": "(lgpl-2.1-plus AND other-permissive AND gpl-2.0-plus) AND (lgpl-3.0 AND lgpl-3.0-plus AND (lgpl-3.0 AND gpl-3.0)) AND (ijg AND mit) AND gpl-1.0-plus AND (gpl-2.0 AND apache-2.0 AND lgpl-3.0-plus) AND (gpl-2.0 AND lgpl-2.0-plus AND proprietary-license)",
7-
"detected_license_expression_spdx": "(LGPL-2.1-or-later AND LicenseRef-scancode-other-permissive AND GPL-2.0-or-later) AND (LGPL-3.0-only AND LGPL-3.0-or-later AND (LGPL-3.0-only AND GPL-3.0-only)) AND (IJG AND MIT) AND GPL-1.0-or-later AND (GPL-2.0-only AND Apache-2.0 AND LGPL-3.0-or-later) AND (GPL-2.0-only AND LGPL-2.0-or-later AND LicenseRef-scancode-proprietary-license)",
6+
"detected_license_expression": "(lgpl-2.1-plus AND other-permissive AND gpl-2.0-plus) AND gpl-1.0-plus AND (lgpl-3.0 AND lgpl-3.0-plus AND (lgpl-3.0 AND gpl-3.0)) AND (ijg AND mit) AND (gpl-2.0 AND apache-2.0 AND lgpl-3.0-plus) AND (gpl-2.0 AND lgpl-2.0-plus AND proprietary-license)",
7+
"detected_license_expression_spdx": "(LGPL-2.1-or-later AND LicenseRef-scancode-other-permissive AND GPL-2.0-or-later) AND GPL-1.0-or-later AND (LGPL-3.0-only AND LGPL-3.0-or-later AND (LGPL-3.0-only AND GPL-3.0-only)) AND (IJG AND MIT) AND (GPL-2.0-only AND Apache-2.0 AND LGPL-3.0-or-later) AND (GPL-2.0-only AND LGPL-2.0-or-later AND LicenseRef-scancode-proprietary-license)",
88
"license_detections": [
99
{
1010
"license_expression": "lgpl-2.1-plus AND other-permissive AND gpl-2.0-plus",
@@ -86,6 +86,52 @@
8686
}
8787
]
8888
},
89+
{
90+
"license_expression": "gpl-1.0-plus",
91+
"detection_log": [
92+
"possible-false-positive",
93+
"not-license-clues-as-more-detections-present"
94+
],
95+
"matches": [
96+
{
97+
"score": 50.0,
98+
"start_line": 18,
99+
"end_line": 18,
100+
"matched_length": 1,
101+
"match_coverage": 100.0,
102+
"matcher": "2-aho",
103+
"license_expression": "gpl-1.0-plus",
104+
"rule_identifier": "gpl_bare_word_only.RULE",
105+
"referenced_filenames": [],
106+
"is_license_text": false,
107+
"is_license_notice": false,
108+
"is_license_reference": true,
109+
"is_license_tag": false,
110+
"is_license_intro": false,
111+
"rule_length": 1,
112+
"rule_relevance": 50,
113+
"matched_text": " libavcodec/x86/flac_dsp_gpl.asm",
114+
"licenses": [
115+
{
116+
"key": "gpl-1.0-plus",
117+
"name": "GNU General Public License 1.0 or later",
118+
"short_name": "GPL 1.0 or later",
119+
"category": "Copyleft",
120+
"is_exception": false,
121+
"is_unknown": false,
122+
"owner": "Free Software Foundation (FSF)",
123+
"homepage_url": "http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html",
124+
"text_url": "http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html",
125+
"reference_url": "https://scancode-licensedb.aboutcode.org/gpl-1.0-plus",
126+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/gpl-1.0-plus.LICENSE",
127+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/gpl-1.0-plus.yml",
128+
"spdx_license_key": "GPL-1.0-or-later",
129+
"spdx_url": "https://spdx.org/licenses/GPL-1.0-or-later"
130+
}
131+
]
132+
}
133+
]
134+
},
89135
{
90136
"license_expression": "lgpl-3.0 AND lgpl-3.0-plus AND (lgpl-3.0 AND gpl-3.0)",
91137
"detection_log": [
@@ -664,45 +710,7 @@
664710
]
665711
}
666712
],
667-
"license_clues": [
668-
{
669-
"score": 50.0,
670-
"start_line": 18,
671-
"end_line": 18,
672-
"matched_length": 1,
673-
"match_coverage": 100.0,
674-
"matcher": "2-aho",
675-
"license_expression": "gpl-1.0-plus",
676-
"rule_identifier": "gpl_bare_word_only.RULE",
677-
"referenced_filenames": [],
678-
"is_license_text": false,
679-
"is_license_notice": false,
680-
"is_license_reference": true,
681-
"is_license_tag": false,
682-
"is_license_intro": false,
683-
"rule_length": 1,
684-
"rule_relevance": 50,
685-
"matched_text": " libavcodec/x86/flac_dsp_gpl.asm",
686-
"licenses": [
687-
{
688-
"key": "gpl-1.0-plus",
689-
"name": "GNU General Public License 1.0 or later",
690-
"short_name": "GPL 1.0 or later",
691-
"category": "Copyleft",
692-
"is_exception": false,
693-
"is_unknown": false,
694-
"owner": "Free Software Foundation (FSF)",
695-
"homepage_url": "http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html",
696-
"text_url": "http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html",
697-
"reference_url": "https://scancode-licensedb.aboutcode.org/gpl-1.0-plus",
698-
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/gpl-1.0-plus.LICENSE",
699-
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/gpl-1.0-plus.yml",
700-
"spdx_license_key": "GPL-1.0-or-later",
701-
"spdx_url": "https://spdx.org/licenses/GPL-1.0-or-later"
702-
}
703-
]
704-
}
705-
],
713+
"license_clues": [],
706714
"percentage_of_license_text": 34.96,
707715
"scan_errors": []
708716
}

0 commit comments

Comments
 (0)