Skip to content

Commit 02ab56c

Browse files
Update false positives and unknown intro heuristics
* Update false positive detection heuristics * Update unknown intro detection heuristics * Update test expectations * Add ffmpeg License.md as a test for complex licenses #3113 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent a5c52fa commit 02ab56c

25 files changed

+1327
-367
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Eclipse Public License 2.0. A copy of the license is contained
2+
in the file LICENSE.md and is also available at https://www.eclipse.org/legal/epl-2.0/
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
license_expression: epl-2.0
2+
is_license_notice: yes
3+
referenced_filenames:
4+
- License.md
5+
ignorable_urls:
6+
- https://www.eclipse.org/legal/epl-2.0/
7+

src/licensedcode/detection.py

Lines changed: 147 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ class DetectionCategory(Enum):
9494

9595
class DetectionRule(Enum):
9696
NOT_COMBINED = 'not-combined'
97+
UNKNOWN_MATCH = 'unknown-match'
98+
LICENSE_CLUES = 'license-clues'
99+
FALSE_POSITIVE = 'false-positive'
97100
UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'unknown-reference-to-local-file'
98101
UNKNOWN_INTRO_FOLLOWED_BY_MATCH = 'unknown-intro-followed-by-match'
99102
CONTAINED_SAME_LICENSE = 'contained-with-same-license'
@@ -165,15 +168,24 @@ class LicenseDetection:
165168
)
166169

167170
@classmethod
168-
def from_matches(cls, matches, analysis=None, post_scan=False):
171+
def from_matches(
172+
cls,
173+
matches,
174+
analysis=None,
175+
post_scan=False,
176+
package_license=False,
177+
):
169178
"""
170179
Return a LicenseDetection created out of `matches` list of LicenseMatch.
171180
"""
172181
if not matches:
173182
return
174-
183+
175184
if analysis is None:
176-
analysis=analyze_detection(matches)
185+
analysis = analyze_detection(
186+
license_matches=matches,
187+
package_license=package_license
188+
)
177189

178190
reasons, license_expression = get_detected_license_expression(
179191
matches=matches,
@@ -438,11 +450,17 @@ def is_correct_detection(license_matches):
438450
)
439451

440452

441-
def is_match_coverage_less_than_threshold(license_matches, threshold):
453+
def is_match_coverage_less_than_threshold(license_matches, threshold, any_matches=True):
442454
"""
443455
Return True if any of the matches in `license_matches` List of LicenseMatch
444456
has a `match_coverage` value below the threshold (a value between 0-100).
445457
"""
458+
if not any_matches:
459+
return not any(
460+
license_match.coverage() > threshold
461+
for license_match in license_matches
462+
)
463+
446464
return any(
447465
license_match.coverage() < threshold
448466
for license_match in license_matches
@@ -480,35 +498,60 @@ def has_extra_words(license_matches):
480498
)
481499

482500

483-
def is_false_positive(license_matches):
501+
def is_false_positive(license_matches, package_license=False):
484502
"""
485503
Return True if all of the matches in `license_matches` List of LicenseMatch
486504
are false positives.
487505
488506
False Positive occurs when other text/code is falsely matched to a license rule,
489507
"""
508+
if package_license:
509+
return False
510+
490511
start_line_region = min(
491512
license_match.start_line for license_match in license_matches
492513
)
493514
match_rule_length_values = [
494515
license_match.rule.length for license_match in license_matches
495516
]
496517

518+
all_match_rule_length_one = all(
519+
match_rule_length == 1
520+
for match_rule_length in match_rule_length_values
521+
)
522+
523+
is_gpl_bare = all(
524+
'gpl_bare' in license_match.rule.identifier
525+
for license_match in license_matches
526+
)
527+
528+
is_gpl = all(
529+
'gpl' in license_match.rule.identifier
530+
for license_match in license_matches
531+
)
532+
533+
matches_is_license_tag_flags = all(
534+
license_match.rule.is_license_tag for license_match in license_matches
535+
)
536+
537+
is_single_match = len(license_matches) == 1
538+
539+
if is_single_match and is_gpl_bare:
540+
return True
541+
542+
if is_gpl and all_match_rule_length_one:
543+
return True
544+
497545
if start_line_region > FALSE_POSITIVE_START_LINE_THRESHOLD and any(
498546
match_rule_length_value <= FALSE_POSITIVE_RULE_LENGTH_THRESHOLD
499547
for match_rule_length_value in match_rule_length_values
500548
):
501549
return True
502550

503-
match_is_license_tag_flags = (
504-
license_match.rule.is_license_tag for license_match in license_matches
505-
)
506-
return all(
507-
(is_license_tag_flag and match_rule_length == 1)
508-
for is_license_tag_flag, match_rule_length in zip(
509-
match_is_license_tag_flags, match_rule_length_values
510-
)
511-
)
551+
if matches_is_license_tag_flags and all_match_rule_length_one:
552+
return True
553+
554+
return False
512555

513556

514557
def has_unknown_matches(license_matches):
@@ -531,18 +574,29 @@ def is_unknown_intro(license_match):
531574

532575
def is_license_clues(license_matches):
533576
"""
577+
Return True if the license_matches are not part of a correct
578+
license detection and are mere license clues.
534579
"""
535580
return not is_correct_detection(license_matches) and (
536581
has_unknown_matches(license_matches) or
537582
is_match_coverage_less_than_threshold(
538583
license_matches=license_matches,
539584
threshold=CLUES_MATCH_COVERAGE_THR,
585+
any_matches=False,
540586
)
541587
)
542588

543589

544590
def has_unknown_intro_before_detection(license_matches):
545591

592+
if len(license_matches) == 1:
593+
return False
594+
595+
if all([
596+
is_unknown_intro(match) for match in license_matches
597+
]):
598+
return False
599+
546600
has_unknown_intro = False
547601
has_unknown_intro_before_detection = False
548602

@@ -552,7 +606,21 @@ def has_unknown_intro_before_detection(license_matches):
552606
continue
553607

554608
if has_unknown_intro:
555-
has_unknown_intro_before_detection = True
609+
if not is_match_coverage_less_than_threshold(
610+
[match], IMPERFECT_MATCH_COVERAGE_THR
611+
) and not has_unknown_matches([match]):
612+
has_unknown_intro_before_detection = True
613+
return has_unknown_intro_before_detection
614+
615+
if has_unknown_intro:
616+
filtered_matches = filter_license_intros(license_matches)
617+
if license_matches != filtered_matches:
618+
if is_match_coverage_less_than_threshold(
619+
license_matches=filtered_matches,
620+
threshold=IMPERFECT_MATCH_COVERAGE_THR,
621+
any_matches=False,
622+
):
623+
has_unknown_intro_before_detection = True
556624

557625
return has_unknown_intro_before_detection
558626

@@ -568,7 +636,11 @@ def filter_license_intros(license_matches):
568636
license notice. In these cases, the license introduction can be discarded as
569637
this is for the license match that follows it.
570638
"""
571-
return [match for match in license_matches if not is_license_intro(match)]
639+
filtered_matches = [match for match in license_matches if not is_license_intro(match)]
640+
if not filtered_matches:
641+
return license_matches
642+
else:
643+
return filtered_matches
572644

573645

574646
def is_license_intro(license_match):
@@ -592,15 +664,26 @@ def is_license_reference_local_file(license_match):
592664
Return True if `license_match` LicenseMatch dict has a non-empty `referenced_filename`,
593665
i.e. contains a license reference to a local file.
594666
"""
595-
return bool(license_match['referenced_filenames'])
667+
if type(license_match) == dict:
668+
return bool(license_match['referenced_filenames'])
669+
else:
670+
return bool(license_match.rule.referenced_filenames)
596671

597672

598673
def filter_license_references(license_matches):
599674
"""
600675
Return a filtered ``license_matches`` list of LicenseMatch objects removing
601676
references to local files with licenses.
602677
"""
603-
return [match for match in license_matches if not is_license_reference_local_file(match)]
678+
679+
filtered_matches = [match for match in license_matches if not is_license_reference_local_file(match)]
680+
if TRACE:
681+
logger_debug(f"detection: filter_license_references: license_matches: {license_matches}: filtered_matches: {filtered_matches}")
682+
683+
if not filtered_matches:
684+
return license_matches
685+
else:
686+
return filtered_matches
604687

605688

606689
def has_unknown_references_to_local_files(license_matches):
@@ -615,44 +698,57 @@ def get_detected_license_expression(matches, analysis, post_scan=False):
615698
Return a tuple of (reasons, combined_expression) by combining a `matches` list of
616699
LicenseMatch objects using an `analysis` code string.
617700
"""
701+
if TRACE:
702+
logger_debug(f'license_matches {matches}', f'package_license {analysis}', f'post_scan: {post_scan}')
703+
618704
matches_for_expression = None
619705
combined_expression = None
620706
reasons = []
621707

622-
if analysis == DetectionCategory.UNDETECTED_LICENSE.value:
708+
if analysis == DetectionCategory.FALSE_POSITVE.value:
709+
reasons.append(DetectionRule.FALSE_POSITIVE.value)
710+
return reasons, combined_expression
711+
712+
elif analysis == DetectionCategory.UNDETECTED_LICENSE.value:
623713
matches_for_expression = matches
624714
reasons.append(DetectionRule.UNDETECTED_LICENSE.value)
625715

626716
elif analysis == DetectionCategory.UNKNOWN_INTRO_BEFORE_DETECTION.value:
627717
matches_for_expression = filter_license_intros(matches)
628718
reasons.append(DetectionRule.UNKNOWN_INTRO_FOLLOWED_BY_MATCH.value)
629719

630-
elif analysis == DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value and post_scan:
631-
matches_for_expression = filter_license_references(matches)
632-
reasons.append(DetectionRule.UNKNOWN_REFERENCE_TO_LOCAL_FILE.value)
720+
elif post_scan:
721+
if analysis == DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value:
722+
matches_for_expression = filter_license_references(matches)
723+
reasons.append(DetectionRule.UNKNOWN_REFERENCE_TO_LOCAL_FILE.value)
633724

634-
elif analysis == DetectionCategory.PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL.value and post_scan:
635-
matches_for_expression = filter_license_references(matches)
636-
reasons.append(DetectionRule.PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE.value)
725+
elif analysis == DetectionCategory.PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL.value:
726+
matches_for_expression = filter_license_references(matches)
727+
reasons.append(DetectionRule.PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE.value)
637728

638-
elif analysis == DetectionCategory.PACKAGE_ADD_FROM_SIBLING_FILE and post_scan:
639-
matches_for_expression = filter_license_references(matches)
640-
reasons.append(DetectionRule.PACKAGE_ADD_FROM_SIBLING_FILE.value)
729+
elif analysis == DetectionCategory.PACKAGE_ADD_FROM_SIBLING_FILE.value:
730+
matches_for_expression = filter_license_references(matches)
731+
reasons.append(DetectionRule.PACKAGE_ADD_FROM_SIBLING_FILE.value)
641732

642-
elif analysis == DetectionCategory.PACKAGE_ADD_FROM_FILE.value and post_scan:
643-
matches_for_expression = filter_license_references(matches)
644-
reasons.append(DetectionRule.PACKAGE_ADD_FROM_FILE.value)
733+
elif analysis == DetectionCategory.PACKAGE_ADD_FROM_FILE.value:
734+
matches_for_expression = filter_license_references(matches)
735+
reasons.append(DetectionRule.PACKAGE_ADD_FROM_FILE.value)
645736

646-
elif (
647-
analysis == DetectionCategory.UNKNOWN_MATCH.value or
648-
analysis == DetectionCategory.LICENSE_CLUES.value
649-
):
737+
elif analysis == DetectionCategory.UNKNOWN_MATCH.value:
738+
reasons.append(DetectionRule.UNKNOWN_MATCH.value)
739+
return reasons, combined_expression
740+
741+
elif analysis == DetectionCategory.LICENSE_CLUES.value:
742+
reasons.append(DetectionRule.LICENSE_CLUES.value)
650743
return reasons, combined_expression
651744

652745
else:
653746
matches_for_expression = matches
654747
reasons.append(DetectionRule.NOT_COMBINED.value)
655748

749+
if TRACE:
750+
logger_debug(f'matches_for_expression: {matches_for_expression}', f'reasons: {reasons}')
751+
656752
if isinstance(matches[0], dict):
657753
combined_expression = combine_expressions(
658754
expressions=[match['license_expression'] for match in matches_for_expression]
@@ -662,6 +758,9 @@ def get_detected_license_expression(matches, analysis, post_scan=False):
662758
expressions=[match.rule.license_expression for match in matches_for_expression]
663759
)
664760

761+
if TRACE:
762+
logger_debug(f'combined_expression {combined_expression}')
763+
665764
return reasons, combined_expression
666765

667766

@@ -792,12 +891,15 @@ def get_license_keys_from_detections(license_detections):
792891
return list(license_keys)
793892

794893

795-
def analyze_detection(license_matches):
894+
def analyze_detection(license_matches, package_license=False):
796895
"""
797896
Analyse a list of LicenseMatch objects, and determine if the license detection
798897
is correct or it is wrong/partially-correct/false-positive/has extra words or
799898
some other detection case.
800899
"""
900+
if TRACE:
901+
logger_debug(f'license_matches {license_matches}', f'package_license {package_license}')
902+
801903
if is_undetected_license_matches(license_matches):
802904
return DetectionCategory.UNDETECTED_LICENSE.value
803905

@@ -811,9 +913,12 @@ def analyze_detection(license_matches):
811913
elif is_correct_detection(license_matches):
812914
return DetectionCategory.PERFECT_DETECTION.value
813915

814-
elif is_match_coverage_less_than_threshold(
815-
license_matches, CLUES_MATCH_COVERAGE_THR
816-
):
916+
# Case where the match is a false positive
917+
# In package license detection this is turned off
918+
elif not package_license and is_false_positive(license_matches, package_license):
919+
return DetectionCategory.FALSE_POSITVE.value
920+
921+
elif is_license_clues(license_matches):
817922
return DetectionCategory.LICENSE_CLUES.value
818923

819924
# Case where at least one of the matches have `match_coverage`
@@ -832,10 +937,6 @@ def analyze_detection(license_matches):
832937
elif has_unknown_matches(license_matches):
833938
return DetectionCategory.UNKNOWN_MATCH.value
834939

835-
# Case where the match is a false positive
836-
elif is_false_positive(license_matches):
837-
return DetectionCategory.FALSE_POSITVE.value
838-
839940
# Cases where Match Coverage is a perfect 100 for all matches
840941
else:
841942
return DetectionCategory.PERFECT_DETECTION.value
@@ -1021,6 +1122,7 @@ def detect_licenses(
10211122
min_score=0,
10221123
deadline=sys.maxsize,
10231124
as_expression=False,
1125+
package_license=False,
10241126
**kwargs
10251127
):
10261128
"""
@@ -1063,5 +1165,6 @@ def detect_licenses(
10631165
yield LicenseDetection.from_matches(
10641166
matches=group_of_matches,
10651167
analysis=analysis,
1066-
post_scan=post_scan
1168+
post_scan=post_scan,
1169+
package_license=package_license,
10671170
)

0 commit comments

Comments
 (0)