@@ -94,6 +94,9 @@ class DetectionCategory(Enum):
94
94
95
95
class DetectionRule (Enum ):
96
96
NOT_COMBINED = 'not-combined'
97
+ UNKNOWN_MATCH = 'unknown-match'
98
+ LICENSE_CLUES = 'license-clues'
99
+ FALSE_POSITIVE = 'false-positive'
97
100
UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'unknown-reference-to-local-file'
98
101
UNKNOWN_INTRO_FOLLOWED_BY_MATCH = 'unknown-intro-followed-by-match'
99
102
CONTAINED_SAME_LICENSE = 'contained-with-same-license'
@@ -165,15 +168,24 @@ class LicenseDetection:
165
168
)
166
169
167
170
@classmethod
168
- def from_matches (cls , matches , analysis = None , post_scan = False ):
171
+ def from_matches (
172
+ cls ,
173
+ matches ,
174
+ analysis = None ,
175
+ post_scan = False ,
176
+ package_license = False ,
177
+ ):
169
178
"""
170
179
Return a LicenseDetection created out of `matches` list of LicenseMatch.
171
180
"""
172
181
if not matches :
173
182
return
174
-
183
+
175
184
if analysis is None :
176
- analysis = analyze_detection (matches )
185
+ analysis = analyze_detection (
186
+ license_matches = matches ,
187
+ package_license = package_license
188
+ )
177
189
178
190
reasons , license_expression = get_detected_license_expression (
179
191
matches = matches ,
@@ -438,11 +450,17 @@ def is_correct_detection(license_matches):
438
450
)
439
451
440
452
441
- def is_match_coverage_less_than_threshold (license_matches , threshold ):
453
+ def is_match_coverage_less_than_threshold (license_matches , threshold , any_matches = True ):
442
454
"""
443
455
Return True if any of the matches in `license_matches` List of LicenseMatch
444
456
has a `match_coverage` value below the threshold (a value between 0-100).
445
457
"""
458
+ if not any_matches :
459
+ return not any (
460
+ license_match .coverage () > threshold
461
+ for license_match in license_matches
462
+ )
463
+
446
464
return any (
447
465
license_match .coverage () < threshold
448
466
for license_match in license_matches
@@ -480,35 +498,60 @@ def has_extra_words(license_matches):
480
498
)
481
499
482
500
483
- def is_false_positive (license_matches ):
501
+ def is_false_positive (license_matches , package_license = False ):
484
502
"""
485
503
Return True if all of the matches in `license_matches` List of LicenseMatch
486
504
are false positives.
487
505
488
506
False Positive occurs when other text/code is falsely matched to a license rule,
489
507
"""
508
+ if package_license :
509
+ return False
510
+
490
511
start_line_region = min (
491
512
license_match .start_line for license_match in license_matches
492
513
)
493
514
match_rule_length_values = [
494
515
license_match .rule .length for license_match in license_matches
495
516
]
496
517
518
+ all_match_rule_length_one = all (
519
+ match_rule_length == 1
520
+ for match_rule_length in match_rule_length_values
521
+ )
522
+
523
+ is_gpl_bare = all (
524
+ 'gpl_bare' in license_match .rule .identifier
525
+ for license_match in license_matches
526
+ )
527
+
528
+ is_gpl = all (
529
+ 'gpl' in license_match .rule .identifier
530
+ for license_match in license_matches
531
+ )
532
+
533
+ matches_is_license_tag_flags = all (
534
+ license_match .rule .is_license_tag for license_match in license_matches
535
+ )
536
+
537
+ is_single_match = len (license_matches ) == 1
538
+
539
+ if is_single_match and is_gpl_bare :
540
+ return True
541
+
542
+ if is_gpl and all_match_rule_length_one :
543
+ return True
544
+
497
545
if start_line_region > FALSE_POSITIVE_START_LINE_THRESHOLD and any (
498
546
match_rule_length_value <= FALSE_POSITIVE_RULE_LENGTH_THRESHOLD
499
547
for match_rule_length_value in match_rule_length_values
500
548
):
501
549
return True
502
550
503
- match_is_license_tag_flags = (
504
- license_match .rule .is_license_tag for license_match in license_matches
505
- )
506
- return all (
507
- (is_license_tag_flag and match_rule_length == 1 )
508
- for is_license_tag_flag , match_rule_length in zip (
509
- match_is_license_tag_flags , match_rule_length_values
510
- )
511
- )
551
+ if matches_is_license_tag_flags and all_match_rule_length_one :
552
+ return True
553
+
554
+ return False
512
555
513
556
514
557
def has_unknown_matches (license_matches ):
@@ -531,18 +574,29 @@ def is_unknown_intro(license_match):
531
574
532
575
def is_license_clues (license_matches ):
533
576
"""
577
+ Return True if the license_matches are not part of a correct
578
+ license detection and are mere license clues.
534
579
"""
535
580
return not is_correct_detection (license_matches ) and (
536
581
has_unknown_matches (license_matches ) or
537
582
is_match_coverage_less_than_threshold (
538
583
license_matches = license_matches ,
539
584
threshold = CLUES_MATCH_COVERAGE_THR ,
585
+ any_matches = False ,
540
586
)
541
587
)
542
588
543
589
544
590
def has_unknown_intro_before_detection (license_matches ):
545
591
592
+ if len (license_matches ) == 1 :
593
+ return False
594
+
595
+ if all ([
596
+ is_unknown_intro (match ) for match in license_matches
597
+ ]):
598
+ return False
599
+
546
600
has_unknown_intro = False
547
601
has_unknown_intro_before_detection = False
548
602
@@ -552,7 +606,21 @@ def has_unknown_intro_before_detection(license_matches):
552
606
continue
553
607
554
608
if has_unknown_intro :
555
- has_unknown_intro_before_detection = True
609
+ if not is_match_coverage_less_than_threshold (
610
+ [match ], IMPERFECT_MATCH_COVERAGE_THR
611
+ ) and not has_unknown_matches ([match ]):
612
+ has_unknown_intro_before_detection = True
613
+ return has_unknown_intro_before_detection
614
+
615
+ if has_unknown_intro :
616
+ filtered_matches = filter_license_intros (license_matches )
617
+ if license_matches != filtered_matches :
618
+ if is_match_coverage_less_than_threshold (
619
+ license_matches = filtered_matches ,
620
+ threshold = IMPERFECT_MATCH_COVERAGE_THR ,
621
+ any_matches = False ,
622
+ ):
623
+ has_unknown_intro_before_detection = True
556
624
557
625
return has_unknown_intro_before_detection
558
626
@@ -568,7 +636,11 @@ def filter_license_intros(license_matches):
568
636
license notice. In these cases, the license introduction can be discarded as
569
637
this is for the license match that follows it.
570
638
"""
571
- return [match for match in license_matches if not is_license_intro (match )]
639
+ filtered_matches = [match for match in license_matches if not is_license_intro (match )]
640
+ if not filtered_matches :
641
+ return license_matches
642
+ else :
643
+ return filtered_matches
572
644
573
645
574
646
def is_license_intro (license_match ):
@@ -592,15 +664,26 @@ def is_license_reference_local_file(license_match):
592
664
Return True if `license_match` LicenseMatch dict has a non-empty `referenced_filename`,
593
665
i.e. contains a license reference to a local file.
594
666
"""
595
- return bool (license_match ['referenced_filenames' ])
667
+ if type (license_match ) == dict :
668
+ return bool (license_match ['referenced_filenames' ])
669
+ else :
670
+ return bool (license_match .rule .referenced_filenames )
596
671
597
672
598
673
def filter_license_references (license_matches ):
599
674
"""
600
675
Return a filtered ``license_matches`` list of LicenseMatch objects removing
601
676
references to local files with licenses.
602
677
"""
603
- return [match for match in license_matches if not is_license_reference_local_file (match )]
678
+
679
+ filtered_matches = [match for match in license_matches if not is_license_reference_local_file (match )]
680
+ if TRACE :
681
+ logger_debug (f"detection: filter_license_references: license_matches: { license_matches } : filtered_matches: { filtered_matches } " )
682
+
683
+ if not filtered_matches :
684
+ return license_matches
685
+ else :
686
+ return filtered_matches
604
687
605
688
606
689
def has_unknown_references_to_local_files (license_matches ):
@@ -615,44 +698,57 @@ def get_detected_license_expression(matches, analysis, post_scan=False):
615
698
Return a tuple of (reasons, combined_expression) by combining a `matches` list of
616
699
LicenseMatch objects using an `analysis` code string.
617
700
"""
701
+ if TRACE :
702
+ logger_debug (f'license_matches { matches } ' , f'package_license { analysis } ' , f'post_scan: { post_scan } ' )
703
+
618
704
matches_for_expression = None
619
705
combined_expression = None
620
706
reasons = []
621
707
622
- if analysis == DetectionCategory .UNDETECTED_LICENSE .value :
708
+ if analysis == DetectionCategory .FALSE_POSITVE .value :
709
+ reasons .append (DetectionRule .FALSE_POSITIVE .value )
710
+ return reasons , combined_expression
711
+
712
+ elif analysis == DetectionCategory .UNDETECTED_LICENSE .value :
623
713
matches_for_expression = matches
624
714
reasons .append (DetectionRule .UNDETECTED_LICENSE .value )
625
715
626
716
elif analysis == DetectionCategory .UNKNOWN_INTRO_BEFORE_DETECTION .value :
627
717
matches_for_expression = filter_license_intros (matches )
628
718
reasons .append (DetectionRule .UNKNOWN_INTRO_FOLLOWED_BY_MATCH .value )
629
719
630
- elif analysis == DetectionCategory .UNKNOWN_FILE_REFERENCE_LOCAL .value and post_scan :
631
- matches_for_expression = filter_license_references (matches )
632
- reasons .append (DetectionRule .UNKNOWN_REFERENCE_TO_LOCAL_FILE .value )
720
+ elif post_scan :
721
+ if analysis == DetectionCategory .UNKNOWN_FILE_REFERENCE_LOCAL .value :
722
+ matches_for_expression = filter_license_references (matches )
723
+ reasons .append (DetectionRule .UNKNOWN_REFERENCE_TO_LOCAL_FILE .value )
633
724
634
- elif analysis == DetectionCategory .PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL .value and post_scan :
635
- matches_for_expression = filter_license_references (matches )
636
- reasons .append (DetectionRule .PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE .value )
725
+ elif analysis == DetectionCategory .PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL .value :
726
+ matches_for_expression = filter_license_references (matches )
727
+ reasons .append (DetectionRule .PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE .value )
637
728
638
- elif analysis == DetectionCategory .PACKAGE_ADD_FROM_SIBLING_FILE and post_scan :
639
- matches_for_expression = filter_license_references (matches )
640
- reasons .append (DetectionRule .PACKAGE_ADD_FROM_SIBLING_FILE .value )
729
+ elif analysis == DetectionCategory .PACKAGE_ADD_FROM_SIBLING_FILE . value :
730
+ matches_for_expression = filter_license_references (matches )
731
+ reasons .append (DetectionRule .PACKAGE_ADD_FROM_SIBLING_FILE .value )
641
732
642
- elif analysis == DetectionCategory .PACKAGE_ADD_FROM_FILE .value and post_scan :
643
- matches_for_expression = filter_license_references (matches )
644
- reasons .append (DetectionRule .PACKAGE_ADD_FROM_FILE .value )
733
+ elif analysis == DetectionCategory .PACKAGE_ADD_FROM_FILE .value :
734
+ matches_for_expression = filter_license_references (matches )
735
+ reasons .append (DetectionRule .PACKAGE_ADD_FROM_FILE .value )
645
736
646
- elif (
647
- analysis == DetectionCategory .UNKNOWN_MATCH .value or
648
- analysis == DetectionCategory .LICENSE_CLUES .value
649
- ):
737
+ elif analysis == DetectionCategory .UNKNOWN_MATCH .value :
738
+ reasons .append (DetectionRule .UNKNOWN_MATCH .value )
739
+ return reasons , combined_expression
740
+
741
+ elif analysis == DetectionCategory .LICENSE_CLUES .value :
742
+ reasons .append (DetectionRule .LICENSE_CLUES .value )
650
743
return reasons , combined_expression
651
744
652
745
else :
653
746
matches_for_expression = matches
654
747
reasons .append (DetectionRule .NOT_COMBINED .value )
655
748
749
+ if TRACE :
750
+ logger_debug (f'matches_for_expression: { matches_for_expression } ' , f'reasons: { reasons } ' )
751
+
656
752
if isinstance (matches [0 ], dict ):
657
753
combined_expression = combine_expressions (
658
754
expressions = [match ['license_expression' ] for match in matches_for_expression ]
@@ -662,6 +758,9 @@ def get_detected_license_expression(matches, analysis, post_scan=False):
662
758
expressions = [match .rule .license_expression for match in matches_for_expression ]
663
759
)
664
760
761
+ if TRACE :
762
+ logger_debug (f'combined_expression { combined_expression } ' )
763
+
665
764
return reasons , combined_expression
666
765
667
766
@@ -792,12 +891,15 @@ def get_license_keys_from_detections(license_detections):
792
891
return list (license_keys )
793
892
794
893
795
- def analyze_detection (license_matches ):
894
+ def analyze_detection (license_matches , package_license = False ):
796
895
"""
797
896
Analyse a list of LicenseMatch objects, and determine if the license detection
798
897
is correct or it is wrong/partially-correct/false-positive/has extra words or
799
898
some other detection case.
800
899
"""
900
+ if TRACE :
901
+ logger_debug (f'license_matches { license_matches } ' , f'package_license { package_license } ' )
902
+
801
903
if is_undetected_license_matches (license_matches ):
802
904
return DetectionCategory .UNDETECTED_LICENSE .value
803
905
@@ -811,9 +913,12 @@ def analyze_detection(license_matches):
811
913
elif is_correct_detection (license_matches ):
812
914
return DetectionCategory .PERFECT_DETECTION .value
813
915
814
- elif is_match_coverage_less_than_threshold (
815
- license_matches , CLUES_MATCH_COVERAGE_THR
816
- ):
916
+ # Case where the match is a false positive
917
+ # In package license detection this is turned off
918
+ elif not package_license and is_false_positive (license_matches , package_license ):
919
+ return DetectionCategory .FALSE_POSITVE .value
920
+
921
+ elif is_license_clues (license_matches ):
817
922
return DetectionCategory .LICENSE_CLUES .value
818
923
819
924
# Case where at least one of the matches have `match_coverage`
@@ -832,10 +937,6 @@ def analyze_detection(license_matches):
832
937
elif has_unknown_matches (license_matches ):
833
938
return DetectionCategory .UNKNOWN_MATCH .value
834
939
835
- # Case where the match is a false positive
836
- elif is_false_positive (license_matches ):
837
- return DetectionCategory .FALSE_POSITVE .value
838
-
839
940
# Cases where Match Coverage is a perfect 100 for all matches
840
941
else :
841
942
return DetectionCategory .PERFECT_DETECTION .value
@@ -1021,6 +1122,7 @@ def detect_licenses(
1021
1122
min_score = 0 ,
1022
1123
deadline = sys .maxsize ,
1023
1124
as_expression = False ,
1125
+ package_license = False ,
1024
1126
** kwargs
1025
1127
):
1026
1128
"""
@@ -1063,5 +1165,6 @@ def detect_licenses(
1063
1165
yield LicenseDetection .from_matches (
1064
1166
matches = group_of_matches ,
1065
1167
analysis = analysis ,
1066
- post_scan = post_scan
1168
+ post_scan = post_scan ,
1169
+ package_license = package_license ,
1067
1170
)
0 commit comments