-
-
Notifications
You must be signed in to change notification settings - Fork 598
Improve score by supporting extra_phrase
for extra words in rules
#4432
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
685a16f
760eba3
b784806
88196da
f80963c
6ad990e
5f090e3
bf3417d
61284f4
490a081
060fd63
68ab63d
5b933c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ | |
from licensedcode.cache import get_licensing | ||
from licensedcode.match import LicenseMatch | ||
from licensedcode.match import set_matched_lines | ||
from licensedcode.match import is_extra_words_position_valid | ||
from licensedcode.models import compute_relevance | ||
from licensedcode.models import Rule | ||
from licensedcode.models import UnDetectedRule | ||
|
@@ -110,6 +111,7 @@ class DetectionCategory(Enum): | |
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file' | ||
PACKAGE_ADD_FROM_FILE = 'from-package-file' | ||
EXTRA_WORDS = 'extra-words' | ||
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule' | ||
UNKNOWN_MATCH = 'unknown-match' | ||
LICENSE_CLUES = 'license-clues' | ||
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' | ||
|
@@ -129,6 +131,7 @@ class DetectionRule(Enum): | |
""" | ||
UNKNOWN_MATCH = 'unknown-match' | ||
EXTRA_WORDS = 'extra-words' | ||
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule' | ||
LICENSE_CLUES = 'license-clues' | ||
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' | ||
IMPERFECT_COVERAGE = 'imperfect-match-coverage' | ||
|
@@ -405,8 +408,12 @@ def score(self): | |
by the length of a match to the overall detection length. | ||
""" | ||
length = self.length | ||
weighted_scores = (m.score() * (m.len() / length) for m in self.matches) | ||
return min([round(sum(weighted_scores), 2), 100]) | ||
for m in self.matches: | ||
# Check whether extra words in the matched text appear in allowed positions, | ||
# and do not exceed the maximum allowed word count at those positions. | ||
score = 100 if is_extra_words_position_valid(m) else m.score() | ||
weighted_scores += score * (m.len() / length) | ||
return min([round(weighted_scores, 2), 100]) | ||
|
||
def append( | ||
self, | ||
|
@@ -1072,6 +1079,7 @@ def is_correct_detection_non_unknown(license_matches): | |
is_correct_detection(license_matches) | ||
and not has_unknown_matches(license_matches) | ||
and not has_extra_words(license_matches) | ||
and not is_extra_words_at_valid_positions(license_matches) | ||
) | ||
|
||
|
||
|
@@ -1087,7 +1095,7 @@ def is_correct_detection(license_matches): | |
] | ||
|
||
return ( | ||
all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers) | ||
all(matcher in ("1-hash", "1-spdx-id", "2-aho", "3-seq") for matcher in matchers) | ||
and all(is_match_coverage_perfect) | ||
) | ||
|
||
|
@@ -1159,6 +1167,21 @@ def has_low_rule_relevance(license_matches): | |
) | ||
|
||
|
||
def is_extra_words_at_valid_positions(license_matches): | ||
""" | ||
Return True if all the matches in `license_matches List of LicenseMatch | ||
has extra words are in the correct place. | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to check both a bit explicitly:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And add a test accordingly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
where I should add a test and like how I implement for all There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also i added test for |
||
for match in license_matches: | ||
# check when we have `extra-words` detection | ||
# if `query_coverage_coefficient` is positive number then 'extra-words` exit | ||
if calculate_query_coverage_coefficient(match) > 0: | ||
if not is_extra_words_position_valid(match): | ||
return False | ||
|
||
# at the end return True if all matches have no extra-wors or this extra-words are in the right place | ||
return True | ||
|
||
def is_false_positive(license_matches, package_license=False): | ||
""" | ||
Return True if all of the matches in ``license_matches`` List of LicenseMatch | ||
|
@@ -1570,6 +1593,12 @@ def get_detected_license_expression( | |
detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value) | ||
return detection_log, combined_expression | ||
|
||
elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value: | ||
if TRACE_ANALYSIS: | ||
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}') | ||
matches_for_expression = license_matches | ||
detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value) | ||
|
||
elif analysis == DetectionCategory.EXTRA_WORDS.value: | ||
if TRACE_ANALYSIS: | ||
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}') | ||
|
@@ -1810,7 +1839,11 @@ def analyze_detection(license_matches, package_license=False): | |
|
||
# Case where at least one of the match have extra words | ||
elif has_extra_words(license_matches=license_matches): | ||
return DetectionCategory.EXTRA_WORDS.value | ||
# Case where `extra-words` are in the right place | ||
if is_extra_words_at_valid_positions(license_matches=license_matches): | ||
return DetectionCategory.EXTRA_WORDS_PERMITTED.value | ||
else: | ||
return DetectionCategory.EXTRA_WORDS.value | ||
|
||
# Cases where Match Coverage is a perfect 100 for all matches | ||
else: | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i add
3-seq
because ofextra-phrase
is removed from rules while loading, after that when we scan same rules, now matcher is3-seq
not any one of these"1-hash", "1-spdx-id", "2-aho"
this is due toextra-phrase
marker that are present in rules. That's why this gives3-seq
. We already loaded the rules whereextra-phrase
is removed, but not in the actual rule file.