Skip to content

Improve score by supporting extra_phrase for extra words in rules #4432

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/bsd-new_158.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

Neither the name of nor the names of its
Neither the name of [[6]] nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

Expand Down
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/bsd-new_newlib3.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ are permitted provided that the following conditions are met:
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University nor the names of its contributors
* Neither the name of the [[3]] University nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.

Expand Down
41 changes: 37 additions & 4 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from licensedcode.cache import get_licensing
from licensedcode.match import LicenseMatch
from licensedcode.match import set_matched_lines
from licensedcode.match import is_extra_words_position_valid
from licensedcode.models import compute_relevance
from licensedcode.models import Rule
from licensedcode.models import UnDetectedRule
Expand Down Expand Up @@ -110,6 +111,7 @@ class DetectionCategory(Enum):
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
PACKAGE_ADD_FROM_FILE = 'from-package-file'
EXTRA_WORDS = 'extra-words'
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
UNKNOWN_MATCH = 'unknown-match'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
Expand All @@ -129,6 +131,7 @@ class DetectionRule(Enum):
"""
UNKNOWN_MATCH = 'unknown-match'
EXTRA_WORDS = 'extra-words'
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
Expand Down Expand Up @@ -405,8 +408,12 @@ def score(self):
by the length of a match to the overall detection length.
"""
length = self.length
weighted_scores = (m.score() * (m.len() / length) for m in self.matches)
return min([round(sum(weighted_scores), 2), 100])
for m in self.matches:
# Check whether extra words in the matched text appear in allowed positions,
# and do not exceed the maximum allowed word count at those positions.
score = 100 if is_extra_words_position_valid(m) else m.score()
weighted_scores += score * (m.len() / length)
return min([round(weighted_scores, 2), 100])

def append(
self,
Expand Down Expand Up @@ -1072,6 +1079,7 @@ def is_correct_detection_non_unknown(license_matches):
is_correct_detection(license_matches)
and not has_unknown_matches(license_matches)
and not has_extra_words(license_matches)
and not is_extra_words_at_valid_positions(license_matches)
)


Expand All @@ -1087,7 +1095,7 @@ def is_correct_detection(license_matches):
]

return (
all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers)
all(matcher in ("1-hash", "1-spdx-id", "2-aho", "3-seq") for matcher in matchers)
Copy link
Collaborator Author

@alok1304 alok1304 Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i add 3-seq because of extra-phrase is removed from rules while loading, after that when we scan same rules, now matcher is 3-seq not any one of these "1-hash", "1-spdx-id", "2-aho" this is due to extra-phrase marker that are present in rules. That's why this gives 3-seq. We already loaded the rules where extra-phrase is removed, but not in the actual rule file.

and all(is_match_coverage_perfect)
)

Expand Down Expand Up @@ -1159,6 +1167,21 @@ def has_low_rule_relevance(license_matches):
)


def is_extra_words_at_valid_positions(license_matches):
"""
Return True if all the matches in `license_matches List of LicenseMatch
has extra words are in the correct place.
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to check both a bit explicitly:

  1. For all the matches which have extra words, they are in correct location
  2. For all the matches which does not have extra words, they are correct detections

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And add a test accordingly

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And add a test accordingly

where I should add a test and like how I implement for all license_matches

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for match in license_matches:
# check when we have `extra-words` detection
# if `query_coverage_coefficient` is positive number then 'extra-words` exit
if calculate_query_coverage_coefficient(match) > 0:
if not is_extra_words_position_valid(match):
return False

# at the end return True if all matches have no extra-wors or this extra-words are in the right place
return True

def is_false_positive(license_matches, package_license=False):
"""
Return True if all of the matches in ``license_matches`` List of LicenseMatch
Expand Down Expand Up @@ -1570,6 +1593,12 @@ def get_detected_license_expression(
detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
return detection_log, combined_expression

elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}')
matches_for_expression = license_matches
detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value)

elif analysis == DetectionCategory.EXTRA_WORDS.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}')
Expand Down Expand Up @@ -1810,7 +1839,11 @@ def analyze_detection(license_matches, package_license=False):

# Case where at least one of the match have extra words
elif has_extra_words(license_matches=license_matches):
return DetectionCategory.EXTRA_WORDS.value
# Case where `extra-words` are in the right place
if is_extra_words_at_valid_positions(license_matches=license_matches):
return DetectionCategory.EXTRA_WORDS_PERMITTED.value
else:
return DetectionCategory.EXTRA_WORDS.value

# Cases where Match Coverage is a perfect 100 for all matches
else:
Expand Down
75 changes: 74 additions & 1 deletion src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,11 @@ def to_dict(
result['start_line'] = self.start_line
result['end_line'] = self.end_line
result['matcher'] = self.matcher
result['score'] = self.score()
# update score if `extra-words` are in right place
if(is_extra_words_position_valid(match=self)):
result['score'] = 100
else:
result['score'] = self.score()
result['matched_length'] = self.len()
result['match_coverage'] = self.coverage()
result['rule_relevance'] = self.rule.relevance
Expand Down Expand Up @@ -1071,6 +1075,75 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
# early from the loops: trying to check containment on wildly separated matches
# does not make sense

def is_extra_words_position_valid(match):
"""
Return True if the extra words appear in valid positions and
do not exceed the maximum allowed word count at those positions.
Otherwise, return False.
"""
# Find `query_coverage_coefficient` such that match have `extra-words` or not
score_coverage_relevance = (
match.coverage() * match.rule.relevance
) / 100

# Calculate the query coverage coefficient
query_coverage_coefficient = score_coverage_relevance - match.score()

# Return False if the match has no extra words
if query_coverage_coefficient == 0:
return False

matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False)))
rule_tokens = list(index_tokenizer(match.rule.text))
extra_phrase_spans = match.rule.extra_phrase_spans

if not extra_phrase_spans:
return False

# count of `extra-words` tokens i.e inserted in `matched_tokens`
matched_count = 0

# Count of extra phrase markers
extra_phrase_count = 0

rule_index = 0
matched_index = 0

for span, allowed_extra_word in extra_phrase_spans:
rule_index = span.start

matched_index = span.start + matched_count - extra_phrase_count
extra_words_count = 0

# return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
if(matched_tokens[matched_index-1] != rule_tokens[rule_index-1]):
return False

# Count how many tokens in `matched_text` do not match the next rule token
while (matched_index < len(matched_tokens) and
matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
matched_index += 1
matched_count += 1
extra_words_count += 1

if extra_words_count > allowed_extra_word:
return False

extra_phrase_count += 1

rule_index+=1

# check if any `extra-words` is present and return False because this `extra-words` are not at marked place
while (matched_index < len(matched_tokens) and
matched_tokens[matched_index] == rule_tokens[rule_index]):
matched_index+=1
rule_index+=1

if matched_index != len(matched_tokens):
return False

return True


def filter_contained_matches(
matches,
Expand Down
26 changes: 26 additions & 0 deletions src/licensedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from licensedcode.tokenize import index_tokenizer
from licensedcode.tokenize import index_tokenizer_with_stopwords
from licensedcode.tokenize import query_lines
from licensedcode.tokenize import get_extra_phrase_spans
from scancode.api import SCANCODE_LICENSEDB_URL
from scancode.api import SCANCODE_LICENSE_URL
from scancode.api import SCANCODE_RULE_URL
Expand Down Expand Up @@ -1683,6 +1684,17 @@ class BasicRule:
)
)

extra_phrase_spans = attr.ib(
default=attr.Factory(list),
repr=False,
metadata=dict(
help='List of tuples `(Span, int)` representing extra phrases for this rule.'
'Each tuple contains a Span of token positions in the rule text and an integer'
'indicating the maximum number of extra tokens allowed at that position.'
'extra phrases are enclosed in [[double square brackets]] in the rule text.'
)
)

source = attr.ib(
default=None,
repr=False,
Expand Down Expand Up @@ -2317,6 +2329,9 @@ def tokens(self):
"is_continuous", "minimum_coverage" and "stopword_by_pos" are
recomputed as a side effect.
"""

# identify and capture the spans of extra phrases specified within the rule
self.extra_phrase_spans = list(self.extra_phrases())

text = self.text
# We tag this rule as being a bare URL if it starts with a scheme and is
Expand Down Expand Up @@ -2353,6 +2368,17 @@ def _set_continuous(self):
):
self.is_continuous = True

def extra_phrases(self):
"""
Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text.

Each tuple consists of:
- a `Span` object representing the position in the tokenized rule text, and
- an integer `n` indicating how many extra tokens are allowed at that position.
"""
if self.text:
yield from get_extra_phrase_spans(self.text)

def build_required_phrase_spans(self):
"""
Return a list of Spans marking required phrases token positions of that must
Expand Down
72 changes: 72 additions & 0 deletions src/licensedcode/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,82 @@ def query_lines(
required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})'
required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall


extra_phrase_pattern = '(?:' + query_pattern + r'|\[\[|\]\])'
extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall


# pattern to match and remove extra phrases like [[1]], [[4]]..etc from the text
extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]')

REQUIRED_PHRASE_OPEN = '{{'
REQUIRED_PHRASE_CLOSE = '}}'

EXTRA_PHRASE_OPEN ='[['
EXTRA_PHRASE_CLOSE =']]'

# FIXME: this should be folded in a single pass tokenization with the index_tokenizer


def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
"""
Yield tokens from a rule ``text`` including extra phrases [[n]] markers.
This n denotes maximum number of extra-words i.e valide at that position.
This is same as ``required_phrase_tokenizer``.
"""
if not text:
return
if not preserve_case:
text = text.lower()

for token in extra_phrase_splitter(text):
if token and token not in stopwords:
yield token


def get_extra_phrase_spans(text):
"""
Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``.
Here, `n` should always be a digit token inside the extra phrase brackets.

Example:
>>> text = 'Neither the name [[3]] of nor the names of its'
>>> # 0 1 2 3 4 5 6 7 8 9
>>> x = get_extra_phrase_spans(text)
>>> assert x == [(Span([3]), 3)], x
"""
ipos = 0
in_extra_phrase = False
current_phrase_value = []
extra_phrase_spans = []

for token in extra_phrase_tokenizer(text):
if token == EXTRA_PHRASE_OPEN:
in_extra_phrase = True
current_phrase_value = []
continue

elif token == EXTRA_PHRASE_CLOSE:
if in_extra_phrase:
# token must be digit and token must be present in double square bracket ``[[token]]``
# and between extra phrases there must only one token exist
if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit():
extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0])))

in_extra_phrase = False
current_phrase_value = []
continue

if in_extra_phrase:
# consider one token after double open square bracket ``[[``
if len(current_phrase_value) == 0:
current_phrase_value.append(token)

ipos += 1

return extra_phrase_spans


def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
"""
Yield tokens from a rule ``text`` including required phrases {{brace}} markers.
Expand Down Expand Up @@ -282,6 +352,8 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS):
"""
if not text:
return [], {}

text = extra_phrase_removal_pattern.sub('', text)

tokens = []
tokens_append = tokens.append
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"license_expression_spdx": "BSD-3-Clause",
"detection_count": 1,
"detection_log": [
"extra-words"
"extra-words-permitted-in-rule"
],
"reference_matches": [
{
Expand All @@ -16,7 +16,7 @@
"start_line": 4,
"end_line": 27,
"matcher": "2-aho",
"score": 99.53,
"score": 100,
"matched_length": 210,
"match_coverage": 100.0,
"rule_relevance": 100,
Expand Down Expand Up @@ -46,7 +46,7 @@
"start_line": 4,
"end_line": 27,
"matcher": "2-aho",
"score": 99.53,
"score": 100,
"matched_length": 210,
"match_coverage": 100.0,
"rule_relevance": 100,
Expand All @@ -57,7 +57,7 @@
}
],
"detection_log": [
"extra-words"
"extra-words-permitted-in-rule"
],
"identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50"
}
Expand Down
Loading
Loading