aboutcode-org · alok1304 · Jun 19, 2025 · Jun 9, 2025 · Jun 19, 2025 · Jun 19, 2025
diff --git a/src/licensedcode/data/rules/bsd-new_158.RULE b/src/licensedcode/data/rules/bsd-new_158.RULE
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 
-Neither the name of nor the names of its
+Neither the name of [[6]] nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 

diff --git a/src/licensedcode/data/rules/bsd-new_newlib3.RULE b/src/licensedcode/data/rules/bsd-new_newlib3.RULE
@@ -11,7 +11,7 @@ are permitted provided that the following conditions are met:
     * Redistributions in binary form must reproduce the above copyright notice,
       this list of conditions and the following disclaimer in the documentation
       and/or other materials provided with the distribution.
-    * Neither the name of the University nor the names of its contributors 
+    * Neither the name of the [[3]] University nor the names of its contributors 
       may be used to endorse or promote products derived from this software 
       without specific prior written permission.
 

diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py
@@ -30,6 +30,7 @@
 from licensedcode.cache import get_licensing
 from licensedcode.match import LicenseMatch
 from licensedcode.match import set_matched_lines
+from licensedcode.match import is_extra_words_position_valid
 from licensedcode.models import compute_relevance
 from licensedcode.models import Rule
 from licensedcode.models import UnDetectedRule
@@ -110,6 +111,7 @@ class DetectionCategory(Enum):
     PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
     PACKAGE_ADD_FROM_FILE = 'from-package-file'
     EXTRA_WORDS = 'extra-words'
+    EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
     UNKNOWN_MATCH = 'unknown-match'
     LICENSE_CLUES = 'license-clues'
     LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
@@ -129,6 +131,7 @@ class DetectionRule(Enum):
     """
     UNKNOWN_MATCH = 'unknown-match'
     EXTRA_WORDS = 'extra-words'
+    EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
     LICENSE_CLUES = 'license-clues'
     LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
     IMPERFECT_COVERAGE = 'imperfect-match-coverage'
@@ -405,8 +408,12 @@ def score(self):
         by the length of a match to the overall detection length.
         """
         length = self.length
-        weighted_scores = (m.score() * (m.len() / length) for m in self.matches)
-        return min([round(sum(weighted_scores), 2), 100])
+        for m in self.matches:
+            # Check whether extra words in the matched text appear in allowed positions,
+            # and do not exceed the maximum allowed word count at those positions.
+            score = 100 if is_extra_words_position_valid(m) else m.score()
+            weighted_scores += score * (m.len() / length)
+        return min([round(weighted_scores, 2), 100])
 
     def append(
         self,
@@ -1072,6 +1079,7 @@ def is_correct_detection_non_unknown(license_matches):
         is_correct_detection(license_matches)
         and not has_unknown_matches(license_matches)
         and not has_extra_words(license_matches)
+        and not is_extra_words_at_valid_positions(license_matches)
     )  
 
 
@@ -1087,7 +1095,7 @@ def is_correct_detection(license_matches):
     ]
 
     return (
-        all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers)
+        all(matcher in ("1-hash", "1-spdx-id", "2-aho", "3-seq") for matcher in matchers)
         and all(is_match_coverage_perfect)
     )
 
@@ -1159,6 +1167,21 @@ def has_low_rule_relevance(license_matches):
     )
 
 
+def is_extra_words_at_valid_positions(license_matches):
+    """
+    Return True if all the matches in `license_matches List of LicenseMatch
+    has extra words are in the correct place.
+    """
+    for match in license_matches:
+        # check when we have `extra-words` detection
+        # if `query_coverage_coefficient` is positive number then 'extra-words` exit
+        if calculate_query_coverage_coefficient(match) > 0:
+            if not is_extra_words_position_valid(match):
+                return False
+
+    # at the end return True if all matches have no extra-wors or this extra-words are in the right place
+    return True
+
 def is_false_positive(license_matches, package_license=False):
     """
     Return True if all of the matches in ``license_matches`` List of LicenseMatch
@@ -1570,6 +1593,12 @@ def get_detected_license_expression(
         detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
         return detection_log, combined_expression
 
+    elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value:
+        if TRACE_ANALYSIS:
+            logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}')
+        matches_for_expression = license_matches
+        detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value)
+
     elif analysis == DetectionCategory.EXTRA_WORDS.value:
         if TRACE_ANALYSIS:
             logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}')
@@ -1810,7 +1839,11 @@ def analyze_detection(license_matches, package_license=False):
 
     # Case where at least one of the match have extra words
     elif has_extra_words(license_matches=license_matches):
-        return DetectionCategory.EXTRA_WORDS.value
+        # Case where `extra-words` are in the right place
+        if is_extra_words_at_valid_positions(license_matches=license_matches):
+            return DetectionCategory.EXTRA_WORDS_PERMITTED.value
+        else:
+            return DetectionCategory.EXTRA_WORDS.value
 
     # Cases where Match Coverage is a perfect 100 for all matches
     else:

diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -826,7 +826,11 @@ def to_dict(
         result['start_line'] = self.start_line
         result['end_line'] = self.end_line
         result['matcher'] = self.matcher
-        result['score'] = self.score()
+        # update score if `extra-words` are in right place
+        if(is_extra_words_position_valid(match=self)):
+            result['score'] = 100
+        else:
+            result['score'] = self.score()            
         result['matched_length'] = self.len()
         result['match_coverage'] = self.coverage()
         result['rule_relevance'] = self.rule.relevance
@@ -1071,6 +1075,75 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
 # early from the loops: trying to check containment on wildly separated matches
 # does not make sense
 
+def is_extra_words_position_valid(match):
+    """
+    Return True if the extra words appear in valid positions and 
+    do not exceed the maximum allowed word count at those positions.
+    Otherwise, return False.
+    """
+    # Find `query_coverage_coefficient` such that match have `extra-words` or not
+    score_coverage_relevance = (
+        match.coverage() * match.rule.relevance
+    ) / 100
+
+    # Calculate the query coverage coefficient
+    query_coverage_coefficient = score_coverage_relevance - match.score()
+
+    # Return False if the match has no extra words
+    if query_coverage_coefficient == 0:
+        return False
+
+    matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False)))
+    rule_tokens = list(index_tokenizer(match.rule.text))
+    extra_phrase_spans = match.rule.extra_phrase_spans
+
+    if not extra_phrase_spans:
+        return False
+
+    # count of `extra-words` tokens i.e inserted in `matched_tokens`
+    matched_count = 0
+
+    # Count of extra phrase markers   
+    extra_phrase_count = 0
+
+    rule_index = 0
+    matched_index = 0
+
+    for span, allowed_extra_word in extra_phrase_spans:
+        rule_index = span.start
+
+        matched_index = span.start + matched_count - extra_phrase_count
+        extra_words_count = 0
+
+        # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
+        if(matched_tokens[matched_index-1] != rule_tokens[rule_index-1]):
+            return False 
+
+        # Count how many tokens in `matched_text` do not match the next rule token
+        while (matched_index < len(matched_tokens) and
+               matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
+            matched_index += 1
+            matched_count += 1
+            extra_words_count += 1
+
+            if extra_words_count > allowed_extra_word:
+               return False
+
+        extra_phrase_count += 1
+
+    rule_index+=1    
+
+    # check if any `extra-words` is present and return False because this `extra-words` are not at marked place
+    while (matched_index < len(matched_tokens) and
+        matched_tokens[matched_index] == rule_tokens[rule_index]):        
+        matched_index+=1
+        rule_index+=1
+
+    if matched_index != len(matched_tokens):
+        return False    
+
+    return True
+
 
 def filter_contained_matches(
     matches,

diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -43,6 +43,7 @@
 from licensedcode.tokenize import index_tokenizer
 from licensedcode.tokenize import index_tokenizer_with_stopwords
 from licensedcode.tokenize import query_lines
+from licensedcode.tokenize import get_extra_phrase_spans
 from scancode.api import SCANCODE_LICENSEDB_URL
 from scancode.api import SCANCODE_LICENSE_URL
 from scancode.api import SCANCODE_RULE_URL
@@ -1683,6 +1684,17 @@ class BasicRule:
         )
     )
 
+    extra_phrase_spans = attr.ib(
+        default=attr.Factory(list),
+        repr=False,
+        metadata=dict(
+            help='List of tuples `(Span, int)` representing extra phrases for this rule.'
+            'Each tuple contains a Span of token positions in the rule text and an integer'
+            'indicating the maximum number of extra tokens allowed at that position.'
+            'extra phrases are enclosed in [[double square brackets]] in the rule text.'
+        )
+    )
+
     source = attr.ib(
         default=None,
         repr=False,
@@ -2317,6 +2329,9 @@ def tokens(self):
         "is_continuous",  "minimum_coverage" and "stopword_by_pos" are
         recomputed as a side effect.
         """
+
+        # identify and capture the spans of extra phrases specified within the rule
+        self.extra_phrase_spans = list(self.extra_phrases())
 
         text = self.text
         # We tag this rule as being a bare URL if it starts with a scheme and is
@@ -2353,6 +2368,17 @@ def _set_continuous(self):
         ):
             self.is_continuous = True
 
+    def extra_phrases(self):
+        """        
+        Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text.
+
+        Each tuple consists of:
+            - a `Span` object representing the position in the tokenized rule text, and
+            - an integer `n` indicating how many extra tokens are allowed at that position.
+        """
+        if self.text:
+            yield from get_extra_phrase_spans(self.text)     
+
     def build_required_phrase_spans(self):
         """
         Return a list of Spans marking required phrases token positions of that must

diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py
@@ -81,12 +81,82 @@ def query_lines(
 required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})'
 required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall
 
+
+extra_phrase_pattern = '(?:' + query_pattern + r'|\[\[|\]\])'
+extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall
+
+
+# pattern to match and remove extra phrases like [[1]], [[4]]..etc from the text
+extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]')
+
 REQUIRED_PHRASE_OPEN = '{{'
 REQUIRED_PHRASE_CLOSE = '}}'
 
+EXTRA_PHRASE_OPEN ='[['
+EXTRA_PHRASE_CLOSE =']]'
+
 # FIXME: this should be folded in a single pass tokenization with the index_tokenizer
 
 
+def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
+    """
+    Yield tokens from a rule ``text`` including extra phrases [[n]] markers.
+    This n denotes maximum number of extra-words i.e valide at that position.
+    This is same as ``required_phrase_tokenizer``.
+    """
+    if not text:
+        return
+    if not preserve_case:
+        text = text.lower()
+
+    for token in extra_phrase_splitter(text):
+        if token and token not in stopwords:
+            yield token
+
+
+def get_extra_phrase_spans(text):
+    """
+    Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``.
+    Here, `n` should always be a digit token inside the extra phrase brackets.
+
+    Example:
+    >>> text = 'Neither the name [[3]] of nor the names of its'
+    >>> #          0    1    2     3   4  5    6   7    8   9
+    >>> x = get_extra_phrase_spans(text)
+    >>> assert x == [(Span([3]), 3)], x
+    """
+    ipos = 0
+    in_extra_phrase = False
+    current_phrase_value = []
+    extra_phrase_spans = []
+
+    for token in extra_phrase_tokenizer(text):
+        if token == EXTRA_PHRASE_OPEN:
+            in_extra_phrase = True
+            current_phrase_value = []
+            continue
+
+        elif token == EXTRA_PHRASE_CLOSE:
+            if in_extra_phrase:
+                # token must be digit and token must be present in double square bracket ``[[token]]``
+                # and between extra phrases there must only one token exist
+                if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit():
+                    extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0])))
+
+            in_extra_phrase = False
+            current_phrase_value = []
+            continue
+
+        if in_extra_phrase:
+            # consider one token after double open square bracket ``[[``
+            if len(current_phrase_value) == 0:
+                current_phrase_value.append(token)
+
+        ipos += 1
+
+    return extra_phrase_spans   
+
+
 def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
     """
     Yield tokens from a rule ``text`` including required phrases {{brace}} markers.
@@ -282,6 +352,8 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS):
     """
     if not text:
         return [], {}
+
+    text = extra_phrase_removal_pattern.sub('', text)
 
     tokens = []
     tokens_append = tokens.append

diff --git a/...licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json b/...licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json
@@ -6,7 +6,7 @@
       "license_expression_spdx": "BSD-3-Clause",
       "detection_count": 1,
       "detection_log": [
-        "extra-words"
+        "extra-words-permitted-in-rule"
       ],
       "reference_matches": [
         {
@@ -16,7 +16,7 @@
           "start_line": 4,
           "end_line": 27,
           "matcher": "2-aho",
-          "score": 99.53,
+          "score": 100,
           "matched_length": 210,
           "match_coverage": 100.0,
           "rule_relevance": 100,
@@ -46,7 +46,7 @@
               "start_line": 4,
               "end_line": 27,
               "matcher": "2-aho",
-              "score": 99.53,
+              "score": 100,
               "matched_length": 210,
               "match_coverage": 100.0,
               "rule_relevance": 100,
@@ -57,7 +57,7 @@
             }
           ],
           "detection_log": [
-            "extra-words"
+            "extra-words-permitted-in-rule"
           ],
           "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50"
         }