Generate fewer required phrase rules

pombredanne · pombredanne · commit 78b8a6583a64 · 2025-04-15T12:21:55.000+02:00
Do not generate rules for "license key". Skip short rules that would contain stopwords as they cannot be matched accurately Validate short required phrase rules for stopwords Add tests that highlight the stop word issue Reference: #4238 Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -38,6 +38,7 @@
 from licensedcode.frontmatter import dumps_frontmatter
 from licensedcode.frontmatter import load_frontmatter
 from licensedcode.languages import LANG_INFO as known_languages
+from licensedcode.stopwords import STOPWORDS
 from licensedcode.tokenize import get_existing_required_phrase_spans
 from licensedcode.tokenize import index_tokenizer
 from licensedcode.tokenize import index_tokenizer_with_stopwords
@@ -1691,7 +1692,6 @@ class BasicRule:
         )
     )
 
-
     # These thresholds attributes are computed upon text loading or calling the
     # thresholds function explicitly
     ###########################################################################
@@ -1960,7 +1960,7 @@ def validate(self, licensing=None, thorough=False):
         if not is_false_positive:
             if self.relevance == 0 and not self.is_deprecated:
                 yield 'Invalid stored relevance. Should be more than 0 for non-deprecated rule'
-    
+
             if not (0 <= self.minimum_coverage <= 100):
                 yield 'Invalid rule minimum_coverage. Should be between 0 and 100.'
 
@@ -1994,6 +1994,12 @@ def validate(self, licensing=None, thorough=False):
                     if self.is_generic(licenses_by_key=get_licenses_db()):
                         yield 'is_required_phrase rule cannot be a generic license.'
 
+                    # no stopwords in short rules! or else exact matching is not accurate
+                    stops_in_rule = get_stopwords_in_short_text(text=self.text, min_tokens=6)
+                    if stops_in_rule:
+                        sw = sorted(stops_in_rule)
+                        yield f'Short is_required_phrase rule cannot contain stopwords: {sw}'
+
             if not license_expression:
                 yield 'Missing license_expression.'
             else:
@@ -2024,7 +2030,6 @@ def validate(self, licensing=None, thorough=False):
             if self.is_deprecated and not self.replaced_by and not self.relevance == 0:
                 yield 'Invalid replaced_by: must be provided with is_deprecated_flag unless relevance is 0'
 
-
         if thorough:
             text = self.text
             data = {"text": text}
@@ -2206,6 +2211,18 @@ def to_dict(self, include_text=False):
         return data
 
 
+def get_stopwords_in_short_text(text, min_tokens=4):
+    """
+    Return a sorted set of stopwords if ``text`` has less than ``min_tokens`` tokens and contains
+    STOPWORDS or None.
+    Stopwords in short texts may make exact matching inaccurate.
+    """
+    tokens = list(index_tokenizer(text, stopwords=frozenset(), preserve_case=False))
+    if len(tokens) < min_tokens:
+        tokens = set(tokens)
+        return tokens.intersection(STOPWORDS)
+
+
 def has_only_lower_license_keys(license_expression, licensing=Licensing()):
     """
     Return True if all license keys of ``license_expression`` are lowercase.
@@ -2377,7 +2394,6 @@ def compute_thresholds(self, small_rule=SMALL_RULE, tiny_rule=TINY_RULE):
         self.is_small = self.length < small_rule
         self.is_tiny = self.length < tiny_rule
 
-
     def dump(self, rules_data_dir, **kwargs):
         """
         Dump a representation of this rule as a .RULE file stored in ``rules_data_dir`` as a UTF-8
diff --git a/src/licensedcode/required_phrases.py b/src/licensedcode/required_phrases.py
@@ -26,6 +26,7 @@
 from licensedcode.models import get_normalized_ignorables
 from licensedcode.models import get_rules_by_expression
 from licensedcode.models import get_rules_by_identifier
+from licensedcode.models import get_stopwords_in_short_text
 from licensedcode.models import load_rules
 from licensedcode.models import rules_data_dir
 from licensedcode.models import Rule
@@ -900,7 +901,6 @@ def generate_new_required_phrase_rules(
                 lic.name,
                 lic.short_name,
                 lic.spdx_license_key,
-                lic.key,
             ] + list(lic.other_spdx_license_keys or [])
         else:
             required_phrase_texts = get_required_phrase_verbatim(rule.text)
@@ -1024,6 +1024,7 @@ def is_good(self, rule, min_tokens, min_single_token_len):
         """
         Return True if this phrase is a minimally suitable to use as a required phrase.
         Use the original rule to ensure we skip when referenced_filenames could be damaged.
+        Also skip short rules that would contain stopwords as they could not be detected correctly.
         """
         # long enough in words and length if one word
         text = self.normalized_text
@@ -1040,6 +1041,11 @@ def is_good(self, rule, min_tokens, min_single_token_len):
         if text in to_ignore:
             return False
 
+        # short rules cannot contain stopwords or else matching will be inaccurate
+        stops_in_rule = get_stopwords_in_short_text(text=text)
+        if stops_in_rule:
+            return False
+
         return True
 
     @classmethod
diff --git a/tests/licensedcode/test_detect.py b/tests/licensedcode/test_detect.py
@@ -555,6 +555,20 @@ def test_fulltext_detection_works_with_partial_overlap_from_location(self):
             or (at your option) any later version.'''
         assert ' '.join(qtext.split()) == ' '.join(expected.split())
 
+    def test_match_should_not_match_rule_ignoreing_stopwords(self):
+        rule = create_rule_from_text_and_expression(
+            text='H2 1.0',
+            license_expression='h2-1.0',
+            is_required_phrase=True,
+        )
+        idx = MiniLicenseIndex([rule])
+        matches = idx.match(query_string='Manifest-Version: 1.0')
+        # we should have NO matches but since h2 is a stopword .... it is ignored!
+        try:
+            assert matches == []
+        except AssertionError:
+            pass
+
 
 class TestIndexPartialMatch(FileBasedTesting):
     test_data_dir = TEST_DATA_DIR
diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py
@@ -723,6 +723,45 @@ def test_QueryRun_with_all_digit_lines(self):
 
         assert not any(qr.is_matchable() for qr in qry.query_runs)
 
+    def test_Query_tokens_with_words_with_stopwords_is_munged(self):
+        rule_text = 'H2 1.0'
+        rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
+        legalese = build_dictionary_from_iterable(['version'])
+        idx = index.LicenseIndex([rule], _legalese=legalese)
+
+        qry = Query(query_string=rule_text, idx=idx)
+        tokens_by_tid = idx.tokens_by_tid
+        tokens = [tokens_by_tid[t] for t in qry.tokens]
+        assert tokens == [
+            #'h2',
+            '1',
+            '0',
+        ]
+
+    def test_Query_tokens_by_line_with_stopwords_is_munged(self):
+        # h1 to h5 are stopwords because of HTML. h2-1.0 is a license name too
+        rule_text = 'H2 1.0'
+        rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
+        legalese = build_dictionary_from_iterable(['version'])
+        idx = index.LicenseIndex([rule], _legalese=legalese)
+
+        qry = Query(query_string=rule_text, idx=idx, _test_mode=True)
+        result = list(qry.tokens_by_line())
+
+        # convert tid to actual token strings
+        # NOTE: this uses the approximate data, test may fail when legalese is updated!
+        tokens_by_tid = idx.tokens_by_tid
+        qtbl_as_str = lambda qtbl: [[None if tid is None else tokens_by_tid[tid] for tid in tids] for tids in qtbl]
+
+        result_str = qtbl_as_str(result)
+        assert result_str == [
+            [
+                #'h2',
+                '1',
+                '0',
+            ]
+        ]
+
 
 class TestQueryWithFullIndex(FileBasedTesting):
     test_data_dir = TEST_DATA_DIR