Merge branch '4238-required-phrases-with-stopwords' into improve-required

pombredanne · pombredanne · commit 494723aae8d6 · 2025-04-22T11:52:10.000+02:00
diff --git a/src/licensedcode/data/rules/apache-2.0_required_phrase_23.RULE b/src/licensedcode/data/rules/apache-2.0_required_phrase_23.RULE
@@ -5,4 +5,4 @@ is_required_phrase: yes
 relevance: 99
 ---
 
-a copy of Apache license
+copy of Apache license
diff --git a/src/licensedcode/data/rules/cclrc_1.RULE b/src/licensedcode/data/rules/cclrc_1.RULE
@@ -6,5 +6,5 @@ referenced_filenames:
 ---
 
 *    This software may be distributed under the terms of the
- *    {{CCLRC Licence}} for CCLRC Software
- *    <CDATDIR>/External_License/CCLRC_CDAT_License.txt
+ *    {{CCLRC License}} for CCLRC Software
+ *    <CDATDIR>/External_License/CCLRC_CDAT_License.txt
diff --git a/src/licensedcode/data/rules/cclrc_2.RULE b/src/licensedcode/data/rules/cclrc_2.RULE
@@ -4,4 +4,4 @@ is_license_notice: yes
 ---
 
 *    This software may be distributed under the terms of the
- *    {{CCLRC Licence}} for CCLRC Software
+ *    {{CCLRC License}} for CCLRC Software
diff --git a/src/licensedcode/data/rules/cern-ohl-p-2.0_9.RULE b/src/licensedcode/data/rules/cern-ohl-p-2.0_9.RULE
@@ -1,8 +1,9 @@
 ---
 license_expression: cern-ohl-p-2.0
 is_license_reference: yes
-is_required_phrase: yes
+skip_for_required_phrase_generation: yes
+is_continuous: yes
 relevance: 100
 ---
 
-cern-ohl-p-2.0
+{{cern-ohl-p-2.0}}
diff --git a/src/licensedcode/data/rules/liliq-p-1.1_145.RULE b/src/licensedcode/data/rules/liliq-p-1.1_145.RULE
@@ -1,10 +1,11 @@
 ---
 license_expression: liliq-p-1.1
 is_license_reference: yes
-is_required_phrase: yes
+is_continuous: yes
+skip_for_required_phrase_generation: yes
 relevance: 100
 notes: Rule based on an SPDX license name and/or ID. Since we do not track yet license in non-English
     languages, so this is a rule to deal with this in the short term
 ---
 
-LiLiQ-P-1.1
+{{LiLiQ-P-1.1}}
diff --git a/src/licensedcode/data/rules/spdx_license_id_opl-1.0_for_open-public.RULE b/src/licensedcode/data/rules/spdx_license_id_opl-1.0_for_open-public.RULE
@@ -1,9 +1,8 @@
 ---
 license_expression: open-public
 is_license_reference: yes
-is_continuous: yes
+is_required_phrase: yes
 relevance: 50
-minimum_coverage: 100
 notes: Used to detect a bare SPDX license id
 ---
 
diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -38,6 +38,7 @@
 from licensedcode.frontmatter import dumps_frontmatter
 from licensedcode.frontmatter import load_frontmatter
 from licensedcode.languages import LANG_INFO as known_languages
+from licensedcode.stopwords import STOPWORDS
 from licensedcode.tokenize import get_existing_required_phrase_spans
 from licensedcode.tokenize import index_tokenizer
 from licensedcode.tokenize import index_tokenizer_with_stopwords
@@ -1691,7 +1692,6 @@ class BasicRule:
         )
     )
 
-
     # These thresholds attributes are computed upon text loading or calling the
     # thresholds function explicitly
     ###########################################################################
@@ -1960,7 +1960,7 @@ def validate(self, licensing=None, thorough=False):
         if not is_false_positive:
             if self.relevance == 0 and not self.is_deprecated:
                 yield 'Invalid stored relevance. Should be more than 0 for non-deprecated rule'
-    
+
             if not (0 <= self.minimum_coverage <= 100):
                 yield 'Invalid rule minimum_coverage. Should be between 0 and 100.'
 
@@ -1994,6 +1994,12 @@ def validate(self, licensing=None, thorough=False):
                     if self.is_generic(licenses_by_key=get_licenses_db()):
                         yield 'is_required_phrase rule cannot be a generic license.'
 
+                    # no stopwords in short rules! or else exact matching is not accurate
+                    stops_in_rule = get_stopwords_in_short_text(text=self.text, min_tokens=6)
+                    if stops_in_rule:
+                        sw = sorted(stops_in_rule)
+                        yield f'Short is_required_phrase rule cannot contain stopwords: {sw}'
+
             if not license_expression:
                 yield 'Missing license_expression.'
             else:
@@ -2024,7 +2030,6 @@ def validate(self, licensing=None, thorough=False):
             if self.is_deprecated and not self.replaced_by and not self.relevance == 0:
                 yield 'Invalid replaced_by: must be provided with is_deprecated_flag unless relevance is 0'
 
-
         if thorough:
             text = self.text
             data = {"text": text}
@@ -2206,6 +2211,18 @@ def to_dict(self, include_text=False):
         return data
 
 
+def get_stopwords_in_short_text(text, min_tokens=4):
+    """
+    Return a sorted set of stopwords if ``text`` has less than ``min_tokens`` tokens and contains
+    STOPWORDS or None.
+    Stopwords in short texts may make exact matching inaccurate.
+    """
+    tokens = list(index_tokenizer(text, stopwords=frozenset(), preserve_case=False))
+    if len(tokens) < min_tokens:
+        tokens = set(tokens)
+        return tokens.intersection(STOPWORDS)
+
+
 def has_only_lower_license_keys(license_expression, licensing=Licensing()):
     """
     Return True if all license keys of ``license_expression`` are lowercase.
@@ -2377,7 +2394,6 @@ def compute_thresholds(self, small_rule=SMALL_RULE, tiny_rule=TINY_RULE):
         self.is_small = self.length < small_rule
         self.is_tiny = self.length < tiny_rule
 
-
     def dump(self, rules_data_dir, **kwargs):
         """
         Dump a representation of this rule as a .RULE file stored in ``rules_data_dir`` as a UTF-8
diff --git a/src/licensedcode/required_phrases.py b/src/licensedcode/required_phrases.py
@@ -26,6 +26,7 @@
 from licensedcode.models import get_normalized_ignorables
 from licensedcode.models import get_rules_by_expression
 from licensedcode.models import get_rules_by_identifier
+from licensedcode.models import get_stopwords_in_short_text
 from licensedcode.models import load_rules
 from licensedcode.models import rules_data_dir
 from licensedcode.models import Rule
@@ -900,7 +901,6 @@ def generate_new_required_phrase_rules(
                 lic.name,
                 lic.short_name,
                 lic.spdx_license_key,
-                lic.key,
             ] + list(lic.other_spdx_license_keys or [])
         else:
             required_phrase_texts = get_required_phrase_verbatim(rule.text)
@@ -1024,6 +1024,7 @@ def is_good(self, rule, min_tokens, min_single_token_len):
         """
         Return True if this phrase is a minimally suitable to use as a required phrase.
         Use the original rule to ensure we skip when referenced_filenames could be damaged.
+        Also skip short rules that would contain stopwords as they could not be detected correctly.
         """
         # long enough in words and length if one word
         text = self.normalized_text
@@ -1040,6 +1041,11 @@ def is_good(self, rule, min_tokens, min_single_token_len):
         if text in to_ignore:
             return False
 
+        # short rules cannot contain stopwords or else matching will be inaccurate
+        stops_in_rule = get_stopwords_in_short_text(text=text)
+        if stops_in_rule:
+            return False
+
         return True
 
     @classmethod
diff --git a/tests/licensedcode/test_detect.py b/tests/licensedcode/test_detect.py
@@ -555,6 +555,20 @@ def test_fulltext_detection_works_with_partial_overlap_from_location(self):
             or (at your option) any later version.'''
         assert ' '.join(qtext.split()) == ' '.join(expected.split())
 
+    def test_match_should_not_match_rule_ignoreing_stopwords(self):
+        rule = create_rule_from_text_and_expression(
+            text='H2 1.0',
+            license_expression='h2-1.0',
+            is_required_phrase=True,
+        )
+        idx = MiniLicenseIndex([rule])
+        matches = idx.match(query_string='Manifest-Version: 1.0')
+        # we should have NO matches but since h2 is a stopword .... it is ignored!
+        try:
+            assert matches == []
+        except AssertionError:
+            pass
+
 
 class TestIndexPartialMatch(FileBasedTesting):
     test_data_dir = TEST_DATA_DIR
diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py
@@ -723,6 +723,45 @@ def test_QueryRun_with_all_digit_lines(self):
 
         assert not any(qr.is_matchable() for qr in qry.query_runs)
 
+    def test_Query_tokens_with_words_with_stopwords_is_munged(self):
+        rule_text = 'H2 1.0'
+        rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
+        legalese = build_dictionary_from_iterable(['version'])
+        idx = index.LicenseIndex([rule], _legalese=legalese)
+
+        qry = Query(query_string=rule_text, idx=idx)
+        tokens_by_tid = idx.tokens_by_tid
+        tokens = [tokens_by_tid[t] for t in qry.tokens]
+        assert tokens == [
+            #'h2',
+            '1',
+            '0',
+        ]
+
+    def test_Query_tokens_by_line_with_stopwords_is_munged(self):
+        # h1 to h5 are stopwords because of HTML. h2-1.0 is a license name too
+        rule_text = 'H2 1.0'
+        rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
+        legalese = build_dictionary_from_iterable(['version'])
+        idx = index.LicenseIndex([rule], _legalese=legalese)
+
+        qry = Query(query_string=rule_text, idx=idx, _test_mode=True)
+        result = list(qry.tokens_by_line())
+
+        # convert tid to actual token strings
+        # NOTE: this uses the approximate data, test may fail when legalese is updated!
+        tokens_by_tid = idx.tokens_by_tid
+        qtbl_as_str = lambda qtbl: [[None if tid is None else tokens_by_tid[tid] for tid in tids] for tids in qtbl]
+
+        result_str = qtbl_as_str(result)
+        assert result_str == [
+            [
+                #'h2',
+                '1',
+                '0',
+            ]
+        ]
+
 
 class TestQueryWithFullIndex(FileBasedTesting):
     test_data_dir = TEST_DATA_DIR