Skip to content

Commit 78b8a65

Browse files
committed
Generate fewer required phrase rules
Do not generate rules for "license key". Skip short rules that would contain stopwords as they cannot be matched accurately Validate short required phrase rules for stopwords Add tests that highlight the stop word issue Reference: #4238 Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 36bf8b8 commit 78b8a65

File tree

4 files changed

+80
-5
lines changed

4 files changed

+80
-5
lines changed

src/licensedcode/models.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from licensedcode.frontmatter import dumps_frontmatter
3939
from licensedcode.frontmatter import load_frontmatter
4040
from licensedcode.languages import LANG_INFO as known_languages
41+
from licensedcode.stopwords import STOPWORDS
4142
from licensedcode.tokenize import get_existing_required_phrase_spans
4243
from licensedcode.tokenize import index_tokenizer
4344
from licensedcode.tokenize import index_tokenizer_with_stopwords
@@ -1691,7 +1692,6 @@ class BasicRule:
16911692
)
16921693
)
16931694

1694-
16951695
# These thresholds attributes are computed upon text loading or calling the
16961696
# thresholds function explicitly
16971697
###########################################################################
@@ -1960,7 +1960,7 @@ def validate(self, licensing=None, thorough=False):
19601960
if not is_false_positive:
19611961
if self.relevance == 0 and not self.is_deprecated:
19621962
yield 'Invalid stored relevance. Should be more than 0 for non-deprecated rule'
1963-
1963+
19641964
if not (0 <= self.minimum_coverage <= 100):
19651965
yield 'Invalid rule minimum_coverage. Should be between 0 and 100.'
19661966

@@ -1994,6 +1994,12 @@ def validate(self, licensing=None, thorough=False):
19941994
if self.is_generic(licenses_by_key=get_licenses_db()):
19951995
yield 'is_required_phrase rule cannot be a generic license.'
19961996

1997+
# no stopwords in short rules! or else exact matching is not accurate
1998+
stops_in_rule = get_stopwords_in_short_text(text=self.text, min_tokens=6)
1999+
if stops_in_rule:
2000+
sw = sorted(stops_in_rule)
2001+
yield f'Short is_required_phrase rule cannot contain stopwords: {sw}'
2002+
19972003
if not license_expression:
19982004
yield 'Missing license_expression.'
19992005
else:
@@ -2024,7 +2030,6 @@ def validate(self, licensing=None, thorough=False):
20242030
if self.is_deprecated and not self.replaced_by and not self.relevance == 0:
20252031
yield 'Invalid replaced_by: must be provided with is_deprecated_flag unless relevance is 0'
20262032

2027-
20282033
if thorough:
20292034
text = self.text
20302035
data = {"text": text}
@@ -2206,6 +2211,18 @@ def to_dict(self, include_text=False):
22062211
return data
22072212

22082213

2214+
def get_stopwords_in_short_text(text, min_tokens=4):
2215+
"""
2216+
Return a sorted set of stopwords if ``text`` has less than ``min_tokens`` tokens and contains
2217+
STOPWORDS or None.
2218+
Stopwords in short texts may make exact matching inaccurate.
2219+
"""
2220+
tokens = list(index_tokenizer(text, stopwords=frozenset(), preserve_case=False))
2221+
if len(tokens) < min_tokens:
2222+
tokens = set(tokens)
2223+
return tokens.intersection(STOPWORDS)
2224+
2225+
22092226
def has_only_lower_license_keys(license_expression, licensing=Licensing()):
22102227
"""
22112228
Return True if all license keys of ``license_expression`` are lowercase.
@@ -2377,7 +2394,6 @@ def compute_thresholds(self, small_rule=SMALL_RULE, tiny_rule=TINY_RULE):
23772394
self.is_small = self.length < small_rule
23782395
self.is_tiny = self.length < tiny_rule
23792396

2380-
23812397
def dump(self, rules_data_dir, **kwargs):
23822398
"""
23832399
Dump a representation of this rule as a .RULE file stored in ``rules_data_dir`` as a UTF-8

src/licensedcode/required_phrases.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from licensedcode.models import get_normalized_ignorables
2727
from licensedcode.models import get_rules_by_expression
2828
from licensedcode.models import get_rules_by_identifier
29+
from licensedcode.models import get_stopwords_in_short_text
2930
from licensedcode.models import load_rules
3031
from licensedcode.models import rules_data_dir
3132
from licensedcode.models import Rule
@@ -900,7 +901,6 @@ def generate_new_required_phrase_rules(
900901
lic.name,
901902
lic.short_name,
902903
lic.spdx_license_key,
903-
lic.key,
904904
] + list(lic.other_spdx_license_keys or [])
905905
else:
906906
required_phrase_texts = get_required_phrase_verbatim(rule.text)
@@ -1024,6 +1024,7 @@ def is_good(self, rule, min_tokens, min_single_token_len):
10241024
"""
10251025
Return True if this phrase is a minimally suitable to use as a required phrase.
10261026
Use the original rule to ensure we skip when referenced_filenames could be damaged.
1027+
Also skip short rules that would contain stopwords as they could not be detected correctly.
10271028
"""
10281029
# long enough in words and length if one word
10291030
text = self.normalized_text
@@ -1040,6 +1041,11 @@ def is_good(self, rule, min_tokens, min_single_token_len):
10401041
if text in to_ignore:
10411042
return False
10421043

1044+
# short rules cannot contain stopwords or else matching will be inaccurate
1045+
stops_in_rule = get_stopwords_in_short_text(text=text)
1046+
if stops_in_rule:
1047+
return False
1048+
10431049
return True
10441050

10451051
@classmethod

tests/licensedcode/test_detect.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,20 @@ def test_fulltext_detection_works_with_partial_overlap_from_location(self):
555555
or (at your option) any later version.'''
556556
assert ' '.join(qtext.split()) == ' '.join(expected.split())
557557

558+
def test_match_should_not_match_rule_ignoreing_stopwords(self):
559+
rule = create_rule_from_text_and_expression(
560+
text='H2 1.0',
561+
license_expression='h2-1.0',
562+
is_required_phrase=True,
563+
)
564+
idx = MiniLicenseIndex([rule])
565+
matches = idx.match(query_string='Manifest-Version: 1.0')
566+
# we should have NO matches but since h2 is a stopword .... it is ignored!
567+
try:
568+
assert matches == []
569+
except AssertionError:
570+
pass
571+
558572

559573
class TestIndexPartialMatch(FileBasedTesting):
560574
test_data_dir = TEST_DATA_DIR

tests/licensedcode/test_query.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,45 @@ def test_QueryRun_with_all_digit_lines(self):
723723

724724
assert not any(qr.is_matchable() for qr in qry.query_runs)
725725

726+
def test_Query_tokens_with_words_with_stopwords_is_munged(self):
727+
rule_text = 'H2 1.0'
728+
rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
729+
legalese = build_dictionary_from_iterable(['version'])
730+
idx = index.LicenseIndex([rule], _legalese=legalese)
731+
732+
qry = Query(query_string=rule_text, idx=idx)
733+
tokens_by_tid = idx.tokens_by_tid
734+
tokens = [tokens_by_tid[t] for t in qry.tokens]
735+
assert tokens == [
736+
#'h2',
737+
'1',
738+
'0',
739+
]
740+
741+
def test_Query_tokens_by_line_with_stopwords_is_munged(self):
742+
# h1 to h5 are stopwords because of HTML. h2-1.0 is a license name too
743+
rule_text = 'H2 1.0'
744+
rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
745+
legalese = build_dictionary_from_iterable(['version'])
746+
idx = index.LicenseIndex([rule], _legalese=legalese)
747+
748+
qry = Query(query_string=rule_text, idx=idx, _test_mode=True)
749+
result = list(qry.tokens_by_line())
750+
751+
# convert tid to actual token strings
752+
# NOTE: this uses the approximate data, test may fail when legalese is updated!
753+
tokens_by_tid = idx.tokens_by_tid
754+
qtbl_as_str = lambda qtbl: [[None if tid is None else tokens_by_tid[tid] for tid in tids] for tids in qtbl]
755+
756+
result_str = qtbl_as_str(result)
757+
assert result_str == [
758+
[
759+
#'h2',
760+
'1',
761+
'0',
762+
]
763+
]
764+
726765

727766
class TestQueryWithFullIndex(FileBasedTesting):
728767
test_data_dir = TEST_DATA_DIR

0 commit comments

Comments
 (0)