Skip to content

Commit 494723a

Browse files
committed
Merge branch '4238-required-phrases-with-stopwords' into improve-required
2 parents c54321a + 1f789a2 commit 494723a

File tree

10 files changed

+91
-15
lines changed

10 files changed

+91
-15
lines changed

src/licensedcode/data/rules/apache-2.0_required_phrase_23.RULE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ is_required_phrase: yes
55
relevance: 99
66
---
77

8-
a copy of Apache license
8+
copy of Apache license

src/licensedcode/data/rules/cclrc_1.RULE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ referenced_filenames:
66
---
77

88
* This software may be distributed under the terms of the
9-
* {{CCLRC Licence}} for CCLRC Software
10-
* <CDATDIR>/External_License/CCLRC_CDAT_License.txt
9+
* {{CCLRC License}} for CCLRC Software
10+
* <CDATDIR>/External_License/CCLRC_CDAT_License.txt

src/licensedcode/data/rules/cclrc_2.RULE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ is_license_notice: yes
44
---
55

66
* This software may be distributed under the terms of the
7-
* {{CCLRC Licence}} for CCLRC Software
7+
* {{CCLRC License}} for CCLRC Software
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
---
22
license_expression: cern-ohl-p-2.0
33
is_license_reference: yes
4-
is_required_phrase: yes
4+
skip_for_required_phrase_generation: yes
5+
is_continuous: yes
56
relevance: 100
67
---
78

8-
cern-ohl-p-2.0
9+
{{cern-ohl-p-2.0}}
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
---
22
license_expression: liliq-p-1.1
33
is_license_reference: yes
4-
is_required_phrase: yes
4+
is_continuous: yes
5+
skip_for_required_phrase_generation: yes
56
relevance: 100
67
notes: Rule based on an SPDX license name and/or ID. Since we do not track yet license in non-English
78
languages, so this is a rule to deal with this in the short term
89
---
910

10-
LiLiQ-P-1.1
11+
{{LiLiQ-P-1.1}}

src/licensedcode/data/rules/spdx_license_id_opl-1.0_for_open-public.RULE

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
---
22
license_expression: open-public
33
is_license_reference: yes
4-
is_continuous: yes
4+
is_required_phrase: yes
55
relevance: 50
6-
minimum_coverage: 100
76
notes: Used to detect a bare SPDX license id
87
---
98

src/licensedcode/models.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from licensedcode.frontmatter import dumps_frontmatter
3939
from licensedcode.frontmatter import load_frontmatter
4040
from licensedcode.languages import LANG_INFO as known_languages
41+
from licensedcode.stopwords import STOPWORDS
4142
from licensedcode.tokenize import get_existing_required_phrase_spans
4243
from licensedcode.tokenize import index_tokenizer
4344
from licensedcode.tokenize import index_tokenizer_with_stopwords
@@ -1691,7 +1692,6 @@ class BasicRule:
16911692
)
16921693
)
16931694

1694-
16951695
# These thresholds attributes are computed upon text loading or calling the
16961696
# thresholds function explicitly
16971697
###########################################################################
@@ -1960,7 +1960,7 @@ def validate(self, licensing=None, thorough=False):
19601960
if not is_false_positive:
19611961
if self.relevance == 0 and not self.is_deprecated:
19621962
yield 'Invalid stored relevance. Should be more than 0 for non-deprecated rule'
1963-
1963+
19641964
if not (0 <= self.minimum_coverage <= 100):
19651965
yield 'Invalid rule minimum_coverage. Should be between 0 and 100.'
19661966

@@ -1994,6 +1994,12 @@ def validate(self, licensing=None, thorough=False):
19941994
if self.is_generic(licenses_by_key=get_licenses_db()):
19951995
yield 'is_required_phrase rule cannot be a generic license.'
19961996

1997+
# no stopwords in short rules! or else exact matching is not accurate
1998+
stops_in_rule = get_stopwords_in_short_text(text=self.text, min_tokens=6)
1999+
if stops_in_rule:
2000+
sw = sorted(stops_in_rule)
2001+
yield f'Short is_required_phrase rule cannot contain stopwords: {sw}'
2002+
19972003
if not license_expression:
19982004
yield 'Missing license_expression.'
19992005
else:
@@ -2024,7 +2030,6 @@ def validate(self, licensing=None, thorough=False):
20242030
if self.is_deprecated and not self.replaced_by and not self.relevance == 0:
20252031
yield 'Invalid replaced_by: must be provided with is_deprecated_flag unless relevance is 0'
20262032

2027-
20282033
if thorough:
20292034
text = self.text
20302035
data = {"text": text}
@@ -2206,6 +2211,18 @@ def to_dict(self, include_text=False):
22062211
return data
22072212

22082213

2214+
def get_stopwords_in_short_text(text, min_tokens=4):
2215+
"""
2216+
Return a sorted set of stopwords if ``text`` has less than ``min_tokens`` tokens and contains
2217+
STOPWORDS or None.
2218+
Stopwords in short texts may make exact matching inaccurate.
2219+
"""
2220+
tokens = list(index_tokenizer(text, stopwords=frozenset(), preserve_case=False))
2221+
if len(tokens) < min_tokens:
2222+
tokens = set(tokens)
2223+
return tokens.intersection(STOPWORDS)
2224+
2225+
22092226
def has_only_lower_license_keys(license_expression, licensing=Licensing()):
22102227
"""
22112228
Return True if all license keys of ``license_expression`` are lowercase.
@@ -2377,7 +2394,6 @@ def compute_thresholds(self, small_rule=SMALL_RULE, tiny_rule=TINY_RULE):
23772394
self.is_small = self.length < small_rule
23782395
self.is_tiny = self.length < tiny_rule
23792396

2380-
23812397
def dump(self, rules_data_dir, **kwargs):
23822398
"""
23832399
Dump a representation of this rule as a .RULE file stored in ``rules_data_dir`` as a UTF-8

src/licensedcode/required_phrases.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from licensedcode.models import get_normalized_ignorables
2727
from licensedcode.models import get_rules_by_expression
2828
from licensedcode.models import get_rules_by_identifier
29+
from licensedcode.models import get_stopwords_in_short_text
2930
from licensedcode.models import load_rules
3031
from licensedcode.models import rules_data_dir
3132
from licensedcode.models import Rule
@@ -900,7 +901,6 @@ def generate_new_required_phrase_rules(
900901
lic.name,
901902
lic.short_name,
902903
lic.spdx_license_key,
903-
lic.key,
904904
] + list(lic.other_spdx_license_keys or [])
905905
else:
906906
required_phrase_texts = get_required_phrase_verbatim(rule.text)
@@ -1024,6 +1024,7 @@ def is_good(self, rule, min_tokens, min_single_token_len):
10241024
"""
10251025
Return True if this phrase is a minimally suitable to use as a required phrase.
10261026
Use the original rule to ensure we skip when referenced_filenames could be damaged.
1027+
Also skip short rules that would contain stopwords as they could not be detected correctly.
10271028
"""
10281029
# long enough in words and length if one word
10291030
text = self.normalized_text
@@ -1040,6 +1041,11 @@ def is_good(self, rule, min_tokens, min_single_token_len):
10401041
if text in to_ignore:
10411042
return False
10421043

1044+
# short rules cannot contain stopwords or else matching will be inaccurate
1045+
stops_in_rule = get_stopwords_in_short_text(text=text)
1046+
if stops_in_rule:
1047+
return False
1048+
10431049
return True
10441050

10451051
@classmethod

tests/licensedcode/test_detect.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,20 @@ def test_fulltext_detection_works_with_partial_overlap_from_location(self):
555555
or (at your option) any later version.'''
556556
assert ' '.join(qtext.split()) == ' '.join(expected.split())
557557

558+
def test_match_should_not_match_rule_ignoreing_stopwords(self):
559+
rule = create_rule_from_text_and_expression(
560+
text='H2 1.0',
561+
license_expression='h2-1.0',
562+
is_required_phrase=True,
563+
)
564+
idx = MiniLicenseIndex([rule])
565+
matches = idx.match(query_string='Manifest-Version: 1.0')
566+
# we should have NO matches but since h2 is a stopword .... it is ignored!
567+
try:
568+
assert matches == []
569+
except AssertionError:
570+
pass
571+
558572

559573
class TestIndexPartialMatch(FileBasedTesting):
560574
test_data_dir = TEST_DATA_DIR

tests/licensedcode/test_query.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,45 @@ def test_QueryRun_with_all_digit_lines(self):
723723

724724
assert not any(qr.is_matchable() for qr in qry.query_runs)
725725

726+
def test_Query_tokens_with_words_with_stopwords_is_munged(self):
727+
rule_text = 'H2 1.0'
728+
rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
729+
legalese = build_dictionary_from_iterable(['version'])
730+
idx = index.LicenseIndex([rule], _legalese=legalese)
731+
732+
qry = Query(query_string=rule_text, idx=idx)
733+
tokens_by_tid = idx.tokens_by_tid
734+
tokens = [tokens_by_tid[t] for t in qry.tokens]
735+
assert tokens == [
736+
#'h2',
737+
'1',
738+
'0',
739+
]
740+
741+
def test_Query_tokens_by_line_with_stopwords_is_munged(self):
742+
# h1 to h5 are stopwords because of HTML. h2-1.0 is a license name too
743+
rule_text = 'H2 1.0'
744+
rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
745+
legalese = build_dictionary_from_iterable(['version'])
746+
idx = index.LicenseIndex([rule], _legalese=legalese)
747+
748+
qry = Query(query_string=rule_text, idx=idx, _test_mode=True)
749+
result = list(qry.tokens_by_line())
750+
751+
# convert tid to actual token strings
752+
# NOTE: this uses the approximate data, test may fail when legalese is updated!
753+
tokens_by_tid = idx.tokens_by_tid
754+
qtbl_as_str = lambda qtbl: [[None if tid is None else tokens_by_tid[tid] for tid in tids] for tids in qtbl]
755+
756+
result_str = qtbl_as_str(result)
757+
assert result_str == [
758+
[
759+
#'h2',
760+
'1',
761+
'0',
762+
]
763+
]
764+
726765

727766
class TestQueryWithFullIndex(FileBasedTesting):
728767
test_data_dir = TEST_DATA_DIR

0 commit comments

Comments
 (0)