From 685a16f5a484c8814311140c6ead6332fbe8dc13 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Thu, 19 Jun 2025 14:00:40 +0530 Subject: [PATCH 01/14] add test for correct position of `extra-words` and enhance `detection_log` Add test for is correct position of `extra-words` according to `extra-phrases` that is present in rules. if we find `extra-words` are in the right place then we set score to `100`. And also show in `detection_log` why we increasing the score to keep track of this. Signed-off-by: Alok Kumar From 760eba34d7c55ca48aa7205e89d45c12c04a0b47 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Tue, 10 Jun 2025 01:46:41 +0530 Subject: [PATCH 02/14] Add new phrase for 'extra-words` in rules Add new phrases like `extra_phrase` this is special for extra-words. This phrase is represented in the format [[n]], where n indicates the maximum number of extra-words allowed at that position in the rule. If extra-words appear at the correct position and their count does not exceed the allowed limit `n`, then the score is increased to `100`. Signed-off-by: Alok Kumar --- src/licensedcode/data/rules/bsd-new_158.RULE | 2 +- src/licensedcode/data/rules/bsd-new_578.RULE | 6 +- src/licensedcode/detection.py | 24 ++++ src/licensedcode/match.py | 57 ++++++++ src/licensedcode/models.py | 41 ++++++ src/licensedcode/tokenize.py | 66 ++++++++++ ...an-extra-words-2-aho-license.expected.json | 12 +- tests/licensedcode/test_license_models.py | 8 ++ tests/licensedcode/test_match.py | 124 ++++++++++++++++++ tests/licensedcode/test_tokenize.py | 58 ++++++++ 10 files changed, 389 insertions(+), 9 deletions(-) diff --git a/src/licensedcode/data/rules/bsd-new_158.RULE b/src/licensedcode/data/rules/bsd-new_158.RULE index 90af8ee4a67..b0835457774 100644 --- a/src/licensedcode/data/rules/bsd-new_158.RULE +++ b/src/licensedcode/data/rules/bsd-new_158.RULE @@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -Neither the name of nor the names of its +Neither the name of [[6]] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/data/rules/bsd-new_578.RULE b/src/licensedcode/data/rules/bsd-new_578.RULE index 99f1aad110e..ede4c9400b4 100644 --- a/src/licensedcode/data/rules/bsd-new_578.RULE +++ b/src/licensedcode/data/rules/bsd-new_578.RULE @@ -6,7 +6,9 @@ minimum_coverage: 99 Software License Agreement (BSD License) -Redistribution and use in source and binary forms, with or without +[[15]] + +Redistribution and use [[4]] in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -16,7 +18,7 @@ are met: copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of nor the names of its + * Neither the name of [[6]] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index 34cbe582e63..47410aba095 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -30,6 +30,7 @@ from licensedcode.cache import get_licensing from licensedcode.match import LicenseMatch from licensedcode.match import set_matched_lines +from licensedcode.match import is_extra_words_position_valid from licensedcode.models import compute_relevance from licensedcode.models import Rule from licensedcode.models import UnDetectedRule @@ -110,6 +111,7 @@ class DetectionCategory(Enum): PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file' PACKAGE_ADD_FROM_FILE = 'from-package-file' EXTRA_WORDS = 'extra-words' + EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule' UNKNOWN_MATCH = 'unknown-match' LICENSE_CLUES = 'license-clues' LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' @@ -129,6 +131,7 @@ class DetectionRule(Enum): """ UNKNOWN_MATCH = 'unknown-match' EXTRA_WORDS = 'extra-words' + EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule' LICENSE_CLUES = 'license-clues' LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' IMPERFECT_COVERAGE = 'imperfect-match-coverage' @@ -1072,6 +1075,7 @@ def is_correct_detection_non_unknown(license_matches): is_correct_detection(license_matches) and not has_unknown_matches(license_matches) and not has_extra_words(license_matches) + and not is_extra_words_at_valid_positions(license_matches) ) @@ -1159,6 +1163,16 @@ def has_low_rule_relevance(license_matches): ) +def is_extra_words_at_valid_positions(license_matches): + """ + Return True if any of the matches in ``license_matches`` List of LicenseMatch + has extra words are in the correct place. + """ + return any( + is_extra_words_position_valid(license_match) + for license_match in license_matches + ) + def is_false_positive(license_matches, package_license=False): """ Return True if all of the matches in ``license_matches`` List of LicenseMatch @@ -1570,6 +1584,12 @@ def get_detected_license_expression( detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value) return detection_log, combined_expression + elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value: + if TRACE_ANALYSIS: + logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}') + matches_for_expression = license_matches + detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value) + elif analysis == DetectionCategory.EXTRA_WORDS.value: if TRACE_ANALYSIS: logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}') @@ -1807,6 +1827,10 @@ def analyze_detection(license_matches, package_license=False): threshold=IMPERFECT_MATCH_COVERAGE_THR, ): return DetectionCategory.IMPERFECT_COVERAGE.value + + # Case where `extra-words` are in the right place + elif is_extra_words_at_valid_positions(license_matches=license_matches): + return DetectionCategory.EXTRA_WORDS_PERMITTED.value # Case where at least one of the match have extra words elif has_extra_words(license_matches=license_matches): diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 90eba30d55e..2e2b27bdde9 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -598,6 +598,12 @@ def score(self): in the matched range (including unknowns and unmatched) and the matched rule relevance. """ + + # Check whether extra words in the matched text appear in allowed positions, + # and do not exceed the maximum allowed word count at those positions. + if is_extra_words_position_valid(match=self): + return 100 + # relevance is a number between 0 and 100. Divide by 100 relevance = self.rule.relevance / 100 if not relevance: @@ -1071,6 +1077,57 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE): # early from the loops: trying to check containment on wildly separated matches # does not make sense +def is_extra_words_position_valid(match): + """ + Return True if the extra words appear in valid positions and + do not exceed the maximum allowed word count at those positions. + Otherwise, return False. + """ + + rule_spans = match.ispan.subspans() + + # If there are multiple subspans, it means not all required tokens are contiguous. + if len(rule_spans) > 1: + return False + + matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False))) + rule_tokens = list(index_tokenizer(match.rule.text)) + extra_phrase_spans = match.rule.extra_phrase_spans + + if not extra_phrase_spans: + return False + + # count of `extra-words` tokens i.e inserted in `matched_tokens` + matched_count = 0 + + # Count of extra phrase markers + extra_phrase_count = 0 + + for span, allowed_extra_words in extra_phrase_spans: + rule_index = span.start - extra_phrase_count - 1 + allowed_extra_words = allowed_extra_words + + matched_index = span.start + matched_count - extra_phrase_count + extra_words_count = 0 + + # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens` + if(matched_tokens[matched_index-1] != rule_tokens[rule_index]): + return False + + # Count how many tokens in `matched_text` do not match the next rule token + while (matched_index < len(matched_tokens) and + matched_tokens[matched_index] != rule_tokens[rule_index + 1]): + matched_index += 1 + matched_count += 1 + extra_words_count += 1 + + if extra_words_count > allowed_extra_words: + return False + + extra_phrase_count += 1 + + return True + def filter_contained_matches( matches, diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 354d93f52d3..47f0238660e 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import re import os import sys import traceback @@ -43,6 +44,7 @@ from licensedcode.tokenize import index_tokenizer from licensedcode.tokenize import index_tokenizer_with_stopwords from licensedcode.tokenize import query_lines +from licensedcode.tokenize import get_extra_phrase_spans from scancode.api import SCANCODE_LICENSEDB_URL from scancode.api import SCANCODE_LICENSE_URL from scancode.api import SCANCODE_RULE_URL @@ -1683,6 +1685,17 @@ class BasicRule: ) ) + extra_phrase_spans = attr.ib( + default=attr.Factory(list), + repr=False, + metadata=dict( + help='List of tuples `(Span, int)` representing extra phrases for this rule.' + 'Each tuple contains a Span of token positions in the rule text and an integer' + 'indicating the maximum number of extra tokens allowed at that position.' + 'extra phrases are enclosed in [[double square brackets]] in the rule text.' + ) + ) + source = attr.ib( default=None, repr=False, @@ -2306,6 +2319,9 @@ def load_data(self, rule_file): except Exception: trace = traceback.format_exc() raise InvalidRule(f'While loading: file://{rule_file}\n{trace}') + + # remove extra_phrase marker from rules + self.text = remove_extra_phrase(self.text) return self @@ -2317,8 +2333,15 @@ def tokens(self): "is_continuous", "minimum_coverage" and "stopword_by_pos" are recomputed as a side effect. """ + + # identify and capture the spans of extra phrases specified within the rule + self.extra_phrase_spans = list(self.extra_phrases()) + + # remove extra_phrase marker from rules + self.text = remove_extra_phrase(self.text) text = self.text + # We tag this rule as being a bare URL if it starts with a scheme and is # on one line: this is used to determine a matching approach @@ -2353,6 +2376,17 @@ def _set_continuous(self): ): self.is_continuous = True + def extra_phrases(self): + """ + Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text. + + Each tuple consists of: + - a `Span` object representing the position in the tokenized rule text, and + - an integer `n` indicating how many extra tokens are allowed at that position. + """ + if self.text: + yield from get_extra_phrase_spans(self.text) + def build_required_phrase_spans(self): """ Return a list of Spans marking required phrases token positions of that must @@ -2570,6 +2604,13 @@ def from_match_data(license_match_mapping): return get_index().rules_by_id[rule_identifier] +def remove_extra_phrase(text): + """ + Remove extra phrase markers like [[n]], where the n is a digit. + """ + pattern = r'\[\[\d+\]\]' + return re.sub(pattern, '', text) + def compute_relevance(length): """ Return a computed ``relevance`` given a ``length`` and a threshold. diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py index bea07dd5a21..dd7ad3356b1 100644 --- a/src/licensedcode/tokenize.py +++ b/src/licensedcode/tokenize.py @@ -81,12 +81,78 @@ def query_lines( required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})' required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall + +extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])' +extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall + REQUIRED_PHRASE_OPEN = '{{' REQUIRED_PHRASE_CLOSE = '}}' +EXTRA_PHRASE_OPEN ='[[' +EXTRA_PHRASE_CLOSE =']]' + # FIXME: this should be folded in a single pass tokenization with the index_tokenizer +def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False): + """ + Yield tokens from a rule ``text`` including extra phrases [[n]] markers. + This n denotes maximum number of extra-words i.e valide at that position. + This is same as ``required_phrase_tokenizer``. + """ + if not text: + return + if not preserve_case: + text = text.lower() + + for token in extra_phrase_splitter(text): + if token and token not in stopwords: + yield token + + +def get_extra_phrase_spans(text): + """ + Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``. + Here, `n` should always be a digit token inside the extra phrase brackets. + + Example: + >>> text = 'Neither the name [[3]] of nor the names of its' + >>> # 0 1 2 3 4 5 6 7 8 9 + >>> x = get_extra_phrase_spans(text) + >>> assert x == [(Span([3]), 3)], x + """ + ipos = 0 + in_extra_phrase = False + current_phrase_value = [] + extra_phrase_spans = [] + + for token in extra_phrase_tokenizer(text): + if token == EXTRA_PHRASE_OPEN: + in_extra_phrase = True + current_phrase_value = [] + continue + + elif token == EXTRA_PHRASE_CLOSE: + if in_extra_phrase: + # token must be digit and token must be present in double square bracket ``[[token]]`` + # and between extra phrases there must only one token exist + if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit(): + extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0]))) + + in_extra_phrase = False + current_phrase_value = [] + continue + + if in_extra_phrase: + # consider one token after double open square bracket ``[[`` + if len(current_phrase_value) == 0: + current_phrase_value.append(token) + + ipos += 1 + + return extra_phrase_spans + + def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False): """ Yield tokens from a rule ``text`` including required phrases {{brace}} markers. diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json index fcdb8639ddb..469b7340eee 100644 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json @@ -1,12 +1,12 @@ { "license_detections": [ { - "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50", + "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e", "license_expression": "bsd-new", "license_expression_spdx": "BSD-3-Clause", "detection_count": 1, "detection_log": [ - "extra-words" + "extra-words-permitted-in-rule" ], "reference_matches": [ { @@ -16,7 +16,7 @@ "start_line": 4, "end_line": 27, "matcher": "2-aho", - "score": 99.53, + "score": 100, "matched_length": 210, "match_coverage": 100.0, "rule_relevance": 100, @@ -46,7 +46,7 @@ "start_line": 4, "end_line": 27, "matcher": "2-aho", - "score": 99.53, + "score": 100, "matched_length": 210, "match_coverage": 100.0, "rule_relevance": 100, @@ -57,9 +57,9 @@ } ], "detection_log": [ - "extra-words" + "extra-words-permitted-in-rule" ], - "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50" + "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e" } ], "license_clues": [], diff --git a/tests/licensedcode/test_license_models.py b/tests/licensedcode/test_license_models.py index 6c47d92a594..fec3dd7fdd5 100644 --- a/tests/licensedcode/test_license_models.py +++ b/tests/licensedcode/test_license_models.py @@ -591,6 +591,14 @@ def test_key_phrases_yields_spans(self): key_phrase_spans = list(rule.build_required_phrase_spans()) assert key_phrase_spans == [Span(4), Span(7, 9)] + def test_extra_phrases_yields_spans(self): + rule_text = ( + 'Neither the name of [[3]] nor the names of its' + ) + rule = models.Rule(license_expression='bsd-new', text=rule_text) + extra_phrase_spans = list(rule.extra_phrases()) + assert extra_phrase_spans == [(Span(4),3)] + def test_key_phrases_raises_exception_when_markup_is_not_closed(self): rule_text = ( 'This released software is {{released}} by under {{the MIT license. ' diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py index 0afab2a7fd8..3302e1a57bc 100644 --- a/tests/licensedcode/test_match.py +++ b/tests/licensedcode/test_match.py @@ -20,6 +20,7 @@ from licensedcode.match import filter_overlapping_matches from licensedcode.match import get_full_matched_text from licensedcode.match import get_matching_regions +from licensedcode.match import is_extra_words_position_valid from licensedcode.match import LicenseMatch from licensedcode.match import merge_matches from licensedcode.match import reportable_tokens @@ -1321,6 +1322,106 @@ def test_get_matching_regions_3_lines_enough(self): assert matches[5].qspan in regions[1] +class TestExtraWordsPosition(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_valid_extra_words_within_limit(self): + rule_text = """ + Redistribution and use [[4]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is True + + def test_invalid_extra_words_exceed_limit(self): + rule_text = """ + Redistribution and use [[2]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_no_extra_words_allowed(self): + rule_text = """ + Redistribution and use in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_multiple_extra_spans_valid(self): + rule_text = """ + Redistribution [[2]] and use [[1]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution of content and use again in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is True + + def test_extra_words_at_wrong_position(self): + rule_text = """ + Redistribution and use [[2]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and amazing use in great source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_exact_match_without_extra_markers(self): + rule_text = """ + Redistribution and use in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + class TestLicenseMatchScore(FileBasedTesting): test_data_dir = TEST_DATA_DIR @@ -1381,6 +1482,29 @@ def test_LicenseMatch_score_100_non_contiguous(self): m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41)) assert m1.score() == 80.77 + def test_LicenseMatch_matches_score_100_for_extra_words_within_limit(self): + rule_text = 'Neither the name of [[3]] nor the names of its' + rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text) + idx = index.LicenseIndex([rule]) + + query = 'Neither the name of XXX YYY ZZZ nor the names of its' + matches = idx.match(query_string=query, _skip_hash_match=True) + match = matches[0] + score = match.score() + assert score == 100 + + def test_LicenseMatch_matches_score_not_100_for_extra_words_exceed_limit(self): + rule_text = 'Neither the name of [[3]] nor the names of its' + rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text) + idx = index.LicenseIndex([rule]) + + # The query includes 4 extra words instead of the allowed 3. + query = 'Neither the name of XXX YYY ZZZ AAA nor the names of its' + matches = idx.match(query_string=query, _skip_hash_match=True) + match = matches[0] + score = match.score() + assert score != 100 + def test_LicenseMatch_stopwords_are_treated_as_unknown_2484(self): rules_dir = self.get_test_loc('stopwords/index/rules') lics_dir = self.get_test_loc('stopwords/index/licenses') diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py index 950a94cf764..2639105094f 100644 --- a/tests/licensedcode/test_tokenize.py +++ b/tests/licensedcode/test_tokenize.py @@ -19,6 +19,7 @@ from licensedcode.spans import Span from licensedcode.tokenize import get_existing_required_phrase_spans +from licensedcode.tokenize import get_extra_phrase_spans from licensedcode.tokenize import index_tokenizer from licensedcode.tokenize import InvalidRuleRequiredPhrase from licensedcode.tokenize import matched_query_text_tokenizer @@ -26,6 +27,7 @@ from licensedcode.tokenize import query_lines from licensedcode.tokenize import query_tokenizer from licensedcode.tokenize import required_phrase_tokenizer +from licensedcode.tokenize import extra_phrase_tokenizer from licensedcode.tokenize import select_ngrams from licensedcode.tokenize import tokens_and_non_tokens from licensedcode.tokenize import word_splitter @@ -585,6 +587,62 @@ def test_get_existing_required_phrase_spans_with_markup(self): assert get_existing_required_phrase_spans(text=text) == [Span(18, 19)] +class TestExtraPhraseTokenizer(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_extra_phrase_tokenizer_handles_empty_string(self): + text = '' + result = list(extra_phrase_tokenizer(text)) + assert result == [] + + def test_extra_phrase_tokenizer_handles_blank_lines(self): + text = u' \n\n\t ' + result = list(extra_phrase_tokenizer(text)) + assert result == [] + + def test_extra_phrase_tokenizer_handles_only_brackets(self): + text = '[[3]]' + assert list(extra_phrase_tokenizer(text)) == ['[[', '3', ']]'] + + def test_extra_phrase_tokenizer_parses_text_with_extra_phrase_marker(self): + text = 'Neither the name of [[3]] nor the names of its' + assert list(extra_phrase_tokenizer(text)) == [ + 'neither', 'the', 'name', 'of', '[[', '3', ']]', 'nor', 'the', 'names', 'of', 'its' + ] + + def test_get_extra_phrase_spans_simple(self): + text = 'This is [[2]] an example.' + spans = get_extra_phrase_spans(text) + assert spans == [(Span([2]), 2)] + + def test_get_extra_phrase_spans_multiple(self): + text = 'Some [[4]] text [[6]] with multiple markers.' + spans = get_extra_phrase_spans(text) + assert spans == [(Span([1]), 4), (Span([3]), 6)] + + def test_get_extra_phrase_spans_returns_nothing_if_none_found(self): + text = 'Just some normal text.' + assert get_extra_phrase_spans(text) == [] + + def test_get_extra_phrase_spans_ignores_non_numeric_values(self): + text = 'Just some [[normal]] text.' + assert get_extra_phrase_spans(text) == [] + + def test_extra_phrase_tokenizer_returns_same_word_tokens_as_index_tokenizer(self): + text = 'This [[1]] is a test.' + ep_tokens = [t for t in extra_phrase_tokenizer(text) if t not in ('[[', ']]')] + idx_tokens = list(index_tokenizer(text)) + assert ep_tokens == idx_tokens + + def test_get_extra_phrase_spans_ignores_unclosed_opening_bracket(self): + text = 'Neither the name of [[3 nor the names of its' + assert get_extra_phrase_spans(text) == [] + + def test_get_extra_phrase_spans_ignores_unopened_closing_bracket(self): + text = 'Neither the name of 3]] nor the names of its' + assert get_extra_phrase_spans(text) == [] + + class TestNgrams(FileBasedTesting): test_data_dir = TEST_DATA_DIR From b784806f2a8118de4c8cf318fa1e6499fe9f3e63 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Thu, 19 Jun 2025 17:33:44 +0530 Subject: [PATCH 03/14] fix test failure Signed-off-by: Alok Kumar --- src/licensedcode/models.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 47f0238660e..c6e77fbbf01 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -2319,9 +2319,6 @@ def load_data(self, rule_file): except Exception: trace = traceback.format_exc() raise InvalidRule(f'While loading: file://{rule_file}\n{trace}') - - # remove extra_phrase marker from rules - self.text = remove_extra_phrase(self.text) return self From 88196da55378d18d7075c58cb79d2b13c5c55efc Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Thu, 19 Jun 2025 22:37:59 +0530 Subject: [PATCH 04/14] remove extra_phrase marker from rules due to `extra_phrase` in rules, this shows that rules containing `extra-words` Signed-off-by: Alok Kumar --- src/licensedcode/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index c6e77fbbf01..062dc6c4060 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -2447,7 +2447,9 @@ def dump(self, rules_data_dir, **kwargs): # other rule metadata, like debugging collection of required phrases if kwargs: metadata.update(kwargs) - content = self.text + + # remove extra_phrase marker + content = remove_extra_phrase(self.text) output = dumps_frontmatter(content=content, metadata=metadata) with open(rule_file, 'w') as of: of.write(output) From f80963c73f32f6bafe3ac71c9d3e3cfea0249068 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Sun, 22 Jun 2025 16:46:55 +0530 Subject: [PATCH 05/14] fix minor issues Signed-off-by: Alok Kumar --- src/licensedcode/models.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 062dc6c4060..f78492b33cf 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -2338,7 +2338,6 @@ def tokens(self): self.text = remove_extra_phrase(self.text) text = self.text - # We tag this rule as being a bare URL if it starts with a scheme and is # on one line: this is used to determine a matching approach @@ -2447,9 +2446,7 @@ def dump(self, rules_data_dir, **kwargs): # other rule metadata, like debugging collection of required phrases if kwargs: metadata.update(kwargs) - - # remove extra_phrase marker - content = remove_extra_phrase(self.text) + content = self.text output = dumps_frontmatter(content=content, metadata=metadata) with open(rule_file, 'w') as of: of.write(output) From 6ad990e604db8001dbb9377bad21b6947f30a709 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Mon, 23 Jun 2025 19:59:52 +0530 Subject: [PATCH 06/14] add test for `3-seq` Signed-off-by: Alok Kumar --- ...an-extra-words-3-seq-license.expected.json | 138 +++++++++--------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json index c078b0be9f0..685252b79d3 100644 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json @@ -1,70 +1,70 @@ -{ - "license_detections": [ - { - "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e", - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "detection_count": 1, - "detection_log": [ - "extra-words" - ], - "reference_matches": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "from_file": "scan-extra-words-3-seq-license/LICENSE", - "start_line": 1, - "end_line": 31, - "matcher": "3-seq", - "score": 92.67, - "matched_length": 215, - "match_coverage": 100.0, - "rule_relevance": 100, - "rule_identifier": "bsd-new_578.RULE", - "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", - "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", - "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." - } - ] - } - ], - "files": [ - { - "path": "LICENSE", - "type": "file", - "detected_license_expression": "bsd-new", - "detected_license_expression_spdx": "BSD-3-Clause", - "license_detections": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "matches": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "from_file": "scan-extra-words-3-seq-license/LICENSE", - "start_line": 1, - "end_line": 31, - "matcher": "3-seq", - "score": 92.67, - "matched_length": 215, - "match_coverage": 100.0, - "rule_relevance": 100, - "rule_identifier": "bsd-new_578.RULE", - "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", - "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", - "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." - } - ], - "detection_log": [ - "extra-words" - ], - "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e" - } - ], - "license_clues": [], - "percentage_of_license_text": 92.67, - "scan_errors": [] - } - ] +{ + "license_detections": [ + { + "identifier": "bsd_new-f327c6f9-6086-8bd5-22c6-3aee3b99acf2", + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "detection_count": 1, + "detection_log": [ + "extra-words-permitted-in-rule" + ], + "reference_matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "scan-extra-words-3-seq-license/LICENSE", + "start_line": 1, + "end_line": 31, + "matcher": "3-seq", + "score": 100, + "matched_length": 215, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_578.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", + "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ] + } + ], + "files": [ + { + "path": "LICENSE", + "type": "file", + "detected_license_expression": "bsd-new", + "detected_license_expression_spdx": "BSD-3-Clause", + "license_detections": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "scan-extra-words-3-seq-license/LICENSE", + "start_line": 1, + "end_line": 31, + "matcher": "3-seq", + "score": 100, + "matched_length": 215, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_578.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", + "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ], + "detection_log": [ + "extra-words-permitted-in-rule" + ], + "identifier": "bsd_new-f327c6f9-6086-8bd5-22c6-3aee3b99acf2" + } + ], + "license_clues": [], + "percentage_of_license_text": 92.67, + "scan_errors": [] + } + ] } \ No newline at end of file From 5f090e340c519166b2fd692bc50092c184dc6fbd Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Tue, 24 Jun 2025 11:56:21 +0530 Subject: [PATCH 07/14] improve detection and handling of `extra-words` and add test for `3-seq` add a new `extra-phrase` for a rule i.e bsd-new Signed-off-by: Alok Kumar --- src/licensedcode/data/rules/bsd-new_578.RULE | 6 +- .../data/rules/bsd-new_newlib3.RULE | 2 +- src/licensedcode/detection.py | 33 +++++---- src/licensedcode/match.py | 14 ++-- ...an-extra-words-2-aho-license.expected.json | 4 +- ...an-extra-words-3-seq-license.expected.json | 70 ------------------- .../{ => with-copyrights}/LICENSE | 60 ++++++++-------- .../without-copyrights/LICENSE | 32 +++++++++ ...ords-with-copyrights-license.expected.json | 70 +++++++++++++++++++ ...s-without-copyrights-license.expected.json | 70 +++++++++++++++++++ .../test_plugin_license_detection.py | 22 +++++- 11 files changed, 254 insertions(+), 129 deletions(-) delete mode 100644 tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json rename tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/{ => with-copyrights}/LICENSE (98%) create mode 100644 tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights/LICENSE create mode 100644 tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json create mode 100644 tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json diff --git a/src/licensedcode/data/rules/bsd-new_578.RULE b/src/licensedcode/data/rules/bsd-new_578.RULE index ede4c9400b4..baa5f6d9fdc 100644 --- a/src/licensedcode/data/rules/bsd-new_578.RULE +++ b/src/licensedcode/data/rules/bsd-new_578.RULE @@ -6,9 +6,7 @@ minimum_coverage: 99 Software License Agreement (BSD License) -[[15]] - -Redistribution and use [[4]] in source and binary forms, with or without +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -18,7 +16,7 @@ are met: copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of [[6]] nor the names of its + * Neither the name of nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/data/rules/bsd-new_newlib3.RULE b/src/licensedcode/data/rules/bsd-new_newlib3.RULE index 997f5537439..1de2ceccdcf 100644 --- a/src/licensedcode/data/rules/bsd-new_newlib3.RULE +++ b/src/licensedcode/data/rules/bsd-new_newlib3.RULE @@ -11,7 +11,7 @@ are permitted provided that the following conditions are met: * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of the University nor the names of its contributors + * Neither the name of the [[3]] University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index 47410aba095..36b42735de3 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -408,8 +408,12 @@ def score(self): by the length of a match to the overall detection length. """ length = self.length - weighted_scores = (m.score() * (m.len() / length) for m in self.matches) - return min([round(sum(weighted_scores), 2), 100]) + for m in self.matches: + # Check whether extra words in the matched text appear in allowed positions, + # and do not exceed the maximum allowed word count at those positions. + score = 100 if is_extra_words_position_valid(m) else m.score() + weighted_scores += score * (m.len() / length) + return min([round(weighted_scores, 2), 100]) def append( self, @@ -1165,13 +1169,18 @@ def has_low_rule_relevance(license_matches): def is_extra_words_at_valid_positions(license_matches): """ - Return True if any of the matches in ``license_matches`` List of LicenseMatch + Return True if all the matches in `license_matches List of LicenseMatch has extra words are in the correct place. """ - return any( - is_extra_words_position_valid(license_match) - for license_match in license_matches - ) + for match in license_matches: + # check when we have `extra-words` detection + # if `query_coverage_coefficient` is positive number then 'extra-words` exit + if calculate_query_coverage_coefficient(match) > 0: + if not is_extra_words_position_valid(match): + return False + + # at the end return True if all matches have no extra-wors or this extra-words are in the right place + return True def is_false_positive(license_matches, package_license=False): """ @@ -1827,14 +1836,14 @@ def analyze_detection(license_matches, package_license=False): threshold=IMPERFECT_MATCH_COVERAGE_THR, ): return DetectionCategory.IMPERFECT_COVERAGE.value - - # Case where `extra-words` are in the right place - elif is_extra_words_at_valid_positions(license_matches=license_matches): - return DetectionCategory.EXTRA_WORDS_PERMITTED.value # Case where at least one of the match have extra words elif has_extra_words(license_matches=license_matches): - return DetectionCategory.EXTRA_WORDS.value + # Case where `extra-words` are in the right place + if is_extra_words_at_valid_positions(license_matches=license_matches): + return DetectionCategory.EXTRA_WORDS_PERMITTED.value + else: + return DetectionCategory.EXTRA_WORDS.value # Cases where Match Coverage is a perfect 100 for all matches else: diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 2e2b27bdde9..5058a7c45ae 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -597,13 +597,7 @@ def score(self): computed from the number of matched tokens, the number of query tokens in the matched range (including unknowns and unmatched) and the matched rule relevance. - """ - - # Check whether extra words in the matched text appear in allowed positions, - # and do not exceed the maximum allowed word count at those positions. - if is_extra_words_position_valid(match=self): - return 100 - + """ # relevance is a number between 0 and 100. Divide by 100 relevance = self.rule.relevance / 100 if not relevance: @@ -832,7 +826,11 @@ def to_dict( result['start_line'] = self.start_line result['end_line'] = self.end_line result['matcher'] = self.matcher - result['score'] = self.score() + # update score if `extra-words` are in right place + if(is_extra_words_position_valid(match=self)): + result['score'] = 100 + else: + result['score'] = self.score() result['matched_length'] = self.len() result['match_coverage'] = self.coverage() result['rule_relevance'] = self.rule.relevance diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json index 469b7340eee..5a4bab7dc4b 100644 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json @@ -1,7 +1,7 @@ { "license_detections": [ { - "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e", + "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50", "license_expression": "bsd-new", "license_expression_spdx": "BSD-3-Clause", "detection_count": 1, @@ -59,7 +59,7 @@ "detection_log": [ "extra-words-permitted-in-rule" ], - "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e" + "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50" } ], "license_clues": [], diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json deleted file mode 100644 index 685252b79d3..00000000000 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "license_detections": [ - { - "identifier": "bsd_new-f327c6f9-6086-8bd5-22c6-3aee3b99acf2", - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "detection_count": 1, - "detection_log": [ - "extra-words-permitted-in-rule" - ], - "reference_matches": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "from_file": "scan-extra-words-3-seq-license/LICENSE", - "start_line": 1, - "end_line": 31, - "matcher": "3-seq", - "score": 100, - "matched_length": 215, - "match_coverage": 100.0, - "rule_relevance": 100, - "rule_identifier": "bsd-new_578.RULE", - "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", - "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", - "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." - } - ] - } - ], - "files": [ - { - "path": "LICENSE", - "type": "file", - "detected_license_expression": "bsd-new", - "detected_license_expression_spdx": "BSD-3-Clause", - "license_detections": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "matches": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "from_file": "scan-extra-words-3-seq-license/LICENSE", - "start_line": 1, - "end_line": 31, - "matcher": "3-seq", - "score": 100, - "matched_length": 215, - "match_coverage": 100.0, - "rule_relevance": 100, - "rule_identifier": "bsd-new_578.RULE", - "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", - "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", - "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." - } - ], - "detection_log": [ - "extra-words-permitted-in-rule" - ], - "identifier": "bsd_new-f327c6f9-6086-8bd5-22c6-3aee3b99acf2" - } - ], - "license_clues": [], - "percentage_of_license_text": 92.67, - "scan_errors": [] - } - ] -} \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/LICENSE b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/with-copyrights/LICENSE similarity index 98% rename from tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/LICENSE rename to tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/with-copyrights/LICENSE index 4e7146ed78a..61c97f0f827 100644 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/LICENSE +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/with-copyrights/LICENSE @@ -1,31 +1,31 @@ -Software License Agreement (BSD License) - -Copyright (c) 2009-2015, Kevin Decker - -All rights reserved. - -Redistribution and use of this software in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above - copyright notice, this list of conditions and the - following disclaimer. - -* Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the - following disclaimer in the documentation and/or other - materials provided with the distribution. - -* Neither the name of Kevin Decker nor the names of its - contributors may be used to endorse or promote products - derived from this software without specific prior - written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR -IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND -FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +Software License Agreement (BSD License) + +Copyright (c) 2009-2015, Kevin Decker + +All rights reserved. + +Redistribution and use of this software in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the + following disclaimer in the documentation and/or other + materials provided with the distribution. + +* Neither the name of Kevin Decker nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior + written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights/LICENSE b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights/LICENSE new file mode 100644 index 00000000000..2a99b023de8 --- /dev/null +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights/LICENSE @@ -0,0 +1,32 @@ +Copyright (c) 2007, 2008, 2009 Mutsuo Saito, Makoto Matsumoto +and Hiroshima University. +Copyright (c) 2011, 2002 Mutsuo Saito, Makoto Matsumoto, Hiroshima +University and The University of Tokyo. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the Hiroshima University nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json new file mode 100644 index 00000000000..a3d49377824 --- /dev/null +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json @@ -0,0 +1,70 @@ +{ + "license_detections": [ + { + "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e", + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "detection_count": 1, + "detection_log": [ + "extra-words" + ], + "reference_matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "with-copyrights/LICENSE", + "start_line": 1, + "end_line": 31, + "matcher": "3-seq", + "score": 92.67, + "matched_length": 215, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_578.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", + "matched_text": "Software License Agreement (BSD License)\r\n\r\nCopyright (c) 2009-2015, Kevin Decker \r\n\r\nAll rights reserved.\r\n\r\nRedistribution and use of this software in source and binary forms, with or without modification,\r\nare permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer in the documentation and/or other\r\n materials provided with the distribution.\r\n\r\n* Neither the name of Kevin Decker nor the names of its\r\n contributors may be used to endorse or promote products\r\n derived from this software without specific prior\r\n written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\r\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\r\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\r\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\r\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Software License Agreement (BSD License)\r\n\r\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\r\n\r\n[All] [rights] [reserved].\r\n\r\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\r\nare permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer in the documentation and/or other\r\n materials provided with the distribution.\r\n\r\n* Neither the name of [Kevin] [Decker] nor the names of its\r\n contributors may be used to endorse or promote products\r\n derived from this software without specific prior\r\n written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\r\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\r\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\r\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\r\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ] + } + ], + "files": [ + { + "path": "LICENSE", + "type": "file", + "detected_license_expression": "bsd-new", + "detected_license_expression_spdx": "BSD-3-Clause", + "license_detections": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "with-copyrights/LICENSE", + "start_line": 1, + "end_line": 31, + "matcher": "3-seq", + "score": 92.67, + "matched_length": 215, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_578.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", + "matched_text": "Software License Agreement (BSD License)\r\n\r\nCopyright (c) 2009-2015, Kevin Decker \r\n\r\nAll rights reserved.\r\n\r\nRedistribution and use of this software in source and binary forms, with or without modification,\r\nare permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer in the documentation and/or other\r\n materials provided with the distribution.\r\n\r\n* Neither the name of Kevin Decker nor the names of its\r\n contributors may be used to endorse or promote products\r\n derived from this software without specific prior\r\n written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\r\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\r\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\r\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\r\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Software License Agreement (BSD License)\r\n\r\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\r\n\r\n[All] [rights] [reserved].\r\n\r\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\r\nare permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer in the documentation and/or other\r\n materials provided with the distribution.\r\n\r\n* Neither the name of [Kevin] [Decker] nor the names of its\r\n contributors may be used to endorse or promote products\r\n derived from this software without specific prior\r\n written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\r\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\r\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\r\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\r\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ], + "detection_log": [ + "extra-words" + ], + "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e" + } + ], + "license_clues": [], + "percentage_of_license_text": 92.67, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json new file mode 100644 index 00000000000..062fe08931a --- /dev/null +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json @@ -0,0 +1,70 @@ +{ + "license_detections": [ + { + "identifier": "bsd_new-4b08a4bf-cc63-bee9-d78c-bec80b3f58f4", + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "detection_count": 1, + "detection_log": [ + "extra-words-permitted-in-rule" + ], + "reference_matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "without-copyrights/LICENSE", + "start_line": 7, + "end_line": 32, + "matcher": "3-seq", + "score": 100, + "matched_length": 212, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_newlib3.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_newlib3.RULE", + "matched_text": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are\r\nmet:\r\n\r\n * Redistributions of source code must retain the above copyright\r\n notice, this list of conditions and the following disclaimer.\r\n * Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the following\r\n disclaimer in the documentation and/or other materials provided\r\n with the distribution.\r\n * Neither the name of the Hiroshima University nor the names of\r\n its contributors may be used to endorse or promote products\r\n derived from this software without specific prior written\r\n permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r\nOWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r\nLIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r\nTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are\r\nmet:\r\n\r\n * Redistributions of source code must retain the above copyright\r\n notice, this list of conditions and the following disclaimer.\r\n * Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the following\r\n disclaimer in the documentation and/or other materials provided\r\n with the distribution.\r\n * Neither the name of the [Hiroshima] University nor the names of\r\n its contributors may be used to endorse or promote products\r\n derived from this software without specific prior written\r\n permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r\nOWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r\nLIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r\nTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ] + } + ], + "files": [ + { + "path": "LICENSE", + "type": "file", + "detected_license_expression": "bsd-new", + "detected_license_expression_spdx": "BSD-3-Clause", + "license_detections": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "without-copyrights/LICENSE", + "start_line": 7, + "end_line": 32, + "matcher": "3-seq", + "score": 100, + "matched_length": 212, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_newlib3.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_newlib3.RULE", + "matched_text": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are\r\nmet:\r\n\r\n * Redistributions of source code must retain the above copyright\r\n notice, this list of conditions and the following disclaimer.\r\n * Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the following\r\n disclaimer in the documentation and/or other materials provided\r\n with the distribution.\r\n * Neither the name of the Hiroshima University nor the names of\r\n its contributors may be used to endorse or promote products\r\n derived from this software without specific prior written\r\n permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r\nOWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r\nLIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r\nTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are\r\nmet:\r\n\r\n * Redistributions of source code must retain the above copyright\r\n notice, this list of conditions and the following disclaimer.\r\n * Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the following\r\n disclaimer in the documentation and/or other materials provided\r\n with the distribution.\r\n * Neither the name of the [Hiroshima] University nor the names of\r\n its contributors may be used to endorse or promote products\r\n derived from this software without specific prior written\r\n permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r\nOWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r\nLIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r\nTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ], + "detection_log": [ + "extra-words-permitted-in-rule" + ], + "identifier": "bsd_new-4b08a4bf-cc63-bee9-d78c-bec80b3f58f4" + } + ], + "license_clues": [], + "percentage_of_license_text": 87.24, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/licensedcode/test_plugin_license_detection.py b/tests/licensedcode/test_plugin_license_detection.py index ff249bc9c58..da98f5b8da6 100644 --- a/tests/licensedcode/test_plugin_license_detection.py +++ b/tests/licensedcode/test_plugin_license_detection.py @@ -113,7 +113,7 @@ def test_license_match_unknown_clues_is_not_in_expression(): def test_license_match_extra_words_3_seq(): - test_dir = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-3-seq-license/') + test_dir = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-3-seq-license/with-copyrights') result_file = test_env.get_temp_file('json') args = [ '--license', @@ -126,7 +126,25 @@ def test_license_match_extra_words_3_seq(): test_dir, ] run_scan_click(args) - test_loc = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json') + test_loc = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json') + check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES) + + +def test_license_match_extra_words_3_seq_without_copyrights(): + test_dir = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights') + result_file = test_env.get_temp_file('json') + args = [ + '--license', + '--license-text', + '--license-text-diagnostics', + '--license-diagnostics', + '--strip-root', + '--verbose', + '--json', result_file, + test_dir, + ] + run_scan_click(args) + test_loc = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json') check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES) From bf3417d5533c6c6c5bb255ebb1087dc170cd31f7 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Tue, 24 Jun 2025 13:13:27 +0530 Subject: [PATCH 08/14] fix test failure Signed-off-by: Alok Kumar --- src/licensedcode/match.py | 13 +++++++++---- tests/licensedcode/test_match.py | 23 ----------------------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 5058a7c45ae..55b4aff1c88 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -1081,11 +1081,16 @@ def is_extra_words_position_valid(match): do not exceed the maximum allowed word count at those positions. Otherwise, return False. """ - - rule_spans = match.ispan.subspans() + # Find `query_coverage_coefficient` such that match have `extra-words` or not + score_coverage_relevance = ( + match.coverage() * match.rule.relevance + ) / 100 + + # Calculate the query coverage coefficient + query_coverage_coefficient = score_coverage_relevance - match.score() - # If there are multiple subspans, it means not all required tokens are contiguous. - if len(rule_spans) > 1: + # Return False if the match has no extra words + if query_coverage_coefficient == 0: return False matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False))) diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py index 3302e1a57bc..a1f8c306145 100644 --- a/tests/licensedcode/test_match.py +++ b/tests/licensedcode/test_match.py @@ -1482,29 +1482,6 @@ def test_LicenseMatch_score_100_non_contiguous(self): m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41)) assert m1.score() == 80.77 - def test_LicenseMatch_matches_score_100_for_extra_words_within_limit(self): - rule_text = 'Neither the name of [[3]] nor the names of its' - rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text) - idx = index.LicenseIndex([rule]) - - query = 'Neither the name of XXX YYY ZZZ nor the names of its' - matches = idx.match(query_string=query, _skip_hash_match=True) - match = matches[0] - score = match.score() - assert score == 100 - - def test_LicenseMatch_matches_score_not_100_for_extra_words_exceed_limit(self): - rule_text = 'Neither the name of [[3]] nor the names of its' - rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text) - idx = index.LicenseIndex([rule]) - - # The query includes 4 extra words instead of the allowed 3. - query = 'Neither the name of XXX YYY ZZZ AAA nor the names of its' - matches = idx.match(query_string=query, _skip_hash_match=True) - match = matches[0] - score = match.score() - assert score != 100 - def test_LicenseMatch_stopwords_are_treated_as_unknown_2484(self): rules_dir = self.get_test_loc('stopwords/index/rules') lics_dir = self.get_test_loc('stopwords/index/licenses') From 61284f4bc7aca9c28a08679f3b659a4990e501a6 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Tue, 1 Jul 2025 17:00:24 +0530 Subject: [PATCH 09/14] remove `extra-phrase` marker from `index_tokenizer_with_stopwords` Signed-off-by: Alok Kumar --- src/licensedcode/data/rules/bsd-new_578.RULE | 2 +- src/licensedcode/match.py | 2 +- src/licensedcode/tokenize.py | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/licensedcode/data/rules/bsd-new_578.RULE b/src/licensedcode/data/rules/bsd-new_578.RULE index baa5f6d9fdc..99f1aad110e 100644 --- a/src/licensedcode/data/rules/bsd-new_578.RULE +++ b/src/licensedcode/data/rules/bsd-new_578.RULE @@ -16,7 +16,7 @@ are met: copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of nor the names of its + * Neither the name of nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 55b4aff1c88..5191da9a3da 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -597,7 +597,7 @@ def score(self): computed from the number of matched tokens, the number of query tokens in the matched range (including unknowns and unmatched) and the matched rule relevance. - """ + """ # relevance is a number between 0 and 100. Divide by 100 relevance = self.rule.relevance / 100 if not relevance: diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py index dd7ad3356b1..918dad79132 100644 --- a/src/licensedcode/tokenize.py +++ b/src/licensedcode/tokenize.py @@ -85,6 +85,9 @@ def query_lines( extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])' extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall + +extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]') + REQUIRED_PHRASE_OPEN = '{{' REQUIRED_PHRASE_CLOSE = '}}' @@ -348,6 +351,8 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS): """ if not text: return [], {} + + text = extra_phrase_removal_pattern.sub('', text) tokens = [] tokens_append = tokens.append From 490a0816b60e41aba99fb0ce3d4c1c261d4eec46 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Tue, 1 Jul 2025 19:09:19 +0530 Subject: [PATCH 10/14] get only `extra-phrase-spans` Signed-off-by: Alok Kumar --- src/licensedcode/match.py | 9 ++++----- src/licensedcode/models.py | 11 ----------- src/licensedcode/tokenize.py | 4 ---- 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 5191da9a3da..6f1613c1522 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -1106,15 +1106,14 @@ def is_extra_words_position_valid(match): # Count of extra phrase markers extra_phrase_count = 0 - for span, allowed_extra_words in extra_phrase_spans: - rule_index = span.start - extra_phrase_count - 1 - allowed_extra_words = allowed_extra_words + for span, allowed_extra_word in extra_phrase_spans: + rule_index = span.start matched_index = span.start + matched_count - extra_phrase_count extra_words_count = 0 # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens` - if(matched_tokens[matched_index-1] != rule_tokens[rule_index]): + if(matched_tokens[matched_index-1] != rule_tokens[rule_index-1]): return False # Count how many tokens in `matched_text` do not match the next rule token @@ -1124,7 +1123,7 @@ def is_extra_words_position_valid(match): matched_count += 1 extra_words_count += 1 - if extra_words_count > allowed_extra_words: + if extra_words_count > allowed_extra_word: return False extra_phrase_count += 1 diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index f78492b33cf..3153037f382 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -7,7 +7,6 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import re import os import sys import traceback @@ -2333,9 +2332,6 @@ def tokens(self): # identify and capture the spans of extra phrases specified within the rule self.extra_phrase_spans = list(self.extra_phrases()) - - # remove extra_phrase marker from rules - self.text = remove_extra_phrase(self.text) text = self.text # We tag this rule as being a bare URL if it starts with a scheme and is @@ -2600,13 +2596,6 @@ def from_match_data(license_match_mapping): return get_index().rules_by_id[rule_identifier] -def remove_extra_phrase(text): - """ - Remove extra phrase markers like [[n]], where the n is a digit. - """ - pattern = r'\[\[\d+\]\]' - return re.sub(pattern, '', text) - def compute_relevance(length): """ Return a computed ``relevance`` given a ``length`` and a threshold. diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py index 918dad79132..69dc6364638 100644 --- a/src/licensedcode/tokenize.py +++ b/src/licensedcode/tokenize.py @@ -86,8 +86,6 @@ def query_lines( extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall -extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]') - REQUIRED_PHRASE_OPEN = '{{' REQUIRED_PHRASE_CLOSE = '}}' @@ -351,8 +349,6 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS): """ if not text: return [], {} - - text = extra_phrase_removal_pattern.sub('', text) tokens = [] tokens_append = tokens.append From 060fd63892ad65f1d5cc46a7027125db0c662dea Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Wed, 2 Jul 2025 15:39:23 +0530 Subject: [PATCH 11/14] add tests for `extra-phrase` removal in `index_tokenizer_with_stopwords` Signed-off-by: Alok Kumar --- src/licensedcode/tokenize.py | 7 ++++- tests/licensedcode/test_tokenize.py | 45 ++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py index 69dc6364638..1bc7c592cf3 100644 --- a/src/licensedcode/tokenize.py +++ b/src/licensedcode/tokenize.py @@ -82,10 +82,13 @@ def query_lines( required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall -extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])' +extra_phrase_pattern = '(?:' + query_pattern + r'|\[\[|\]\])' extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall +# pattern to match and remove extra phrases like [[1]], [[4]]..etc from the text +extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]') + REQUIRED_PHRASE_OPEN = '{{' REQUIRED_PHRASE_CLOSE = '}}' @@ -349,6 +352,8 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS): """ if not text: return [], {} + + text = extra_phrase_removal_pattern.sub('', text) tokens = [] tokens_append = tokens.append diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py index 2639105094f..06cb446e7f2 100644 --- a/tests/licensedcode/test_tokenize.py +++ b/tests/licensedcode/test_tokenize.py @@ -21,6 +21,7 @@ from licensedcode.tokenize import get_existing_required_phrase_spans from licensedcode.tokenize import get_extra_phrase_spans from licensedcode.tokenize import index_tokenizer +from licensedcode.tokenize import index_tokenizer_with_stopwords from licensedcode.tokenize import InvalidRuleRequiredPhrase from licensedcode.tokenize import matched_query_text_tokenizer from licensedcode.tokenize import ngrams @@ -640,7 +641,49 @@ def test_get_extra_phrase_spans_ignores_unclosed_opening_bracket(self): def test_get_extra_phrase_spans_ignores_unopened_closing_bracket(self): text = 'Neither the name of 3]] nor the names of its' - assert get_extra_phrase_spans(text) == [] + assert get_extra_phrase_spans(text) == [] + + +class TestIndexTokenizerWithStopwords(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_index_tokenizer_with_stopwords_empty_input(self): + toks, stops = index_tokenizer_with_stopwords('') + assert toks == [] + assert stops == {} + + def test_index_tokenizer_with_stopwords_removes_extra_phrase(self): + text = 'Neither the name of [[3]] nor the names of its' + toks, stops = index_tokenizer_with_stopwords(text) + assert toks == ['neither', 'the', 'name', 'of', 'nor', 'the', 'names', 'of', 'its'] + assert stops == {} + + def test_index_tokenizer_with_stopwords_removes_curly_phrase(self): + text = '{{Hi}}some {{}}Text with{{junk}}spAces!' + toks, stops = index_tokenizer_with_stopwords(text) + assert toks == ['hi', 'some', 'text', 'with', 'junk', 'spaces'] + assert stops == {} + + def test_index_tokenizer_with_custom_stopwords(self): + stops_set = set(['is', 'a']) + text = 'This is a test' + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) + assert toks == ['this', 'test'] + assert stops == {0: 2} + + def test_index_tokenizer_with_leading_stopwords(self): + stops_set = set(['is', 'a', 'the']) + text = 'The is a test with result' + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) + assert toks == ['test', 'with', 'result'] + assert stops == {-1: 3} + + def test_index_tokenizer_with_embedded_stopwords_after_position(self): + stops_set = set(['markup', 'lt', 'gt', 'quot']) + text = 'some "< markup >"' + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) + assert toks == ['some'] + assert stops == {0: 5} class TestNgrams(FileBasedTesting): From 68ab63d9ccc1a59dbed4c74e8cdcaddfe6c6e685 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Wed, 2 Jul 2025 21:38:43 +0530 Subject: [PATCH 12/14] add test for `3-seq` and `extra-words-spans` Signed-off-by: Alok Kumar --- src/licensedcode/detection.py | 13 ++++++++++++- tests/licensedcode/test_detection_validate.py | 3 ++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index 36b42735de3..af95860ee55 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -1095,7 +1095,7 @@ def is_correct_detection(license_matches): ] return ( - all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers) + all(matcher in ("1-hash", "1-spdx-id", "2-aho", "3-seq") for matcher in matchers) and all(is_match_coverage_perfect) ) @@ -1156,6 +1156,17 @@ def has_extra_words(license_matches): ) +def has_extra_words_spans(license_matches): + """ + Return True if all of the matches rules in ``license_matches`` (a list of LicenseMatch) + has `extra_phrase` marker and also have matcher `3-seq`. + """ + return all( + match.matcher == '3-seq' and match.rule.extra_phrase_spans + for match in license_matches + ) + + def has_low_rule_relevance(license_matches): """ Return True if all on the matches in ``license_matches`` List of LicenseMatch diff --git a/tests/licensedcode/test_detection_validate.py b/tests/licensedcode/test_detection_validate.py index 5b393f9b4dc..4849967daa9 100644 --- a/tests/licensedcode/test_detection_validate.py +++ b/tests/licensedcode/test_detection_validate.py @@ -18,6 +18,7 @@ from licensedcode import cache from licensedcode import models from licensedcode.detection import is_correct_detection +from licensedcode.detection import has_extra_words_spans from licensedcode.models import licenses_data_dir from licensedcode.models import rules_data_dir from licensedcode.models import License @@ -99,7 +100,7 @@ def check_rule_or_license_can_be_detected_exactly(licensish): assert results == expected icm = is_correct_detection(matches) - if not icm: + if not icm and not has_extra_words_spans(matches): expected.append(f'file://{licensish.rule_file()}') assert results == expected From 5b933c0144afe424bc0e3d2b0401883089da59c4 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Wed, 2 Jul 2025 23:02:23 +0530 Subject: [PATCH 13/14] improve and add more test for `is_extra_words_position_valid` Signed-off-by: Alok Kumar --- src/licensedcode/detection.py | 11 ----------- src/licensedcode/match.py | 14 ++++++++++++++ tests/licensedcode/test_detection_validate.py | 3 +-- tests/licensedcode/test_match.py | 16 ++++++++++++++++ 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index af95860ee55..3949f87fd74 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -1156,17 +1156,6 @@ def has_extra_words(license_matches): ) -def has_extra_words_spans(license_matches): - """ - Return True if all of the matches rules in ``license_matches`` (a list of LicenseMatch) - has `extra_phrase` marker and also have matcher `3-seq`. - """ - return all( - match.matcher == '3-seq' and match.rule.extra_phrase_spans - for match in license_matches - ) - - def has_low_rule_relevance(license_matches): """ Return True if all on the matches in ``license_matches`` List of LicenseMatch diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 6f1613c1522..0997e334773 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -1106,6 +1106,9 @@ def is_extra_words_position_valid(match): # Count of extra phrase markers extra_phrase_count = 0 + rule_index = 0 + matched_index = 0 + for span, allowed_extra_word in extra_phrase_spans: rule_index = span.start @@ -1128,6 +1131,17 @@ def is_extra_words_position_valid(match): extra_phrase_count += 1 + rule_index+=1 + + # check if any `extra-words` is present and return False because this `extra-words` are not at marked place + while (matched_index < len(matched_tokens) and + matched_tokens[matched_index] == rule_tokens[rule_index]): + matched_index+=1 + rule_index+=1 + + if matched_index != len(matched_tokens): + return False + return True diff --git a/tests/licensedcode/test_detection_validate.py b/tests/licensedcode/test_detection_validate.py index 4849967daa9..5b393f9b4dc 100644 --- a/tests/licensedcode/test_detection_validate.py +++ b/tests/licensedcode/test_detection_validate.py @@ -18,7 +18,6 @@ from licensedcode import cache from licensedcode import models from licensedcode.detection import is_correct_detection -from licensedcode.detection import has_extra_words_spans from licensedcode.models import licenses_data_dir from licensedcode.models import rules_data_dir from licensedcode.models import License @@ -100,7 +99,7 @@ def check_rule_or_license_can_be_detected_exactly(licensish): assert results == expected icm = is_correct_detection(matches) - if not icm and not has_extra_words_spans(matches): + if not icm: expected.append(f'file://{licensish.rule_file()}') assert results == expected diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py index a1f8c306145..fdf32086464 100644 --- a/tests/licensedcode/test_match.py +++ b/tests/licensedcode/test_match.py @@ -1421,6 +1421,22 @@ def test_exact_match_without_extra_markers(self): match = idx.match(query_string=query, _skip_hash_match=True)[0] assert is_extra_words_position_valid(match) is False + def test_extra_words_one_at_right_place_and_one_at_not_right_place(self): + rule_text = """ + Redistribution and use [[3]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of this software in source and binary extra-words forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + class TestLicenseMatchScore(FileBasedTesting): test_data_dir = TEST_DATA_DIR From 16d336463e80e652e575175ed727f0d191aa82a1 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Sun, 6 Jul 2025 18:42:50 +0530 Subject: [PATCH 14/14] add more tests for `is_extra_words_at_valid_positions` and improve detection Signed-off-by: Alok Kumar --- src/licensedcode/detection.py | 16 +--- src/licensedcode/match.py | 13 ++- tests/licensedcode/test_match.py | 139 ++++++++++++++++++++++++++++++- 3 files changed, 150 insertions(+), 18 deletions(-) diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index 3949f87fd74..2bd5ca5f999 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -31,6 +31,7 @@ from licensedcode.match import LicenseMatch from licensedcode.match import set_matched_lines from licensedcode.match import is_extra_words_position_valid +from licensedcode.match import is_extra_words_at_valid_positions from licensedcode.models import compute_relevance from licensedcode.models import Rule from licensedcode.models import UnDetectedRule @@ -1167,21 +1168,6 @@ def has_low_rule_relevance(license_matches): ) -def is_extra_words_at_valid_positions(license_matches): - """ - Return True if all the matches in `license_matches List of LicenseMatch - has extra words are in the correct place. - """ - for match in license_matches: - # check when we have `extra-words` detection - # if `query_coverage_coefficient` is positive number then 'extra-words` exit - if calculate_query_coverage_coefficient(match) > 0: - if not is_extra_words_position_valid(match): - return False - - # at the end return True if all matches have no extra-wors or this extra-words are in the right place - return True - def is_false_positive(license_matches, package_license=False): """ Return True if all of the matches in ``license_matches`` List of LicenseMatch diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 0997e334773..ef6bcec226b 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -1133,18 +1133,27 @@ def is_extra_words_position_valid(match): rule_index+=1 - # check if any `extra-words` is present and return False because this `extra-words` are not at marked place + # check if any `extra-words` is present after checking all `extra-phrase-spans` in rules while (matched_index < len(matched_tokens) and matched_tokens[matched_index] == rule_tokens[rule_index]): matched_index+=1 rule_index+=1 - + + # some `extra-words` are found if matched_index != len(matched_tokens): return False return True +def is_extra_words_at_valid_positions(license_matches): + """ + Return True if any of the matches in `license_matches` that have `extra-words` + are in the right place. + """ + return any(is_extra_words_position_valid(match) for match in license_matches) + + def filter_contained_matches( matches, trace=TRACE_FILTER_CONTAINED, diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py index fdf32086464..ed95babffff 100644 --- a/tests/licensedcode/test_match.py +++ b/tests/licensedcode/test_match.py @@ -21,6 +21,7 @@ from licensedcode.match import get_full_matched_text from licensedcode.match import get_matching_regions from licensedcode.match import is_extra_words_position_valid +from licensedcode.match import is_extra_words_at_valid_positions from licensedcode.match import LicenseMatch from licensedcode.match import merge_matches from licensedcode.match import reportable_tokens @@ -1400,9 +1401,11 @@ def test_extra_words_at_wrong_position(self): idx = index.LicenseIndex([rule]) query = """ - Redistribution and amazing use in great source and binary forms are permitted. + Redistribution and amazing use in source and binary forms are permitted. """ + # here 'amazing' word are at wrong place match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False def test_exact_match_without_extra_markers(self): @@ -1437,6 +1440,140 @@ def test_extra_words_one_at_right_place_and_one_at_not_right_place(self): match = idx.match(query_string=query, _skip_hash_match=True)[0] assert is_extra_words_position_valid(match) is False + def test_extra_words_if_one_match_have_extra_words_at_right_place_and_another_match_have_no_extra_words(self): + r1_text = "Redistribution and use [[3]] in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "under the MIT license" + r2 = create_rule_from_text_and_expression( + license_expression='mit', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + under the MIT license + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + assert is_extra_words_at_valid_positions(matches) is True + + def test_extra_words_if_one_match_have_extra_words_at_right_place_but_exceed_limit_and_another_match_have_no_extra_words(self): + r1_text = "Redistribution and use [[3]] in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "under the MIT license" + r2 = create_rule_from_text_and_expression( + license_expression='mit', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use of this software AAA in source and binary forms are permitted. + under the MIT license + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + + # one match have `extra-words` but it exceed the limit here there are + # four `extra-words` i.e 'of','this','software','AAA' + assert is_extra_words_at_valid_positions(matches) is False + + def test_extra_words_if_all_match_have_no_extra_words(self): + r1_text = "Redistribution and use in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "under the MIT license" + r2 = create_rule_from_text_and_expression( + license_expression='mit', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use in source and binary forms are permitted. + under the MIT license + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + + assert is_extra_words_at_valid_positions(matches) is False + + def test_extra_words_if_one_match_have_extra_words_at_right_place_and_another_match_at_wrong_place(self): + r1_text = "Redistribution and use [[3]] in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "Neither the name of [[3]] nor the names of its" + r2 = create_rule_from_text_and_expression( + license_expression='extra-words2', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + Neither the name of William Henry James nor the names of Harris its + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + + # one match have `extra-words` at correct place but another match + # have `extra-words` at correct place but one words 'Harris' at wrong place + # this `is_extra_words_at_valid_positions` return True because one match + # have `extra-words` at correct place + assert is_extra_words_at_valid_positions(matches) is True + + def test_extra_words_all_match_have_extra_words_at_right_place(self): + r1_text = "Redistribution and use [[3]] in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "Neither the name of [[3]] nor the names of its" + r2 = create_rule_from_text_and_expression( + license_expression='extra-words2', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + Neither the name of William Henry James nor the names of its + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + assert is_extra_words_at_valid_positions(matches) is True + class TestLicenseMatchScore(FileBasedTesting): test_data_dir = TEST_DATA_DIR