diff --git a/src/licensedcode/data/rules/bsd-new_158.RULE b/src/licensedcode/data/rules/bsd-new_158.RULE index 90af8ee4a6..b083545777 100644 --- a/src/licensedcode/data/rules/bsd-new_158.RULE +++ b/src/licensedcode/data/rules/bsd-new_158.RULE @@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -Neither the name of nor the names of its +Neither the name of [[6]] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/data/rules/bsd-new_newlib3.RULE b/src/licensedcode/data/rules/bsd-new_newlib3.RULE index 997f553743..1de2ceccdc 100644 --- a/src/licensedcode/data/rules/bsd-new_newlib3.RULE +++ b/src/licensedcode/data/rules/bsd-new_newlib3.RULE @@ -11,7 +11,7 @@ are permitted provided that the following conditions are met: * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of the University nor the names of its contributors + * Neither the name of the [[3]] University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index 34cbe582e6..2bd5ca5f99 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -30,6 +30,8 @@ from licensedcode.cache import get_licensing from licensedcode.match import LicenseMatch from licensedcode.match import set_matched_lines +from licensedcode.match import is_extra_words_position_valid +from licensedcode.match import is_extra_words_at_valid_positions from licensedcode.models import compute_relevance from licensedcode.models import Rule from licensedcode.models import UnDetectedRule @@ -110,6 +112,7 @@ class DetectionCategory(Enum): PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file' PACKAGE_ADD_FROM_FILE = 'from-package-file' EXTRA_WORDS = 'extra-words' + EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule' UNKNOWN_MATCH = 'unknown-match' LICENSE_CLUES = 'license-clues' LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' @@ -129,6 +132,7 @@ class DetectionRule(Enum): """ UNKNOWN_MATCH = 'unknown-match' EXTRA_WORDS = 'extra-words' + EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule' LICENSE_CLUES = 'license-clues' LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' IMPERFECT_COVERAGE = 'imperfect-match-coverage' @@ -405,8 +409,12 @@ def score(self): by the length of a match to the overall detection length. """ length = self.length - weighted_scores = (m.score() * (m.len() / length) for m in self.matches) - return min([round(sum(weighted_scores), 2), 100]) + for m in self.matches: + # Check whether extra words in the matched text appear in allowed positions, + # and do not exceed the maximum allowed word count at those positions. + score = 100 if is_extra_words_position_valid(m) else m.score() + weighted_scores += score * (m.len() / length) + return min([round(weighted_scores, 2), 100]) def append( self, @@ -1072,6 +1080,7 @@ def is_correct_detection_non_unknown(license_matches): is_correct_detection(license_matches) and not has_unknown_matches(license_matches) and not has_extra_words(license_matches) + and not is_extra_words_at_valid_positions(license_matches) ) @@ -1087,7 +1096,7 @@ def is_correct_detection(license_matches): ] return ( - all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers) + all(matcher in ("1-hash", "1-spdx-id", "2-aho", "3-seq") for matcher in matchers) and all(is_match_coverage_perfect) ) @@ -1570,6 +1579,12 @@ def get_detected_license_expression( detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value) return detection_log, combined_expression + elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value: + if TRACE_ANALYSIS: + logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}') + matches_for_expression = license_matches + detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value) + elif analysis == DetectionCategory.EXTRA_WORDS.value: if TRACE_ANALYSIS: logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}') @@ -1810,7 +1825,11 @@ def analyze_detection(license_matches, package_license=False): # Case where at least one of the match have extra words elif has_extra_words(license_matches=license_matches): - return DetectionCategory.EXTRA_WORDS.value + # Case where `extra-words` are in the right place + if is_extra_words_at_valid_positions(license_matches=license_matches): + return DetectionCategory.EXTRA_WORDS_PERMITTED.value + else: + return DetectionCategory.EXTRA_WORDS.value # Cases where Match Coverage is a perfect 100 for all matches else: diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 90eba30d55..ef6bcec226 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -826,7 +826,11 @@ def to_dict( result['start_line'] = self.start_line result['end_line'] = self.end_line result['matcher'] = self.matcher - result['score'] = self.score() + # update score if `extra-words` are in right place + if(is_extra_words_position_valid(match=self)): + result['score'] = 100 + else: + result['score'] = self.score() result['matched_length'] = self.len() result['match_coverage'] = self.coverage() result['rule_relevance'] = self.rule.relevance @@ -1071,6 +1075,84 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE): # early from the loops: trying to check containment on wildly separated matches # does not make sense +def is_extra_words_position_valid(match): + """ + Return True if the extra words appear in valid positions and + do not exceed the maximum allowed word count at those positions. + Otherwise, return False. + """ + # Find `query_coverage_coefficient` such that match have `extra-words` or not + score_coverage_relevance = ( + match.coverage() * match.rule.relevance + ) / 100 + + # Calculate the query coverage coefficient + query_coverage_coefficient = score_coverage_relevance - match.score() + + # Return False if the match has no extra words + if query_coverage_coefficient == 0: + return False + + matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False))) + rule_tokens = list(index_tokenizer(match.rule.text)) + extra_phrase_spans = match.rule.extra_phrase_spans + + if not extra_phrase_spans: + return False + + # count of `extra-words` tokens i.e inserted in `matched_tokens` + matched_count = 0 + + # Count of extra phrase markers + extra_phrase_count = 0 + + rule_index = 0 + matched_index = 0 + + for span, allowed_extra_word in extra_phrase_spans: + rule_index = span.start + + matched_index = span.start + matched_count - extra_phrase_count + extra_words_count = 0 + + # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens` + if(matched_tokens[matched_index-1] != rule_tokens[rule_index-1]): + return False + + # Count how many tokens in `matched_text` do not match the next rule token + while (matched_index < len(matched_tokens) and + matched_tokens[matched_index] != rule_tokens[rule_index + 1]): + matched_index += 1 + matched_count += 1 + extra_words_count += 1 + + if extra_words_count > allowed_extra_word: + return False + + extra_phrase_count += 1 + + rule_index+=1 + + # check if any `extra-words` is present after checking all `extra-phrase-spans` in rules + while (matched_index < len(matched_tokens) and + matched_tokens[matched_index] == rule_tokens[rule_index]): + matched_index+=1 + rule_index+=1 + + # some `extra-words` are found + if matched_index != len(matched_tokens): + return False + + return True + + +def is_extra_words_at_valid_positions(license_matches): + """ + Return True if any of the matches in `license_matches` that have `extra-words` + are in the right place. + """ + return any(is_extra_words_position_valid(match) for match in license_matches) + def filter_contained_matches( matches, diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 354d93f52d..3153037f38 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -43,6 +43,7 @@ from licensedcode.tokenize import index_tokenizer from licensedcode.tokenize import index_tokenizer_with_stopwords from licensedcode.tokenize import query_lines +from licensedcode.tokenize import get_extra_phrase_spans from scancode.api import SCANCODE_LICENSEDB_URL from scancode.api import SCANCODE_LICENSE_URL from scancode.api import SCANCODE_RULE_URL @@ -1683,6 +1684,17 @@ class BasicRule: ) ) + extra_phrase_spans = attr.ib( + default=attr.Factory(list), + repr=False, + metadata=dict( + help='List of tuples `(Span, int)` representing extra phrases for this rule.' + 'Each tuple contains a Span of token positions in the rule text and an integer' + 'indicating the maximum number of extra tokens allowed at that position.' + 'extra phrases are enclosed in [[double square brackets]] in the rule text.' + ) + ) + source = attr.ib( default=None, repr=False, @@ -2317,6 +2329,9 @@ def tokens(self): "is_continuous", "minimum_coverage" and "stopword_by_pos" are recomputed as a side effect. """ + + # identify and capture the spans of extra phrases specified within the rule + self.extra_phrase_spans = list(self.extra_phrases()) text = self.text # We tag this rule as being a bare URL if it starts with a scheme and is @@ -2353,6 +2368,17 @@ def _set_continuous(self): ): self.is_continuous = True + def extra_phrases(self): + """ + Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text. + + Each tuple consists of: + - a `Span` object representing the position in the tokenized rule text, and + - an integer `n` indicating how many extra tokens are allowed at that position. + """ + if self.text: + yield from get_extra_phrase_spans(self.text) + def build_required_phrase_spans(self): """ Return a list of Spans marking required phrases token positions of that must diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py index bea07dd5a2..1bc7c592cf 100644 --- a/src/licensedcode/tokenize.py +++ b/src/licensedcode/tokenize.py @@ -81,12 +81,82 @@ def query_lines( required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})' required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall + +extra_phrase_pattern = '(?:' + query_pattern + r'|\[\[|\]\])' +extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall + + +# pattern to match and remove extra phrases like [[1]], [[4]]..etc from the text +extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]') + REQUIRED_PHRASE_OPEN = '{{' REQUIRED_PHRASE_CLOSE = '}}' +EXTRA_PHRASE_OPEN ='[[' +EXTRA_PHRASE_CLOSE =']]' + # FIXME: this should be folded in a single pass tokenization with the index_tokenizer +def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False): + """ + Yield tokens from a rule ``text`` including extra phrases [[n]] markers. + This n denotes maximum number of extra-words i.e valide at that position. + This is same as ``required_phrase_tokenizer``. + """ + if not text: + return + if not preserve_case: + text = text.lower() + + for token in extra_phrase_splitter(text): + if token and token not in stopwords: + yield token + + +def get_extra_phrase_spans(text): + """ + Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``. + Here, `n` should always be a digit token inside the extra phrase brackets. + + Example: + >>> text = 'Neither the name [[3]] of nor the names of its' + >>> # 0 1 2 3 4 5 6 7 8 9 + >>> x = get_extra_phrase_spans(text) + >>> assert x == [(Span([3]), 3)], x + """ + ipos = 0 + in_extra_phrase = False + current_phrase_value = [] + extra_phrase_spans = [] + + for token in extra_phrase_tokenizer(text): + if token == EXTRA_PHRASE_OPEN: + in_extra_phrase = True + current_phrase_value = [] + continue + + elif token == EXTRA_PHRASE_CLOSE: + if in_extra_phrase: + # token must be digit and token must be present in double square bracket ``[[token]]`` + # and between extra phrases there must only one token exist + if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit(): + extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0]))) + + in_extra_phrase = False + current_phrase_value = [] + continue + + if in_extra_phrase: + # consider one token after double open square bracket ``[[`` + if len(current_phrase_value) == 0: + current_phrase_value.append(token) + + ipos += 1 + + return extra_phrase_spans + + def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False): """ Yield tokens from a rule ``text`` including required phrases {{brace}} markers. @@ -282,6 +352,8 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS): """ if not text: return [], {} + + text = extra_phrase_removal_pattern.sub('', text) tokens = [] tokens_append = tokens.append diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json index fcdb8639dd..5a4bab7dc4 100644 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json @@ -6,7 +6,7 @@ "license_expression_spdx": "BSD-3-Clause", "detection_count": 1, "detection_log": [ - "extra-words" + "extra-words-permitted-in-rule" ], "reference_matches": [ { @@ -16,7 +16,7 @@ "start_line": 4, "end_line": 27, "matcher": "2-aho", - "score": 99.53, + "score": 100, "matched_length": 210, "match_coverage": 100.0, "rule_relevance": 100, @@ -46,7 +46,7 @@ "start_line": 4, "end_line": 27, "matcher": "2-aho", - "score": 99.53, + "score": 100, "matched_length": 210, "match_coverage": 100.0, "rule_relevance": 100, @@ -57,7 +57,7 @@ } ], "detection_log": [ - "extra-words" + "extra-words-permitted-in-rule" ], "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50" } diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json deleted file mode 100644 index c078b0be9f..0000000000 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "license_detections": [ - { - "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e", - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "detection_count": 1, - "detection_log": [ - "extra-words" - ], - "reference_matches": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "from_file": "scan-extra-words-3-seq-license/LICENSE", - "start_line": 1, - "end_line": 31, - "matcher": "3-seq", - "score": 92.67, - "matched_length": 215, - "match_coverage": 100.0, - "rule_relevance": 100, - "rule_identifier": "bsd-new_578.RULE", - "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", - "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", - "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." - } - ] - } - ], - "files": [ - { - "path": "LICENSE", - "type": "file", - "detected_license_expression": "bsd-new", - "detected_license_expression_spdx": "BSD-3-Clause", - "license_detections": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "matches": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "from_file": "scan-extra-words-3-seq-license/LICENSE", - "start_line": 1, - "end_line": 31, - "matcher": "3-seq", - "score": 92.67, - "matched_length": 215, - "match_coverage": 100.0, - "rule_relevance": 100, - "rule_identifier": "bsd-new_578.RULE", - "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", - "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", - "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." - } - ], - "detection_log": [ - "extra-words" - ], - "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e" - } - ], - "license_clues": [], - "percentage_of_license_text": 92.67, - "scan_errors": [] - } - ] -} \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/LICENSE b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/with-copyrights/LICENSE similarity index 98% rename from tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/LICENSE rename to tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/with-copyrights/LICENSE index 4e7146ed78..61c97f0f82 100644 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/LICENSE +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/with-copyrights/LICENSE @@ -1,31 +1,31 @@ -Software License Agreement (BSD License) - -Copyright (c) 2009-2015, Kevin Decker - -All rights reserved. - -Redistribution and use of this software in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above - copyright notice, this list of conditions and the - following disclaimer. - -* Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the - following disclaimer in the documentation and/or other - materials provided with the distribution. - -* Neither the name of Kevin Decker nor the names of its - contributors may be used to endorse or promote products - derived from this software without specific prior - written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR -IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND -FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +Software License Agreement (BSD License) + +Copyright (c) 2009-2015, Kevin Decker + +All rights reserved. + +Redistribution and use of this software in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the + following disclaimer in the documentation and/or other + materials provided with the distribution. + +* Neither the name of Kevin Decker nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior + written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights/LICENSE b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights/LICENSE new file mode 100644 index 0000000000..2a99b023de --- /dev/null +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights/LICENSE @@ -0,0 +1,32 @@ +Copyright (c) 2007, 2008, 2009 Mutsuo Saito, Makoto Matsumoto +and Hiroshima University. +Copyright (c) 2011, 2002 Mutsuo Saito, Makoto Matsumoto, Hiroshima +University and The University of Tokyo. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the Hiroshima University nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json new file mode 100644 index 0000000000..a3d4937782 --- /dev/null +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json @@ -0,0 +1,70 @@ +{ + "license_detections": [ + { + "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e", + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "detection_count": 1, + "detection_log": [ + "extra-words" + ], + "reference_matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "with-copyrights/LICENSE", + "start_line": 1, + "end_line": 31, + "matcher": "3-seq", + "score": 92.67, + "matched_length": 215, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_578.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", + "matched_text": "Software License Agreement (BSD License)\r\n\r\nCopyright (c) 2009-2015, Kevin Decker \r\n\r\nAll rights reserved.\r\n\r\nRedistribution and use of this software in source and binary forms, with or without modification,\r\nare permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer in the documentation and/or other\r\n materials provided with the distribution.\r\n\r\n* Neither the name of Kevin Decker nor the names of its\r\n contributors may be used to endorse or promote products\r\n derived from this software without specific prior\r\n written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\r\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\r\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\r\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\r\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Software License Agreement (BSD License)\r\n\r\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\r\n\r\n[All] [rights] [reserved].\r\n\r\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\r\nare permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer in the documentation and/or other\r\n materials provided with the distribution.\r\n\r\n* Neither the name of [Kevin] [Decker] nor the names of its\r\n contributors may be used to endorse or promote products\r\n derived from this software without specific prior\r\n written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\r\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\r\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\r\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\r\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ] + } + ], + "files": [ + { + "path": "LICENSE", + "type": "file", + "detected_license_expression": "bsd-new", + "detected_license_expression_spdx": "BSD-3-Clause", + "license_detections": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "with-copyrights/LICENSE", + "start_line": 1, + "end_line": 31, + "matcher": "3-seq", + "score": 92.67, + "matched_length": 215, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_578.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", + "matched_text": "Software License Agreement (BSD License)\r\n\r\nCopyright (c) 2009-2015, Kevin Decker \r\n\r\nAll rights reserved.\r\n\r\nRedistribution and use of this software in source and binary forms, with or without modification,\r\nare permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer in the documentation and/or other\r\n materials provided with the distribution.\r\n\r\n* Neither the name of Kevin Decker nor the names of its\r\n contributors may be used to endorse or promote products\r\n derived from this software without specific prior\r\n written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\r\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\r\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\r\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\r\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Software License Agreement (BSD License)\r\n\r\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\r\n\r\n[All] [rights] [reserved].\r\n\r\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\r\nare permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the\r\n following disclaimer in the documentation and/or other\r\n materials provided with the distribution.\r\n\r\n* Neither the name of [Kevin] [Decker] nor the names of its\r\n contributors may be used to endorse or promote products\r\n derived from this software without specific prior\r\n written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\r\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\r\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\r\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\r\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ], + "detection_log": [ + "extra-words" + ], + "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e" + } + ], + "license_clues": [], + "percentage_of_license_text": 92.67, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json new file mode 100644 index 0000000000..062fe08931 --- /dev/null +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json @@ -0,0 +1,70 @@ +{ + "license_detections": [ + { + "identifier": "bsd_new-4b08a4bf-cc63-bee9-d78c-bec80b3f58f4", + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "detection_count": 1, + "detection_log": [ + "extra-words-permitted-in-rule" + ], + "reference_matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "without-copyrights/LICENSE", + "start_line": 7, + "end_line": 32, + "matcher": "3-seq", + "score": 100, + "matched_length": 212, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_newlib3.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_newlib3.RULE", + "matched_text": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are\r\nmet:\r\n\r\n * Redistributions of source code must retain the above copyright\r\n notice, this list of conditions and the following disclaimer.\r\n * Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the following\r\n disclaimer in the documentation and/or other materials provided\r\n with the distribution.\r\n * Neither the name of the Hiroshima University nor the names of\r\n its contributors may be used to endorse or promote products\r\n derived from this software without specific prior written\r\n permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r\nOWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r\nLIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r\nTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are\r\nmet:\r\n\r\n * Redistributions of source code must retain the above copyright\r\n notice, this list of conditions and the following disclaimer.\r\n * Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the following\r\n disclaimer in the documentation and/or other materials provided\r\n with the distribution.\r\n * Neither the name of the [Hiroshima] University nor the names of\r\n its contributors may be used to endorse or promote products\r\n derived from this software without specific prior written\r\n permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r\nOWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r\nLIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r\nTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ] + } + ], + "files": [ + { + "path": "LICENSE", + "type": "file", + "detected_license_expression": "bsd-new", + "detected_license_expression_spdx": "BSD-3-Clause", + "license_detections": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "without-copyrights/LICENSE", + "start_line": 7, + "end_line": 32, + "matcher": "3-seq", + "score": 100, + "matched_length": 212, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_newlib3.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_newlib3.RULE", + "matched_text": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are\r\nmet:\r\n\r\n * Redistributions of source code must retain the above copyright\r\n notice, this list of conditions and the following disclaimer.\r\n * Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the following\r\n disclaimer in the documentation and/or other materials provided\r\n with the distribution.\r\n * Neither the name of the Hiroshima University nor the names of\r\n its contributors may be used to endorse or promote products\r\n derived from this software without specific prior written\r\n permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r\nOWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r\nLIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r\nTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are\r\nmet:\r\n\r\n * Redistributions of source code must retain the above copyright\r\n notice, this list of conditions and the following disclaimer.\r\n * Redistributions in binary form must reproduce the above\r\n copyright notice, this list of conditions and the following\r\n disclaimer in the documentation and/or other materials provided\r\n with the distribution.\r\n * Neither the name of the [Hiroshima] University nor the names of\r\n its contributors may be used to endorse or promote products\r\n derived from this software without specific prior written\r\n permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r\nOWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r\nLIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r\nTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ], + "detection_log": [ + "extra-words-permitted-in-rule" + ], + "identifier": "bsd_new-4b08a4bf-cc63-bee9-d78c-bec80b3f58f4" + } + ], + "license_clues": [], + "percentage_of_license_text": 87.24, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/licensedcode/test_license_models.py b/tests/licensedcode/test_license_models.py index 6c47d92a59..fec3dd7fdd 100644 --- a/tests/licensedcode/test_license_models.py +++ b/tests/licensedcode/test_license_models.py @@ -591,6 +591,14 @@ def test_key_phrases_yields_spans(self): key_phrase_spans = list(rule.build_required_phrase_spans()) assert key_phrase_spans == [Span(4), Span(7, 9)] + def test_extra_phrases_yields_spans(self): + rule_text = ( + 'Neither the name of [[3]] nor the names of its' + ) + rule = models.Rule(license_expression='bsd-new', text=rule_text) + extra_phrase_spans = list(rule.extra_phrases()) + assert extra_phrase_spans == [(Span(4),3)] + def test_key_phrases_raises_exception_when_markup_is_not_closed(self): rule_text = ( 'This released software is {{released}} by under {{the MIT license. ' diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py index 0afab2a7fd..ed95babfff 100644 --- a/tests/licensedcode/test_match.py +++ b/tests/licensedcode/test_match.py @@ -20,6 +20,8 @@ from licensedcode.match import filter_overlapping_matches from licensedcode.match import get_full_matched_text from licensedcode.match import get_matching_regions +from licensedcode.match import is_extra_words_position_valid +from licensedcode.match import is_extra_words_at_valid_positions from licensedcode.match import LicenseMatch from licensedcode.match import merge_matches from licensedcode.match import reportable_tokens @@ -1321,6 +1323,258 @@ def test_get_matching_regions_3_lines_enough(self): assert matches[5].qspan in regions[1] +class TestExtraWordsPosition(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_valid_extra_words_within_limit(self): + rule_text = """ + Redistribution and use [[4]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is True + + def test_invalid_extra_words_exceed_limit(self): + rule_text = """ + Redistribution and use [[2]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_no_extra_words_allowed(self): + rule_text = """ + Redistribution and use in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_multiple_extra_spans_valid(self): + rule_text = """ + Redistribution [[2]] and use [[1]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution of content and use again in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is True + + def test_extra_words_at_wrong_position(self): + rule_text = """ + Redistribution and use [[2]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and amazing use in source and binary forms are permitted. + """ + # here 'amazing' word are at wrong place + match = idx.match(query_string=query, _skip_hash_match=True)[0] + + assert is_extra_words_position_valid(match) is False + + def test_exact_match_without_extra_markers(self): + rule_text = """ + Redistribution and use in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_extra_words_one_at_right_place_and_one_at_not_right_place(self): + rule_text = """ + Redistribution and use [[3]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of this software in source and binary extra-words forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_extra_words_if_one_match_have_extra_words_at_right_place_and_another_match_have_no_extra_words(self): + r1_text = "Redistribution and use [[3]] in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "under the MIT license" + r2 = create_rule_from_text_and_expression( + license_expression='mit', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + under the MIT license + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + assert is_extra_words_at_valid_positions(matches) is True + + def test_extra_words_if_one_match_have_extra_words_at_right_place_but_exceed_limit_and_another_match_have_no_extra_words(self): + r1_text = "Redistribution and use [[3]] in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "under the MIT license" + r2 = create_rule_from_text_and_expression( + license_expression='mit', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use of this software AAA in source and binary forms are permitted. + under the MIT license + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + + # one match have `extra-words` but it exceed the limit here there are + # four `extra-words` i.e 'of','this','software','AAA' + assert is_extra_words_at_valid_positions(matches) is False + + def test_extra_words_if_all_match_have_no_extra_words(self): + r1_text = "Redistribution and use in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "under the MIT license" + r2 = create_rule_from_text_and_expression( + license_expression='mit', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use in source and binary forms are permitted. + under the MIT license + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + + assert is_extra_words_at_valid_positions(matches) is False + + def test_extra_words_if_one_match_have_extra_words_at_right_place_and_another_match_at_wrong_place(self): + r1_text = "Redistribution and use [[3]] in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "Neither the name of [[3]] nor the names of its" + r2 = create_rule_from_text_and_expression( + license_expression='extra-words2', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + Neither the name of William Henry James nor the names of Harris its + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + + # one match have `extra-words` at correct place but another match + # have `extra-words` at correct place but one words 'Harris' at wrong place + # this `is_extra_words_at_valid_positions` return True because one match + # have `extra-words` at correct place + assert is_extra_words_at_valid_positions(matches) is True + + def test_extra_words_all_match_have_extra_words_at_right_place(self): + r1_text = "Redistribution and use [[3]] in source and binary forms are permitted." + r1 = create_rule_from_text_and_expression( + license_expression='extra-words', + text=r1_text + ) + + r2_text = "Neither the name of [[3]] nor the names of its" + r2 = create_rule_from_text_and_expression( + license_expression='extra-words2', + text=r2_text + ) + + idx = index.LicenseIndex([r1,r2]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + Neither the name of William Henry James nor the names of its + """ + + matches = idx.match(query_string=query, _skip_hash_match=True) + + assert len(matches) == 2 + assert is_extra_words_at_valid_positions(matches) is True + + class TestLicenseMatchScore(FileBasedTesting): test_data_dir = TEST_DATA_DIR diff --git a/tests/licensedcode/test_plugin_license_detection.py b/tests/licensedcode/test_plugin_license_detection.py index ff249bc9c5..da98f5b8da 100644 --- a/tests/licensedcode/test_plugin_license_detection.py +++ b/tests/licensedcode/test_plugin_license_detection.py @@ -113,7 +113,7 @@ def test_license_match_unknown_clues_is_not_in_expression(): def test_license_match_extra_words_3_seq(): - test_dir = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-3-seq-license/') + test_dir = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-3-seq-license/with-copyrights') result_file = test_env.get_temp_file('json') args = [ '--license', @@ -126,7 +126,25 @@ def test_license_match_extra_words_3_seq(): test_dir, ] run_scan_click(args) - test_loc = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json') + test_loc = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-with-copyrights-license.expected.json') + check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES) + + +def test_license_match_extra_words_3_seq_without_copyrights(): + test_dir = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-3-seq-license/without-copyrights') + result_file = test_env.get_temp_file('json') + args = [ + '--license', + '--license-text', + '--license-text-diagnostics', + '--license-diagnostics', + '--strip-root', + '--verbose', + '--json', result_file, + test_dir, + ] + run_scan_click(args) + test_loc = test_env.get_test_loc('plugin_license/extra-words/scan-extra-words-without-copyrights-license.expected.json') check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES) diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py index 950a94cf76..06cb446e7f 100644 --- a/tests/licensedcode/test_tokenize.py +++ b/tests/licensedcode/test_tokenize.py @@ -19,13 +19,16 @@ from licensedcode.spans import Span from licensedcode.tokenize import get_existing_required_phrase_spans +from licensedcode.tokenize import get_extra_phrase_spans from licensedcode.tokenize import index_tokenizer +from licensedcode.tokenize import index_tokenizer_with_stopwords from licensedcode.tokenize import InvalidRuleRequiredPhrase from licensedcode.tokenize import matched_query_text_tokenizer from licensedcode.tokenize import ngrams from licensedcode.tokenize import query_lines from licensedcode.tokenize import query_tokenizer from licensedcode.tokenize import required_phrase_tokenizer +from licensedcode.tokenize import extra_phrase_tokenizer from licensedcode.tokenize import select_ngrams from licensedcode.tokenize import tokens_and_non_tokens from licensedcode.tokenize import word_splitter @@ -585,6 +588,104 @@ def test_get_existing_required_phrase_spans_with_markup(self): assert get_existing_required_phrase_spans(text=text) == [Span(18, 19)] +class TestExtraPhraseTokenizer(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_extra_phrase_tokenizer_handles_empty_string(self): + text = '' + result = list(extra_phrase_tokenizer(text)) + assert result == [] + + def test_extra_phrase_tokenizer_handles_blank_lines(self): + text = u' \n\n\t ' + result = list(extra_phrase_tokenizer(text)) + assert result == [] + + def test_extra_phrase_tokenizer_handles_only_brackets(self): + text = '[[3]]' + assert list(extra_phrase_tokenizer(text)) == ['[[', '3', ']]'] + + def test_extra_phrase_tokenizer_parses_text_with_extra_phrase_marker(self): + text = 'Neither the name of [[3]] nor the names of its' + assert list(extra_phrase_tokenizer(text)) == [ + 'neither', 'the', 'name', 'of', '[[', '3', ']]', 'nor', 'the', 'names', 'of', 'its' + ] + + def test_get_extra_phrase_spans_simple(self): + text = 'This is [[2]] an example.' + spans = get_extra_phrase_spans(text) + assert spans == [(Span([2]), 2)] + + def test_get_extra_phrase_spans_multiple(self): + text = 'Some [[4]] text [[6]] with multiple markers.' + spans = get_extra_phrase_spans(text) + assert spans == [(Span([1]), 4), (Span([3]), 6)] + + def test_get_extra_phrase_spans_returns_nothing_if_none_found(self): + text = 'Just some normal text.' + assert get_extra_phrase_spans(text) == [] + + def test_get_extra_phrase_spans_ignores_non_numeric_values(self): + text = 'Just some [[normal]] text.' + assert get_extra_phrase_spans(text) == [] + + def test_extra_phrase_tokenizer_returns_same_word_tokens_as_index_tokenizer(self): + text = 'This [[1]] is a test.' + ep_tokens = [t for t in extra_phrase_tokenizer(text) if t not in ('[[', ']]')] + idx_tokens = list(index_tokenizer(text)) + assert ep_tokens == idx_tokens + + def test_get_extra_phrase_spans_ignores_unclosed_opening_bracket(self): + text = 'Neither the name of [[3 nor the names of its' + assert get_extra_phrase_spans(text) == [] + + def test_get_extra_phrase_spans_ignores_unopened_closing_bracket(self): + text = 'Neither the name of 3]] nor the names of its' + assert get_extra_phrase_spans(text) == [] + + +class TestIndexTokenizerWithStopwords(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_index_tokenizer_with_stopwords_empty_input(self): + toks, stops = index_tokenizer_with_stopwords('') + assert toks == [] + assert stops == {} + + def test_index_tokenizer_with_stopwords_removes_extra_phrase(self): + text = 'Neither the name of [[3]] nor the names of its' + toks, stops = index_tokenizer_with_stopwords(text) + assert toks == ['neither', 'the', 'name', 'of', 'nor', 'the', 'names', 'of', 'its'] + assert stops == {} + + def test_index_tokenizer_with_stopwords_removes_curly_phrase(self): + text = '{{Hi}}some {{}}Text with{{junk}}spAces!' + toks, stops = index_tokenizer_with_stopwords(text) + assert toks == ['hi', 'some', 'text', 'with', 'junk', 'spaces'] + assert stops == {} + + def test_index_tokenizer_with_custom_stopwords(self): + stops_set = set(['is', 'a']) + text = 'This is a test' + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) + assert toks == ['this', 'test'] + assert stops == {0: 2} + + def test_index_tokenizer_with_leading_stopwords(self): + stops_set = set(['is', 'a', 'the']) + text = 'The is a test with result' + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) + assert toks == ['test', 'with', 'result'] + assert stops == {-1: 3} + + def test_index_tokenizer_with_embedded_stopwords_after_position(self): + stops_set = set(['markup', 'lt', 'gt', 'quot']) + text = 'some "< markup >"' + toks, stops = index_tokenizer_with_stopwords(text, stopwords=stops_set) + assert toks == ['some'] + assert stops == {0: 5} + + class TestNgrams(FileBasedTesting): test_data_dir = TEST_DATA_DIR