Skip to content

Commit 8edfb92

Browse files
Make LicenseMatch grouping affected by presence of license intro
Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 64bd3d8 commit 8edfb92

File tree

5 files changed

+288
-7
lines changed

5 files changed

+288
-7
lines changed

src/licensedcode/detection.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -843,17 +843,40 @@ def group_matches(license_matches, lines_threshold=LINES_THRESHOLD):
843843
"""
844844
Given a list of `matches` LicenseMatch objects, yield lists of grouped matches
845845
together where each group is less than `lines_threshold` apart.
846+
This creates a new group if there is a license intro, and doesn't create a new
847+
group if the last match was a license intro.
846848
"""
847849
group_of_license_matches = []
850+
848851
for license_match in license_matches:
852+
# If this is the first match or the start of another group after yielding
853+
# the contents of the previous group
849854
if not group_of_license_matches:
850855
group_of_license_matches.append(license_match)
851856
continue
857+
852858
previous_match = group_of_license_matches[-1]
853-
is_in_group = license_match.start_line <= previous_match.end_line + lines_threshold
854-
if is_in_group:
859+
is_in_group_by_threshold = license_match.start_line <= previous_match.end_line + lines_threshold
860+
861+
# If the previous match is an intro, we should keep this match in the group
862+
# This is regardless of line number difference being more than threshold
863+
if previous_match.rule.is_license_intro:
864+
group_of_license_matches.append(license_match)
865+
continue
866+
867+
# If the current match is an intro, we should create a new group
868+
# This is regardless of line number difference being less than threshold
869+
elif license_match.rule.is_license_intro:
870+
yield group_of_license_matches
871+
group_of_license_matches = [license_match]
872+
873+
# If none of previous or current match has license intro then we look at line numbers
874+
# If line number difference is within threshold, we keep the current match in the group
875+
elif is_in_group_by_threshold:
855876
group_of_license_matches.append(license_match)
856877
continue
878+
879+
# If line number difference is outside threshold, we make a new group
857880
else:
858881
yield group_of_license_matches
859882
group_of_license_matches = [license_match]
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
{
2+
"files": [
3+
{
4+
"path": "LICENSE-dist.txt",
5+
"type": "file",
6+
"detected_license_expression": "x11-lucent AND bzip2-libbzip-2010",
7+
"detected_license_expression_spdx": "LicenseRef-scancode-x11-lucent AND bzip2-1.0.6",
8+
"license_detections": [
9+
{
10+
"license_expression": "x11-lucent",
11+
"detection_rules": [
12+
"unknown-intro-followed-by-match"
13+
],
14+
"matches": [
15+
{
16+
"score": 100.0,
17+
"start_line": 1,
18+
"end_line": 1,
19+
"matched_length": 5,
20+
"match_coverage": 100.0,
21+
"matcher": "2-aho",
22+
"license_expression": "unknown-license-reference",
23+
"rule_identifier": "license-intro_4.RULE",
24+
"referenced_filenames": [],
25+
"is_license_text": false,
26+
"is_license_notice": false,
27+
"is_license_reference": false,
28+
"is_license_tag": false,
29+
"is_license_intro": true,
30+
"rule_length": 5,
31+
"rule_relevance": 100,
32+
"matched_text": "licensed under the following terms:",
33+
"licenses": [
34+
{
35+
"key": "unknown-license-reference",
36+
"name": "Unknown License file reference",
37+
"short_name": "Unknown License reference",
38+
"category": "Unstated License",
39+
"is_exception": false,
40+
"is_unknown": true,
41+
"owner": "Unspecified",
42+
"homepage_url": null,
43+
"text_url": "",
44+
"reference_url": "https://scancode-licensedb.aboutcode.org/unknown-license-reference",
45+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
46+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.yml",
47+
"spdx_license_key": "LicenseRef-scancode-unknown-license-reference",
48+
"spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE"
49+
}
50+
]
51+
},
52+
{
53+
"score": 100.0,
54+
"start_line": 8,
55+
"end_line": 14,
56+
"matched_length": 93,
57+
"match_coverage": 100.0,
58+
"matcher": "2-aho",
59+
"license_expression": "x11-lucent",
60+
"rule_identifier": "x11-lucent_1.RULE",
61+
"referenced_filenames": [],
62+
"is_license_text": true,
63+
"is_license_notice": false,
64+
"is_license_reference": false,
65+
"is_license_tag": false,
66+
"is_license_intro": false,
67+
"rule_length": 93,
68+
"rule_relevance": 100,
69+
"matched_text": "Permission to use, copy, modify, and distribute this software for any purpose without\n fee is hereby granted, provided that this entire notice is included in all copies of any\n software which is or includes a copy or modification of this software and in all copies\n of the supporting documentation for such software. THIS SOFTWARE IS BEING PROVIDED \"AS\n IS\", WITHOUT ANY EXPRESS OR IMPLIED WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR\n LUCENT TECHNOLOGIES MAKE ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE\n MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.",
70+
"licenses": [
71+
{
72+
"key": "x11-lucent",
73+
"name": "X11-Style (Lucent)",
74+
"short_name": "X11-Style (Lucent)",
75+
"category": "Permissive",
76+
"is_exception": false,
77+
"is_unknown": false,
78+
"owner": "Alcatel-Lucent",
79+
"homepage_url": null,
80+
"text_url": "",
81+
"reference_url": "https://scancode-licensedb.aboutcode.org/x11-lucent",
82+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/x11-lucent.LICENSE",
83+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/x11-lucent.yml",
84+
"spdx_license_key": "LicenseRef-scancode-x11-lucent",
85+
"spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/x11-lucent.LICENSE"
86+
}
87+
]
88+
}
89+
]
90+
},
91+
{
92+
"license_expression": "bzip2-libbzip-2010",
93+
"detection_rules": [
94+
"unknown-intro-followed-by-match"
95+
],
96+
"matches": [
97+
{
98+
"score": 100.0,
99+
"start_line": 18,
100+
"end_line": 18,
101+
"matched_length": 5,
102+
"match_coverage": 100.0,
103+
"matcher": "2-aho",
104+
"license_expression": "unknown-license-reference",
105+
"rule_identifier": "license-intro_4.RULE",
106+
"referenced_filenames": [],
107+
"is_license_text": false,
108+
"is_license_notice": false,
109+
"is_license_reference": false,
110+
"is_license_tag": false,
111+
"is_license_intro": true,
112+
"rule_length": 5,
113+
"rule_relevance": 100,
114+
"matched_text": "licensed under the following terms:",
115+
"licenses": [
116+
{
117+
"key": "unknown-license-reference",
118+
"name": "Unknown License file reference",
119+
"short_name": "Unknown License reference",
120+
"category": "Unstated License",
121+
"is_exception": false,
122+
"is_unknown": true,
123+
"owner": "Unspecified",
124+
"homepage_url": null,
125+
"text_url": "",
126+
"reference_url": "https://scancode-licensedb.aboutcode.org/unknown-license-reference",
127+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
128+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.yml",
129+
"spdx_license_key": "LicenseRef-scancode-unknown-license-reference",
130+
"spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE"
131+
}
132+
]
133+
},
134+
{
135+
"score": 100.0,
136+
"start_line": 25,
137+
"end_line": 54,
138+
"matched_length": 233,
139+
"match_coverage": 100.0,
140+
"matcher": "2-aho",
141+
"license_expression": "bzip2-libbzip-2010",
142+
"rule_identifier": "bzip2-libbzip-2010.LICENSE",
143+
"referenced_filenames": [],
144+
"is_license_text": true,
145+
"is_license_notice": false,
146+
"is_license_reference": false,
147+
"is_license_tag": false,
148+
"is_license_intro": false,
149+
"rule_length": 233,
150+
"rule_relevance": 100,
151+
"matched_text": "Redistribution and use in source and binary forms, with or without\n modification, are permitted provided that the following conditions\n are met:\n\n 1. Redistributions of source code must retain the above copyright\n notice, this list of conditions and the following disclaimer.\n\n 2. The origin of this software must not be misrepresented; you must\n not claim that you wrote the original software. If you use this\n software in a product, an acknowledgment in the product\n documentation would be appreciated but is not required.\n\n 3. Altered source versions must be plainly marked as such, and must\n not be misrepresented as being the original software.\n\n 4. The name of the author may not be used to endorse or promote\n products derived from this software without specific prior written\n permission.\n\n THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS\n OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY\n DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE\n GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.",
152+
"licenses": [
153+
{
154+
"key": "bzip2-libbzip-2010",
155+
"name": "bzip2 License 2010",
156+
"short_name": "bzip2 License 2010",
157+
"category": "Permissive",
158+
"is_exception": false,
159+
"is_unknown": false,
160+
"owner": "bzip",
161+
"homepage_url": "https://github.com/asimonov-im/bzip2/blob/master/LICENSE",
162+
"text_url": "",
163+
"reference_url": "https://scancode-licensedb.aboutcode.org/bzip2-libbzip-2010",
164+
"scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/bzip2-libbzip-2010.LICENSE",
165+
"scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/bzip2-libbzip-2010.yml",
166+
"spdx_license_key": "bzip2-1.0.6",
167+
"spdx_url": "https://spdx.org/licenses/bzip2-1.0.6"
168+
}
169+
]
170+
}
171+
]
172+
}
173+
],
174+
"license_clues": [],
175+
"percentage_of_license_text": 87.73,
176+
"scan_errors": []
177+
}
178+
]
179+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
be/src/gutil/utf/*: licensed under the following terms:
2+
3+
UTF-8 Library
4+
5+
The authors of this software are Rob Pike and Ken Thompson.
6+
Copyright (c) 1998-2002 by Lucent Technologies.
7+
8+
Permission to use, copy, modify, and distribute this software for any purpose without
9+
fee is hereby granted, provided that this entire notice is included in all copies of any
10+
software which is or includes a copy or modification of this software and in all copies
11+
of the supporting documentation for such software. THIS SOFTWARE IS BEING PROVIDED "AS
12+
IS", WITHOUT ANY EXPRESS OR IMPLIED WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR
13+
LUCENT TECHNOLOGIES MAKE ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
14+
MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15+
16+
--------------------------------------------------------------------------------
17+
18+
be/src/gutil/valgrind.h: licensed under the following terms:
19+
20+
This file is part of Valgrind, a dynamic binary instrumentation
21+
framework.
22+
23+
Copyright (C) 2000-2008 Julian Seward. All rights reserved.
24+
25+
Redistribution and use in source and binary forms, with or without
26+
modification, are permitted provided that the following conditions
27+
are met:
28+
29+
1. Redistributions of source code must retain the above copyright
30+
notice, this list of conditions and the following disclaimer.
31+
32+
2. The origin of this software must not be misrepresented; you must
33+
not claim that you wrote the original software. If you use this
34+
software in a product, an acknowledgment in the product
35+
documentation would be appreciated but is not required.
36+
37+
3. Altered source versions must be plainly marked as such, and must
38+
not be misrepresented as being the original software.
39+
40+
4. The name of the author may not be used to endorse or promote
41+
products derived from this software without specific prior written
42+
permission.
43+
44+
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
45+
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
46+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47+
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
48+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
50+
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
51+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
52+
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
53+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
54+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

tests/licensedcode/test_plugin_license_detection.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,23 @@ def test_license_match_unknown_license_intro_eclipse_foundation_tycho():
8989
check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES)
9090

9191

92+
def test_license_match_unknown_license_intro_with_long_gaps_between():
93+
test_dir = test_env.get_test_loc('plugin_license/unknown_intro/scan-unknown-intro-long-gaps-between/', copy=True)
94+
result_file = test_env.get_temp_file('json')
95+
args = [
96+
'--license',
97+
'--license-text',
98+
'--license-text-diagnostics',
99+
'--strip-root',
100+
'--verbose',
101+
'--json', result_file,
102+
test_dir,
103+
]
104+
run_scan_click(args)
105+
test_loc = test_env.get_test_loc('plugin_license/unknown_intro/scan-unknown-intro-long-gaps-between.expected.json')
106+
check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES)
107+
108+
92109
def test_license_match_unknown_license_with_license_ref_to_key_file_at_root():
93110
test_dir = test_env.get_test_loc('plugin_license/license_reference/scan/unknown-ref-to-key-file-root', copy=True)
94111
result_file = test_env.get_temp_file('json')

tests/packagedcode/data/pypi/setup.py/simple-setup.py-expected.json

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@
4141
"code_view_url": null,
4242
"vcs_url": null,
4343
"copyright": null,
44-
"declared_license_expression": "((apache-2.0 AND scancode-acknowledgment) AND cc0-1.0) AND (apache-2.0 AND cc0-1.0)",
45-
"declared_license_expression_spdx": "((Apache-2.0 AND LicenseRef-scancode-scancode-acknowledgment) AND CC0-1.0) AND (Apache-2.0 AND CC0-1.0)",
44+
"declared_license_expression": "((apache-2.0 AND scancode-acknowledgment) AND cc0-1.0) AND apache-2.0 AND cc0-1.0",
45+
"declared_license_expression_spdx": "((Apache-2.0 AND LicenseRef-scancode-scancode-acknowledgment) AND CC0-1.0) AND Apache-2.0 AND CC0-1.0",
4646
"license_detections": [
4747
{
4848
"license_expression": "(apache-2.0 AND scancode-acknowledgment) AND cc0-1.0",
@@ -143,9 +143,9 @@
143143
]
144144
},
145145
{
146-
"license_expression": "apache-2.0 AND cc0-1.0",
146+
"license_expression": "apache-2.0",
147147
"detection_rules": [
148-
"unknown-intro-followed-by-match"
148+
"not-combined"
149149
],
150150
"matches": [
151151
{
@@ -184,7 +184,15 @@
184184
"spdx_url": "https://spdx.org/licenses/Apache-2.0"
185185
}
186186
]
187-
},
187+
}
188+
]
189+
},
190+
{
191+
"license_expression": "cc0-1.0",
192+
"detection_rules": [
193+
"unknown-intro-followed-by-match"
194+
],
195+
"matches": [
188196
{
189197
"score": 100.0,
190198
"start_line": 1,

0 commit comments

Comments
 (0)