Skip to content

Commit fc8a1dc

Browse files
committed
Improve Maven extracted_license storage and detection
We were only storing the list of attributes but the now "license" top level elements. Because of this a maven tag with only name: mit would not be detected properly as there is no rule for this and this would yield too many false positive. Now we have a structure that is a list of license as {license: {name: mit, url: foo: comments: bar}} This ensures a more accurate detection. With this also regenerate all Maven related tests, and also ensure we filter early empty and unused license attributes (such as the "distribution") Also add new tests and apply mild refactoring for maven license detection for clarity. Remove redundant tests and simplify some other tests for clarity. Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 5b6998d commit fc8a1dc

File tree

279 files changed

+3144
-26945
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

279 files changed

+3144
-26945
lines changed

src/packagedcode/maven.py

Lines changed: 105 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -766,13 +766,14 @@ def _get_comments(self, xml=None):
766766
def _find_licenses(self):
767767
"""Return an iterable of license mappings."""
768768
for lic in self.pom_data.findall('licenses/license'):
769-
yield dict([
770-
('name', self._get_attribute('name', lic)),
771-
('url', self._get_attribute('url', lic)),
772-
('comments', self._get_attribute('comments', lic)),
773-
# arcane and seldom used
774-
('distribution', self._get_attribute('distribution', lic)),
775-
])
769+
yield {"license": dict([
770+
('name', self._get_attribute('name', lic)),
771+
('url', self._get_attribute('url', lic)),
772+
('comments', self._get_attribute('comments', lic)),
773+
# arcane and seldom used
774+
('distribution', self._get_attribute('distribution', lic)),
775+
])
776+
}
776777

777778
def _find_parties(self, key='developers/developer'):
778779
"""Return an iterable of party mappings for a given xpath."""
@@ -1254,7 +1255,7 @@ def _parse(
12541255
# complex defeinition in Maven
12551256
qualifiers['type'] = extension
12561257

1257-
extracted_license_statement = pom.licenses
1258+
extracted_license_statement = clean_licenses(pom.licenses) or None
12581259

12591260
group_id = pom.group_id
12601261
artifact_id = pom.artifact_id
@@ -1325,52 +1326,121 @@ def get_license_detections_for_extracted_license_statement(
13251326
approximate=True,
13261327
expression_symbols=None,
13271328
):
1329+
"""
1330+
Return license detections from a Maven POM license data structure.
1331+
This looks like this in XML, and some attributes are more important than others.
1332+
Which one exists and whether we can detect a proper license in each also determines which
1333+
attribute we need to consider.
1334+
The original XML has this shape:
1335+
<licenses>
1336+
<license>
1337+
<name>Apache-2.0</name>
1338+
<url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
1339+
<distribution>repo</distribution>
1340+
<comments> notes... </comments>
1341+
</license>
1342+
</licenses>
1343+
The data structure we keep has this shape:
1344+
[{"license":
1345+
{
1346+
"name": "Apache-2.0",
1347+
"url": "https://www.apache.org/licenses/LICENSE-2.0.txt",
1348+
"comments": "notes...",
1349+
}
1350+
},
1351+
.... other license]
1352+
"""
1353+
13281354
from packagedcode.licensing import get_normalized_license_detections
13291355
from packagedcode.licensing import get_license_detections_for_extracted_license_statement
13301356

1331-
if not cls.check_extracted_license_statement_structure(extracted_license):
1357+
if not is_standard_maven_license_data_structure(licenses=extracted_license):
1358+
# use the generic detection
13321359
return get_normalized_license_detections(
13331360
extracted_license=extracted_license,
13341361
try_as_expression=try_as_expression,
13351362
approximate=approximate,
13361363
expression_symbols=expression_symbols,
13371364
)
1365+
extracted_license = clean_licenses(extracted_license)
1366+
extracted_license_statement = saneyaml.dump(extracted_license)
13381367

1339-
new_extracted_license = extracted_license.copy()
1340-
1341-
for license_entry in new_extracted_license:
1342-
license_entry.pop("distribution")
1343-
if not license_entry.get("name"):
1344-
license_entry.pop("name")
1345-
if not license_entry.get("url"):
1346-
license_entry.pop("url")
1347-
if not license_entry.get("comments"):
1348-
license_entry.pop("comments")
1349-
1350-
extracted_license_statement = saneyaml.dump(new_extracted_license)
1351-
1352-
return get_license_detections_for_extracted_license_statement(
1368+
detections = get_license_detections_for_extracted_license_statement(
13531369
extracted_license_statement=extracted_license_statement,
13541370
try_as_expression=try_as_expression,
13551371
approximate=approximate,
13561372
expression_symbols=expression_symbols,
13571373
)
1374+
# TODO: if we have any unknown license, we need to try harder
1375+
# We can detect each license item individually and check if the unknown was detected
1376+
# in the name, URL or comment field.
1377+
# name, URL, comments
1378+
# name unknwon: keep that unknown in all cases
1379+
# URL or comments with unknown, but name not unknown: we want to combine the unknown
1380+
# matches with the correct name match
13581381

1359-
@classmethod
1360-
def check_extracted_license_statement_structure(cls, extracted_license):
1382+
return detections
13611383

1362-
is_list_of_mappings = False
1363-
if not isinstance(extracted_license, list):
1364-
return is_list_of_mappings
1365-
else:
1366-
is_list_of_mappings = True
13671384

1368-
for extracted_license_item in extracted_license:
1369-
if not isinstance(extracted_license_item, dict):
1370-
is_list_of_mappings = False
1371-
break
1385+
def clean_licenses(licenses):
1386+
"""
1387+
Return a modified, cleaned ``licenses`` list of POM license data cleaned from unwanted data
1388+
(some fields, empty entries, etc).
1389+
Each item in the list has this shape:
1390+
[
1391+
{"license": {"name": "Apache-2.0", "url": "https://www... ", "comments": "..."} },
1392+
{"license": {other fields} },
1393+
]
1394+
"""
1395+
for licitem in (licenses or []):
1396+
if not isinstance(licitem, dict):
1397+
continue
1398+
1399+
license_attributes = licitem.get("license")
1400+
if not license_attributes or not len(licitem) == 1:
1401+
continue
1402+
1403+
license_attributes.pop("distribution", None)
1404+
if not license_attributes.get("name"):
1405+
license_attributes.pop("name", None)
1406+
if not license_attributes.get("url"):
1407+
license_attributes.pop("url", None)
1408+
if not license_attributes.get("comments"):
1409+
license_attributes.pop("comments", None)
1410+
1411+
return licenses
13721412

1373-
return is_list_of_mappings
1413+
1414+
def is_standard_maven_license_data_structure(licenses):
1415+
"""
1416+
Return True if ``licenses`` has the structure expected from a Maven POM license data. The data
1417+
is a list of dicts of dicts, each top dict with a single item as {"license" : {mapping of
1418+
attributes}. We expect the POM license data to be in that shape in most cases, except for legacy
1419+
non POM 4 data.
1420+
Each item in the list has this shape:
1421+
[
1422+
{"license": {"name": "Apache-2.0", "url": "https://www... ", "comments": "..."} },
1423+
{"license": {other fields} },
1424+
]
1425+
1426+
"""
1427+
if not isinstance(licenses, list):
1428+
return False
1429+
1430+
fields = ("name", "url", "comment",)
1431+
1432+
for item in licenses:
1433+
if not isinstance(item, dict):
1434+
return False
1435+
if not len(item) == 1:
1436+
return False
1437+
litem = item.get('license') or {}
1438+
if not isinstance(litem, dict):
1439+
return False
1440+
if not any(field in item for field in fields):
1441+
return False
1442+
1443+
return True
13741444

13751445

13761446
def build_vcs_and_code_view_urls(scm):

tests/formattedcode/data/common/manifests-expected.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
"rule_relevance": 100,
4545
"rule_identifier": "cddl-1.0_98.RULE",
4646
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
47-
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
47+
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
4848
}
4949
],
5050
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
@@ -53,7 +53,7 @@
5353
"other_license_expression": null,
5454
"other_license_expression_spdx": null,
5555
"other_license_detections": [],
56-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
56+
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
5757
"notice_text": null,
5858
"source_packages": [
5959
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"
@@ -1142,7 +1142,7 @@
11421142
"rule_relevance": 100,
11431143
"rule_identifier": "cddl-1.0_98.RULE",
11441144
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
1145-
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
1145+
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
11461146
}
11471147
],
11481148
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
@@ -1151,7 +1151,7 @@
11511151
"other_license_expression": null,
11521152
"other_license_expression_spdx": null,
11531153
"other_license_detections": [],
1154-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
1154+
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
11551155
"notice_text": null,
11561156
"source_packages": [
11571157
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"

tests/formattedcode/data/common/manifests-expected.jsonlines

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
"rule_relevance": 100,
7777
"rule_identifier": "cddl-1.0_98.RULE",
7878
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
79-
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
79+
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
8080
}
8181
],
8282
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
@@ -85,7 +85,7 @@
8585
"other_license_expression": null,
8686
"other_license_expression_spdx": null,
8787
"other_license_detections": [],
88-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
88+
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
8989
"notice_text": null,
9090
"source_packages": [
9191
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"
@@ -1188,7 +1188,7 @@
11881188
"rule_relevance": 100,
11891189
"rule_identifier": "cddl-1.0_98.RULE",
11901190
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
1191-
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
1191+
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
11921192
}
11931193
],
11941194
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
@@ -1197,7 +1197,7 @@
11971197
"other_license_expression": null,
11981198
"other_license_expression_spdx": null,
11991199
"other_license_detections": [],
1200-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
1200+
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
12011201
"notice_text": null,
12021202
"source_packages": [
12031203
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"

tests/formattedcode/data/common/manifests-expected.yaml

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,16 @@ packages:
105105
rule_identifier: cddl-1.0_98.RULE
106106
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
107107
matched_text: |
108-
- name: Common Development and Distribution License (CDDL) v1.0
109-
url: http://www.sun.com/cddl/cddl.html
108+
name: Common Development and Distribution License (CDDL) v1.0
109+
url: http://www.sun.com/cddl/cddl.html
110110
identifier: cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c
111111
other_license_expression:
112112
other_license_expression_spdx:
113113
other_license_detections: []
114114
extracted_license_statement: |
115-
- name: Common Development and Distribution License (CDDL) v1.0
116-
url: http://www.sun.com/cddl/cddl.html
115+
- license:
116+
name: Common Development and Distribution License (CDDL) v1.0
117+
url: http://www.sun.com/cddl/cddl.html
117118
notice_text:
118119
source_packages:
119120
- pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources
@@ -818,8 +819,8 @@ license_detections:
818819
rule_identifier: cddl-1.0_98.RULE
819820
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
820821
matched_text: |
821-
- name: Common Development and Distribution License (CDDL) v1.0
822-
url: http://www.sun.com/cddl/cddl.html
822+
name: Common Development and Distribution License (CDDL) v1.0
823+
url: http://www.sun.com/cddl/cddl.html
823824
- identifier: lgpl_3_0-272571eb-5e68-95b6-ddb0-71de2d8df321
824825
license_expression: lgpl-3.0
825826
license_expression_spdx: LGPL-3.0-only
@@ -2006,15 +2007,16 @@ files:
20062007
rule_identifier: cddl-1.0_98.RULE
20072008
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
20082009
matched_text: |
2009-
- name: Common Development and Distribution License (CDDL) v1.0
2010-
url: http://www.sun.com/cddl/cddl.html
2010+
name: Common Development and Distribution License (CDDL) v1.0
2011+
url: http://www.sun.com/cddl/cddl.html
20112012
identifier: cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c
20122013
other_license_expression:
20132014
other_license_expression_spdx:
20142015
other_license_detections: []
20152016
extracted_license_statement: |
2016-
- name: Common Development and Distribution License (CDDL) v1.0
2017-
url: http://www.sun.com/cddl/cddl.html
2017+
- license:
2018+
name: Common Development and Distribution License (CDDL) v1.0
2019+
url: http://www.sun.com/cddl/cddl.html
20182020
notice_text:
20192021
source_packages:
20202022
- pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources

tests/packagedcode/data/m2/aopalliance/aopalliance/1.0/aopalliance-1.0.pom.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
"organization_url": null,
1515
"licenses": [
1616
{
17-
"name": "Public Domain",
18-
"url": null,
19-
"comments": null,
20-
"distribution": null
17+
"license": {
18+
"name": "Public Domain",
19+
"url": null,
20+
"comments": null,
21+
"distribution": null
22+
}
2123
}
2224
],
2325
"developers": [],

tests/packagedcode/data/m2/aopalliance/aopalliance/1.0/aopalliance-1.0.pom.package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
"rule_relevance": 70,
4343
"rule_identifier": "public-domain_bare_words.RULE",
4444
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/public-domain_bare_words.RULE",
45-
"matched_text": "- name: Public Domain"
45+
"matched_text": "name: Public Domain"
4646
}
4747
],
4848
"identifier": "public_domain-3dd945ae-65df-7d90-6467-60f8ecf2eb77"
@@ -51,7 +51,7 @@
5151
"other_license_expression": null,
5252
"other_license_expression_spdx": null,
5353
"other_license_detections": [],
54-
"extracted_license_statement": "- name: Public Domain\n",
54+
"extracted_license_statement": "- license:\n name: Public Domain\n",
5555
"notice_text": null,
5656
"source_packages": [
5757
"pkg:maven/aopalliance/aopalliance@1.0?classifier=sources"

tests/packagedcode/data/m2/aspectj/aspectjrt/1.5.3/aspectjrt-1.5.3.pom.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
"organization_url": null,
1515
"licenses": [
1616
{
17-
"name": "Eclipse Public License - v 1.0",
18-
"url": "http://www.eclipse.org/legal/epl-v10.html",
19-
"comments": null,
20-
"distribution": "repo"
17+
"license": {
18+
"name": "Eclipse Public License - v 1.0",
19+
"url": "http://www.eclipse.org/legal/epl-v10.html",
20+
"comments": null,
21+
"distribution": "repo"
22+
}
2123
}
2224
],
2325
"developers": [],

tests/packagedcode/data/m2/aspectj/aspectjrt/1.5.3/aspectjrt-1.5.3.pom.package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
"rule_relevance": 100,
4343
"rule_identifier": "epl-1.0_4.RULE",
4444
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/epl-1.0_4.RULE",
45-
"matched_text": "- name: Eclipse Public License - v 1.0"
45+
"matched_text": "name: Eclipse Public License - v 1.0"
4646
},
4747
{
4848
"license_expression": "epl-1.0",
@@ -57,7 +57,7 @@
5757
"rule_relevance": 100,
5858
"rule_identifier": "epl-1.0.RULE",
5959
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/epl-1.0.RULE",
60-
"matched_text": " url: http://www.eclipse.org/legal/epl-v10.html"
60+
"matched_text": "url: http://www.eclipse.org/legal/epl-v10.html"
6161
}
6262
],
6363
"identifier": "epl_1_0-48d15cd3-0ccf-4f62-3d30-24dc9b7308e5"
@@ -66,7 +66,7 @@
6666
"other_license_expression": null,
6767
"other_license_expression_spdx": null,
6868
"other_license_detections": [],
69-
"extracted_license_statement": "- name: Eclipse Public License - v 1.0\n url: http://www.eclipse.org/legal/epl-v10.html\n",
69+
"extracted_license_statement": "- license:\n name: Eclipse Public License - v 1.0\n url: http://www.eclipse.org/legal/epl-v10.html\n",
7070
"notice_text": null,
7171
"source_packages": [
7272
"pkg:maven/aspectj/aspectjrt@1.5.3?classifier=sources"

0 commit comments

Comments
 (0)