Skip to content

Commit 8ed3a53

Browse files
Merge pull request #4261 from aboutcode-org/improve-maven-license
Improve maven license detection
2 parents 427e8ae + f2f5ef9 commit 8ed3a53

File tree

279 files changed

+3144
-26945
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

279 files changed

+3144
-26945
lines changed

src/packagedcode/maven.py

Lines changed: 105 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -766,13 +766,14 @@ def _get_comments(self, xml=None):
766766
def _find_licenses(self):
767767
"""Return an iterable of license mappings."""
768768
for lic in self.pom_data.findall('licenses/license'):
769-
yield dict([
770-
('name', self._get_attribute('name', lic)),
771-
('url', self._get_attribute('url', lic)),
772-
('comments', self._get_attribute('comments', lic)),
773-
# arcane and seldom used
774-
('distribution', self._get_attribute('distribution', lic)),
775-
])
769+
yield {"license": dict([
770+
('name', self._get_attribute('name', lic)),
771+
('url', self._get_attribute('url', lic)),
772+
('comments', self._get_attribute('comments', lic)),
773+
# arcane and seldom used
774+
('distribution', self._get_attribute('distribution', lic)),
775+
])
776+
}
776777

777778
def _find_parties(self, key='developers/developer'):
778779
"""Return an iterable of party mappings for a given xpath."""
@@ -1254,7 +1255,7 @@ def _parse(
12541255
# complex defeinition in Maven
12551256
qualifiers['type'] = extension
12561257

1257-
extracted_license_statement = pom.licenses
1258+
extracted_license_statement = clean_licenses(pom.licenses) or None
12581259

12591260
group_id = pom.group_id
12601261
artifact_id = pom.artifact_id
@@ -1325,52 +1326,121 @@ def get_license_detections_for_extracted_license_statement(
13251326
approximate=True,
13261327
expression_symbols=None,
13271328
):
1329+
"""
1330+
Return license detections from a Maven POM license data structure.
1331+
This looks like this in XML, and some attributes are more important than others.
1332+
Which one exists and whether we can detect a proper license in each also determines which
1333+
attribute we need to consider.
1334+
The original XML has this shape:
1335+
<licenses>
1336+
<license>
1337+
<name>Apache-2.0</name>
1338+
<url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
1339+
<distribution>repo</distribution>
1340+
<comments> notes... </comments>
1341+
</license>
1342+
</licenses>
1343+
The data structure we keep has this shape:
1344+
[{"license":
1345+
{
1346+
"name": "Apache-2.0",
1347+
"url": "https://www.apache.org/licenses/LICENSE-2.0.txt",
1348+
"comments": "notes...",
1349+
}
1350+
},
1351+
.... other license]
1352+
"""
1353+
13281354
from packagedcode.licensing import get_normalized_license_detections
13291355
from packagedcode.licensing import get_license_detections_for_extracted_license_statement
13301356

1331-
if not cls.check_extracted_license_statement_structure(extracted_license):
1357+
if not is_standard_maven_license_data_structure(licenses=extracted_license):
1358+
# use the generic detection
13321359
return get_normalized_license_detections(
13331360
extracted_license=extracted_license,
13341361
try_as_expression=try_as_expression,
13351362
approximate=approximate,
13361363
expression_symbols=expression_symbols,
13371364
)
1365+
extracted_license = clean_licenses(extracted_license)
1366+
extracted_license_statement = saneyaml.dump(extracted_license)
13381367

1339-
new_extracted_license = extracted_license.copy()
1340-
1341-
for license_entry in new_extracted_license:
1342-
license_entry.pop("distribution")
1343-
if not license_entry.get("name"):
1344-
license_entry.pop("name")
1345-
if not license_entry.get("url"):
1346-
license_entry.pop("url")
1347-
if not license_entry.get("comments"):
1348-
license_entry.pop("comments")
1349-
1350-
extracted_license_statement = saneyaml.dump(new_extracted_license)
1351-
1352-
return get_license_detections_for_extracted_license_statement(
1368+
detections = get_license_detections_for_extracted_license_statement(
13531369
extracted_license_statement=extracted_license_statement,
13541370
try_as_expression=try_as_expression,
13551371
approximate=approximate,
13561372
expression_symbols=expression_symbols,
13571373
)
1374+
# TODO: if we have any unknown license, we need to try harder
1375+
# We can detect each license item individually and check if the unknown was detected
1376+
# in the name, URL or comment field.
1377+
# name, URL, comments
1378+
# name unknwon: keep that unknown in all cases
1379+
# URL or comments with unknown, but name not unknown: we want to combine the unknown
1380+
# matches with the correct name match
13581381

1359-
@classmethod
1360-
def check_extracted_license_statement_structure(cls, extracted_license):
1382+
return detections
13611383

1362-
is_list_of_mappings = False
1363-
if not isinstance(extracted_license, list):
1364-
return is_list_of_mappings
1365-
else:
1366-
is_list_of_mappings = True
13671384

1368-
for extracted_license_item in extracted_license:
1369-
if not isinstance(extracted_license_item, dict):
1370-
is_list_of_mappings = False
1371-
break
1385+
def clean_licenses(licenses):
1386+
"""
1387+
Return a modified, cleaned ``licenses`` list of POM license data cleaned from unwanted data
1388+
(some fields, empty entries, etc).
1389+
Each item in the list has this shape:
1390+
[
1391+
{"license": {"name": "Apache-2.0", "url": "https://www... ", "comments": "..."} },
1392+
{"license": {other fields} },
1393+
]
1394+
"""
1395+
for licitem in (licenses or []):
1396+
if not isinstance(licitem, dict):
1397+
continue
1398+
1399+
license_attributes = licitem.get("license")
1400+
if not license_attributes or not len(licitem) == 1:
1401+
continue
1402+
1403+
license_attributes.pop("distribution", None)
1404+
if not license_attributes.get("name"):
1405+
license_attributes.pop("name", None)
1406+
if not license_attributes.get("url"):
1407+
license_attributes.pop("url", None)
1408+
if not license_attributes.get("comments"):
1409+
license_attributes.pop("comments", None)
1410+
1411+
return licenses
13721412

1373-
return is_list_of_mappings
1413+
1414+
def is_standard_maven_license_data_structure(licenses):
1415+
"""
1416+
Return True if ``licenses`` has the structure expected from a Maven POM license data. The data
1417+
is a list of dicts of dicts, each top dict with a single item as {"license" : {mapping of
1418+
attributes}. We expect the POM license data to be in that shape in most cases, except for legacy
1419+
non POM 4 data.
1420+
Each item in the list has this shape:
1421+
[
1422+
{"license": {"name": "Apache-2.0", "url": "https://www... ", "comments": "..."} },
1423+
{"license": {other fields} },
1424+
]
1425+
1426+
"""
1427+
if not isinstance(licenses, list):
1428+
return False
1429+
1430+
fields = ("name", "url", "comment",)
1431+
1432+
for item in licenses:
1433+
if not isinstance(item, dict):
1434+
return False
1435+
if not len(item) == 1:
1436+
return False
1437+
litem = item.get('license') or {}
1438+
if not isinstance(litem, dict):
1439+
return False
1440+
if not any(field in item for field in fields):
1441+
return False
1442+
1443+
return True
13741444

13751445

13761446
def build_vcs_and_code_view_urls(scm):

tests/formattedcode/data/common/manifests-expected.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
"rule_relevance": 100,
4545
"rule_identifier": "cddl-1.0_98.RULE",
4646
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
47-
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
47+
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
4848
}
4949
],
5050
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
@@ -53,7 +53,7 @@
5353
"other_license_expression": null,
5454
"other_license_expression_spdx": null,
5555
"other_license_detections": [],
56-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
56+
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
5757
"notice_text": null,
5858
"source_packages": [
5959
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"
@@ -1142,7 +1142,7 @@
11421142
"rule_relevance": 100,
11431143
"rule_identifier": "cddl-1.0_98.RULE",
11441144
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
1145-
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
1145+
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
11461146
}
11471147
],
11481148
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
@@ -1151,7 +1151,7 @@
11511151
"other_license_expression": null,
11521152
"other_license_expression_spdx": null,
11531153
"other_license_detections": [],
1154-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
1154+
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
11551155
"notice_text": null,
11561156
"source_packages": [
11571157
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"

tests/formattedcode/data/common/manifests-expected.jsonlines

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
"rule_relevance": 100,
7777
"rule_identifier": "cddl-1.0_98.RULE",
7878
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
79-
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
79+
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
8080
}
8181
],
8282
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
@@ -85,7 +85,7 @@
8585
"other_license_expression": null,
8686
"other_license_expression_spdx": null,
8787
"other_license_detections": [],
88-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
88+
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
8989
"notice_text": null,
9090
"source_packages": [
9191
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"
@@ -1188,7 +1188,7 @@
11881188
"rule_relevance": 100,
11891189
"rule_identifier": "cddl-1.0_98.RULE",
11901190
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE",
1191-
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html"
1191+
"matched_text": "name: Common Development and Distribution License (CDDL) v1.0\nurl: http://www.sun.com/cddl/cddl.html"
11921192
}
11931193
],
11941194
"identifier": "cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c"
@@ -1197,7 +1197,7 @@
11971197
"other_license_expression": null,
11981198
"other_license_expression_spdx": null,
11991199
"other_license_detections": [],
1200-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
1200+
"extracted_license_statement": "- license:\n name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
12011201
"notice_text": null,
12021202
"source_packages": [
12031203
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"

tests/formattedcode/data/common/manifests-expected.yaml

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,16 @@ packages:
105105
rule_identifier: cddl-1.0_98.RULE
106106
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
107107
matched_text: |
108-
- name: Common Development and Distribution License (CDDL) v1.0
109-
url: http://www.sun.com/cddl/cddl.html
108+
name: Common Development and Distribution License (CDDL) v1.0
109+
url: http://www.sun.com/cddl/cddl.html
110110
identifier: cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c
111111
other_license_expression:
112112
other_license_expression_spdx:
113113
other_license_detections: []
114114
extracted_license_statement: |
115-
- name: Common Development and Distribution License (CDDL) v1.0
116-
url: http://www.sun.com/cddl/cddl.html
115+
- license:
116+
name: Common Development and Distribution License (CDDL) v1.0
117+
url: http://www.sun.com/cddl/cddl.html
117118
notice_text:
118119
source_packages:
119120
- pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources
@@ -818,8 +819,8 @@ license_detections:
818819
rule_identifier: cddl-1.0_98.RULE
819820
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
820821
matched_text: |
821-
- name: Common Development and Distribution License (CDDL) v1.0
822-
url: http://www.sun.com/cddl/cddl.html
822+
name: Common Development and Distribution License (CDDL) v1.0
823+
url: http://www.sun.com/cddl/cddl.html
823824
- identifier: lgpl_3_0-272571eb-5e68-95b6-ddb0-71de2d8df321
824825
license_expression: lgpl-3.0
825826
license_expression_spdx: LGPL-3.0-only
@@ -2006,15 +2007,16 @@ files:
20062007
rule_identifier: cddl-1.0_98.RULE
20072008
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_98.RULE
20082009
matched_text: |
2009-
- name: Common Development and Distribution License (CDDL) v1.0
2010-
url: http://www.sun.com/cddl/cddl.html
2010+
name: Common Development and Distribution License (CDDL) v1.0
2011+
url: http://www.sun.com/cddl/cddl.html
20112012
identifier: cddl_1_0-b17acf03-1e4f-20e6-cbb8-1b6945ee4c4c
20122013
other_license_expression:
20132014
other_license_expression_spdx:
20142015
other_license_detections: []
20152016
extracted_license_statement: |
2016-
- name: Common Development and Distribution License (CDDL) v1.0
2017-
url: http://www.sun.com/cddl/cddl.html
2017+
- license:
2018+
name: Common Development and Distribution License (CDDL) v1.0
2019+
url: http://www.sun.com/cddl/cddl.html
20182020
notice_text:
20192021
source_packages:
20202022
- pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources

tests/packagedcode/data/m2/aopalliance/aopalliance/1.0/aopalliance-1.0.pom.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
"organization_url": null,
1515
"licenses": [
1616
{
17-
"name": "Public Domain",
18-
"url": null,
19-
"comments": null,
20-
"distribution": null
17+
"license": {
18+
"name": "Public Domain",
19+
"url": null,
20+
"comments": null,
21+
"distribution": null
22+
}
2123
}
2224
],
2325
"developers": [],

tests/packagedcode/data/m2/aopalliance/aopalliance/1.0/aopalliance-1.0.pom.package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
"rule_relevance": 70,
4343
"rule_identifier": "public-domain_bare_words.RULE",
4444
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/public-domain_bare_words.RULE",
45-
"matched_text": "- name: Public Domain"
45+
"matched_text": "name: Public Domain"
4646
}
4747
],
4848
"identifier": "public_domain-3dd945ae-65df-7d90-6467-60f8ecf2eb77"
@@ -51,7 +51,7 @@
5151
"other_license_expression": null,
5252
"other_license_expression_spdx": null,
5353
"other_license_detections": [],
54-
"extracted_license_statement": "- name: Public Domain\n",
54+
"extracted_license_statement": "- license:\n name: Public Domain\n",
5555
"notice_text": null,
5656
"source_packages": [
5757
"pkg:maven/aopalliance/aopalliance@1.0?classifier=sources"

tests/packagedcode/data/m2/aspectj/aspectjrt/1.5.3/aspectjrt-1.5.3.pom.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
"organization_url": null,
1515
"licenses": [
1616
{
17-
"name": "Eclipse Public License - v 1.0",
18-
"url": "http://www.eclipse.org/legal/epl-v10.html",
19-
"comments": null,
20-
"distribution": "repo"
17+
"license": {
18+
"name": "Eclipse Public License - v 1.0",
19+
"url": "http://www.eclipse.org/legal/epl-v10.html",
20+
"comments": null,
21+
"distribution": "repo"
22+
}
2123
}
2224
],
2325
"developers": [],

tests/packagedcode/data/m2/aspectj/aspectjrt/1.5.3/aspectjrt-1.5.3.pom.package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
"rule_relevance": 100,
4343
"rule_identifier": "epl-1.0_4.RULE",
4444
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/epl-1.0_4.RULE",
45-
"matched_text": "- name: Eclipse Public License - v 1.0"
45+
"matched_text": "name: Eclipse Public License - v 1.0"
4646
},
4747
{
4848
"license_expression": "epl-1.0",
@@ -57,7 +57,7 @@
5757
"rule_relevance": 100,
5858
"rule_identifier": "epl-1.0.RULE",
5959
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/epl-1.0.RULE",
60-
"matched_text": " url: http://www.eclipse.org/legal/epl-v10.html"
60+
"matched_text": "url: http://www.eclipse.org/legal/epl-v10.html"
6161
}
6262
],
6363
"identifier": "epl_1_0-48d15cd3-0ccf-4f62-3d30-24dc9b7308e5"
@@ -66,7 +66,7 @@
6666
"other_license_expression": null,
6767
"other_license_expression_spdx": null,
6868
"other_license_detections": [],
69-
"extracted_license_statement": "- name: Eclipse Public License - v 1.0\n url: http://www.eclipse.org/legal/epl-v10.html\n",
69+
"extracted_license_statement": "- license:\n name: Eclipse Public License - v 1.0\n url: http://www.eclipse.org/legal/epl-v10.html\n",
7070
"notice_text": null,
7171
"source_packages": [
7272
"pkg:maven/aspectj/aspectjrt@1.5.3?classifier=sources"

0 commit comments

Comments
 (0)