Skip to content

Commit b1cef69

Browse files
committed
Improve package datafile handlers
* Add new attributes to the DatafileHandler class for a datasource type and the supported operating systems * Add and test new validate() method to validate DatafileHandler collection correctness * Apply minor refactorings and code formatting Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent dd675aa commit b1cef69

File tree

11 files changed

+141
-35
lines changed

11 files changed

+141
-35
lines changed

src/packagedcode/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
import attr
11-
1210
from commoncode.system import on_linux
1311
from packagedcode import about
1412
from packagedcode import alpine
@@ -254,6 +252,7 @@
254252
]
255253
)
256254

255+
# registry of all handler classes keyed by datasource_id
257256
HANDLER_BY_DATASOURCE_ID = {handler.datasource_id: handler for handler in ALL_DATAFILE_HANDLERS}
258257

259258

@@ -263,8 +262,8 @@ class UnknownPackageDatasource(Exception):
263262

264263
def get_package_handler(package_data):
265264
"""
266-
Return the DatafileHandler class that corresponds to a ``package_data``
267-
PackageData object. Raise a UnknownPackageDatasource error if the
265+
Return the DatafileHandler class that for a ``package_data``
266+
PackageData class datasource_id. Raise a UnknownPackageDatasource error if the
268267
DatafileHandler is not found.
269268
"""
270269
ppc = HANDLER_BY_DATASOURCE_ID.get(package_data.datasource_id)

src/packagedcode/alpine.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def get_license_detections_and_expression(package):
5858

5959
class AlpineInstalledDatabaseHandler(models.DatafileHandler):
6060
datasource_id = 'alpine_installed_db'
61+
datasource_type = 'sys'
6162
path_patterns = ('*lib/apk/db/installed',)
6263
default_package_type = 'alpine'
6364
description = 'Alpine Linux installed package database'

src/packagedcode/debian.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder)
223223

224224
class DebianInstalledStatusDatabaseHandler(models.DatafileHandler):
225225
datasource_id = 'debian_installed_status_db'
226+
datasource_type = 'sys'
226227
default_package_type = 'deb'
227228
path_patterns = ('*var/lib/dpkg/status',)
228229
description = 'Debian installed packages database'
@@ -391,6 +392,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
391392

392393
class DebianDistrolessInstalledDatabaseHandler(models.DatafileHandler):
393394
datasource_id = 'debian_distroless_installed_db'
395+
datasource_type = 'sys'
394396
default_package_type = 'deb'
395397
path_patterns = ('*var/lib/dpkg/status.d/*',)
396398
description = 'Debian distroless installed database'
@@ -474,6 +476,7 @@ class DebianInstalledFilelistHandler(models.DatafileHandler):
474476
# seen in installed rootfs in:
475477
# - /var/lib/dpkg/info/<package name>.list
476478
datasource_id = 'debian_installed_files_list'
479+
datasource_type = 'sys'
477480
default_package_type = 'deb'
478481
path_patterns = (
479482
'*var/lib/dpkg/info/*.list',
@@ -499,6 +502,7 @@ class DebianInstalledMd5sumFilelistHandler(models.DatafileHandler):
499502
# - /var/lib/dpkg/info/<package name>.md5sums
500503
# - /var/lib/dpkg/info/<package name:arch>.md5sums
501504
datasource_id = 'debian_installed_md5sums'
505+
datasource_type = 'sys'
502506
default_package_type = 'deb'
503507
path_patterns = (
504508
'*var/lib/dpkg/info/*.md5sums',

src/packagedcode/maven.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
there is no pom.properties check if there are side-by-side artifacts
5656
"""
5757

58+
5859
class MavenBasePackageHandler(models.DatafileHandler):
5960

6061
@classmethod
@@ -71,7 +72,7 @@ def assemble(cls, package_data, resource, codebase, package_adder=models.add_to_
7172
datafile_path = resource.path
7273

7374
# This order is important as we want pom.xml to be used for package
74-
# creation and then to update from MANIFEST later
75+
# creation and then to update from MANIFEST later
7576
manifest_path_pattern = '*/META-INF/MANIFEST.MF'
7677
nested_pom_xml_path_pattern = '*/META-INF/maven/**/pom.xml'
7778
datafile_name_patterns = (nested_pom_xml_path_pattern, manifest_path_pattern)
@@ -103,7 +104,7 @@ def assemble(cls, package_data, resource, codebase, package_adder=models.add_to_
103104
return
104105

105106
if manifests and pom_xmls:
106-
#raise Exception(resource.path, meta_inf_resource, datafile_name_patterns, package_adder)
107+
# raise Exception(resource.path, meta_inf_resource, datafile_name_patterns, package_adder)
107108
parent_resource = meta_inf_resource.parent(codebase)
108109
if not parent_resource:
109110
parent_resource = meta_inf_resource
@@ -272,7 +273,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder)
272273
for child in root.walk(codebase):
273274
if 'pom.xml' in child.path:
274275
number_poms += 1
275-
276+
276277
if number_poms > 1:
277278
root = resource
278279
else:
@@ -315,7 +316,7 @@ def parse(cls, location, package_only=False):
315316
if TRACE:
316317
logger.debug(f'MavenPomPropertiesHandler.parse: properties: {properties!r}')
317318
if properties:
318-
yield from cls.parse_pom_properties(properties=properties, package_only=package_only)
319+
yield from cls.parse_pom_properties(properties=properties, package_only=package_only)
319320

320321
@classmethod
321322
def parse_pom_properties(cls, properties, package_only=False):
@@ -1308,11 +1309,14 @@ def _parse(
13081309
)
13091310
return MavenPackageData.from_data(package_data, package_only)
13101311

1312+
13111313
class MavenPackageData(models.PackageData):
13121314

13131315
datasource_id = 'maven_pom'
13141316

1317+
@classmethod
13151318
def get_license_detections_for_extracted_license_statement(
1319+
cls,
13161320
extracted_license,
13171321
try_as_expression=True,
13181322
approximate=True,
@@ -1321,16 +1325,16 @@ def get_license_detections_for_extracted_license_statement(
13211325
from packagedcode.licensing import get_normalized_license_detections
13221326
from packagedcode.licensing import get_license_detections_for_extracted_license_statement
13231327

1324-
if not MavenPackageData.check_extracted_license_statement_structure(extracted_license):
1328+
if not cls.check_extracted_license_statement_structure(extracted_license):
13251329
return get_normalized_license_detections(
13261330
extracted_license=extracted_license,
13271331
try_as_expression=try_as_expression,
13281332
approximate=approximate,
13291333
expression_symbols=expression_symbols,
13301334
)
1331-
1335+
13321336
new_extracted_license = extracted_license.copy()
1333-
1337+
13341338
for license_entry in new_extracted_license:
13351339
license_entry.pop("distribution")
13361340
if not license_entry.get("name"):
@@ -1349,8 +1353,8 @@ def get_license_detections_for_extracted_license_statement(
13491353
expression_symbols=expression_symbols,
13501354
)
13511355

1352-
1353-
def check_extracted_license_statement_structure(extracted_license):
1356+
@classmethod
1357+
def check_extracted_license_statement_structure(cls, extracted_license):
13541358

13551359
is_list_of_mappings = False
13561360
if not isinstance(extracted_license, list):
@@ -1362,7 +1366,7 @@ def check_extracted_license_statement_structure(extracted_license):
13621366
if not isinstance(extracted_license_item, dict):
13631367
is_list_of_mappings = False
13641368
break
1365-
1369+
13661370
return is_list_of_mappings
13671371

13681372

src/packagedcode/models.py

Lines changed: 64 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,29 +7,30 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import logging
1011
import os
1112
import uuid
12-
from fnmatch import fnmatchcase
13-
import logging
1413
import sys
1514

15+
from fnmatch import fnmatchcase
16+
1617
import attr
17-
from packageurl import normalize_qualifiers
18-
from packageurl import PackageURL
1918
import saneyaml
2019

2120
from commoncode import filetype
21+
from commoncode.fileutils import as_posixpath
2222
from commoncode.datautils import choices
2323
from commoncode.datautils import Boolean
2424
from commoncode.datautils import Date
2525
from commoncode.datautils import Integer
2626
from commoncode.datautils import List
2727
from commoncode.datautils import Mapping
2828
from commoncode.datautils import String
29-
from commoncode.fileutils import as_posixpath
3029
from commoncode.resource import Resource
3130
from license_expression import combine_expressions
3231
from license_expression import Licensing
32+
from packageurl import normalize_qualifiers
33+
from packageurl import PackageURL
3334

3435
try:
3536
from typecode import contenttype
@@ -41,6 +42,7 @@
4142
except ImportError:
4243
licensing = None
4344

45+
# FIXME: what if licensing is not importable?
4446
from packagedcode.licensing import get_declared_license_expression_spdx
4547

4648
"""
@@ -963,7 +965,7 @@ def get_license_detections_and_expression(self):
963965
return [], None
964966

965967
if self.datasource_id:
966-
default_relation_license=get_default_relation_license(
968+
default_relation_license = get_default_relation_license(
967969
datasource_id=self.datasource_id,
968970
)
969971
else:
@@ -1017,12 +1019,11 @@ def add_to_package(package_uid, resource, codebase):
10171019

10181020
class DatafileHandler:
10191021
"""
1020-
A base handler class to handle any package manifests, lockfiles and data
1021-
files. Each subclass handles a package datafile format to parse datafiles
1022-
and assemble Package and Depdencies from these:
1022+
A base handler class to handle any package manifest, lockfile, package database
1023+
and related data files. Each subclass handles a package datafile format to parse
1024+
datafiles and assemble Package and Dependencies from these:
10231025
10241026
- parses a datafile format and yields package data.
1025-
10261027
- assembles this datafile package data in top-level packages and dependencies
10271028
- assigns package files to their package
10281029
"""
@@ -1033,6 +1034,16 @@ class DatafileHandler:
10331034
# can only contain ASCII letters, digits and underscore. Must be lowercase
10341035
datasource_id = None
10351036

1037+
# style of package data processed by this handler, either app for application package like npm,
1038+
# sys for system packages like rpm, or info for informational data file that provides hints but
1039+
# is not a package manifest, like with a README file
1040+
# possible values are app, sys and info
1041+
datasource_type = 'app'
1042+
1043+
# tuple of specifically supported operating systems. If None or empty, all platforms are supported
1044+
# possible values are win, mac, linux, freebsd
1045+
supported_oses = tuple()
1046+
10361047
# Sequence of known fnmatch-style case-insensitive glob patterns (e.g., Unix
10371048
# shell style patterns) that apply on the whole POSIX path for package
10381049
# datafiles recognized and parsed by this parser. See fnmatch.fnmatch().
@@ -1053,7 +1064,7 @@ class DatafileHandler:
10531064
# Informational: Default primary language for this parser.
10541065
default_primary_language = None
10551066

1056-
# If the datafilehandler contains only resolved dependencies
1067+
# If the handler is for a lockfile that contains locked/pinned, pre-resolved dependencies
10571068
is_lockfile = False
10581069

10591070
# Informational: Description of this parser
@@ -1062,7 +1073,9 @@ class DatafileHandler:
10621073
# Informational: URL that documents this file format
10631074
documentation_url = None
10641075

1065-
# Default Relation between license elements detected in an `extracted_license_statement`
1076+
# Default license expression relation between the license detected in an
1077+
# `extracted_license_statement` for this data file.
1078+
# This may vary for each data file based on conventions and specifications.
10661079
default_relation_license = None
10671080

10681081
@classmethod
@@ -1491,11 +1504,44 @@ def get_top_level_resources(cls, manifest_resource, codebase):
14911504
"""
14921505
pass
14931506

1507+
@classmethod
1508+
def validate(cls):
1509+
"""
1510+
Validate this class.
1511+
Raise ImproperlyConfiguredDatafileHandler exception on errors.
1512+
"""
1513+
1514+
did = cls.datasource_id
1515+
if not did:
1516+
raise ImproperlyConfiguredDatafileHandler(
1517+
f'The handler {cls!r} has an empty datasource_id {did!r}.')
1518+
1519+
DATASOURCE_TYPES = 'app', 'sys', 'info',
1520+
dfs = cls.datasource_type
1521+
if dfs not in DATASOURCE_TYPES:
1522+
raise ImproperlyConfiguredDatafileHandler(
1523+
f'The handler {did!r} : {cls!r} has an invalid '
1524+
f'datasource_type: {dfs!r}: must be one of {DATASOURCE_TYPES!r}.'
1525+
)
1526+
1527+
oses = 'linux', 'win', 'max', 'freebsd',
1528+
soses = cls.supported_oses
1529+
if soses and not all(s in oses for s in soses):
1530+
raise ImproperlyConfiguredDatafileHandler(
1531+
f'The handler {cls.datasource_id!r} : {cls!r} has invalid '
1532+
f'supported_oses: {soses!r}: must be empty or among {oses!r}'
1533+
)
1534+
1535+
1536+
class ImproperlyConfiguredDatafileHandler(Exception):
1537+
"""ScanCode Package Datafile Handler is not properly configured"""
1538+
pass
1539+
14941540

14951541
class NonAssemblableDatafileHandler(DatafileHandler):
14961542
"""
1497-
A handler that has no default implmentation for the assemble method, e.g.,
1498-
it will not alone trigger the creation of a top-level Pacakge.
1543+
A handler with a default implementation of an assemble method doing nothing, e.g.,
1544+
it will not alone trigger the creation of a top-level Package.
14991545
"""
15001546

15011547
@classmethod
@@ -1528,8 +1574,8 @@ def build_purl(mapping):
15281574
subpath = mapping.get('subpath')
15291575
return PackageURL(
15301576
type=ptype,
1531-
name=name,
15321577
namespace=namespace,
1578+
name=name,
15331579
version=version,
15341580
qualifiers=qualifiers,
15351581
subpath=subpath,
@@ -1601,10 +1647,10 @@ def from_package_data(cls, package_data, datafile_path, package_only=False):
16011647
license_match['from_file'] = datafile_path
16021648

16031649
package = cls.from_dict(package_data_mapping)
1604-
1650+
16051651
if not package.package_uid:
16061652
package.package_uid = build_package_uid(package.purl)
1607-
1653+
16081654
if not package_only:
16091655
package.populate_license_fields()
16101656
package.populate_holder_field()
@@ -1763,7 +1809,7 @@ def refresh_license_expressions(self, default_relation='AND'):
17631809
self.declared_license_expression_spdx = get_declared_license_expression_spdx(
17641810
declared_license_expression=self.declared_license_expression,
17651811
)
1766-
1812+
17671813
if self.other_license_detections:
17681814
self.other_license_expression = str(combine_expressions(
17691815
expressions=[

src/packagedcode/msi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ class MsiInstallerHandler(models.DatafileHandler):
195195
default_package_type = 'msi'
196196
description = 'Microsoft MSI installer'
197197
documentation_url = 'https://docs.microsoft.com/en-us/windows/win32/msi/windows-installer-portal'
198+
supported_oses = ('linux',)
198199

199200
@classmethod
200201
def parse(cls, location, package_only=False):

src/packagedcode/pypi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1093,7 +1093,7 @@ def parse(cls, location, package_only=False):
10931093
package_only=package_only,
10941094
)
10951095

1096-
1096+
# FIXME: this is NOT used
10971097
class PypiSdistArchiveHandler(models.DatafileHandler):
10981098
datasource_id = 'pypi_sdist'
10991099
path_patterns = ('*.tar.gz', '*.tar.bz2', '*.zip',)

0 commit comments

Comments
 (0)