Skip to content

Commit cac2c25

Browse files
pombredanneaayushkdev
authored andcommitted
Improve package datafile handlers
* Add new attributes to the DatafileHandler class for a datasource type and the supported operating systems * Add and test new validate() method to validate DatafileHandler collection correctness * Apply minor refactorings and code formatting Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 824163f commit cac2c25

File tree

11 files changed

+139
-33
lines changed

11 files changed

+139
-33
lines changed

src/packagedcode/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
import attr
11-
1210
from commoncode.system import on_linux
1311
from packagedcode import about
1412
from packagedcode import alpine
@@ -260,6 +258,7 @@
260258
]
261259
)
262260

261+
# registry of all handler classes keyed by datasource_id
263262
HANDLER_BY_DATASOURCE_ID = {handler.datasource_id: handler for handler in ALL_DATAFILE_HANDLERS}
264263

265264

@@ -269,8 +268,8 @@ class UnknownPackageDatasource(Exception):
269268

270269
def get_package_handler(package_data):
271270
"""
272-
Return the DatafileHandler class that corresponds to a ``package_data``
273-
PackageData object. Raise a UnknownPackageDatasource error if the
271+
Return the DatafileHandler class that for a ``package_data``
272+
PackageData class datasource_id. Raise a UnknownPackageDatasource error if the
274273
DatafileHandler is not found.
275274
"""
276275
ppc = HANDLER_BY_DATASOURCE_ID.get(package_data.datasource_id)

src/packagedcode/alpine.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def get_license_detections_and_expression(package):
5858

5959
class AlpineInstalledDatabaseHandler(models.DatafileHandler):
6060
datasource_id = 'alpine_installed_db'
61+
datasource_type = 'sys'
6162
path_patterns = ('*lib/apk/db/installed',)
6263
default_package_type = 'alpine'
6364
description = 'Alpine Linux installed package database'

src/packagedcode/debian.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder)
223223

224224
class DebianInstalledStatusDatabaseHandler(models.DatafileHandler):
225225
datasource_id = 'debian_installed_status_db'
226+
datasource_type = 'sys'
226227
default_package_type = 'deb'
227228
path_patterns = ('*var/lib/dpkg/status',)
228229
description = 'Debian installed packages database'
@@ -391,6 +392,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
391392

392393
class DebianDistrolessInstalledDatabaseHandler(models.DatafileHandler):
393394
datasource_id = 'debian_distroless_installed_db'
395+
datasource_type = 'sys'
394396
default_package_type = 'deb'
395397
path_patterns = ('*var/lib/dpkg/status.d/*',)
396398
description = 'Debian distroless installed database'
@@ -474,6 +476,7 @@ class DebianInstalledFilelistHandler(models.DatafileHandler):
474476
# seen in installed rootfs in:
475477
# - /var/lib/dpkg/info/<package name>.list
476478
datasource_id = 'debian_installed_files_list'
479+
datasource_type = 'sys'
477480
default_package_type = 'deb'
478481
path_patterns = (
479482
'*var/lib/dpkg/info/*.list',
@@ -499,6 +502,7 @@ class DebianInstalledMd5sumFilelistHandler(models.DatafileHandler):
499502
# - /var/lib/dpkg/info/<package name>.md5sums
500503
# - /var/lib/dpkg/info/<package name:arch>.md5sums
501504
datasource_id = 'debian_installed_md5sums'
505+
datasource_type = 'sys'
502506
default_package_type = 'deb'
503507
path_patterns = (
504508
'*var/lib/dpkg/info/*.md5sums',

src/packagedcode/maven.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
there is no pom.properties check if there are side-by-side artifacts
5656
"""
5757

58+
5859
class MavenBasePackageHandler(models.DatafileHandler):
5960

6061
@classmethod
@@ -71,7 +72,7 @@ def assemble(cls, package_data, resource, codebase, package_adder=models.add_to_
7172
datafile_path = resource.path
7273

7374
# This order is important as we want pom.xml to be used for package
74-
# creation and then to update from MANIFEST later
75+
# creation and then to update from MANIFEST later
7576
manifest_path_pattern = '*/META-INF/MANIFEST.MF'
7677
nested_pom_xml_path_pattern = '*/META-INF/maven/**/pom.xml'
7778
datafile_name_patterns = (nested_pom_xml_path_pattern, manifest_path_pattern)
@@ -103,7 +104,7 @@ def assemble(cls, package_data, resource, codebase, package_adder=models.add_to_
103104
return
104105

105106
if manifests and pom_xmls:
106-
#raise Exception(resource.path, meta_inf_resource, datafile_name_patterns, package_adder)
107+
# raise Exception(resource.path, meta_inf_resource, datafile_name_patterns, package_adder)
107108
parent_resource = meta_inf_resource.parent(codebase)
108109
if not parent_resource:
109110
parent_resource = meta_inf_resource
@@ -272,7 +273,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder)
272273
for child in root.walk(codebase):
273274
if 'pom.xml' in child.path:
274275
number_poms += 1
275-
276+
276277
if number_poms > 1:
277278
root = resource
278279
else:
@@ -315,7 +316,7 @@ def parse(cls, location, package_only=False):
315316
if TRACE:
316317
logger.debug(f'MavenPomPropertiesHandler.parse: properties: {properties!r}')
317318
if properties:
318-
yield from cls.parse_pom_properties(properties=properties, package_only=package_only)
319+
yield from cls.parse_pom_properties(properties=properties, package_only=package_only)
319320

320321
@classmethod
321322
def parse_pom_properties(cls, properties, package_only=False):
@@ -1308,11 +1309,14 @@ def _parse(
13081309
)
13091310
return MavenPackageData.from_data(package_data, package_only)
13101311

1312+
13111313
class MavenPackageData(models.PackageData):
13121314

13131315
datasource_id = 'maven_pom'
13141316

1317+
@classmethod
13151318
def get_license_detections_for_extracted_license_statement(
1319+
cls,
13161320
extracted_license,
13171321
try_as_expression=True,
13181322
approximate=True,
@@ -1321,16 +1325,16 @@ def get_license_detections_for_extracted_license_statement(
13211325
from packagedcode.licensing import get_normalized_license_detections
13221326
from packagedcode.licensing import get_license_detections_for_extracted_license_statement
13231327

1324-
if not MavenPackageData.check_extracted_license_statement_structure(extracted_license):
1328+
if not cls.check_extracted_license_statement_structure(extracted_license):
13251329
return get_normalized_license_detections(
13261330
extracted_license=extracted_license,
13271331
try_as_expression=try_as_expression,
13281332
approximate=approximate,
13291333
expression_symbols=expression_symbols,
13301334
)
1331-
1335+
13321336
new_extracted_license = extracted_license.copy()
1333-
1337+
13341338
for license_entry in new_extracted_license:
13351339
license_entry.pop("distribution")
13361340
if not license_entry.get("name"):
@@ -1349,8 +1353,8 @@ def get_license_detections_for_extracted_license_statement(
13491353
expression_symbols=expression_symbols,
13501354
)
13511355

1352-
1353-
def check_extracted_license_statement_structure(extracted_license):
1356+
@classmethod
1357+
def check_extracted_license_statement_structure(cls, extracted_license):
13541358

13551359
is_list_of_mappings = False
13561360
if not isinstance(extracted_license, list):
@@ -1362,7 +1366,7 @@ def check_extracted_license_statement_structure(extracted_license):
13621366
if not isinstance(extracted_license_item, dict):
13631367
is_list_of_mappings = False
13641368
break
1365-
1369+
13661370
return is_list_of_mappings
13671371

13681372

src/packagedcode/models.py

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,29 +7,30 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import logging
1011
import os
1112
import uuid
12-
from fnmatch import fnmatchcase
13-
import logging
1413
import sys
1514

15+
from fnmatch import fnmatchcase
16+
1617
import attr
17-
from packageurl import normalize_qualifiers
18-
from packageurl import PackageURL
1918
import saneyaml
2019

2120
from commoncode import filetype
21+
from commoncode.fileutils import as_posixpath
2222
from commoncode.datautils import choices
2323
from commoncode.datautils import Boolean
2424
from commoncode.datautils import Date
2525
from commoncode.datautils import Integer
2626
from commoncode.datautils import List
2727
from commoncode.datautils import Mapping
2828
from commoncode.datautils import String
29-
from commoncode.fileutils import as_posixpath
3029
from commoncode.resource import Resource
3130
from license_expression import combine_expressions
3231
from license_expression import Licensing
32+
from packageurl import normalize_qualifiers
33+
from packageurl import PackageURL
3334

3435
try:
3536
from typecode import contenttype
@@ -41,6 +42,7 @@
4142
except ImportError:
4243
licensing = None
4344

45+
# FIXME: what if licensing is not importable?
4446
from packagedcode.licensing import get_declared_license_expression_spdx
4547

4648
"""
@@ -963,7 +965,7 @@ def get_license_detections_and_expression(self):
963965
return [], None
964966

965967
if self.datasource_id:
966-
default_relation_license=get_default_relation_license(
968+
default_relation_license = get_default_relation_license(
967969
datasource_id=self.datasource_id,
968970
)
969971
else:
@@ -1020,12 +1022,11 @@ def add_to_package(package_uid, resource, codebase):
10201022

10211023
class DatafileHandler:
10221024
"""
1023-
A base handler class to handle any package manifests, lockfiles and data
1024-
files. Each subclass handles a package datafile format to parse datafiles
1025-
and assemble Package and Depdencies from these:
1025+
A base handler class to handle any package manifest, lockfile, package database
1026+
and related data files. Each subclass handles a package datafile format to parse
1027+
datafiles and assemble Package and Dependencies from these:
10261028
10271029
- parses a datafile format and yields package data.
1028-
10291030
- assembles this datafile package data in top-level packages and dependencies
10301031
- assigns package files to their package
10311032
"""
@@ -1036,6 +1037,16 @@ class DatafileHandler:
10361037
# can only contain ASCII letters, digits and underscore. Must be lowercase
10371038
datasource_id = None
10381039

1040+
# style of package data processed by this handler, either app for application package like npm,
1041+
# sys for system packages like rpm, or info for informational data file that provides hints but
1042+
# is not a package manifest, like with a README file
1043+
# possible values are app, sys and info
1044+
datasource_type = 'app'
1045+
1046+
# tuple of specifically supported operating systems. If None or empty, all platforms are supported
1047+
# possible values are win, mac, linux, freebsd
1048+
supported_oses = tuple()
1049+
10391050
# Sequence of known fnmatch-style case-insensitive glob patterns (e.g., Unix
10401051
# shell style patterns) that apply on the whole POSIX path for package
10411052
# datafiles recognized and parsed by this parser. See fnmatch.fnmatch().
@@ -1056,7 +1067,7 @@ class DatafileHandler:
10561067
# Informational: Default primary language for this parser.
10571068
default_primary_language = None
10581069

1059-
# If the datafilehandler contains only resolved dependencies
1070+
# If the handler is for a lockfile that contains locked/pinned, pre-resolved dependencies
10601071
is_lockfile = False
10611072

10621073
# Informational: Description of this parser
@@ -1065,7 +1076,9 @@ class DatafileHandler:
10651076
# Informational: URL that documents this file format
10661077
documentation_url = None
10671078

1068-
# Default Relation between license elements detected in an `extracted_license_statement`
1079+
# Default license expression relation between the license detected in an
1080+
# `extracted_license_statement` for this data file.
1081+
# This may vary for each data file based on conventions and specifications.
10691082
default_relation_license = None
10701083

10711084
@classmethod
@@ -1494,11 +1507,44 @@ def get_top_level_resources(cls, manifest_resource, codebase):
14941507
"""
14951508
pass
14961509

1510+
@classmethod
1511+
def validate(cls):
1512+
"""
1513+
Validate this class.
1514+
Raise ImproperlyConfiguredDatafileHandler exception on errors.
1515+
"""
1516+
1517+
did = cls.datasource_id
1518+
if not did:
1519+
raise ImproperlyConfiguredDatafileHandler(
1520+
f'The handler {cls!r} has an empty datasource_id {did!r}.')
1521+
1522+
DATASOURCE_TYPES = 'app', 'sys', 'info',
1523+
dfs = cls.datasource_type
1524+
if dfs not in DATASOURCE_TYPES:
1525+
raise ImproperlyConfiguredDatafileHandler(
1526+
f'The handler {did!r} : {cls!r} has an invalid '
1527+
f'datasource_type: {dfs!r}: must be one of {DATASOURCE_TYPES!r}.'
1528+
)
1529+
1530+
oses = 'linux', 'win', 'max', 'freebsd',
1531+
soses = cls.supported_oses
1532+
if soses and not all(s in oses for s in soses):
1533+
raise ImproperlyConfiguredDatafileHandler(
1534+
f'The handler {cls.datasource_id!r} : {cls!r} has invalid '
1535+
f'supported_oses: {soses!r}: must be empty or among {oses!r}'
1536+
)
1537+
1538+
1539+
class ImproperlyConfiguredDatafileHandler(Exception):
1540+
"""ScanCode Package Datafile Handler is not properly configured"""
1541+
pass
1542+
14971543

14981544
class NonAssemblableDatafileHandler(DatafileHandler):
14991545
"""
1500-
A handler that has no default implmentation for the assemble method, e.g.,
1501-
it will not alone trigger the creation of a top-level Pacakge.
1546+
A handler with a default implementation of an assemble method doing nothing, e.g.,
1547+
it will not alone trigger the creation of a top-level Package.
15021548
"""
15031549

15041550
@classmethod
@@ -1531,8 +1577,8 @@ def build_purl(mapping):
15311577
subpath = mapping.get('subpath')
15321578
return PackageURL(
15331579
type=ptype,
1534-
name=name,
15351580
namespace=namespace,
1581+
name=name,
15361582
version=version,
15371583
qualifiers=qualifiers,
15381584
subpath=subpath,
@@ -1769,7 +1815,7 @@ def refresh_license_expressions(self, default_relation='AND'):
17691815
self.declared_license_expression_spdx = get_declared_license_expression_spdx(
17701816
declared_license_expression=self.declared_license_expression,
17711817
)
1772-
1818+
17731819
if self.other_license_detections:
17741820
self.other_license_expression = str(combine_expressions(
17751821
expressions=[

src/packagedcode/msi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ class MsiInstallerHandler(models.DatafileHandler):
195195
default_package_type = 'msi'
196196
description = 'Microsoft MSI installer'
197197
documentation_url = 'https://docs.microsoft.com/en-us/windows/win32/msi/windows-installer-portal'
198+
supported_oses = ('linux',)
198199

199200
@classmethod
200201
def parse(cls, location, package_only=False):

src/packagedcode/pypi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1130,7 +1130,7 @@ def parse(cls, location, package_only=False):
11301130
package_only=package_only,
11311131
)
11321132

1133-
1133+
# FIXME: this is NOT used
11341134
class PypiSdistArchiveHandler(models.DatafileHandler):
11351135
datasource_id = 'pypi_sdist'
11361136
path_patterns = ('*.tar.gz', '*.tar.bz2', '*.zip',)

src/packagedcode/rpm.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ def assemble(cls, package_data, resource, codebase, package_adder):
229229
class RpmInstalledNdbDatabaseHandler(BaseRpmInstalledDatabaseHandler):
230230
# used by recent Suse
231231
datasource_id = 'rpm_installed_database_ndb'
232+
datasource_type = 'sys'
233+
supported_oses = ('linux',)
232234
path_patterns = ('*usr/lib/sysimage/rpm/Packages.db',)
233235
default_package_type = 'rpm'
234236
default_package_namespace = 'TBD'
@@ -243,6 +245,8 @@ class RpmInstalledSqliteDatabaseHandler(BaseRpmInstalledDatabaseHandler):
243245
# Mimetype: application/vnd.sqlite3
244246

245247
datasource_id = 'rpm_installed_database_sqlite'
248+
datasource_type = 'sys'
249+
supported_oses = ('linux',)
246250
path_patterns = ('*rpm/rpmdb.sqlite',)
247251
default_package_type = 'rpm'
248252
default_package_namespace = 'TBD'
@@ -254,6 +258,8 @@ class RpmInstalledSqliteDatabaseHandler(BaseRpmInstalledDatabaseHandler):
254258
class RpmInstalledBdbDatabaseHandler(BaseRpmInstalledDatabaseHandler):
255259
# used by legacy RHEL/CentOS/Fedora/Suse
256260
datasource_id = 'rpm_installed_database_bdb'
261+
datasource_type = 'sys'
262+
supported_oses = ('linux',)
257263
path_patterns = ('*var/lib/rpm/Packages',)
258264
filetypes = ('berkeley',)
259265
default_package_type = 'rpm'
@@ -381,6 +387,7 @@ def parse(cls, location, package_only=False):
381387

382388
class RpmMarinerContainerManifestHandler(models.DatafileHandler):
383389
datasource_id = 'rpm_mariner_manifest'
390+
datasource_type = 'sys'
384391
# container-manifest-1 is more minimal and has the same data
385392
path_patterns = ('*var/lib/rpmmanifest/container-manifest-2',)
386393
default_package_type = 'rpm'
@@ -502,6 +509,7 @@ def clean_mariner_manifest_data(package_data):
502509

503510
class RpmLicenseFilesHandler(models.NonAssemblableDatafileHandler):
504511
datasource_id = 'rpm_package_licenses'
512+
datasource_type = 'sys'
505513
path_patterns = (
506514
'*usr/share/licenses/*/COPYING*',
507515
'*usr/share/licenses/*/LICENSE*',

0 commit comments

Comments
 (0)