Skip to content

Commit bfd88b6

Browse files
Merge pull request #3689 from nexB/purls-only-v2
Add a faster package scan with `--package-only`
2 parents 95f7e55 + 578d289 commit bfd88b6

File tree

73 files changed

+4453
-314
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+4453
-314
lines changed

CHANGELOG.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ v33.0.0 (next next, roadmap)
3737
v32.1.0 (next, roadmap)
3838
----------------------------
3939

40+
New CLI options:
41+
42+
- A new CLI option ``--package-only`` has been added which performs
43+
a faster package scan by skipping the package assembly step and
44+
also skipping license/copyright detection on package metadata.
45+
4046
Major API/other changes:
4147

4248
- Output Format Version updated to 3.1.0 (minor version bump)

docs/source/rst_snippets/basic_options.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ documenting a program's options. For example:
3333
--system-package Scan ``<input>`` for installed system package
3434
databases.
3535

36+
--package-only Scan ``<input>`` for system and application
37+
only for package metadata, without license/
38+
copyright detection and package assembly.
39+
3640
-e, --email Scan ``<input>`` for emails.
3741

3842
Sub-Options:

src/packagedcode/about.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class AboutFileHandler(models.DatafileHandler):
4747
documentation_url = 'https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html'
4848

4949
@classmethod
50-
def parse(cls, location):
50+
def parse(cls, location, package_only=False):
5151
"""
5252
Yield one or more Package manifest objects given a file ``location`` pointing to a
5353
package archive, manifest or similar.
@@ -90,7 +90,7 @@ def parse(cls, location):
9090
file_references.append(models.FileReference(path=about_resource))
9191

9292
# FIXME: we should put the unprocessed attributes in extra data
93-
yield models.PackageData(
93+
package_data = dict(
9494
datasource_id=cls.datasource_id,
9595
type=package_type,
9696
namespace=package_ns,
@@ -103,6 +103,7 @@ def parse(cls, location):
103103
download_url=download_url,
104104
file_references=file_references,
105105
)
106+
yield models.PackageData.from_data(package_data, package_only)
106107

107108
@classmethod
108109
def assemble(cls, package_data, resource, codebase, package_adder):

src/packagedcode/alpine.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,12 @@ class AlpineInstalledDatabaseHandler(models.DatafileHandler):
6363
description = 'Alpine Linux installed package database'
6464

6565
@classmethod
66-
def parse(cls, location):
66+
def parse(cls, location, package_only=False):
6767
yield from parse_alpine_installed_db(
6868
location=location,
6969
datasource_id=cls.datasource_id,
7070
package_type=cls.default_package_type,
71+
package_only=package_only,
7172
)
7273

7374
@classmethod
@@ -134,9 +135,14 @@ class AlpineApkbuildHandler(models.DatafileHandler):
134135
documentation_url = 'https://wiki.alpinelinux.org/wiki/APKBUILD_Reference'
135136

136137
@classmethod
137-
def parse(cls, location):
138-
package_data = parse_apkbuild(location, strict=True)
139-
cls.populate_license_fields(package_data)
138+
def parse(cls, location, package_only=False):
139+
package_data = parse_apkbuild(
140+
location=location,
141+
strict=True,
142+
package_only=package_only
143+
)
144+
if not package_only:
145+
cls.populate_license_fields(package_data)
140146
if package_data:
141147
yield package_data
142148

@@ -165,7 +171,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder)
165171
)
166172

167173

168-
def parse_alpine_installed_db(location, datasource_id, package_type):
174+
def parse_alpine_installed_db(location, datasource_id, package_type, package_only=False):
169175
"""
170176
Yield PackageData objects from an installed database file at `location`
171177
or None. Typically found at '/lib/apk/db/installed' in an Alpine
@@ -179,6 +185,7 @@ def parse_alpine_installed_db(location, datasource_id, package_type):
179185
package_fields=package_fields,
180186
datasource_id=datasource_id,
181187
package_type=package_type,
188+
package_only=package_only,
182189
)
183190

184191

@@ -241,7 +248,7 @@ def get_alpine_installed_db_fields(location):
241248
])
242249

243250

244-
def parse_apkbuild(location, strict=False):
251+
def parse_apkbuild(location, strict=False, package_only=False):
245252
"""
246253
Return a PackageData object from an APKBUILD file at ``location`` or None.
247254
@@ -256,6 +263,7 @@ def parse_apkbuild(location, strict=False):
256263
datasource_id=AlpineApkbuildHandler.datasource_id,
257264
package_type=AlpineApkbuildHandler.default_package_type,
258265
strict=strict,
266+
package_only=package_only,
259267
)
260268

261269

@@ -732,7 +740,7 @@ def fix_apkbuild(text):
732740
return text
733741

734742

735-
def parse_apkbuild_text(text, datasource_id, package_type, strict=False):
743+
def parse_apkbuild_text(text, datasource_id, package_type, strict=False, package_only=False):
736744
"""
737745
Return a PackageData object from an APKBUILD text context or None. Only
738746
consider variables with a name listed in the ``names`` set.
@@ -761,7 +769,8 @@ def parse_apkbuild_text(text, datasource_id, package_type, strict=False):
761769
package = build_package_data(
762770
variables,
763771
datasource_id=datasource_id,
764-
package_type=package_type
772+
package_type=package_type,
773+
package_only=package_only,
765774
)
766775

767776
if package and unresolved:
@@ -800,7 +809,7 @@ def parse_pkginfo(location):
800809
raise NotImplementedError
801810

802811

803-
def build_package_data(package_fields, datasource_id, package_type):
812+
def build_package_data(package_fields, datasource_id, package_type, package_only=False):
804813
"""
805814
Return a PackageData object from a ``package_fields`` iterable of (name,
806815
value) tuples.
@@ -850,7 +859,16 @@ def build_package_data(package_fields, datasource_id, package_type):
850859

851860
converted_fields.update(converted)
852861

853-
return models.PackageData.from_dict(converted_fields)
862+
fields_not_required = ["current_file", "current_dir"]
863+
for field in fields_not_required:
864+
value = converted_fields.get(field)
865+
if value:
866+
converted_fields.pop(field)
867+
868+
return models.PackageData.from_data(
869+
package_data=converted_fields,
870+
package_only=package_only,
871+
)
854872

855873
#####################################
856874
# Note: all handlers MUST accept **kwargs as they also receive the current data

src/packagedcode/bower.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class BowerJsonHandler(models.DatafileHandler):
2525
documentation_url = 'https://bower.io'
2626

2727
@classmethod
28-
def parse(cls, location):
28+
def parse(cls, location, package_only=False):
2929
with io.open(location, encoding='utf-8') as loc:
3030
package_data = json.load(loc)
3131

@@ -87,7 +87,7 @@ def parse(cls, location):
8787
)
8888
)
8989

90-
yield models.PackageData(
90+
package_data = dict(
9191
datasource_id=cls.datasource_id,
9292
type=cls.default_package_type,
9393
name=name,
@@ -98,5 +98,6 @@ def parse(cls, location):
9898
parties=parties,
9999
homepage_url=homepage_url,
100100
vcs_url=vcs_url,
101-
dependencies=dependencies
101+
dependencies=dependencies,
102102
)
103+
yield models.PackageData.from_data(package_data, package_only)

src/packagedcode/build.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class AutotoolsConfigureHandler(models.NonAssemblableDatafileHandler):
5555
documentation_url = 'https://www.gnu.org/software/automake/'
5656

5757
@classmethod
58-
def parse(cls, location):
58+
def parse(cls, location, package_only=False):
5959
# we use the parent directory as a package name
6060
name = fileutils.file_name(fileutils.parent_directory(location))
6161
# we could use checksums as version in the future
@@ -67,12 +67,13 @@ def parse(cls, location):
6767
# there are dependencies we could use
6868
# dependencies = []
6969

70-
yield models.PackageData(
70+
package_data = dict(
7171
datasource_id=cls.datasource_id,
7272
type=cls.default_package_type,
7373
name=name,
7474
version=version,
7575
)
76+
yield models.PackageData.from_data(package_data, package_only)
7677

7778

7879

@@ -104,6 +105,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
104105
package = models.Package.from_package_data(
105106
package_data=package_data,
106107
datafile_path=resource.path,
108+
package_only=True,
107109
)
108110

109111
if TRACE:
@@ -135,8 +137,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
135137
yield resource
136138

137139
@classmethod
138-
def parse(cls, location):
139-
140+
def parse(cls, location, package_only=False):
140141
# Thanks to Starlark being a Python dialect, we can use `ast` to parse it
141142
with open(location, 'rb') as f:
142143
tree = ast.parse(f.read())
@@ -188,23 +189,28 @@ def parse(cls, location):
188189
if TRACE:
189190
logger_debug(f"build: parse: license_files: {license_files}")
190191

191-
package_data = models.PackageData(
192+
package_data = dict(
192193
datasource_id=cls.datasource_id,
193194
type=cls.default_package_type,
194195
name=name,
196+
extracted_license_statement=license_files,
197+
)
198+
# `package_only` is True as we do the license detection
199+
# on assembly
200+
yield models.PackageData.from_data(
201+
package_data=package_data,
202+
package_only=True,
195203
)
196-
197-
package_data.extracted_license_statement = license_files
198-
yield package_data
199204

200205
else:
201206
# If we don't find anything in the pkgdata file, we yield a Package
202207
# with the parent directory as the name
203-
yield models.PackageData(
208+
package_data = dict(
204209
datasource_id=cls.datasource_id,
205210
type=cls.default_package_type,
206211
name=fileutils.file_name(fileutils.parent_directory(location))
207212
)
213+
yield models.PackageData.from_data(package_data, package_only)
208214

209215
@classmethod
210216
def assign_package_to_resources(cls, package, resource, codebase, package_adder, skip_name=None):
@@ -326,7 +332,7 @@ class BuckMetadataBzlHandler(BaseStarlarkManifestHandler):
326332
documentation_url = 'https://buck.build/'
327333

328334
@classmethod
329-
def parse(cls, location):
335+
def parse(cls, location, package_only=True):
330336

331337
with open(location, 'rb') as f:
332338
tree = ast.parse(f.read())
@@ -378,7 +384,7 @@ def parse(cls, location):
378384
):
379385
# TODO: Create function that determines package type from download URL,
380386
# then create a package of that package type from the metadata info
381-
yield models.PackageData(
387+
package_data = dict(
382388
datasource_id=cls.datasource_id,
383389
type=metadata_fields.get('upstream_type', cls.default_package_type),
384390
name=metadata_fields.get('name'),
@@ -388,6 +394,7 @@ def parse(cls, location):
388394
homepage_url=metadata_fields.get('upstream_address', ''),
389395
# TODO: Store 'upstream_hash` somewhere
390396
)
397+
yield models.PackageData.from_data(package_data, package_only=True)
391398

392399
if (
393400
'package_type'
@@ -401,7 +408,7 @@ def parse(cls, location):
401408
and 'vcs_commit_hash'
402409
in metadata_fields
403410
):
404-
yield models.PackageData(
411+
package_data = dict(
405412
datasource_id=cls.datasource_id,
406413
type=metadata_fields.get('package_type', cls.default_package_type),
407414
name=metadata_fields.get('name'),
@@ -414,6 +421,7 @@ def parse(cls, location):
414421
sha1=metadata_fields.get('download_archive_sha1', ''),
415422
extra_data=dict(vcs_commit_hash=metadata_fields.get('vcs_commit_hash', ''))
416423
)
424+
yield models.PackageData.from_data(package_data, package_only=True)
417425

418426
@classmethod
419427
def assign_package_to_resources(cls, package, resource, codebase, package_adder):

src/packagedcode/build_gradle.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ class BuildGradleHandler(models.DatafileHandler):
5959
description = 'Gradle build script'
6060

6161
@classmethod
62-
def parse(cls, location):
62+
def parse(cls, location, package_only=False):
6363
dependencies = get_dependencies(location)
64-
return build_package(cls, dependencies)
64+
return build_package(cls, dependencies, package_only)
6565

6666
# TODO: handle complex cases of nested builds with many packages
6767
@classmethod
@@ -328,7 +328,7 @@ def get_dependencies(build_gradle_location):
328328
return list(get_dependencies_from_parse_tree(parse_tree))
329329

330330

331-
def build_package(cls, dependencies):
331+
def build_package(cls, dependencies, package_only=False):
332332
"""
333333
Yield PackageData from a ``dependencies`` list of mappings.
334334
"""
@@ -364,10 +364,11 @@ def build_package(cls, dependencies):
364364
)
365365
)
366366

367-
yield models.PackageData(
367+
package_data = dict(
368368
datasource_id=cls.datasource_id,
369369
type=cls.default_package_type,
370370
primary_language=BuildGradleHandler.default_primary_language,
371371
dependencies=package_dependencies,
372372
)
373+
yield models.PackageData.from_data(package_data, package_only)
373374

src/packagedcode/cargo.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ class CargoTomlHandler(CargoBaseHandler):
104104
documentation_url = 'https://doc.rust-lang.org/cargo/reference/manifest.html'
105105

106106
@classmethod
107-
def parse(cls, location):
107+
def parse(cls, location, package_only=False):
108108
package_data = toml.load(location, _dict=dict)
109109
core_package_data = package_data.get('package', {})
110110
workspace = package_data.get('workspace', {})
@@ -149,7 +149,7 @@ def parse(cls, location):
149149
if workspace:
150150
extra_data["workspace"] = workspace
151151

152-
yield models.PackageData(
152+
package_data = dict(
153153
datasource_id=cls.datasource_id,
154154
type=cls.default_package_type,
155155
name=name,
@@ -166,6 +166,7 @@ def parse(cls, location):
166166
dependencies=dependencies,
167167
extra_data=extra_data,
168168
)
169+
yield models.PackageData.from_data(package_data, package_only)
169170

170171

171172
CARGO_ATTRIBUTE_MAPPING = {
@@ -200,7 +201,7 @@ class CargoLockHandler(CargoBaseHandler):
200201
# ]
201202

202203
@classmethod
203-
def parse(cls, location):
204+
def parse(cls, location, package_only=False):
204205
cargo_lock = toml.load(location, _dict=dict)
205206
dependencies = []
206207
package = cargo_lock.get('package', [])
@@ -221,12 +222,13 @@ def parse(cls, location):
221222
)
222223
)
223224

224-
yield models.PackageData(
225+
package_data = dict(
225226
datasource_id=cls.datasource_id,
226227
type=cls.default_package_type,
227228
primary_language=cls.default_primary_language,
228229
dependencies=dependencies,
229230
)
231+
yield models.PackageData.from_data(package_data, package_only)
230232

231233

232234
def dependency_mapper(dependencies, scope='dependencies'):

0 commit comments

Comments
 (0)