Skip to content

Commit 79aae34

Browse files
Merge pull request #3647 from nexB/fix-debian-namespace
Update debian package manifest parsing
2 parents 9600610 + 0f25e55 commit 79aae34

28 files changed

+2949
-1986
lines changed

CHANGELOG.rst

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,27 @@ v33.0.0 (next next, roadmap)
1212
- Fallback packages for non-native dependencies of SCTK.
1313
- Dependencies for
1414
- Support for copyright detection objects.
15-
- Bump commoncode to v31.0.3
15+
16+
- A new field in packages with the license category for the
17+
detected license expression and also an API function to
18+
compute license categories from license expressions.
19+
See https://github.com/nexB/scancode-toolkit/issues/2897
20+
21+
- More support for tabular output formats: New command-line
22+
options for XSLX output, and the old `--csv` command line
23+
option is removed.
24+
See https://github.com/nexB/scancode-toolkit/issues/830
25+
26+
- `--unknown-licenses` is removed and this is always enabled
27+
and only used in case of improper detections automatically.
28+
Also tag all license rules with required phrases to improve
29+
license detection and reduce false positives.
30+
See https://github.com/nexB/scancode-toolkit/issues/3300
31+
32+
- File categorization support added, a post scan plugin tagging
33+
files with priority levels for review, and also take advantage
34+
of these in other summary plugins.
35+
See https://github.com/nexB/scancode-toolkit/issues/1745
1636

1737
v32.1.0 (next, roadmap)
1838
----------------------------
@@ -68,38 +88,23 @@ Changes in Output Data Structure:
6888
referenced_filenames, and the boolean attributes (like
6989
is_license_notice, is_license_intro etc, as applicable).
7090

71-
- A new field in packages with the license category for the
72-
detected license expression and also an API function to
73-
compute license categories from license expressions.
74-
See https://github.com/nexB/scancode-toolkit/issues/2897
75-
76-
- More support for tabular output formats: New command-line
77-
options for XSLX output, and the old `--csv` command line
78-
option is removed.
79-
See https://github.com/nexB/scancode-toolkit/issues/830
80-
81-
- `--unknown-licenses` is removed and this is always enabled
82-
and only used in case of improper detections automatically.
83-
Also tag all license rules with required phrases to improve
84-
license detection and reduce false positives.
85-
See https://github.com/nexB/scancode-toolkit/issues/3300
86-
87-
- A new `--todo` option is added to show the todo items that
88-
should be reviewed, which are ambiguous license/package
89-
detections.
90-
91-
- File categorization support added, a post scan plugin tagging
92-
files with priority levels for review, and also take advantage
93-
of these in other summary plugins.
94-
See https://github.com/nexB/scancode-toolkit/issues/1745
95-
9691
- New and updated licenses, including support for newly released
9792
SPDX license list version 3.22. This release of the SPDX license
9893
list had 48 new licenses, and several of them we already had as
9994
licenses/rules, and these has been modified to be consistent with
10095
the SPDX list. And the rest have been added as new licenses.
10196
For more details see https://github.com/nexB/scancode-toolkit/pull/3554
10297

98+
- Improve debian namespace detection based on clues and fix
99+
namespace and qualifier bugs for debian purls.
100+
For more details see https://github.com/nexB/scancode.io/issues/899
101+
and https://github.com/nexB/scancode-toolkit/issues/3443
102+
Also improve debian manifests parsing and purl parsing from
103+
filenames. Support for https://github.com/nexB/purldb/issues/245
104+
Bumps debian-inspector to v31.1.0
105+
106+
- Bump commoncode to v31.0.3
107+
103108
- Upgraded spdx-tools dependency to v0.8.
104109
See https://github.com/nexB/scancode-toolkit/issues/3455
105110

docs/source/reference/available_package_parsers.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,13 @@ parsers in scancode-toolkit during documentation builds.
255255
- ``debian_copyright_in_source``
256256
- None
257257
- https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
258+
* - Debian machine readable file standalone
259+
- ``*/copyright``
260+
``*_copyright``
261+
- ``deb``
262+
- ``debian_copyright_standalone``
263+
- None
264+
- https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
258265
* - Debian binary package archive
259266
- ``*.deb``
260267
- ``deb``

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ commoncode==31.0.3
1313
construct==2.10.68
1414
container-inspector==31.1.0
1515
cryptography==37.0.4
16-
debian-inspector==31.0.0
16+
debian-inspector==31.1.0
1717
dockerfile-parse==1.2.0
1818
dparse2==0.7.0
1919
extractcode==31.0.0

setup-mini.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ install_requires =
7272
colorama >= 0.3.9
7373
commoncode >= 31.0.2
7474
container-inspector >= 31.0.0
75-
debian-inspector >= 31.0.0
75+
debian-inspector >= 31.1.0
7676
dparse2 >= 0.7.0
7777
fasteners
7878
fingerprints >= 0.6.0

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ install_requires =
7272
colorama >= 0.3.9
7373
commoncode >= 31.0.3
7474
container-inspector >= 31.0.0
75-
debian-inspector >= 31.0.0
75+
debian-inspector >= 31.1.0
7676
dparse2 >= 0.7.0
7777
fasteners
7878
fingerprints >= 0.6.0

src/packagedcode/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,13 +212,17 @@
212212
debian_copyright.DebianCopyrightFileInPackageHandler,
213213
debian_copyright.DebianCopyrightFileInSourceHandler,
214214

215-
# TODO: consider activating? debian_copyright.StandaloneDebianCopyrightFileHandler,
216-
217215
debian.DebianDistrolessInstalledDatabaseHandler,
218216

219217
debian.DebianInstalledFilelistHandler,
220218
debian.DebianInstalledMd5sumFilelistHandler,
221219
debian.DebianInstalledStatusDatabaseHandler,
220+
debian.DebianControlFileInSourceHandler,
221+
debian.DebianDscFileHandler,
222+
debian.DebianSourcePackageTarballHandler,
223+
debian.DebianSourcePackageMetadataTarballHandler,
224+
debian.DebianDebPackageHandler,
225+
debian_copyright.StandaloneDebianCopyrightFileHandler
222226
]
223227

224228
if on_linux:

src/packagedcode/debian.py

Lines changed: 102 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import os
1111
import logging
12+
from collections import Counter
1213
from pathlib import Path
1314

1415
from commoncode import fileutils
@@ -137,6 +138,7 @@ def parse(cls, location):
137138
debian_data=get_paragraph_data_from_file(location=location),
138139
datasource_id=cls.datasource_id,
139140
package_type=cls.default_package_type,
141+
distro='debian',
140142
)
141143

142144
@classmethod
@@ -157,15 +159,19 @@ class DebianControlFileInSourceHandler(models.DatafileHandler):
157159

158160
@classmethod
159161
def parse(cls, location):
160-
# TODO: we cannot know the distro from the name only
161162
# NOTE: a control file in a source repo or debina.tar tarball can contain more than one package
163+
debian_packages = []
162164
for debian_data in get_paragraphs_data_from_file(location=location):
163-
yield build_package_data(
164-
debian_data,
165-
datasource_id=cls.datasource_id,
166-
package_type=cls.default_package_type,
165+
debian_packages.append(
166+
build_package_data(
167+
debian_data=debian_data,
168+
datasource_id=cls.datasource_id,
169+
package_type=cls.default_package_type,
170+
)
167171
)
168172

173+
yield from populate_debian_namespace(debian_packages)
174+
169175
@classmethod
170176
def assign_package_to_resources(cls, package, resource, codebase, package_adder):
171177
# two levels up
@@ -191,11 +197,19 @@ def parse(cls, location):
191197
location=location,
192198
remove_pgp_signature=True,
193199
)
194-
yield build_package_data(
200+
201+
package_data_from_file = build_package_data_from_package_filename(
202+
filename=os.path.basename(location),
203+
datasource_id=cls.datasource_id,
204+
package_type=cls.default_package_type,
205+
)
206+
package_data = build_package_data(
195207
debian_data=debian_data,
196208
datasource_id=cls.datasource_id,
197209
package_type=cls.default_package_type,
198210
)
211+
package_data.update_purl_fields(package_data=package_data_from_file)
212+
yield package_data
199213

200214
@classmethod
201215
def assign_package_to_resources(cls, package, resource, codebase, package_adder):
@@ -214,13 +228,18 @@ class DebianInstalledStatusDatabaseHandler(models.DatafileHandler):
214228
def parse(cls, location):
215229
# note that we do not know yet the distro at this stage
216230
# we could get it... but we get that later during assemble()
217-
for debian_data in get_paragraphs_data_from_file(location):
218-
yield build_package_data(
219-
debian_data,
220-
datasource_id=cls.datasource_id,
221-
package_type=cls.default_package_type,
231+
debian_packages = []
232+
for debian_data in get_paragraphs_data_from_file(location=location):
233+
debian_packages.append(
234+
build_package_data(
235+
debian_data=debian_data,
236+
datasource_id=cls.datasource_id,
237+
package_type=cls.default_package_type,
238+
)
222239
)
223240

241+
yield from populate_debian_namespace(debian_packages)
242+
224243
@classmethod
225244
def assemble(cls, package_data, resource, codebase, package_adder):
226245
# get the root resource of the rootfs
@@ -260,7 +279,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
260279

261280
# We only need to adjust the md5sum/list path in the case of `same`
262281
qualifiers = package_data.qualifiers or {}
263-
architecture = qualifiers.get('architecture')
282+
architecture = qualifiers.get('arch')
264283

265284
multi_arch = package_data.extra_data.get('multi_arch')
266285

@@ -305,6 +324,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
305324
package.update(
306325
package_data=package_data,
307326
datafile_path=res.path,
327+
check_compatible=False,
308328
replace=False,
309329
include_version=False,
310330
include_qualifiers=False,
@@ -379,14 +399,18 @@ def parse(cls, location):
379399
rootfs installation. distroless is derived from Debian but each package
380400
has its own status file.
381401
"""
382-
for debian_data in get_paragraphs_data_from_file(location):
383-
yield build_package_data(
384-
debian_data,
385-
datasource_id=cls.datasource_id,
386-
package_type=cls.default_package_type,
387-
distro='distroless',
402+
debian_packages = []
403+
for debian_data in get_paragraphs_data_from_file(location=location):
404+
debian_packages.append(
405+
build_package_data(
406+
debian_data=debian_data,
407+
datasource_id=cls.datasource_id,
408+
package_type=cls.default_package_type,
409+
)
388410
)
389411

412+
yield from populate_debian_namespace(debian_packages)
413+
390414
@classmethod
391415
def assemble(cls, package_data, resource, codebase, package_adder):
392416
# get the root resource of the rootfs
@@ -523,6 +547,9 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty
523547
"""
524548

525549
# TODO: we cannot know the distro from the name only
550+
# PURLs without namespace is invalid, so we need to
551+
# have a default value for this
552+
distro = 'debian'
526553
deb = DebArchive.from_filename(filename=filename)
527554

528555
if deb.architecture:
@@ -538,6 +565,7 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty
538565
datasource_id=datasource_id,
539566
type=package_type,
540567
name=deb.name,
568+
namespace=distro,
541569
version=version,
542570
qualifiers=qualifiers,
543571
)
@@ -598,7 +626,7 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
598626
qualifiers = {}
599627
architecture = debian_data.get('architecture')
600628
if architecture:
601-
qualifiers['architecture'] = architecture
629+
qualifiers['arch'] = architecture
602630

603631
extra_data = {}
604632
# Multi-Arch can be: "foreign", "same", "allowed", "all", "optional" or
@@ -628,13 +656,27 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
628656
if keyword:
629657
keywords.append(keyword)
630658

659+
# Get distro/namespace information from clues in package data
660+
if not distro:
661+
if version:
662+
for clue, namespace in version_clues_for_namespace.items():
663+
if clue in version:
664+
distro = namespace
665+
break
666+
667+
if maintainer:
668+
for clue, namespace in maintainer_clues_for_namespace.items():
669+
if clue in maintainer:
670+
distro = namespace
671+
break
672+
631673
source_packages = []
632674
source = debian_data.get('source')
633675
if source:
634676
source_pkg_purl = PackageURL(
635677
type=package_type,
636678
name=source,
637-
namespace=distro
679+
namespace=distro,
638680
).to_string()
639681

640682
source_packages.append(source_pkg_purl)
@@ -656,6 +698,46 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No
656698
)
657699

658700

701+
def populate_debian_namespace(packages):
702+
"""
703+
For an iterable of debian `packages`, populate the
704+
most frequently occuring namespace, or the default
705+
namespace 'debian' in packages without namespace.
706+
"""
707+
if not packages:
708+
return
709+
710+
namespaces_with_count = Counter([
711+
package.namespace
712+
for package in packages
713+
])
714+
distro = max(namespaces_with_count, key=namespaces_with_count.get)
715+
if not distro:
716+
distro = 'debian'
717+
718+
for package in packages:
719+
if not package.namespace:
720+
package.namespace = distro
721+
yield package
722+
723+
724+
version_clues_for_namespace = {
725+
'deb': 'debian',
726+
'ubuntu': 'ubuntu',
727+
}
728+
729+
730+
maintainer_clues_for_namespace = {
731+
'packages.debian.org': 'debian',
732+
'lists.debian.org': 'debian',
733+
'lists.alioth.debian.org': 'debian',
734+
'@debian.org': 'debian',
735+
'debian-init-diversity@': 'debian',
736+
'lists.ubuntu.com': 'ubuntu',
737+
'@canonical.com': 'ubuntu',
738+
}
739+
740+
659741
ignored_root_dirs = {
660742
'/.',
661743
'/bin',

0 commit comments

Comments
 (0)