Skip to content

Commit 2b09be1

Browse files
Merge pull request #3869 from nexB/update-pypi-package-detection
Fix python package detection issues #3859
2 parents 1dc4827 + d78f193 commit 2b09be1

23 files changed

+3038
-950
lines changed

src/packagedcode/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1189,7 +1189,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder=
11891189
starting ``resource`` in the ``codebase``.
11901190
11911191
This default implementation assigns the package to the whole
1192-
``resource`` tree. Since ``resource`` is a file y default, this means
1192+
``resource`` tree. Since ``resource`` is a file by default, this means
11931193
that only the datafile ``resource`` is assigned to the ``package`` by
11941194
default.
11951195

src/packagedcode/pypi.py

Lines changed: 77 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import tempfile
1919
import zipfile
2020
from configparser import ConfigParser
21+
from fnmatch import fnmatchcase
2122
from pathlib import Path
2223
from typing import NamedTuple
2324

@@ -83,6 +84,10 @@ class PythonEggPkgInfoFile(models.DatafileHandler):
8384

8485
@classmethod
8586
def parse(cls, location, package_only=False):
87+
"""
88+
Parse package data from a PKG-INFO file and other manifests present in
89+
neighboring files as needed when an installed layout is found.
90+
"""
8691
yield parse_metadata(
8792
location=location,
8893
datasource_id=cls.datasource_id,
@@ -108,6 +113,10 @@ class PythonEditableInstallationPkgInfoFile(models.DatafileHandler):
108113

109114
@classmethod
110115
def parse(cls, location, package_only=False):
116+
"""
117+
Parse package data from a PKG-INFO file and other manifests present in
118+
neighboring files as needed when an installed layout is found.
119+
"""
111120
yield parse_metadata(
112121
location=location,
113122
datasource_id=cls.datasource_id,
@@ -150,12 +159,11 @@ class BaseExtractedPythonLayout(models.DatafileHandler):
150159
def assemble(cls, package_data, resource, codebase, package_adder):
151160
# a source distribution can have many manifests
152161
datafile_name_patterns = (
153-
'Pipfile.lock',
154-
'Pipfile',
155-
) + PipRequirementsFileHandler.path_patterns + PyprojectTomlHandler.path_patterns
162+
PipfileHandler.path_patterns + PipfileLockHandler.path_patterns
163+
+ PipRequirementsFileHandler.path_patterns + PyprojectTomlHandler.path_patterns
164+
)
156165

157-
# TODO: we want PKG-INFO first, then (setup.py, setup.cfg), then pyproject.toml for poetry
158-
# then we have the rest of the lock files (pipfile, pipfile.lock, etc.)
166+
is_datafile_pypi = any(fnmatchcase(resource.path, pat) for pat in datafile_name_patterns)
159167

160168
package_resource = None
161169
if resource.name == 'PKG-INFO':
@@ -186,18 +194,21 @@ def assemble(cls, package_data, resource, codebase, package_adder):
186194
continue
187195
package_resource = child
188196
break
189-
elif resource.name in datafile_name_patterns:
197+
198+
elif is_datafile_pypi:
190199
if resource.has_parent():
191200
siblings = resource.siblings(codebase)
192-
package_resource = [r for r in siblings if r.name == 'PKG-INFO']
201+
package_resources = [r for r in siblings if r.name == 'PKG-INFO']
193202
if package_resource:
194-
package_resource = package_resource[0]
203+
package_resource = package_resources[0]
195204

196205
package = None
197206
if package_resource:
198207
pkg_data = package_resource.package_data[0]
199208
pkg_data = models.PackageData.from_dict(pkg_data)
200209
if pkg_data.purl:
210+
# We yield only the package and the resource, and not dependencies because
211+
# PKG-INFO also has the dependencies from
201212
package = create_package_from_package_data(
202213
package_data=pkg_data,
203214
datafile_path=package_resource.path
@@ -207,11 +218,6 @@ def assemble(cls, package_data, resource, codebase, package_adder):
207218
package_adder(package.package_uid, package_resource, codebase)
208219
yield package_resource
209220

210-
yield from yield_dependencies_from_package_data(
211-
package_data=pkg_data,
212-
datafile_path=package_resource.path,
213-
package_uid=package.package_uid
214-
)
215221
else:
216222
setup_resources = []
217223
if resource.has_parent():
@@ -221,31 +227,50 @@ def assemble(cls, package_data, resource, codebase, package_adder):
221227
if r.name in ('setup.py', 'setup.cfg')
222228
and r.package_data
223229
]
224-
225-
setup_package_data = [
226-
(setup_resource, models.PackageData.from_dict(setup_resource.package_data[0]))
227-
for setup_resource in setup_resources
228-
]
229-
setup_package_data = sorted(setup_package_data, key=lambda s: bool(s[1].purl), reverse=True)
230-
for setup_resource, setup_pkg_data in setup_package_data:
231-
if setup_pkg_data.purl:
232-
if not package:
233-
package = create_package_from_package_data(
230+
if setup_resources:
231+
setup_package_data = [
232+
(setup_resource, models.PackageData.from_dict(setup_resource.package_data[0]))
233+
for setup_resource in setup_resources
234+
]
235+
setup_package_data = sorted(setup_package_data, key=lambda s: bool(s[1].purl), reverse=True)
236+
for setup_resource, setup_pkg_data in setup_package_data:
237+
if setup_pkg_data.purl:
238+
if not package:
239+
package = create_package_from_package_data(
240+
package_data=setup_pkg_data,
241+
datafile_path=setup_resource.path,
242+
)
243+
yield package
244+
package_resource = setup_resource
245+
else:
246+
package.update(setup_pkg_data, setup_resource.path)
247+
if package:
248+
for setup_resource, setup_pkg_data in setup_package_data:
249+
package_adder(package.package_uid, setup_resource, codebase)
250+
yield setup_resource
251+
252+
yield from yield_dependencies_from_package_data(
234253
package_data=setup_pkg_data,
235254
datafile_path=setup_resource.path,
255+
package_uid=package.package_uid
236256
)
237-
yield package
238-
package_resource = setup_resource
239-
else:
240-
package.update(setup_pkg_data, setup_resource.path)
241-
if package:
242-
for setup_resource, setup_pkg_data in setup_package_data:
243-
package_adder(package.package_uid, setup_resource, codebase)
244-
yield setup_resource
257+
else:
258+
package_resource = resource
259+
pkg_data = package_resource.package_data[0]
260+
pkg_data = models.PackageData.from_dict(pkg_data)
261+
if pkg_data.purl:
262+
package = create_package_from_package_data(
263+
package_data=pkg_data,
264+
datafile_path=package_resource.path
265+
)
266+
yield package
267+
268+
package_adder(package.package_uid, package_resource, codebase)
269+
yield package_resource
245270

246271
yield from yield_dependencies_from_package_data(
247-
package_data=setup_pkg_data,
248-
datafile_path=setup_resource.path,
272+
package_data=pkg_data,
273+
datafile_path=package_resource.path,
249274
package_uid=package.package_uid
250275
)
251276

@@ -275,12 +300,20 @@ def assemble(cls, package_data, resource, codebase, package_adder):
275300
else:
276301
package_uid = None
277302

303+
# Yield dependencies from sibling manifests
278304
if package_resource:
279305
for sibling in package_resource.siblings(codebase):
280-
if sibling and sibling.name in datafile_name_patterns:
306+
if not sibling:
307+
continue
308+
309+
is_sibling_pypi_manifest = any(
310+
fnmatchcase(sibling.path, pat)
311+
for pat in datafile_name_patterns
312+
)
313+
if is_sibling_pypi_manifest:
281314
yield from yield_dependencies_from_package_resource(
282315
resource=sibling,
283-
package_uid=package_uid
316+
package_uid=package_uid,
284317
)
285318

286319
if package_uid and package_uid not in sibling.for_packages:
@@ -981,6 +1014,10 @@ def parse_metadata(location, datasource_id, package_type, package_only=False):
9811014
if license_file:
9821015
extra_data['license_file'] = license_file
9831016

1017+
# FIXME: We are getting dependencies from other sibling files, this is duplicated
1018+
# data at the package_data level, is this necessary? We also have the entire dependency
1019+
# relationships here at requires.txt present in ``.egg-info`` should we store these
1020+
# nicely?
9841021
dependencies = get_dist_dependencies(dist)
9851022
file_references = list(get_file_references(dist))
9861023

@@ -1240,6 +1277,8 @@ def parse(cls, location, package_only=False):
12401277
with open(location) as f:
12411278
parser.read_file(f)
12421279

1280+
extra_data = {}
1281+
12431282
for section in parser.values():
12441283
if section.name == 'options':
12451284
scope_by_sub_section = {
@@ -1255,22 +1294,10 @@ def parse(cls, location, package_only=False):
12551294
reqs = list(get_requirement_from_section(section=section, sub_section=sub_section))
12561295
dependent_packages.extend(cls.parse_reqs(reqs, scope))
12571296
continue
1297+
1298+
# This is not a dependency, merely a required python version
12581299
python_requires_specifier = section[sub_section]
1259-
purl = PackageURL(
1260-
type="generic",
1261-
name="python",
1262-
)
1263-
resolved_purl = get_resolved_purl(purl=purl, specifiers=SpecifierSet(python_requires_specifier))
1264-
dependent_packages.append(
1265-
models.DependentPackage(
1266-
purl=str(resolved_purl.purl),
1267-
scope=scope,
1268-
is_runtime=True,
1269-
is_optional=False,
1270-
is_resolved=resolved_purl.is_resolved,
1271-
extracted_requirement=f"python_requires{python_requires_specifier}",
1272-
)
1273-
)
1300+
extra_data["python_requires"] = python_requires_specifier
12741301

12751302
if section.name == "options.extras_require":
12761303
for sub_section in section:

0 commit comments

Comments
 (0)