Skip to content

Commit 2663ec0

Browse files
authored
Use the collect endpoint for the enrich_with_purldb pipeline (#1336)
Signed-off-by: tdruez <tdruez@nexb.com>
1 parent 6b91509 commit 2663ec0

File tree

8 files changed

+149
-45
lines changed

8 files changed

+149
-45
lines changed

scanpipe/models.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,6 +1231,7 @@ def add_message(
12311231
details=None,
12321232
exception=None,
12331233
resource=None,
1234+
package=None,
12341235
):
12351236
"""
12361237
Create a ProjectMessage record for this Project.
@@ -1252,9 +1253,15 @@ def add_message(
12521253
description = str(exception)
12531254

12541255
details = details or {}
1256+
1257+
# Do not change the following field names as those have special behavior in
1258+
# templates.
12551259
if resource:
1256-
# Do not change this field name as it has special behavior in templates.
12571260
details["resource_path"] = resource.path
1261+
if package:
1262+
details.update(
1263+
{"package_url": package.package_url, "package_uuid": package.uuid}
1264+
)
12581265

12591266
return ProjectMessage.objects.create(
12601267
project=self,
@@ -1272,11 +1279,12 @@ def add_info(
12721279
details=None,
12731280
exception=None,
12741281
resource=None,
1282+
package=None,
12751283
):
12761284
"""Create an INFO ProjectMessage record for this project."""
12771285
severity = ProjectMessage.Severity.INFO
12781286
return self.add_message(
1279-
severity, description, model, details, exception, resource
1287+
severity, description, model, details, exception, resource, package
12801288
)
12811289

12821290
def add_warning(
@@ -1286,11 +1294,12 @@ def add_warning(
12861294
details=None,
12871295
exception=None,
12881296
resource=None,
1297+
package=None,
12891298
):
12901299
"""Create a WARNING ProjectMessage record for this project."""
12911300
severity = ProjectMessage.Severity.WARNING
12921301
return self.add_message(
1293-
severity, description, model, details, exception, resource
1302+
severity, description, model, details, exception, resource, package
12941303
)
12951304

12961305
def add_error(
@@ -1300,11 +1309,12 @@ def add_error(
13001309
details=None,
13011310
exception=None,
13021311
resource=None,
1312+
package=None,
13031313
):
13041314
"""Create an ERROR ProjectMessage record using for this project."""
13051315
severity = ProjectMessage.Severity.ERROR
13061316
return self.add_message(
1307-
severity, description, model, details, exception, resource
1317+
severity, description, model, details, exception, resource, package
13081318
)
13091319

13101320
def get_absolute_url(self):

scanpipe/pipes/purldb.py

Lines changed: 91 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,27 @@ class PurlDBException(Exception):
6969
# This key can be used for filtering
7070
ENRICH_EXTRA_DATA_KEY = "enrich_with_purldb"
7171

72+
# Subset of fields kept when multiple entries are found in the PurlDB.
73+
CROSS_VERSION_COMMON_FIELDS = [
74+
"primary_language",
75+
"description",
76+
"parties",
77+
"keywords",
78+
"homepage_url",
79+
"bug_tracking_url",
80+
"code_view_url",
81+
"vcs_url",
82+
"repository_homepage_url",
83+
"copyright",
84+
"holder",
85+
"declared_license_expression",
86+
"declared_license_expression_spdx",
87+
"other_license_expression",
88+
"other_license_expression_spdx",
89+
"extracted_license_statement",
90+
"notice_text",
91+
]
92+
7293

7394
def is_configured():
7495
"""Return True if the required PurlDB settings have been set."""
@@ -101,7 +122,7 @@ def check_service_availability(*args):
101122
raise Exception(f"{label} is not available.")
102123

103124

104-
def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT):
125+
def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT, raise_on_error=False):
105126
"""Wrap the HTTP request calls on the API."""
106127
if not url:
107128
return
@@ -112,13 +133,17 @@ def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT):
112133
if payload:
113134
params.update(payload)
114135

115-
logger.debug(f"{label}: url={url} params={params}")
136+
logger.debug(f"[{label}] Requesting URL: {url} with params: {params}")
116137
try:
117138
response = session.get(url, params=params, timeout=timeout)
118139
response.raise_for_status()
119140
return response.json()
120-
except (requests.RequestException, ValueError, TypeError) as exception:
121-
logger.debug(f"{label} [Exception] {exception}")
141+
except requests.RequestException: # raise_for_status
142+
return
143+
except (ValueError, TypeError) as exception:
144+
logger.debug(f"[{label}] Request to {url} failed with exception: {exception}")
145+
if raise_on_error:
146+
raise PurlDBException(exception)
122147

123148

124149
def request_post(url, data=None, headers=None, files=None, timeout=DEFAULT_TIMEOUT):
@@ -353,12 +378,6 @@ def populate_purldb_with_discovered_dependencies(project, logger=logger.info):
353378
)
354379

355380

356-
def get_package_by_purl(package_url):
357-
"""Get a Package details entry providing its `package_url`."""
358-
if results := find_packages({"purl": str(package_url)}):
359-
return results[0]
360-
361-
362381
def find_packages(payload):
363382
"""Get Packages using provided `payload` filters on the PurlDB package list."""
364383
package_api_url = f"{PURLDB_API_URL}packages/"
@@ -367,6 +386,31 @@ def find_packages(payload):
367386
return response.get("results")
368387

369388

389+
def get_packages_for_purl(package_url):
390+
"""Get Package details entries providing a `package_url`."""
391+
payload = {
392+
"purl": str(package_url),
393+
"sort": "-version",
394+
}
395+
return find_packages(payload)
396+
397+
398+
def collect_data_for_purl(package_url, raise_on_error=False):
399+
collect_api_url = f"{PURLDB_API_URL}collect/"
400+
payload = {
401+
"purl": str(package_url),
402+
"sort": "-version",
403+
}
404+
purldb_entries = request_get(
405+
url=collect_api_url,
406+
payload=payload,
407+
raise_on_error=raise_on_error,
408+
)
409+
410+
if purldb_entries:
411+
return purldb_entries
412+
413+
370414
def get_next_download_url(timeout=DEFAULT_TIMEOUT, api_url=PURLDB_API_URL):
371415
"""
372416
Return the ScannableURI UUID, download URL, and pipelines for the next
@@ -464,12 +508,43 @@ def get_run_status(run, **kwargs):
464508

465509
def enrich_package(package):
466510
"""Enrich the provided ``package`` with the PurlDB data."""
467-
purldb_entry = get_package_by_purl(package.package_url)
468-
if purldb_entry:
469-
package_data = _clean_package_data(purldb_entry)
470-
if updated_fields := package.update_from_data(package_data):
471-
package.update_extra_data({ENRICH_EXTRA_DATA_KEY: updated_fields})
472-
return updated_fields
511+
package_url = package.package_url
512+
project = package.project
513+
514+
try:
515+
purldb_entries = collect_data_for_purl(package_url, raise_on_error=True)
516+
except PurlDBException as exception:
517+
project.add_error(model="PurlDB", exception=exception, package=package)
518+
return
519+
520+
if not purldb_entries:
521+
return
522+
523+
if len(purldb_entries) == 1:
524+
# Single match, all the PurlDB data are used to enrich the package.
525+
purldb_entry = purldb_entries[0]
526+
else:
527+
project.add_warning(
528+
model="PurlDB",
529+
description=(
530+
f'Multiple entries found in the PurlDB for "{package_url}". '
531+
f"Using data from the most recent version."
532+
),
533+
package=package,
534+
)
535+
# Do not set version-specific fields, such as the download_url.
536+
purldb_entry = {
537+
field: value
538+
for field, value in purldb_entries[0].items()
539+
if field in CROSS_VERSION_COMMON_FIELDS
540+
}
541+
542+
# Remove package_uid as it is not relevant to capture the value from PurlDB.
543+
purldb_entry.pop("package_uid", None)
544+
package_data = _clean_package_data(purldb_entry)
545+
if updated_fields := package.update_from_data(package_data):
546+
package.update_extra_data({ENRICH_EXTRA_DATA_KEY: updated_fields})
547+
return updated_fields
473548

474549

475550
def enrich_discovered_packages(project, logger=logger.info):

scanpipe/templates/scanpipe/message_list.html

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@
4646
</a>
4747
</div>
4848
{% endif %}
49+
{% if message.details.package_uuid %}
50+
<div>
51+
<strong>Package</strong>:
52+
<a href="{% url 'package_detail' project.slug message.details.package_uuid %}" target="_blank">
53+
{{ message.details.package_url|default_if_none:message.details.package_uuid }}
54+
</a>
55+
</div>
56+
{% endif %}
4957
{% for key, value in message.details.items %}
5058
<strong>{{ key }}</strong>: {{ value }}<br>
5159
{% endfor %}

scanpipe/templates/scanpipe/tabset/tab_purldb_content.html

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@
44
You are looking at the details for this software package as defined
55
in the PurlDB which was scanned automatically from a public source.
66
</div>
7+
{% if has_multiple_purldb_entries %}
8+
<div class="notification is-warning is-light has-text-weight-semibold p-3 mb-4">
9+
<i class="fa-solid fa-warning mr-1"></i>
10+
Multiple packages were found in the PurlDB for "{{ object.package_url }}".
11+
The data below corresponds to the most recent version of this package.
12+
</div>
13+
{% endif %}
714
{% include 'scanpipe/tabset/tab_default.html' %}
815
{% else %}
916
<div class="notification is-warning is-light has-text-weight-semibold p-3 mb-4">

scanpipe/tests/pipes/test_purldb.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -237,17 +237,17 @@ def test_scanpipe_pipes_purldb_create_project_name(self):
237237
project_name = purldb.create_project_name(download_url, scannable_uri_uuid)
238238
self.assertEqual("httpsregistrynpmjsorgasdf-asdf-101tgz-52b2930d", project_name)
239239

240-
@mock.patch("scanpipe.pipes.purldb.get_package_by_purl")
241-
def test_scanpipe_pipes_purldb_enrich_package(self, mock_get_package_by_purl):
240+
@mock.patch("scanpipe.pipes.purldb.collect_data_for_purl")
241+
def test_scanpipe_pipes_purldb_enrich_package(self, mock_collect_data):
242242
package1 = make_package(self.project1, package_url="pkg:npm/csvtojson@2.0.10")
243243

244-
mock_get_package_by_purl.return_value = {}
244+
mock_collect_data.return_value = []
245245
updated_fields = purldb.enrich_package(package=package1)
246246
self.assertIsNone(updated_fields)
247247

248248
purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json"
249249
purldb_entry = json.loads(purldb_entry_file.read_text())
250-
mock_get_package_by_purl.return_value = purldb_entry
250+
mock_collect_data.return_value = [purldb_entry]
251251
updated_fields = purldb.enrich_package(package=package1)
252252
self.assertTrue(updated_fields)
253253
self.assertIn("homepage_url", updated_fields)
@@ -258,13 +258,11 @@ def test_scanpipe_pipes_purldb_enrich_package(self, mock_get_package_by_purl):
258258
self.assertEqual(purldb_entry.get("sha256"), package1.sha256)
259259
self.assertEqual(purldb_entry.get("copyright"), package1.copyright)
260260

261-
@mock.patch("scanpipe.pipes.purldb.get_package_by_purl")
262-
def test_scanpipe_pipes_purldb_enrich_discovered_packages(
263-
self, mock_get_package_by_purl
264-
):
261+
@mock.patch("scanpipe.pipes.purldb.collect_data_for_purl")
262+
def test_scanpipe_pipes_purldb_enrich_discovered_packages(self, mock_collect_data):
265263
package1 = make_package(self.project1, package_url="pkg:npm/csvtojson@2.0.10")
266264

267-
mock_get_package_by_purl.return_value = {}
265+
mock_collect_data.return_value = []
268266
buffer = io.StringIO()
269267
updated_package_count = purldb.enrich_discovered_packages(
270268
project=self.project1,
@@ -276,7 +274,7 @@ def test_scanpipe_pipes_purldb_enrich_discovered_packages(
276274

277275
purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json"
278276
purldb_entry = json.loads(purldb_entry_file.read_text())
279-
mock_get_package_by_purl.return_value = purldb_entry
277+
mock_collect_data.return_value = [purldb_entry]
280278
buffer = io.StringIO()
281279
updated_package_count = purldb.enrich_discovered_packages(
282280
project=self.project1,

scanpipe/tests/test_pipelines.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1709,9 +1709,9 @@ def test_scanpipe_collect_symbols_tree_sitter_pipeline_integration(self):
17091709

17101710
@mock.patch("scanpipe.pipes.purldb.is_available")
17111711
@mock.patch("scanpipe.pipes.purldb.is_configured")
1712-
@mock.patch("scanpipe.pipes.purldb.get_package_by_purl")
1712+
@mock.patch("scanpipe.pipes.purldb.collect_data_for_purl")
17131713
def test_scanpipe_enrich_with_purldb_pipeline_integration(
1714-
self, mock_get_package, mock_is_configured, mock_is_available
1714+
self, mock_collect_data, mock_is_configured, mock_is_available
17151715
):
17161716
pipeline_name = "enrich_with_purldb"
17171717
project1 = Project.objects.create(name="Analysis")
@@ -1722,7 +1722,7 @@ def test_scanpipe_enrich_with_purldb_pipeline_integration(
17221722

17231723
purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json"
17241724
purldb_entry = json.loads(purldb_entry_file.read_text())
1725-
mock_get_package.return_value = purldb_entry
1725+
mock_collect_data.return_value = [purldb_entry]
17261726

17271727
run = project1.add_pipeline(pipeline_name)
17281728
pipeline = run.make_pipeline_instance()

scanpipe/tests/test_views.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,21 +1048,23 @@ def test_scanpipe_views_discovered_package_purldb_tab_view(self, mock_configured
10481048
self.assertContains(response, "tab-purldb")
10491049
self.assertContains(response, '<section id="tab-purldb"')
10501050

1051-
with mock.patch("scanpipe.pipes.purldb.get_package_by_purl") as get_package:
1052-
get_package.return_value = None
1051+
with mock.patch("scanpipe.pipes.purldb.get_packages_for_purl") as get_packages:
1052+
get_packages.return_value = None
10531053
purldb_tab_url = f"{package_url}purldb_tab/"
10541054
response = self.client.get(purldb_tab_url)
10551055
msg = "No entries found in the PurlDB for this package"
10561056
self.assertContains(response, msg)
10571057

1058-
get_package.return_value = {
1059-
"uuid": "9261605f-e2fb-4db9-94ab-0d82d3273cdf",
1060-
"filename": "abab-2.0.3.tgz",
1061-
"type": "npm",
1062-
"name": "abab",
1063-
"version": "2.0.3",
1064-
"primary_language": "JavaScript",
1065-
}
1058+
get_packages.return_value = [
1059+
{
1060+
"uuid": "9261605f-e2fb-4db9-94ab-0d82d3273cdf",
1061+
"filename": "abab-2.0.3.tgz",
1062+
"type": "npm",
1063+
"name": "abab",
1064+
"version": "2.0.3",
1065+
"primary_language": "JavaScript",
1066+
}
1067+
]
10661068
response = self.client.get(purldb_tab_url)
10671069
self.assertContains(response, "abab-2.0.3.tgz")
10681070
self.assertContains(response, "2.0.3")

scanpipe/views.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,9 +1999,13 @@ def get_context_data(self, **kwargs):
19991999
if not purldb.is_configured():
20002000
raise Http404("PurlDB access is not configured.")
20012001

2002-
if purldb_entry := purldb.get_package_by_purl(self.object.package_url):
2003-
fields = self.get_fields_data(purldb_entry)
2002+
if purldb_entries := purldb.get_packages_for_purl(self.object.package_url):
2003+
# Always display the most recent version entry.
2004+
fields = self.get_fields_data(purldb_entries[0])
20042005
context["tab_data"] = {"fields": fields}
2006+
# Display a warning if multiple packages found in PurlDB for this purl.
2007+
if len(purldb_entries) > 1:
2008+
context["has_multiple_purldb_entries"] = True
20052009

20062010
return context
20072011

0 commit comments

Comments
 (0)