Skip to content

Commit bb98875

Browse files
authored
Create "client" matching pipeline (#1042)
* Create pipeline for purldb matching Signed-off-by: Jono Yang <jyang@nexb.com> * Update matching pipeline logic Signed-off-by: Jono Yang <jyang@nexb.com> * Create test for match_to_purldb Signed-off-by: Jono Yang <jyang@nexb.com> * Fix code style Signed-off-by: Jono Yang <jyang@nexb.com> * Update matching pipeline to be an addon pipeline * Update test Signed-off-by: Jono Yang <jyang@nexb.com> * Rename matching pipeline to match_codebase_to_purldb Signed-off-by: Jono Yang <jyang@nexb.com> * Rename pipeline to match_to_purldb Signed-off-by: Jono Yang <jyang@nexb.com> * Remove create_codebase_json step Signed-off-by: Jono Yang <jyang@nexb.com> * Use context manager when opening project_json_location Signed-off-by: Jono Yang <jyang@nexb.com> * Split match_to_purldb into multiple functions * Create tests for new functions Signed-off-by: Jono Yang <jyang@nexb.com> * Split match_to_purldb into three steps Signed-off-by: Jono Yang <jyang@nexb.com> * Add step to ensure PurlDB is available Signed-off-by: Jono Yang <jyang@nexb.com> * Return only run_url in send_project_json_to_matchcode Signed-off-by: Jono Yang <jyang@nexb.com> * Remove unused function * Update docstrings Signed-off-by: Jono Yang <jyang@nexb.com> * Update CHANGELOG.rst * Update match_to_purldb description Signed-off-by: Jono Yang <jyang@nexb.com> * Declare match_to_purldb pipeline in alphabetical order Signed-off-by: Jono Yang <jyang@nexb.com> * Create function to get match results * Update poll_until_success to return True when a run is successful, instead of returning the match results Signed-off-by: Jono Yang <jyang@nexb.com> * Raise an exception if the match run has stopped Signed-off-by: Jono Yang <jyang@nexb.com> * Address review comments * Create new test cases for poll_until_success Signed-off-by: Jono Yang <jyang@nexb.com> * Remove params from project_url * Fix indent in send_project_json_to_matchcode * Update docstring Signed-off-by: Jono Yang <jyang@nexb.com> --------- Signed-off-by: Jono Yang <jyang@nexb.com>
1 parent 2f2c0e3 commit bb98875

File tree

9 files changed

+683
-2
lines changed

9 files changed

+683
-2
lines changed

CHANGELOG.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ v33.0.0 (2024-01-16)
6868
include credentials such as "user:pass@domain".
6969
https://github.com/nexB/scancode.io/issues/998
7070

71+
- Add a new pipeline, ``match_to_purldb``, that check CodebaseResources of a
72+
Project against PurlDB for Package matches.
73+
7174
v32.7.0 (2023-10-25)
7275
--------------------
7376

scanpipe/pipelines/match_to_purldb.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/nexB/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import purldb
25+
26+
27+
class MatchToPurlDB(Pipeline):
28+
"""
29+
Check CodebaseResources of a Project against PurlDB for Package matches.
30+
31+
This involves creating a JSON scan of the Project codebase, sending it to
32+
MatchCode on PurlDB, waiting for match results, creating DiscoveredPackages
33+
from the match results Package data and associating the proper
34+
CodebaseResources to those DiscoveredPackges.
35+
"""
36+
37+
download_inputs = False
38+
is_addon = True
39+
40+
@classmethod
41+
def steps(cls):
42+
return (
43+
cls.check_purldb_service_availability,
44+
cls.send_project_json_to_matchcode,
45+
cls.poll_matching_results,
46+
cls.create_packages_from_match_results,
47+
)
48+
49+
def check_purldb_service_availability(self):
50+
"""Check if the PurlDB service if configured and available."""
51+
if not purldb.is_configured():
52+
raise Exception("PurlDB is not configured.")
53+
54+
if not purldb.is_available():
55+
raise Exception("PurlDB is not available.")
56+
57+
def send_project_json_to_matchcode(self):
58+
"""Create a JSON scan of the project Codebase and send it to MatchCode."""
59+
self.run_url = purldb.send_project_json_to_matchcode(self.project)
60+
61+
def poll_matching_results(self):
62+
"""Wait until the match results are ready by polling the match run status."""
63+
purldb.poll_until_success(self.run_url)
64+
65+
def create_packages_from_match_results(self):
66+
"""Create DiscoveredPackages from match results."""
67+
match_results = purldb.get_match_results(self.run_url)
68+
purldb.create_packages_from_match_results(self.project, match_results)

scanpipe/pipes/purldb.py

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
import json
2424
import logging
25+
import time
26+
from collections import defaultdict
2527

2628
from django.conf import settings
2729

@@ -30,7 +32,15 @@
3032
from univers.version_range import RANGE_CLASS_BY_SCHEMES
3133
from univers.version_range import InvalidVersionRange
3234

35+
from scanpipe.models import AbstractTaskFieldsModel
3336
from scanpipe.pipes import LoopProgress
37+
from scanpipe.pipes import flag
38+
from scanpipe.pipes.output import to_json
39+
40+
41+
class PurlDBException(Exception):
42+
pass
43+
3444

3545
label = "PurlDB"
3646
logger = logging.getLogger(__name__)
@@ -100,9 +110,11 @@ def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT):
100110
logger.debug(f"{label} [Exception] {exception}")
101111

102112

103-
def request_post(url, data, headers=None, timeout=DEFAULT_TIMEOUT):
113+
def request_post(url, data=None, headers=None, files=None, timeout=DEFAULT_TIMEOUT):
104114
try:
105-
response = session.post(url, data=data, timeout=timeout, headers=headers)
115+
response = session.post(
116+
url, data=data, timeout=timeout, headers=headers, files=files
117+
)
106118
response.raise_for_status()
107119
return response.json()
108120
except (requests.RequestException, ValueError, TypeError) as exception:
@@ -320,3 +332,111 @@ def populate_purldb_with_discovered_dependencies(project, logger=logger.info):
320332
chunk_size=10,
321333
logger=logger,
322334
)
335+
336+
337+
def send_project_json_to_matchcode(
338+
project, timeout=DEFAULT_TIMEOUT, api_url=PURLDB_API_URL
339+
):
340+
"""
341+
Given a `project`, create a JSON scan of the `project` CodebaseResources and
342+
send it to PurlDB for matching. Return a tuple containing strings of the url
343+
to the particular match run and the url to the match results.
344+
"""
345+
scan_output_location = to_json(project)
346+
with open(scan_output_location, "rb") as f:
347+
files = {"upload_file": f}
348+
response = request_post(
349+
url=f"{api_url}matching/",
350+
timeout=timeout,
351+
files=files,
352+
)
353+
run_url = response["runs"][0]["url"]
354+
return run_url
355+
356+
357+
def poll_until_success(run_url, sleep=10):
358+
"""
359+
Given a URL to a scancode.io run instance, `run_url`, return True when the
360+
run instance has completed successfully.
361+
362+
Raise a PurlDBException when the run instance has faield, stopped, or gone
363+
stale.
364+
"""
365+
run_status = AbstractTaskFieldsModel.Status
366+
while True:
367+
response = request_get(run_url)
368+
if response:
369+
status = response["status"]
370+
if status == run_status.SUCCESS:
371+
return True
372+
373+
if status in [
374+
run_status.NOT_STARTED,
375+
run_status.QUEUED,
376+
run_status.RUNNING,
377+
]:
378+
continue
379+
380+
if status in [
381+
run_status.FAILURE,
382+
run_status.STOPPED,
383+
run_status.STALE,
384+
]:
385+
log = response["log"]
386+
msg = f"Matching run has stopped:\n\n{log}"
387+
raise PurlDBException(msg)
388+
389+
time.sleep(sleep)
390+
391+
392+
def get_match_results(run_url):
393+
"""
394+
Given the `run_url` for a pipeline running the matchcode matching pipeline,
395+
return the match results for that run.
396+
"""
397+
response = request_get(run_url)
398+
project_url = response["project"]
399+
# `project_url` can have params, such as "?format=json"
400+
if "?" in project_url:
401+
project_url, _ = project_url.split("?")
402+
project_url = project_url.rstrip("/")
403+
results_url = project_url + "/results/"
404+
return request_get(results_url)
405+
406+
407+
def map_match_results(match_results):
408+
"""
409+
Given `match_results`, which is a mapping of ScanCode.io codebase results,
410+
return a defaultdict(list) where the keys are the package_uid of matched
411+
packages and the value is a list containing the paths of Resources
412+
associated with the package_uid.
413+
"""
414+
resource_results = match_results.get("files", [])
415+
resource_paths_by_package_uids = defaultdict(list)
416+
for resource in resource_results:
417+
for_packages = resource.get("for_packages", [])
418+
for package_uid in for_packages:
419+
resource_paths_by_package_uids[package_uid].append(resource["path"])
420+
return resource_paths_by_package_uids
421+
422+
423+
def create_packages_from_match_results(project, match_results):
424+
"""
425+
Given `match_results`, which is a mapping of ScanCode.io codebase results,
426+
use the Package data from it to create DiscoveredPackages for `project` and
427+
associate the proper Resources of `project` to the DiscoveredPackages.
428+
"""
429+
from scanpipe.pipes.d2d import create_package_from_purldb_data
430+
431+
resource_paths_by_package_uids = map_match_results(match_results)
432+
matched_packages = match_results.get("packages", [])
433+
for matched_package in matched_packages:
434+
package_uid = matched_package["package_uid"]
435+
resource_paths = resource_paths_by_package_uids[package_uid]
436+
resources = project.codebaseresources.filter(path__in=resource_paths)
437+
create_package_from_purldb_data(
438+
project,
439+
resources=resources,
440+
package_data=matched_package,
441+
status=flag.MATCHED_TO_PURLDB_PACKAGE,
442+
)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
{
2+
"headers": [
3+
{
4+
"tool_name": "scanpipe",
5+
"tool_version": "v3.0.0-241-g45d653b",
6+
"other_tools": [
7+
"pkg:pypi/scancode-toolkit@32.0.8"
8+
],
9+
"notice": "Generated with ScanCode.io and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied.\nNo content created from ScanCode.io should be considered or used as legal advice.\nConsult an Attorney for any legal advice.\nScanCode.io is a free software code scanning tool from nexB Inc. and others\nlicensed under the Apache License version 2.0.\nScanCode is a trademark of nexB Inc.\nVisit https://github.com/nexB/scancode.io for support and download.\n",
10+
"uuid": "b47c79e8-14f1-4991-a584-717b39233750",
11+
"created_date": "2024-01-09T00:59:38.649Z",
12+
"notes": "",
13+
"settings": {},
14+
"input_sources": [
15+
{
16+
"filename": "test-out.json",
17+
"source": "uploaded"
18+
}
19+
],
20+
"runs": [],
21+
"extra_data": {}
22+
}
23+
],
24+
"packages": [],
25+
"dependencies": [],
26+
"files": [
27+
{
28+
"path": "elasticsearch-x-content-7.17.9-sources.jar",
29+
"type": "file",
30+
"name": "elasticsearch-x-content-7.17.9-sources.jar",
31+
"status": "matched-to-purldb-package",
32+
"tag": "",
33+
"extension": ".jar",
34+
"size": 89821,
35+
"md5": "abcc7ad93deadc72f4b4369cb933e8e1",
36+
"sha1": "30d21add57abe04beece3f28a079671dbc9043e4",
37+
"sha256": "dc433d60ef7e43670500bd97c4f1d217a1880774a1dfb75de7132fb0b5208acd",
38+
"sha512": "",
39+
"mime_type": "application/zip",
40+
"file_type": "Zip archive data, at least v1.0 to extract",
41+
"programming_language": "",
42+
"is_binary": true,
43+
"is_text": false,
44+
"is_archive": true,
45+
"is_media": false,
46+
"is_key_file": false,
47+
"detected_license_expression": "",
48+
"detected_license_expression_spdx": "",
49+
"license_detections": [],
50+
"license_clues": [],
51+
"percentage_of_license_text": null,
52+
"compliance_alert": "",
53+
"copyrights": [],
54+
"holders": [],
55+
"authors": [],
56+
"package_data": [],
57+
"for_packages": [],
58+
"emails": [],
59+
"urls": [],
60+
"extra_data": {}
61+
}
62+
],
63+
"relations": []
64+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"url": "http://192.168.1.12/api/runs/52b2930d-6e85-4b3e-ba3e-17dd9a618650/",
3+
"pipeline_name": "matching",
4+
"status": "success",
5+
"description": "",
6+
"project": "http://192.168.1.12/api/matching/65bf1e6d-6bff-4841-9c9b-db5cf25edfa7/",
7+
"uuid": "52b2930d-6e85-4b3e-ba3e-17dd9a618650",
8+
"created_date": "2024-01-08T23:24:50.063864Z",
9+
"scancodeio_version": "v3.0.0-241-g45d653b",
10+
"task_id": "52b2930d-6e85-4b3e-ba3e-17dd9a618650",
11+
"task_start_date": "2024-01-08T23:24:50.107013Z",
12+
"task_end_date": "2024-01-08T23:24:53.095046Z",
13+
"task_exitcode": 0,
14+
"task_output": "",
15+
"log": "2024-01-08 23:24:50.10 Pipeline [matching] starting\n2024-01-08 23:24:50.11 Step [get_inputs] starting\n2024-01-08 23:24:50.11 Step [get_inputs] completed in 0 seconds\n2024-01-08 23:24:50.11 Step [build_inventory_from_scans] starting\n2024-01-08 23:24:50.28 Step [build_inventory_from_scans] completed in 0 seconds\n2024-01-08 23:24:50.29 Step [flag_empty_files] starting\n2024-01-08 23:24:50.29 Step [flag_empty_files] completed in 0 seconds\n2024-01-08 23:24:50.29 Step [match_archives_to_purldb_packages] starting\n2024-01-08 23:24:50.29 Matching 1 resources in PurlDB, using SHA1\n2024-01-08 23:24:52.98 58 resources matched in PurlDB using 1 SHA1s\n2024-01-08 23:24:52.98 Step [match_archives_to_purldb_packages] completed in 3 seconds\n2024-01-08 23:24:52.99 Step [match_archives_to_purldb_resources] starting\n2024-01-08 23:24:52.99 Skipping resource matching as there are 0\n2024-01-08 23:24:52.99 0 resources matched in PurlDB using 0 SHA1s\n2024-01-08 23:24:53.00 Step [match_archives_to_purldb_resources] completed in 0 seconds\n2024-01-08 23:24:53.00 Step [fingerprint_codebase_directories] starting\n2024-01-08 23:24:53.04 Step [fingerprint_codebase_directories] completed in 0 seconds\n2024-01-08 23:24:53.04 Step [match_directories_exact_to_purldb] starting\n2024-01-08 23:24:53.04 Matching 0 directories against PurlDB\n2024-01-08 23:24:53.05 0 directories matched in PurlDB\n2024-01-08 23:24:53.05 Step [match_directories_exact_to_purldb] completed in 0 seconds\n2024-01-08 23:24:53.05 Step [match_resources_to_purldb] starting\n2024-01-08 23:24:53.05 Skipping resource matching as there are 0\n2024-01-08 23:24:53.06 0 resources matched in PurlDB using 0 SHA1s\n2024-01-08 23:24:53.06 Step [match_resources_to_purldb] completed in 0 seconds\n2024-01-08 23:24:53.06 Step [match_directories_to_purldb] starting\n2024-01-08 23:24:53.06 Matching 0 directories against PurlDB\n2024-01-08 23:24:53.07 0 directories matched in PurlDB\n2024-01-08 23:24:53.07 Step [match_directories_to_purldb] completed in 0 seconds\n2024-01-08 23:24:53.07 Step [match_purldb_resources_post_process] starting\n2024-01-08 23:24:53.07 Refining matching for 1 matched-to-purldb-resource archives.\n2024-01-08 23:24:53.08 0 resource processed\n2024-01-08 23:24:53.08 Step [match_purldb_resources_post_process] completed in 0 seconds\n2024-01-08 23:24:53.08 Step [remove_packages_without_resources] starting\n2024-01-08 23:24:53.09 Step [remove_packages_without_resources] completed in 0 seconds\n2024-01-08 23:24:53.09 Pipeline completed in 3 seconds\n",
16+
"execution_time": 2
17+
}

0 commit comments

Comments
 (0)