Skip to content

Commit 310d0a8

Browse files
JonoYangtdruez
andauthored
Fingerprint codebase resources (#1163)
* Create fingerprint_codebase_resources Signed-off-by: Jono Yang <jyang@nexb.com> * Update docstring * Use specific matchcode-toolkit branch in setup.cfg Signed-off-by: Jono Yang <jyang@nexb.com> * Add test for fingerprint_codebase_resources Signed-off-by: Jono Yang <jyang@nexb.com> * Update CHANGELOG.rst Signed-off-by: Jono Yang <jyang@nexb.com> * Use scancode API to run fingerprinting function Signed-off-by: Jono Yang <jyang@nexb.com> * Move changelog entry Signed-off-by: Jono Yang <jyang@nexb.com> * Bump matchcode-toolkit version to v4.1.0 Signed-off-by: Jono Yang <jyang@nexb.com> * Remove scan_func_kwargs from fingerprint_codebase_resources Signed-off-by: Jono Yang <jyang@nexb.com> --------- Signed-off-by: Jono Yang <jyang@nexb.com> Co-authored-by: tdruez <489057+tdruez@users.noreply.github.com>
1 parent 7d413bc commit 310d0a8

File tree

4 files changed

+93
-3
lines changed

4 files changed

+93
-3
lines changed

CHANGELOG.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@ v34.4.0 (unreleased)
1111
Also, the CycloneDX outputs can be downloaded as 1.6, 1.5, and 1.4 spec versions.
1212
https://github.com/nexB/scancode.io/pull/1165
1313

14+
- Update matchcode-toolkit to v4.1.0
15+
16+
- Add a new function
17+
`scanpipe.pipes.matchcode.fingerprint_codebase_resources()`, which computes
18+
approximate file matching fingerprints for text files using the new
19+
`get_file_fingerprint_hashes` function from matchcode-toolkit.
20+
1421
- Rename the `purldb-scan-queue-worker` management command to `purldb-scan-worker`.
1522

1623
- Add `docker-compose.purldb-scan-worker.yml` to run ScanCode.io as a PurlDB

scanpipe/pipes/matchcode.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,15 @@
2727

2828
import requests
2929
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
30+
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
31+
from scancode import Scanner
3032

3133
from scanpipe.pipes import codebase
3234
from scanpipe.pipes import flag
3335
from scanpipe.pipes import poll_until_success
3436
from scanpipe.pipes.output import to_json
37+
from scanpipe.pipes.scancode import _scan_resource
38+
from scanpipe.pipes.scancode import scan_resources
3539

3640

3741
class MatchCodeIOException(Exception):
@@ -174,10 +178,12 @@ def save_directory_fingerprints(project, virtual_codebase, to_codebase_only=Fals
174178

175179
def fingerprint_codebase_directories(project, to_codebase_only=False):
176180
"""
177-
Compute directory fingerprints for the directories of the to/ codebase from
178-
`project`.
181+
Compute directory fingerprints for the directories from `project`.
179182
180183
These directory fingerprints are used for matching purposes on matchcode.
184+
185+
If `to_codebase_only` is True, the only directories from the `to/` codebase
186+
are computed.
181187
"""
182188
resources = project.codebaseresources.all()
183189
if to_codebase_only:
@@ -189,6 +195,61 @@ def fingerprint_codebase_directories(project, to_codebase_only=False):
189195
)
190196

191197

198+
def fingerprint_codebase_resource(location, with_threading=True, **kwargs):
199+
"""
200+
Compute fingerprints for the resource at `location` using the
201+
scancode-toolkit direct API.
202+
203+
Return a dictionary of scan `results` and a list of `errors`.
204+
"""
205+
scanners = [
206+
Scanner("fingerprints", get_file_fingerprint_hashes),
207+
]
208+
return _scan_resource(location, scanners, with_threading=with_threading)
209+
210+
211+
def save_resource_fingerprints(resource, scan_results, scan_errors=None):
212+
"""
213+
Save computed fingerprints from `scan_results` to `resource.extra_data`.
214+
Create project errors if any occurred during the scan.
215+
"""
216+
resource.extra_data.update(scan_results)
217+
resource.save()
218+
219+
if scan_errors:
220+
resource.add_errors(scan_errors)
221+
resource.update(status=flag.SCANNED_WITH_ERROR)
222+
223+
224+
def fingerprint_codebase_resources(
225+
project, resource_qs=None, progress_logger=None, to_codebase_only=False
226+
):
227+
"""
228+
Compute fingerprints for the resources from `project`.
229+
230+
These resource fingerprints are used for matching purposes on matchcode.
231+
232+
Multiprocessing is enabled by default on this pipe, the number of processes can be
233+
controlled through the SCANCODEIO_PROCESSES setting.
234+
235+
If `to_codebase_only` is True, the only resources from the `to/` codebase
236+
are computed.
237+
"""
238+
# Checking for None to make the distinction with an empty resource_qs queryset
239+
if resource_qs is None:
240+
resource_qs = project.codebaseresources.filter(is_text=True)
241+
242+
if to_codebase_only:
243+
resource_qs = resource_qs.to_codebase()
244+
245+
scan_resources(
246+
resource_qs=resource_qs,
247+
scan_func=fingerprint_codebase_resource,
248+
save_func=save_resource_fingerprints,
249+
progress_logger=progress_logger,
250+
)
251+
252+
192253
def send_project_json_to_matchcode(
193254
project, timeout=DEFAULT_TIMEOUT, api_url=MATCHCODEIO_API_URL
194255
):

scanpipe/tests/pipes/test_matchcode.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@
2929
from django.test import TestCase
3030

3131
from scanpipe.models import AbstractTaskFieldsModel
32+
from scanpipe.models import CodebaseResource
3233
from scanpipe.models import Project
3334
from scanpipe.pipes import matchcode
35+
from scanpipe.pipes.input import copy_input
3436
from scanpipe.tests import make_resource_file
3537

3638

@@ -311,3 +313,23 @@ def test_scanpipe_pipes_matchcode_get_match_results(
311313
match_results = matchcode.get_match_results(run_url)
312314

313315
self.assertEqual(mock_request_get_results_return, match_results)
316+
317+
def test_scanpipe_pipes_matchcode_fingerprint_codebase_resources(self):
318+
copy_input(self.data_location / "notice.NOTICE", self.project1.codebase_path)
319+
codebase_resource1 = CodebaseResource.objects.create(
320+
project=self.project1, path="notice.NOTICE", is_text=True
321+
)
322+
323+
# This resource should not have a fingerprint
324+
copy_input(self.data_location / "is-npm-1.0.0.tgz", self.project1.codebase_path)
325+
codebase_resource2 = CodebaseResource.objects.create(
326+
project=self.project1, path="is-npm-1.0.0.tgz"
327+
)
328+
329+
matchcode.fingerprint_codebase_resources(self.project1)
330+
codebase_resource1.refresh_from_db()
331+
codebase_resource2.refresh_from_db()
332+
333+
expected_extra_data = {"halo1": "ef420f7e84c8c74c691315f0a06ac4f0"}
334+
self.assertEqual(expected_extra_data, codebase_resource1.extra_data)
335+
self.assertFalse(codebase_resource2.extra_data)

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ install_requires =
9595
# Font Awesome
9696
fontawesomefree==6.5.1
9797
# MatchCode-toolkit
98-
matchcode-toolkit==4.0.0
98+
matchcode-toolkit==4.1.0
9999
# Univers
100100
univers==30.11.0
101101
# Markdown

0 commit comments

Comments
 (0)