Skip to content

Commit 696deb5

Browse files
authored
Refine the extraction errors reporting include the resource path #1273 (#1276)
Signed-off-by: tdruez <tdruez@nexb.com>
1 parent a2e392e commit 696deb5

File tree

9 files changed

+246
-121
lines changed

9 files changed

+246
-121
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ v34.6.4 (unreleased)
77
- Add all "classify" plugin fields from scancode-toolkit on the CodebaseResource model.
88
https://github.com/nexB/scancode.io/issues/1275
99

10+
- Refine the extraction errors reporting to include the resource path for rendering
11+
link to the related resources in the UI.
12+
https://github.com/nexB/scancode.io/issues/1273
13+
1014
- Add a ``flush-projects`` management command, to Delete all project data and their
1115
related work directories created more than a specified number of days ago.
1216
https://github.com/nexB/scancode.io/issues/1289

scanpipe/pipelines/__init__.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import traceback
2626
from contextlib import contextmanager
2727
from functools import wraps
28+
from pathlib import Path
2829
from pydoc import getdoc
2930
from pydoc import splitdoc
3031
from timeit import default_timer as timer
@@ -301,17 +302,45 @@ def flag_ignored_resources(self):
301302
if ignored_patterns := self.env.get("ignored_patterns"):
302303
flag.flag_ignored_patterns(self.project, patterns=ignored_patterns)
303304

304-
def extract_archives(self):
305+
def extract_archive(self, location, target):
306+
"""Extract archive at `location` to `target`. Save errors as messages."""
307+
from scanpipe.pipes import scancode
308+
309+
extract_errors = scancode.extract_archive(location, target)
310+
311+
for resource_location, errors in extract_errors.items():
312+
resource_path = Path(resource_location)
313+
314+
if resource_path.is_relative_to(self.project.codebase_path):
315+
resource_path = resource_path.relative_to(self.project.codebase_path)
316+
details = {"resource_path": str(resource_path)}
317+
elif resource_path.is_relative_to(self.project.input_path):
318+
resource_path = resource_path.relative_to(self.project.input_path)
319+
details = {"path": f"input/{str(resource_path)}"}
320+
else:
321+
details = {"filename": str(resource_path.name)}
322+
323+
self.project.add_error(
324+
description="\n".join(errors),
325+
model="extract_archive",
326+
details=details,
327+
)
328+
329+
def extract_archives(self, location=None):
305330
"""Extract archives located in the codebase/ directory with extractcode."""
306331
from scanpipe.pipes import scancode
307332

308-
extract_errors = scancode.extract_archives(
309-
location=self.project.codebase_path,
310-
recurse=True,
311-
)
333+
if not location:
334+
location = self.project.codebase_path
312335

313-
if extract_errors:
314-
self.add_error("\n".join(extract_errors))
336+
extract_errors = scancode.extract_archives(location=location, recurse=True)
337+
338+
for resource_path, errors in extract_errors.items():
339+
self.project.add_error(
340+
description="\n".join(errors),
341+
model="extract_archives",
342+
details={"resource_path": resource_path},
343+
)
315344

316345
# Reload the project env post-extraction as the scancode-config.yml file
317346
# may be located in one of the extracted archives.

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -127,13 +127,9 @@ def extract_inputs_to_codebase_directory(self):
127127
(self.to_files, self.project.codebase_path / d2d.TO),
128128
]
129129

130-
errors = []
131130
for input_files, codebase_path in inputs_with_codebase_path_destination:
132131
for input_file_path in input_files:
133-
errors += scancode.extract_archive(input_file_path, codebase_path)
134-
135-
if errors:
136-
self.add_error("\n".join(errors))
132+
self.extract_archive(input_file_path, codebase_path)
137133

138134
# Reload the project env post-extraction as the scancode-config.yml file
139135
# may be located in one of the extracted archives.

scanpipe/pipelines/root_filesystem.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,10 @@ def extract_input_files_to_codebase_directory(self):
5353
"""Extract root filesystem input archives with extractcode."""
5454
input_files = self.project.inputs("*")
5555
target_path = self.project.codebase_path
56-
errors = []
5756

5857
for input_file in input_files:
5958
extract_target = target_path / f"{input_file.name}{EXTRACT_SUFFIX}"
60-
extract_errors = scancode.extract_archive(input_file, extract_target)
61-
errors.extend(extract_errors)
62-
63-
if errors:
64-
self.add_error("\n".join(errors))
59+
self.extract_archive(input_file, extract_target)
6560

6661
# Reload the project env post-extraction as the scancode-config.yml file
6762
# may be located in one of the extracted archives.

scanpipe/pipelines/scan_single_package.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,7 @@ def extract_input_to_codebase_directory(self):
9393
copy_input(self.input_path, self.project.codebase_path)
9494
return
9595

96-
extract_errors = scancode.extract_archive(
97-
location=self.input_path,
98-
target=self.project.codebase_path,
99-
)
100-
101-
if extract_errors:
102-
self.add_error("\n".join(extract_errors))
96+
self.extract_archive(self.input_path, self.project.codebase_path)
10397

10498
# Reload the project env post-extraction as the scancode-config.yml file
10599
# may be located in one of the extracted archives.

scanpipe/pipes/scancode.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ def extract_archive(location, target):
102102
Extract a single archive or compressed file at `location` to the `target`
103103
directory.
104104
105-
Return a list of extraction errors.
105+
Return a dict of extraction errors, keyed by the resource location.
106106
107107
Wrapper of the `extractcode.api.extract_archive` function.
108108
"""
109-
errors = []
109+
errors = {}
110110

111111
for event in extractcode_api.extract_archive(location, target):
112-
if event.done:
113-
errors.extend(event.errors)
112+
if event.done and event.errors:
113+
errors[str(event.source)] = event.errors
114114

115115
return errors
116116

@@ -125,7 +125,7 @@ def extract_archives(location, recurse=False):
125125
126126
If `recurse` is True, extract nested archives-in-archives recursively.
127127
128-
Return a list of extraction errors.
128+
Return a dict of extraction errors, keyed by the resource location.
129129
130130
Wrapper of the `extractcode.api.extract_archives` function.
131131
"""
@@ -135,10 +135,10 @@ def extract_archives(location, recurse=False):
135135
"all_formats": True,
136136
}
137137

138-
errors = []
138+
errors = {}
139139
for event in extractcode_api.extract_archives(location, **options):
140-
if event.done:
141-
errors.extend(event.errors)
140+
if event.done and event.errors:
141+
errors[str(event.source)] = event.errors
142142

143143
return errors
144144

235 Bytes
Binary file not shown.

scanpipe/tests/pipes/test_scancode.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def test_scanpipe_pipes_scancode_extract_archive(self):
5959
input_location = str(self.data_location / "archive.zip")
6060

6161
errors = scancode.extract_archive(input_location, target)
62-
self.assertEqual([], errors)
62+
self.assertEqual({}, errors)
6363

6464
results = [path.name for path in list(Path(target).glob("**/*"))]
6565
expected = [
@@ -72,13 +72,25 @@ def test_scanpipe_pipes_scancode_extract_archive(self):
7272
for path in expected:
7373
self.assertIn(path, results)
7474

75+
def test_scanpipe_pipes_scancode_extract_archive_errors(self):
76+
target = tempfile.mkdtemp()
77+
input_location = str(self.data_location / "scancode" / "corrupted.tar.gz")
78+
errors = scancode.extract_archive(input_location, target)
79+
80+
error_message = "gzip decompression failed"
81+
if sys.platform == "darwin":
82+
error_message += " (zlib returned error -3, msg invalid code lengths set)"
83+
84+
expected = {input_location: [error_message]}
85+
self.assertEqual(expected, errors)
86+
7587
def test_scanpipe_pipes_scancode_extract_archives(self):
7688
tempdir = Path(tempfile.mkdtemp())
7789
input_location = str(self.data_location / "archive.zip")
7890
copy_input(input_location, tempdir)
7991

8092
errors = scancode.extract_archives(tempdir)
81-
self.assertEqual([], errors)
93+
self.assertEqual({}, errors)
8294

8395
results = [path.name for path in list(tempdir.glob("**/*"))]
8496
self.assertEqual(9, len(results))
@@ -93,6 +105,19 @@ def test_scanpipe_pipes_scancode_extract_archives(self):
93105
for path in expected:
94106
self.assertIn(path, results)
95107

108+
def test_scanpipe_pipes_scancode_extract_archives_errors(self):
109+
tempdir = Path(tempfile.mkdtemp())
110+
input_location = str(self.data_location / "scancode" / "corrupted.tar.gz")
111+
target = copy_input(input_location, tempdir)
112+
errors = scancode.extract_archives(tempdir)
113+
114+
error_message = "gzip decompression failed"
115+
if sys.platform == "darwin":
116+
error_message += " (zlib returned error -3, msg invalid code lengths set)"
117+
118+
expected = {str(target): [error_message]}
119+
self.assertEqual(expected, errors)
120+
96121
@skipIf(sys.platform != "linux", "QCOW2 extraction is not available on macOS.")
97122
def test_scanpipe_pipes_scancode_extract_archive_vmimage_qcow2(self):
98123
target = tempfile.mkdtemp()
@@ -104,7 +129,7 @@ def test_scanpipe_pipes_scancode_extract_archive_vmimage_qcow2(self):
104129

105130
# The VM image extraction features are available in the Docker image context.
106131
if from_docker_image:
107-
self.assertEqual([], errors)
132+
self.assertEqual({}, errors)
108133
results = [path.name for path in list(Path(target).glob("**/*"))]
109134
expected = [
110135
"bin",
@@ -118,15 +143,17 @@ def test_scanpipe_pipes_scancode_extract_archive_vmimage_qcow2(self):
118143
self.assertEqual(sorted(expected), sorted(results))
119144

120145
else:
121-
error = errors[0]
122-
self.assertTrue(
123-
any(
124-
[
125-
"Unable to read kernel" in error,
126-
"VM Image extraction only supported on Linux." in error,
127-
]
128-
)
129-
)
146+
expected = {
147+
str(input_location): [
148+
"Unable to read kernel at: /boot/vmlinuz-6.5.0-1022-azure.\n"
149+
"libguestfs requires the kernel executable to be readable.\n"
150+
"This is the case by default on most Linux distributions except on "
151+
"Ubuntu.\nPlease follow the ExtractCode installation instructions "
152+
"in the README.rst at:\n"
153+
"https://github.com/nexB/extractcode/blob/main/README.rst '\n"
154+
]
155+
}
156+
self.assertEqual(expected, errors)
130157

131158
def test_scanpipe_pipes_scancode_get_resource_info(self):
132159
input_location = str(self.data_location / "notice.NOTICE")

0 commit comments

Comments
 (0)