Skip to content

Commit 5de4116

Browse files
authored
Add support extra_data value from the JSON input in load_inventory #926 (#1507)
Signed-off-by: tdruez <tdruez@nexb.com>
1 parent 7b00d3f commit 5de4116

File tree

5 files changed

+87
-5
lines changed

5 files changed

+87
-5
lines changed

CHANGELOG.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ v34.9.4 (unreleased)
1010
Any paginated view can now be navigated using the left/right keyboard keys.
1111
https://github.com/aboutcode-org/scancode.io/issues/1200
1212

13+
- Add support for importing the ``extra_data`` value from the JSON input with the
14+
``load_inventory`` pipeline.
15+
When multiple JSON files are provided as inputs, the ``extra`` is prefixed with
16+
the input filename.
17+
https://github.com/aboutcode-org/scancode.io/issues/926
18+
1319
v34.9.3 (2024-12-31)
1420
--------------------
1521

scanpipe/pipelines/load_inventory.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,28 @@ def build_inventory_from_scans(self):
5454
Process JSON scan results files to populate packages, dependencies, and
5555
resources.
5656
"""
57+
self.input_paths = list(self.input_paths)
58+
is_single_input = len(self.input_paths) == 1
59+
5760
for input_path in self.input_paths:
61+
extra_data_prefix = None if is_single_input else input_path.name
62+
5863
if input_path.suffix.endswith(".xlsx"):
59-
input.load_inventory_from_xlsx(self.project, input_path)
64+
input.load_inventory_from_xlsx(
65+
self.project, input_path, extra_data_prefix
66+
)
6067
continue
6168

6269
scan_data = json.loads(input_path.read_text())
6370
tool_name = input.get_tool_name_from_scan_headers(scan_data)
6471

6572
if tool_name == "scancode-toolkit":
6673
input.load_inventory_from_toolkit_scan(self.project, input_path)
74+
6775
elif tool_name == "scanpipe":
68-
input.load_inventory_from_scanpipe(self.project, scan_data)
76+
input.load_inventory_from_scanpipe(
77+
self.project, scan_data, extra_data_prefix
78+
)
79+
6980
else:
7081
raise Exception(f"Input not supported: {str(input_path)} ")

scanpipe/pipes/input.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,14 @@ def get_tool_name_from_scan_headers(scan_data):
8282
return tool_name
8383

8484

85+
def get_extra_data_from_scan_headers(scan_data):
86+
"""Return the ``extra_data`` of the first header in the provided ``scan_data``."""
87+
if headers := scan_data.get("headers", []):
88+
first_header = headers[0]
89+
if extra_data := first_header.get("extra_data"):
90+
return extra_data
91+
92+
8593
def is_archive(location):
8694
"""Return True if the file at ``location`` is an archive."""
8795
return get_type(location).is_archive
@@ -100,10 +108,13 @@ def load_inventory_from_toolkit_scan(project, input_location):
100108
)
101109

102110

103-
def load_inventory_from_scanpipe(project, scan_data):
111+
def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None):
104112
"""
105113
Create packages, dependencies, resources, and relations loaded from a ScanCode.io
106114
JSON output provided as ``scan_data``.
115+
116+
An ``extra_data_prefix`` can be provided in case multiple input files are loaded
117+
into the same project. The prefix is usually the filename of the input.
107118
"""
108119
for package_data in scan_data.get("packages", []):
109120
pipes.update_or_create_package(project, package_data)
@@ -117,6 +128,11 @@ def load_inventory_from_scanpipe(project, scan_data):
117128
for relation_data in scan_data.get("relations", []):
118129
pipes.get_or_create_relation(project, relation_data)
119130

131+
if extra_data := get_extra_data_from_scan_headers(scan_data):
132+
if extra_data_prefix:
133+
extra_data = {extra_data_prefix: extra_data}
134+
project.update_extra_data(extra_data)
135+
120136

121137
model_to_object_maker_func = {
122138
DiscoveredPackage: pipes.update_or_create_package,
@@ -186,10 +202,13 @@ def clean_xlsx_data_to_model_data(model_class, xlsx_data):
186202
return cleaned_data
187203

188204

189-
def load_inventory_from_xlsx(project, input_location):
205+
def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None):
190206
"""
191207
Create packages, dependencies, resources, and relations loaded from XLSX file
192208
located at ``input_location``.
209+
210+
An ``extra_data_prefix`` can be provided in case multiple input files are loaded
211+
into the same project. The prefix is usually the filename of the input.
193212
"""
194213
workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True)
195214

@@ -206,4 +225,7 @@ def load_inventory_from_xlsx(project, input_location):
206225

207226
if "LAYERS" in workbook:
208227
layers_data = get_worksheet_data(worksheet=workbook["LAYERS"])
209-
project.update_extra_data({"layers": layers_data})
228+
extra_data = {"layers": layers_data}
229+
if extra_data_prefix:
230+
extra_data = {extra_data_prefix: extra_data}
231+
project.update_extra_data(extra_data)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"layers": [
3+
{
4+
"layer_tag": "img-12ebda-layer-01-1a058d",
5+
"created_by": "/bin/sh -c #(nop) ADD file:762c899ec0505d1a32930ee804c5b008825f41611161be104076cba33b7e5b2b in / ",
6+
"layer_id": "1a058d5342cc722ad5439cacae4b2b4eedde51d8fe8800fcf28444302355c16d",
7+
"image_id": "12ebda3111cec73a788b0e802a00de04ebf5e9765043925dd396c2d03a7c1e66",
8+
"created": "2021-11-12T17:19:44.795237917Z",
9+
"size": "5886464",
10+
"author": null,
11+
"comment": null,
12+
"archive_location": "ghcr_io_kyverno_sbom.tar-extract/1a058d5342cc722ad5439cacae4b2b4eedde51d8fe8800fcf28444302355c16d.tar",
13+
"xlsx_errors": null
14+
}
15+
]
16+
}

scanpipe/tests/pipes/test_input.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,25 @@ def test_scanpipe_pipes_scancode_load_inventory_from_scanpipe_with_relations(sel
109109
self.assertEqual(57, project.codebaseresources.count())
110110
self.assertEqual(18, project.codebaserelations.count())
111111

112+
def test_scanpipe_pipes_scancode_load_inventory_extra_data(self):
113+
project = Project.objects.create(name="1")
114+
input_location = self.data / "asgiref" / "asgiref-3.3.0_scanpipe_output.json"
115+
scan_data = json.loads(input_location.read_text())
116+
extra_data = {"key": "value"}
117+
scan_data["headers"][0]["extra_data"] = extra_data
118+
119+
input.load_inventory_from_scanpipe(project, scan_data)
120+
project.refresh_from_db()
121+
self.assertEqual(extra_data, project.extra_data)
122+
123+
project.extra_data = {}
124+
project.save()
125+
input.load_inventory_from_scanpipe(
126+
project, scan_data, extra_data_prefix="file.ext"
127+
)
128+
project.refresh_from_db()
129+
self.assertEqual({"file.ext": extra_data}, project.extra_data)
130+
112131
def test_scanpipe_pipes_input_load_inventory_from_xlsx(self):
113132
project1 = Project.objects.create(name="Analysis")
114133
input_location = self.data / "outputs" / "asgiref-3.6.0-output.xlsx"
@@ -129,6 +148,14 @@ def test_scanpipe_pipes_input_load_inventory_from_xlsx_layers_sheet(self):
129148
expected = json.loads(expected_location.read_text())
130149
self.assertEqual(expected, project1.extra_data)
131150

151+
project1.extra_data = {}
152+
project1.save()
153+
input.load_inventory_from_xlsx(
154+
project1, input_location, extra_data_prefix="file.ext"
155+
)
156+
project1.refresh_from_db()
157+
self.assertEqual({"file.ext": expected}, project1.extra_data)
158+
132159
def test_scanpipe_pipes_input_load_inventory_from_project_xlsx_output(self):
133160
fixtures = self.data / "asgiref" / "asgiref-3.3.0_fixtures.json"
134161
call_command("loaddata", fixtures, **{"verbosity": 0})

0 commit comments

Comments
 (0)