Skip to content

Commit 77faa3f

Browse files
authored
Raise the minimum RAM required per CPU in the docs #1191 (#1192)
Signed-off-by: tdruez <tdruez@nexb.com>
1 parent bb521c1 commit 77faa3f

File tree

5 files changed

+62
-14
lines changed

5 files changed

+62
-14
lines changed

CHANGELOG.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@ v34.5.0 (unreleased)
3131
- Add "Product name" and "Product version" as new project settings.
3232
https://github.com/nexB/scancode.io/issues/1197
3333

34+
- Add "Product name" and "Product version" as new project settings.
35+
https://github.com/nexB/scancode.io/issues/1197
36+
37+
- Raise the minimum RAM required per CPU code in the docs.
38+
A good rule of thumb is to allow **2 GB of memory per CPU**.
39+
For example, if Docker is configured for 8 CPUs, a minimum of 16 GB of memory is
40+
required.
41+
https://github.com/nexB/scancode.io/issues/1191
42+
3443
v34.4.0 (2024-04-22)
3544
--------------------
3645

docs/installation.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ An overview of the web application usage is available at :ref:`user_interface`.
7474

7575
**Make sure to allow enough memory to support each CPU processes**.
7676

77-
A good rule of thumb is to allow **1 GB of memory per CPU**.
78-
For example, if Docker is configured for 8 CPUs, a minimum of 8 GB of memory is
77+
A good rule of thumb is to allow **2 GB of memory per CPU**.
78+
For example, if Docker is configured for 8 CPUs, a minimum of 16 GB of memory is
7979
required.
8080

8181
.. tip::

scanpipe/pipelines/__init__.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,19 @@ def log(self, message):
168168
logger.info(message)
169169
self.run.append_to_log(message)
170170

171+
@staticmethod
172+
def output_from_exception(exception):
173+
"""Return a formatted error message including the traceback."""
174+
output = f"{exception}\n\n"
175+
176+
if exception.__cause__ and str(exception.__cause__) != str(exception):
177+
output += f"Cause: {exception.__cause__}\n\n"
178+
179+
traceback_formatted = "".join(traceback.format_tb(exception.__traceback__))
180+
output += f"Traceback:\n{traceback_formatted}"
181+
182+
return output
183+
171184
def execute(self):
172185
"""Execute each steps in the order defined on this pipeline class."""
173186
self.log(f"Pipeline [{self.pipeline_name}] starting")
@@ -189,10 +202,9 @@ def execute(self):
189202

190203
try:
191204
step(self)
192-
except Exception as e:
205+
except Exception as exception:
193206
self.log("Pipeline failed")
194-
tb = "".join(traceback.format_tb(e.__traceback__))
195-
return 1, f"{e}\n\nTraceback:\n{tb}"
207+
return 1, self.output_from_exception(exception)
196208

197209
step_run_time = timer() - step_start_time
198210
self.log(f"Step [{step_name}] completed in {humanize_time(step_run_time)}")

scanpipe/pipes/scancode.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,14 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

23-
import concurrent.futures
2423
import json
2524
import logging
2625
import multiprocessing
2726
import os
2827
import shlex
28+
import warnings
2929
from collections import defaultdict
30+
from concurrent import futures
3031
from functools import partial
3132
from pathlib import Path
3233

@@ -57,6 +58,10 @@
5758
scanpipe_app = apps.get_app_config("scanpipe")
5859

5960

61+
class InsufficientResourcesError(Exception):
62+
pass
63+
64+
6065
def get_max_workers(keep_available):
6166
"""
6267
Return the `SCANCODEIO_PROCESSES` if defined in the setting,
@@ -67,16 +72,28 @@ def get_max_workers(keep_available):
6772
but for example "spawn", such as on macOS, multiprocessing and threading are
6873
disabled by default returning -1 `max_workers`.
6974
"""
70-
processes = settings.SCANCODEIO_PROCESSES
71-
if processes is not None:
72-
return processes
75+
processes_from_settings = settings.SCANCODEIO_PROCESSES
76+
if processes_from_settings in [-1, 0, 1]:
77+
return processes_from_settings
7378

7479
if multiprocessing.get_start_method() != "fork":
7580
return -1
7681

7782
max_workers = os.cpu_count() - keep_available
7883
if max_workers < 1:
7984
return 1
85+
86+
if processes_from_settings is not None:
87+
if processes_from_settings <= max_workers:
88+
return processes_from_settings
89+
else:
90+
msg = (
91+
f"The value {processes_from_settings} specified in SCANCODEIO_PROCESSES"
92+
f" exceeds the number of available CPUs on this machine."
93+
f" {max_workers} CPUs will be used instead for multiprocessing."
94+
)
95+
warnings.warn(msg, ResourceWarning)
96+
8097
return max_workers
8198

8299

@@ -305,20 +322,28 @@ def scan_resources(
305322

306323
logger.info(f"Starting ProcessPoolExecutor with {max_workers} max_workers")
307324

308-
with concurrent.futures.ProcessPoolExecutor(max_workers) as executor:
325+
with futures.ProcessPoolExecutor(max_workers) as executor:
309326
future_to_resource = {
310327
executor.submit(scan_func, resource.location): resource
311328
for resource in resource_iterator
312329
}
313330

314331
# Iterate over the Futures as they complete (finished or cancelled)
315-
future_as_completed = concurrent.futures.as_completed(future_to_resource)
332+
future_as_completed = futures.as_completed(future_to_resource)
316333

317334
for future in progress.iter(future_as_completed):
318335
resource = future_to_resource[future]
319336
progress.log_progress()
320337
logger.debug(f"{scan_func.__name__} pk={resource.pk}")
321-
scan_results, scan_errors = future.result()
338+
try:
339+
scan_results, scan_errors = future.result()
340+
except futures.process.BrokenProcessPool as broken_pool_error:
341+
message = (
342+
"You may not have enough resources to complete this operation. "
343+
"Please ensure that there is at least 2 GB of available memory per "
344+
"CPU core for successful execution."
345+
)
346+
raise broken_pool_error from InsufficientResourcesError(message)
322347
save_func(resource, scan_results, scan_errors)
323348

324349

scanpipe/tests/pipes/test_scancode.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

2323
import json
24+
import multiprocessing
2425
import os
2526
import sys
2627
import tempfile
@@ -434,10 +435,11 @@ def test_scanpipe_pipes_scancode_run_scan_args(self, mock_run_scan):
434435
run_scan_kwargs = mock_run_scan.call_args.kwargs
435436
self.assertEqual(10, run_scan_kwargs.get("timeout"))
436437

437-
with override_settings(SCANCODEIO_PROCESSES=10):
438+
expected_processes = -1 if multiprocessing.get_start_method() != "fork" else 2
439+
with override_settings(SCANCODEIO_PROCESSES=2):
438440
scancode.run_scan(location=None, output_file=output_file, run_scan_args={})
439441
run_scan_kwargs = mock_run_scan.call_args.kwargs
440-
self.assertEqual(10, run_scan_kwargs.get("processes"))
442+
self.assertEqual(expected_processes, run_scan_kwargs.get("processes"))
441443

442444
def test_scanpipe_pipes_scancode_make_results_summary(self, regen=FIXTURES_REGEN):
443445
# Ensure the policies index is empty to avoid any side effect on results

0 commit comments

Comments
 (0)