diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 2ffacb19f..0351e61e2 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -20,6 +20,7 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import os import sys import tempfile from pathlib import Path @@ -367,6 +368,15 @@ PROJECT_DIR("static"), ] +# Media files (Uploaded package archives, etc.) + +MEDIA_URL = "/media/" +MEDIA_ROOT = os.path.join(str(ROOT_DIR), "media") + +# Package storage settings + +ENABLE_LOCAL_PACKAGE_STORAGE = env.bool("ENABLE_LOCAL_PACKAGE_STORAGE", default=False) + # Third-party apps CRISPY_TEMPLATE_PACK = "bootstrap3" diff --git a/scancodeio/urls.py b/scancodeio/urls.py index f0e475e17..35851e0fc 100644 --- a/scancodeio/urls.py +++ b/scancodeio/urls.py @@ -21,6 +21,7 @@ # Visit https://github.com/aboutcode-org/scancode.io for support and download. from django.conf import settings +from django.conf.urls.static import static from django.contrib.auth import views as auth_views from django.urls import include from django.urls import path @@ -54,6 +55,8 @@ path("", RedirectView.as_view(url="project/")), ] +urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) + if settings.SCANCODEIO_ENABLE_ADMIN_SITE: urlpatterns.append(path("admin/", admin_site.urls)) diff --git a/scanpipe/forms.py b/scanpipe/forms.py index f9f46da67..6537ddeee 100644 --- a/scanpipe/forms.py +++ b/scanpipe/forms.py @@ -165,6 +165,7 @@ class Meta: "pipeline", "execute_now", "selected_groups", + "use_local_storage", ] def __init__(self, *args, **kwargs): @@ -178,6 +179,12 @@ def __init__(self, *args, **kwargs): pipeline_choices = scanpipe_app.get_pipeline_choices(include_addon=False) self.fields["pipeline"].choices = pipeline_choices + self.fields["use_local_storage"].label = "Store packages locally" + self.fields[ + "use_local_storage" + ].help_text = "If checked, packages will be stored on the local filesystem." + self.fields["use_local_storage"].widget.attrs.update({"class": "checkbox"}) + def clean_name(self): return " ".join(self.cleaned_data["name"].split()) diff --git a/scanpipe/migrations/0068_squashed_package_archive.py b/scanpipe/migrations/0068_squashed_package_archive.py new file mode 100644 index 000000000..ffd6cd2c0 --- /dev/null +++ b/scanpipe/migrations/0068_squashed_package_archive.py @@ -0,0 +1,93 @@ +# Generated by Django 5.1.1 on 2025-07-09 + +import django.db.models.deletion +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('scanpipe', '0067_discoveredpackage_notes'), + ] + + operations = [ + migrations.CreateModel( + name='PackageArchive', + fields=[ + ('uuid', models.UUIDField( + db_index=True, default=uuid.uuid4, editable=False, primary_key=True, + serialize=False, verbose_name='UUID' + )), + ('checksum_sha256', models.CharField( + db_index=True, help_text='SHA256 checksum of the package archive file.', + max_length=64, unique=True + )), + ('storage_path', models.CharField( + blank=True, help_text='Path to the stored archive file', max_length=1024 + )), + ('created_date', models.DateTimeField( + auto_now_add=True, help_text='Date when the archive was added to storage.' + )), + ('package_file', models.FileField( + blank=True, help_text='The actual package archive file ( ZIP or TAR).', + null=True, upload_to='packages/' + )), + ], + options={ + 'indexes': [models.Index(fields=['checksum_sha256'], name='checksum_idx')], + }, + ), + + migrations.CreateModel( + name='DownloadedPackage', + fields=[ + ('uuid', models.UUIDField( + db_index=True, default=uuid.uuid4, editable=False, primary_key=True, + serialize=False, verbose_name='UUID' + )), + ('url', models.URLField( + blank=True, db_index=True, help_text='URL from which the package was downloaded, if applicable.', + max_length=1024 + )), + ('filename', models.CharField( + help_text='Name of the package file.', max_length=255 + )), + ('download_date', models.DateTimeField( + auto_now_add=True, help_text='Date when the package was downloaded or added.' + )), + ('scan_log', models.TextField( + blank=True, help_text='Log output from scanning the package.' + )), + ('scan_date', models.DateTimeField( + blank=True, help_text='Date when the package was scanned.', null=True + )), + ('project', models.ForeignKey( + editable=False, on_delete=django.db.models.deletion.CASCADE, + related_name='downloadedpackages', to='scanpipe.project' + )), + ('package_archive', models.ForeignKey( + help_text='The stored archive file associated with this package.', + on_delete=django.db.models.deletion.CASCADE, to='scanpipe.packagearchive' + )), + ], + options={ + 'indexes': [models.Index(fields=['url'], name='url_idx')], + 'constraints': [ + models.UniqueConstraint( + condition=models.Q(('url__gt', '')), + fields=('url', 'project'), + name='scanpipe_downloadedpackage_unique_url_project' + ), + ], + }, + ), + + migrations.AddField( + model_name='project', + name='use_local_storage', + field=models.BooleanField( + default=False, help_text='Store packages locally if enabled.' + ), + ), + ] diff --git a/scanpipe/models.py b/scanpipe/models.py index d763b9c03..10da7d769 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -585,6 +585,7 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model): ) notes = models.TextField(blank=True) settings = models.JSONField(default=dict, blank=True) + labels = TaggableManager(through=UUIDTaggedItem, ordering=["name"]) purl = models.CharField( max_length=2048, @@ -597,6 +598,10 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model): ), ) + labels = TaggableManager(through=UUIDTaggedItem) + use_local_storage = models.BooleanField( + default=False, help_text="Store packages locally if enabled." + ) objects = ProjectQuerySet.as_manager() class Meta: @@ -4393,6 +4398,116 @@ def success(self): return self.response_status_code in (200, 201, 202) +class PackageArchive(UUIDPKModel): + """ + Stores metadata about a package archive file stored in the project's storage. + Each archive is uniquely identified by its SHA256 checksum. + """ + + checksum_sha256 = models.CharField( + max_length=64, + unique=True, + db_index=True, + help_text=_("SHA256 checksum of the package archive file."), + ) + storage_path = models.CharField( + max_length=1024, + blank=True, + help_text=_("Path to the stored archive file"), + ) + package_file = models.FileField( + upload_to="packages/", + null=True, + blank=True, + help_text=_("The actual package archive file ( ZIP or TAR)."), + ) + created_date = models.DateTimeField( + auto_now_add=True, + help_text=_("Date when the archive was added to storage."), + ) + + class Meta: + pass + + def __str__(self): + return ( + f"Archive {self.checksum_sha256[:8]} at " + f" {self.storage_path or self.package_file.name}" + ) + + +class DownloadedPackage(UUIDPKModel): + """ + Tracks packages downloaded or provided as input for a project, linked to a + PackageArchive. Each instance represents a package associated with a project, + including its source URL (if downloaded) and scan details. + """ + + project = models.ForeignKey( + Project, + related_name="downloadedpackages", + on_delete=models.CASCADE, + editable=False, + ) + url = models.URLField( + max_length=1024, + db_index=True, + blank=True, + help_text=_("URL from which the package was downloaded, if applicable."), + ) + filename = models.CharField( + max_length=255, + help_text=_("Name of the package file."), + ) + download_date = models.DateTimeField( + auto_now_add=True, + help_text=_("Date when the package was downloaded or added."), + ) + scan_log = models.TextField( + blank=True, + help_text=_("Log output from scanning the package."), + ) + scan_date = models.DateTimeField( + null=True, + blank=True, + help_text=_("Date when the package was scanned."), + ) + package_archive = models.ForeignKey( + PackageArchive, + on_delete=models.CASCADE, + help_text=_("The stored archive file associated with this package."), + ) + scancode_version = models.CharField( + max_length=50, + blank=True, + help_text=_("ScanCode version used for scanning."), + ) + pipeline_name = models.CharField( + max_length=100, + blank=True, + help_text=_("Pipeline used to process the package."), + ) + + class Meta: + indexes = [ + models.Index(fields=["url"], name="url_idx"), + ] + constraints = [ + models.UniqueConstraint( + fields=["url", "project"], + condition=Q(url__gt=""), + name="%(app_label)s_%(class)s_unique_url_project", + ), + models.UniqueConstraint( + fields=["project", "package_archive"], + name="%(app_label)s_%(class)s_unique_project_archive", + ), + ] + + def __str__(self): + return f"{self.filename} for project {self.project.name}" + + @receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL) def create_auth_token(sender, instance=None, created=False, **kwargs): """Create an API key token on user creation, using the signal system.""" diff --git a/scanpipe/pipelines/analyze_docker.py b/scanpipe/pipelines/analyze_docker.py index 59d7a3d92..a8d9983d2 100644 --- a/scanpipe/pipelines/analyze_docker.py +++ b/scanpipe/pipelines/analyze_docker.py @@ -20,9 +20,15 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from scanpipe.pipelines.analyze_root_filesystem import RootFS from scanpipe.pipes import docker from scanpipe.pipes import rootfs +from scanpipe.pipes.fetch import store_package_archive + +logger = logging.getLogger(__name__) class Docker(RootFS): @@ -36,6 +42,7 @@ def steps(cls): cls.find_images_os_and_distro, cls.collect_images_information, cls.collect_and_create_codebase_resources, + cls.store_package_archives, cls.collect_and_create_system_packages, cls.flag_uninteresting_codebase_resources, cls.flag_empty_files, @@ -74,6 +81,40 @@ def collect_and_create_codebase_resources(self): """Collect and labels all image files as CodebaseResources.""" for image in self.images: docker.create_codebase_resources(self.project, image) + self.package_files = [] + for resource in self.project.codebaseresources.filter(extension=".deb"): + self.package_files.append(resource.path) + logger.debug(f"Found package file: {resource.path}") + + def store_package_archives(self): + """Store identified package archives.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage." + ) + return [] + + logger.info( + f"Storing package archives for project: {self.project.name}," + "files: {self.package_files}" + ) + stored_files = [] + for package_path in self.package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + return stored_files def collect_and_create_system_packages(self): """Collect installed system packages for each layer based on the distro.""" diff --git a/scanpipe/pipelines/analyze_docker_windows.py b/scanpipe/pipelines/analyze_docker_windows.py index d0809a51b..deb0d7819 100644 --- a/scanpipe/pipelines/analyze_docker_windows.py +++ b/scanpipe/pipelines/analyze_docker_windows.py @@ -20,10 +20,17 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from scanpipe.pipelines.analyze_docker import Docker from scanpipe.pipes import docker from scanpipe.pipes import rootfs from scanpipe.pipes import windows +from scanpipe.pipes.fetch import store_package_archive +from scanpipe.pipes.input import is_archive + +logger = logging.getLogger(__name__) class DockerWindows(Docker): @@ -37,6 +44,7 @@ def steps(cls): cls.find_images_os_and_distro, cls.collect_images_information, cls.collect_and_create_codebase_resources, + cls.store_package_archives, cls.collect_and_create_system_packages, cls.flag_known_software_packages, cls.flag_uninteresting_codebase_resources, @@ -50,6 +58,41 @@ def steps(cls): cls.flag_not_analyzed_codebase_resources, ) + def store_package_archives(self): + """Store identified package archives for Windows images.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage." + ) + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + + package_files = [ + resource.path + for resource in self.project.codebaseresources.all() + if is_archive(resource.path) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + def flag_known_software_packages(self): """Flag files from known software packages by checking common install paths.""" windows.flag_known_software(self.project) diff --git a/scanpipe/pipelines/analyze_root_filesystem.py b/scanpipe/pipelines/analyze_root_filesystem.py index 76478ce6d..5446a1180 100644 --- a/scanpipe/pipelines/analyze_root_filesystem.py +++ b/scanpipe/pipelines/analyze_root_filesystem.py @@ -20,12 +20,17 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +from pathlib import Path + from extractcode import EXTRACT_SUFFIX +from scanpipe.models import DiscoveredPackage from scanpipe.pipelines import Pipeline from scanpipe.pipes import flag from scanpipe.pipes import rootfs from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive +from scanpipe.pipes.input import is_archive class RootFS(Pipeline): @@ -39,6 +44,7 @@ def steps(cls): cls.collect_rootfs_information, cls.collect_and_create_codebase_resources, cls.collect_and_create_system_packages, + cls.store_package_archives, cls.flag_uninteresting_codebase_resources, cls.flag_empty_files, cls.flag_ignored_resources, @@ -87,6 +93,76 @@ def collect_and_create_system_packages(self): for rfs in self.root_filesystems: rootfs.scan_rootfs_for_system_packages(self.project, rfs) + def store_package_archives(self): + """ + Store package archives (.deb, .apk) found in the root filesystem or fetch + them for detected system packages if configured to do so. + """ + if not self.project.use_local_storage: + self.log( + f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage." + ) + return [] + if not self.env.get("STORE_DOWNLOADED_PACKAGES", True): + self.log("Package storage skipped: STORE_DOWNLOADED_PACKAGES is disabled") + return + + self.log(f"Storing package archives for project: {self.project.name}") + stored_files = [] + + package_files = [ + resource.path + for resource in self.project.codebaseresources.all() + if is_archive(resource.path) + ] + for package_path in package_files: + if not Path(package_path).exists(): + self.log(f"Package file not found: {package_path}", level="ERROR") + continue + result = store_package_archive( + self.project, url=None, file_path=str(package_path) + ) + self.log(f"Stored package archive: {package_path}, Result: {result}") + stored_files.append(result) + system_packages = DiscoveredPackage.objects.filter(project=self.project) + self.log(f"Found {system_packages.count()} system packages") + + for pkg in system_packages: + if "alpine" in pkg.purl: + pkg_name = pkg.name + pkg_version = pkg.version + apk_url = f"http://dl-cdn.alpinelinux.org/alpine/v3.18/main/x86_64/{pkg_name}-{pkg_version}.apk" + try: + import requests + + response = requests.get(apk_url, stream=True, timeout=10) + response.raise_for_status() + dest_path = ( + Path(self.project.work_directory) + / "tmp" + / f"{pkg_name}-{pkg_version}.apk" + ) + dest_path.parent.mkdir(exist_ok=True) + with open(dest_path, "wb") as f: + for chunk in response.iter_content(1024): + f.writelines(chunk) + result = store_package_archive( + self.project, url=apk_url, file_path=str(dest_path) + ) + self.log( + f"Stored system package archive: {pkg_name}, URL: {apk_url}," + "Result: {result}" + ) + stored_files.append(result) + except Exception as e: + self.log( + f"Failed to fetch/store {pkg_name} from {apk_url}: {e}", + level="WARNING", + ) + + return stored_files + def flag_uninteresting_codebase_resources(self): """Flag files—not worth tracking—that don’t belong to any system packages.""" rootfs.flag_uninteresting_codebase_resources(self.project) diff --git a/scanpipe/pipelines/deploy_to_develop.py b/scanpipe/pipelines/deploy_to_develop.py index 33f699472..30cde5154 100644 --- a/scanpipe/pipelines/deploy_to_develop.py +++ b/scanpipe/pipelines/deploy_to_develop.py @@ -20,6 +20,9 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from aboutcode.pipeline import optional_step from scanpipe import pipes from scanpipe.pipelines import Pipeline @@ -30,6 +33,10 @@ from scanpipe.pipes import matchcode from scanpipe.pipes import purldb from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive +from scanpipe.pipes.input import is_archive + +logger = logging.getLogger(__name__) class DeployToDevelop(Pipeline): @@ -61,6 +68,7 @@ def steps(cls): cls.extract_inputs_to_codebase_directory, cls.extract_archives, cls.collect_and_create_codebase_resources, + cls.store_package_archives, cls.fingerprint_codebase_directories, cls.flag_empty_files, cls.flag_whitespace_files, @@ -98,6 +106,67 @@ def steps(cls): cls.create_local_files_packages, ) + purldb_package_extensions = [".jar", ".war", ".zip"] + purldb_resource_extensions = [ + ".map", + ".js", + ".mjs", + ".ts", + ".d.ts", + ".jsx", + ".tsx", + ".css", + ".scss", + ".less", + ".sass", + ".soy", + ".class", + ] + doc_extensions = [ + ".pdf", + ".doc", + ".docx", + ".ppt", + ".pptx", + ".tex", + ".odt", + ".odp", + ] + + def store_package_archives(self): + """Store package archives locally if enabled.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage." + ) + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + package_files = [ + resource.path + for resource in self.project.codebaseresources.all() + if is_archive(resource.path) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + def get_inputs(self): """Locate the ``from`` and ``to`` input files.""" self.from_files, self.to_files = d2d.get_inputs(self.project) diff --git a/scanpipe/pipelines/inspect_packages.py b/scanpipe/pipelines/inspect_packages.py index 7674f7f25..c48fe53ff 100644 --- a/scanpipe/pipelines/inspect_packages.py +++ b/scanpipe/pipelines/inspect_packages.py @@ -20,10 +20,14 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. -from aboutcode.pipeline import optional_step +import logging + +from aboutcode.pipeline import group from scanpipe.pipelines.scan_codebase import ScanCodebase from scanpipe.pipes import scancode +logger = logging.getLogger(__name__) + class InspectPackages(ScanCodebase): """ @@ -50,6 +54,7 @@ def steps(cls): cls.flag_empty_files, cls.flag_ignored_resources, cls.scan_for_application_packages, + cls.store_package_archives, cls.resolve_dependencies, ) @@ -65,7 +70,7 @@ def scan_for_application_packages(self): progress_logger=self.log, ) - @optional_step("StaticResolver") + @group("StaticResolver") def resolve_dependencies(self): """ Create packages and dependency relationships from diff --git a/scanpipe/pipelines/load_inventory.py b/scanpipe/pipelines/load_inventory.py index bcc10d61e..fbe3d9c66 100644 --- a/scanpipe/pipelines/load_inventory.py +++ b/scanpipe/pipelines/load_inventory.py @@ -21,9 +21,13 @@ # Visit https://github.com/aboutcode-org/scancode.io for support and download. import json +import logging from scanpipe.pipelines import Pipeline from scanpipe.pipes import input +from scanpipe.pipes.fetch import store_package_archive + +logger = logging.getLogger(__name__) class LoadInventory(Pipeline): @@ -42,6 +46,7 @@ class LoadInventory(Pipeline): def steps(cls): return ( cls.get_inputs, + cls.store_inventory_files, cls.build_inventory_from_scans, ) @@ -49,6 +54,35 @@ def get_inputs(self): """Locate all the supported input files from the project's input/ directory.""" self.input_paths = self.project.inputs(extensions=self.supported_extensions) + def store_inventory_files(self): + """Store input inventory files locally if enabled.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping file storage." + ) + return [] + + logger.info(f"Storing inventory files for project: {self.project.name}") + stored_files = [] + + for input_path in self.input_paths: + if not input_path.exists(): + logger.error(f"Invalid or missing file path: {input_path}") + continue + input_path_str = str(input_path) + logger.info(f"Storing inventory file: {input_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=input_path_str + ) + logger.info(f"Stored inventory file {input_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {input_path_str}: {e}") + + return stored_files + def build_inventory_from_scans(self): """ Process JSON scan results files to populate packages, dependencies, and diff --git a/scanpipe/pipelines/load_sbom.py b/scanpipe/pipelines/load_sbom.py index 955e54dd9..7f5960e75 100644 --- a/scanpipe/pipelines/load_sbom.py +++ b/scanpipe/pipelines/load_sbom.py @@ -20,8 +20,14 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from scanpipe.pipelines.scan_codebase import ScanCodebase from scanpipe.pipes import resolve +from scanpipe.pipes.fetch import store_package_archive + +logger = logging.getLogger(__name__) class LoadSBOM(ScanCodebase): @@ -44,6 +50,7 @@ def steps(cls): cls.flag_empty_files, cls.flag_ignored_resources, cls.get_sbom_inputs, + cls.store_sbom_files, cls.get_packages_from_sboms, cls.create_packages_from_sboms, cls.create_dependencies_from_sboms, @@ -53,6 +60,36 @@ def get_sbom_inputs(self): """Locate all the SBOMs among the codebase resources.""" self.manifest_resources = resolve.get_manifest_resources(self.project) + def store_sbom_files(self): + """Store SBOM files locally if enabled.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping file storage." + ) + return [] + + logger.info(f"Storing SBOM files for project: {self.project.name}") + stored_files = [] + + for resource in self.manifest_resources: + resource_path = resource.path + if not Path(resource_path).exists(): + logger.error(f"Invalid or missing file path: {resource_path}") + continue + resource_path_str = str(resource_path) + logger.info(f"Storing SBOM file: {resource_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=resource_path_str + ) + logger.info(f"Stored SBOM file {resource_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {resource_path_str}: {e}") + + return stored_files + def get_packages_from_sboms(self): """Get packages data from SBOMs.""" self.packages = resolve.get_packages( diff --git a/scanpipe/pipelines/resolve_dependencies.py b/scanpipe/pipelines/resolve_dependencies.py index 0646a3fda..76fead95f 100644 --- a/scanpipe/pipelines/resolve_dependencies.py +++ b/scanpipe/pipelines/resolve_dependencies.py @@ -20,10 +20,18 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + +from aboutcode.pipeline import group from aboutcode.pipeline import optional_step from scanpipe.pipelines.scan_codebase import ScanCodebase from scanpipe.pipes import resolve from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive +from scanpipe.pipes.input import is_archive + +logger = logging.getLogger(__name__) class ResolveDependencies(ScanCodebase): @@ -47,7 +55,9 @@ def steps(cls): cls.collect_and_create_codebase_resources, cls.flag_ignored_resources, cls.get_manifest_inputs, + cls.store_manifest_files, cls.scan_for_application_packages, + cls.store_package_archives, cls.create_packages_and_dependencies, cls.get_packages_from_manifest, cls.create_resolved_packages, @@ -57,7 +67,37 @@ def get_manifest_inputs(self): """Locate package manifest files with a supported package resolver.""" self.manifest_resources = resolve.get_manifest_resources(self.project) - @optional_step("StaticResolver") + def store_manifest_files(self): + """Store manifest files locally if enabled.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping file storage." + ) + return [] + + logger.info(f"Storing manifest files for project: {self.project.name}") + stored_files = [] + + for resource in self.manifest_resources: + resource_path = resource.path + if not Path(resource_path).exists(): + logger.error(f"Invalid or missing file path: {resource_path}") + continue + resource_path_str = str(resource_path) + logger.info(f"Storing manifest file: {resource_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=resource_path_str + ) + logger.info(f"Stored manifest file {resource_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {resource_path_str}: {e}") + + return stored_files + + @group("StaticResolver") def scan_for_application_packages(self): """ Scan and assemble application packages from package manifests @@ -70,7 +110,41 @@ def scan_for_application_packages(self): progress_logger=self.log, ) - @optional_step("StaticResolver") + def store_package_archives(self): + """Store package archives locally if enabled.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage." + ) + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + package_files = [ + resource.path + for resource in self.project.codebaseresources.all() + if is_archive(resource.path) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + + @group("StaticResolver") def create_packages_and_dependencies(self): """ Create the statically resolved packages and their dependencies diff --git a/scanpipe/pipelines/scan_codebase.py b/scanpipe/pipelines/scan_codebase.py index d5bbe992c..df9be8198 100644 --- a/scanpipe/pipelines/scan_codebase.py +++ b/scanpipe/pipelines/scan_codebase.py @@ -20,10 +20,17 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from scanpipe import pipes from scanpipe.pipelines import Pipeline from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive from scanpipe.pipes.input import copy_inputs +from scanpipe.pipes.input import is_archive + +logger = logging.getLogger(__name__) class ScanCodebase(Pipeline): @@ -41,6 +48,7 @@ def steps(cls): cls.copy_inputs_to_codebase_directory, cls.extract_archives, cls.collect_and_create_codebase_resources, + cls.store_package_archives, cls.flag_empty_files, cls.flag_ignored_resources, cls.scan_for_application_packages, @@ -54,6 +62,40 @@ def copy_inputs_to_codebase_directory(self): """ copy_inputs(self.project.inputs("*"), self.project.codebase_path) + def store_package_archives(self): + """Store package archives locally if enabled.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage." + ) + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + package_files = [ + resource.path + for resource in self.project.codebaseresources.all() + if is_archive(resource.path) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + def collect_and_create_codebase_resources(self): """Collect and create codebase resources.""" pipes.collect_and_create_codebase_resources(self.project) diff --git a/scanpipe/pipelines/scan_single_package.py b/scanpipe/pipelines/scan_single_package.py index c8753b90d..d7960740a 100644 --- a/scanpipe/pipelines/scan_single_package.py +++ b/scanpipe/pipelines/scan_single_package.py @@ -21,6 +21,7 @@ # Visit https://github.com/aboutcode-org/scancode.io for support and download. import json +import logging from django.core.serializers.json import DjangoJSONEncoder @@ -29,9 +30,12 @@ from scanpipe.pipelines import Pipeline from scanpipe.pipes import input from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive from scanpipe.pipes.input import copy_input from scanpipe.pipes.input import is_archive +logger = logging.getLogger(__name__) + class ScanSinglePackage(Pipeline): """ @@ -47,6 +51,7 @@ class ScanSinglePackage(Pipeline): def steps(cls): return ( cls.get_package_input, + cls.store_package, cls.collect_input_information, cls.extract_input_to_codebase_directory, cls.extract_archives, @@ -80,6 +85,36 @@ def get_package_input(self): self.input_path = inputs[0] + def store_package(self): + """ + Store the package archive and create a DownloadedPackage entry if applicable. + Uses the input_path from get_package_input and the optional URL for + downloaded packages. + """ + if not self.project.use_local_storage: + self.log( + f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage." + ) + return [] + logger.info( + f"Starting store_package for project: {self.project}," + "input: {self.input_path}" + ) + + file_path = self.input_path + if not file_path: + logger.error("No input file path available for storage") + return None + + url = None + + logger.info(f"Calling store_package_archive with URL: {url}, File: {file_path}") + result = store_package_archive(self.project, url=url, file_path=file_path) + + logger.info(f"store_package completed, result: {result}") + return result + def collect_input_information(self): """Collect and store information about the project input.""" self.project.update_extra_data( diff --git a/scanpipe/pipes/fetch.py b/scanpipe/pipes/fetch.py index 2f74d0cf1..332ddd6be 100644 --- a/scanpipe/pipes/fetch.py +++ b/scanpipe/pipes/fetch.py @@ -20,6 +20,8 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib import json import logging import os @@ -27,23 +29,27 @@ import subprocess import tempfile from collections import namedtuple +from email.message import Message +from io import BytesIO from pathlib import Path from urllib.parse import unquote from urllib.parse import urlparse from django.conf import settings -from django.utils.http import parse_header_parameters +from django.core.files import File import git import requests from commoncode import command from commoncode.hash import multi_checksums from commoncode.text import python_safe_name -from packageurl import PackageURL -from packageurl.contrib import purl2url from plugincode.location_provider import get_location from requests import auth as request_auth +import scanpipe +from scanpipe.models import DownloadedPackage +from scanpipe.models import PackageArchive + logger = logging.getLogger("scanpipe.pipes") Download = namedtuple("Download", "uri directory filename path size sha1 md5") @@ -126,7 +132,9 @@ def fetch_http(uri, to=None): raise requests.RequestException content_disposition = response.headers.get("content-disposition", "") - _, params = parse_header_parameters(content_disposition) + msg = Message() + msg["Content-Disposition"] = content_disposition + params = dict(msg.get_params(header="content-disposition")) filename = params.get("filename") if not filename: # Using `response.url` in place of provided `Scan.uri` since the former @@ -358,15 +366,109 @@ def fetch_git_repo(url, to=None): ) -def fetch_package_url(url): - # Ensure the provided Package URL is valid, or raise a ValueError. - PackageURL.from_string(url) +def store_package_archive(project, url=None, file_path=None, pipeline_name=None): + """ + Store a package in PackageArchive and link it to DownloadedPackage. + + Args: + project: The ScanCode.io Project instance. + url (str, optional): The URL from which the package was downloaded. + file_path (str or Path, optional): Path to the package file. + pipeline_name: The name of the pipeline storing the package. + + Returns: + DownloadedPackage: The created DownloadedPackage instance, or + None if storage is disabled or an error occurs. + + """ + logger.info( + f"store_package_archive called with project: {project}, " + f"url: {url}, " + f"file_path: {file_path}" + ) + + if not getattr(settings, "ENABLE_PACKAGE_STORAGE", False): + logger.info("Package storage disabled (ENABLE_PACKAGE_STORAGE=False)") + return None - # Resolve a Download URL using purl2url. - if download_url := purl2url.get_download_url(url): - return fetch_http(download_url) + if not file_path and not url: + logger.error("Either file_path or url must be provided") + return None - raise ValueError(f"Could not resolve a download URL for {url}.") + content, filename = get_package_content_and_filename(file_path, url) + if not content: + return None + + archive = get_or_create_archive(content, file_path, filename) + if not archive: + return None + + dp = get_or_create_downloaded_package( + project, url, filename, archive, pipeline_name + ) + return dp + + +def get_package_content_and_filename(file_path, url): + if file_path: + file_path = str(file_path) + if not Path(file_path).exists(): + logger.error(f"File not found: {file_path}") + return None, None + with open(file_path, "rb") as f: + content = f.read() + filename = os.path.basename(file_path) + else: + try: + response = requests.get(url, stream=True, timeout=HTTP_REQUEST_TIMEOUT) + response.raise_for_status() + content = response.content + filename = os.path.basename(url.split("?")[0]) + except requests.RequestException as e: + logger.error(f"Failed to download {url}: {e}") + return None, None + return content, filename + + +def get_or_create_archive(content, file_path, filename): + checksum = hashlib.sha256(content).hexdigest() + logger.info(f"Calculated SHA256: {checksum}") + + existing_archive = PackageArchive.objects.filter(checksum_sha256=checksum).first() + if existing_archive: + logger.info(f"Using existing package: {existing_archive.package_file.name}") + return existing_archive + + try: + archive = PackageArchive( + checksum_sha256=checksum, + size=len(content), + ) + with open(file_path, "rb") if file_path else BytesIO(content) as f: + archive.package_file.save(filename, File(f), save=False) + archive.save() + logger.info(f"Created PackageArchive: {archive.checksum_sha256}") + return archive + except Exception as e: + logger.error(f"Error creating PackageArchive: {e}") + return None + + +def get_or_create_downloaded_package(project, url, filename, archive, pipeline_name): + try: + dp = DownloadedPackage.objects.create( + project=project, + url=url or "", + filename=filename, + package_archive=archive, + scancode_version=scanpipe.__version__, + pipeline_name=pipeline_name or "", + ) + logger.info(f"DownloadedPackage created: {dp.url}, {dp.filename}") + return dp + except Exception as e: + logger.error(f"Error creating DownloadedPackage: {e}") + return None SCHEME_TO_FETCHER_MAPPING = { @@ -385,7 +487,7 @@ def get_fetcher(url): return fetch_git_repo if url.startswith("pkg:"): - return fetch_package_url + return fetch_url # Not using `urlparse(url).scheme` for the scheme as it converts to lower case. scheme = url.split("://")[0] @@ -399,15 +501,27 @@ def get_fetcher(url): raise ValueError(error_msg) -def fetch_url(url): +def fetch_url(url, project=None): """Fetch provided `url` and returns the result as a `Download` object.""" fetcher = get_fetcher(url) logger.info(f'Fetching "{url}" using {fetcher.__name__}') downloaded = fetcher(url) + if ( + project + and getattr(settings, "ENABLE_PACKAGE_STORAGE", False) + and project.use_local_storage + ): + logger.info(f"Storing package for project: {project.name} with url={url}") + store_package_archive(project, url, downloaded.path) + elif project and not project.use_local_storage: + logger.info( + f"Skipping package storage for project: {project.name} " + "(local storage disabled)" + ) return downloaded -def fetch_urls(urls): +def fetch_urls(urls, project=None): """ Fetch provided `urls` list. The `urls` can also be provided as a string containing one URL per line. diff --git a/scanpipe/templates/scanpipe/project_form.html b/scanpipe/templates/scanpipe/project_form.html index 25a5ae6d2..4bf19f0a5 100644 --- a/scanpipe/templates/scanpipe/project_form.html +++ b/scanpipe/templates/scanpipe/project_form.html @@ -57,6 +57,11 @@

+