Skip to content

add support to store packages/archives locally #1685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 34 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0cade5e
add support to store packages/archives locally
VarshaUN Jun 10, 2025
0515605
resolve conflicts
VarshaUN Jun 11, 2025
6d140c2
Merge branch 'main' into store-all-downloaded-packages
VarshaUN Jun 11, 2025
c845a20
Add required imports
VarshaUN Jun 12, 2025
ca3a1ac
Merge branch 'main' into store-all-downloaded-packages
VarshaUN Jun 23, 2025
993d80c
add support to store packages/archives locally
VarshaUN Jun 10, 2025
59fc6fa
Add support for using Package URL (purl) as project input. (#1686)
tdruez Jun 11, 2025
dd80c83
Add required imports
VarshaUN Jun 12, 2025
3494bee
Upgrade the scancode-action references to aboutcode-org
tdruez Jun 11, 2025
33fc3c7
Add filtering by label and pipeline in the ``flush-projects`` managem…
tdruez Jun 13, 2025
ecf3a21
Upgrade Ace library to latest version 1.42.0 (#1692)
tdruez Jun 16, 2025
5c83405
Display matched snippets details in "Resource viewer" (#1693)
tdruez Jun 16, 2025
5c452b0
Rename the parent_path function to parent_directory #1691 (#1694)
tdruez Jun 16, 2025
d0f272f
Upgrade Django to latest security release (#1695)
tdruez Jun 18, 2025
ad6b14e
Add "(No value detected)" for Copyright and Holder charts #1697 (#1698)
tdruez Jun 19, 2025
5ebe370
Add support for Python3.13 (#1469)
tdruez Jun 19, 2025
9384133
Add "Package Compliance Alert" chart in the Policies section (#1699)
tdruez Jun 19, 2025
50fa74b
Add unit test for the "Package Compliance Alert" chart #1699
tdruez Jun 19, 2025
b117064
Add d2d symbols matching for winpe macho binaries (#1674)
AyanSinhaMahapatra Jun 20, 2025
a773009
Bump version to v35.0.0 for release
tdruez Jun 23, 2025
f4bad69
add required imports
VarshaUN Jun 28, 2025
22c50af
Merge branch 'main' into store-all-downloaded-packages
VarshaUN Jul 1, 2025
7d553ca
Update scancodeio/settings.py
VarshaUN Jul 1, 2025
81942b6
add support for extra extensions
VarshaUN Jul 2, 2025
f818e62
fix CI failures
VarshaUN Jul 2, 2025
18e66a3
add required imports
VarshaUN Jul 2, 2025
04b6cef
remove duplicate code snippets
VarshaUN Jul 2, 2025
e0a59f6
fix errors
VarshaUN Jul 2, 2025
2b62163
fix linters
VarshaUN Jul 2, 2025
5632bf7
fix errors
VarshaUN Jul 3, 2025
d81ebd0
add support to not re-scan packages
VarshaUN Jul 3, 2025
33dfe79
fix errors
VarshaUN Jul 8, 2025
435b2d9
add support for functionc all
VarshaUN Jul 8, 2025
68bb105
squash the migration files
VarshaUN Jul 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions scancodeio/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import os
import sys
import tempfile
from pathlib import Path
Expand Down Expand Up @@ -367,6 +368,15 @@
PROJECT_DIR("static"),
]

# Media files (Uploaded package archives, etc.)

MEDIA_URL = "/media/"
MEDIA_ROOT = os.path.join(str(ROOT_DIR), "media")

# Package storage settings

ENABLE_LOCAL_PACKAGE_STORAGE = env.bool("ENABLE_LOCAL_PACKAGE_STORAGE", default=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on our review session what about using this instead?

ENABLE_DOWNLOAD_ARCHIVING = env.bool("ENABLE_DOWNLOAD_ARCHIVING", default=False)
#localstorage, s3
DOWNLOAD_ARCHIVING_PROVIDER = env.str("DOWNLOAD_ARCHIVING_PROVIDER", default=None) 
# for local storage we would store the root path in that setting 
DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict("DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None) 


# Third-party apps

CRISPY_TEMPLATE_PACK = "bootstrap3"
Expand Down
3 changes: 3 additions & 0 deletions scancodeio/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from django.conf import settings
from django.conf.urls.static import static
from django.contrib.auth import views as auth_views
from django.urls import include
from django.urls import path
Expand Down Expand Up @@ -54,6 +55,8 @@
path("", RedirectView.as_view(url="project/")),
]

urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do not use media for our storage, instead we are using our own thing.



if settings.SCANCODEIO_ENABLE_ADMIN_SITE:
urlpatterns.append(path("admin/", admin_site.urls))
Expand Down
7 changes: 7 additions & 0 deletions scanpipe/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class Meta:
"pipeline",
"execute_now",
"selected_groups",
"use_local_storage",
]

def __init__(self, *args, **kwargs):
Expand All @@ -178,6 +179,12 @@ def __init__(self, *args, **kwargs):
pipeline_choices = scanpipe_app.get_pipeline_choices(include_addon=False)
self.fields["pipeline"].choices = pipeline_choices

self.fields["use_local_storage"].label = "Store packages locally"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leave forms for later

self.fields[
"use_local_storage"
].help_text = "If checked, packages will be stored on the local filesystem."
self.fields["use_local_storage"].widget.attrs.update({"class": "checkbox"})

def clean_name(self):
return " ".join(self.cleaned_data["name"].split())

Expand Down
93 changes: 93 additions & 0 deletions scanpipe/migrations/0068_squashed_package_archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Generated by Django 5.1.1 on 2025-07-09

import django.db.models.deletion
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No model change so no migrations.

import uuid
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('scanpipe', '0067_discoveredpackage_notes'),
]

operations = [
migrations.CreateModel(
name='PackageArchive',
fields=[
('uuid', models.UUIDField(
db_index=True, default=uuid.uuid4, editable=False, primary_key=True,
serialize=False, verbose_name='UUID'
)),
('checksum_sha256', models.CharField(
db_index=True, help_text='SHA256 checksum of the package archive file.',
max_length=64, unique=True
)),
('storage_path', models.CharField(
blank=True, help_text='Path to the stored archive file', max_length=1024
)),
('created_date', models.DateTimeField(
auto_now_add=True, help_text='Date when the archive was added to storage.'
)),
('package_file', models.FileField(
blank=True, help_text='The actual package archive file ( ZIP or TAR).',
null=True, upload_to='packages/'
)),
],
options={
'indexes': [models.Index(fields=['checksum_sha256'], name='checksum_idx')],
},
),

migrations.CreateModel(
name='DownloadedPackage',
fields=[
('uuid', models.UUIDField(
db_index=True, default=uuid.uuid4, editable=False, primary_key=True,
serialize=False, verbose_name='UUID'
)),
('url', models.URLField(
blank=True, db_index=True, help_text='URL from which the package was downloaded, if applicable.',
max_length=1024
)),
('filename', models.CharField(
help_text='Name of the package file.', max_length=255
)),
('download_date', models.DateTimeField(
auto_now_add=True, help_text='Date when the package was downloaded or added.'
)),
('scan_log', models.TextField(
blank=True, help_text='Log output from scanning the package.'
)),
('scan_date', models.DateTimeField(
blank=True, help_text='Date when the package was scanned.', null=True
)),
('project', models.ForeignKey(
editable=False, on_delete=django.db.models.deletion.CASCADE,
related_name='downloadedpackages', to='scanpipe.project'
)),
('package_archive', models.ForeignKey(
help_text='The stored archive file associated with this package.',
on_delete=django.db.models.deletion.CASCADE, to='scanpipe.packagearchive'
)),
],
options={
'indexes': [models.Index(fields=['url'], name='url_idx')],
'constraints': [
models.UniqueConstraint(
condition=models.Q(('url__gt', '')),
fields=('url', 'project'),
name='scanpipe_downloadedpackage_unique_url_project'
),
],
},
),

migrations.AddField(
model_name='project',
name='use_local_storage',
field=models.BooleanField(
default=False, help_text='Store packages locally if enabled.'
),
),
]
115 changes: 115 additions & 0 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,7 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model):
)
notes = models.TextField(blank=True)
settings = models.JSONField(default=dict, blank=True)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the latest review, we do not need models yet.

labels = TaggableManager(through=UUIDTaggedItem, ordering=["name"])
purl = models.CharField(
max_length=2048,
Expand All @@ -597,6 +598,10 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model):
),
)

labels = TaggableManager(through=UUIDTaggedItem)
use_local_storage = models.BooleanField(
default=False, help_text="Store packages locally if enabled."
)
objects = ProjectQuerySet.as_manager()

class Meta:
Expand Down Expand Up @@ -4393,6 +4398,116 @@ def success(self):
return self.response_status_code in (200, 201, 202)


class PackageArchive(UUIDPKModel):
"""
Stores metadata about a package archive file stored in the project's storage.
Each archive is uniquely identified by its SHA256 checksum.
"""

checksum_sha256 = models.CharField(
max_length=64,
unique=True,
db_index=True,
help_text=_("SHA256 checksum of the package archive file."),
)
storage_path = models.CharField(
max_length=1024,
blank=True,
help_text=_("Path to the stored archive file"),
)
package_file = models.FileField(
upload_to="packages/",
null=True,
blank=True,
help_text=_("The actual package archive file ( ZIP or TAR)."),
)
created_date = models.DateTimeField(
auto_now_add=True,
help_text=_("Date when the archive was added to storage."),
)

class Meta:
pass

def __str__(self):
return (
f"Archive {self.checksum_sha256[:8]} at "
f" {self.storage_path or self.package_file.name}"
)


class DownloadedPackage(UUIDPKModel):
"""
Tracks packages downloaded or provided as input for a project, linked to a
PackageArchive. Each instance represents a package associated with a project,
including its source URL (if downloaded) and scan details.
"""

project = models.ForeignKey(
Project,
related_name="downloadedpackages",
on_delete=models.CASCADE,
editable=False,
)
url = models.URLField(
max_length=1024,
db_index=True,
blank=True,
help_text=_("URL from which the package was downloaded, if applicable."),
)
filename = models.CharField(
max_length=255,
help_text=_("Name of the package file."),
)
download_date = models.DateTimeField(
auto_now_add=True,
help_text=_("Date when the package was downloaded or added."),
)
scan_log = models.TextField(
blank=True,
help_text=_("Log output from scanning the package."),
)
scan_date = models.DateTimeField(
null=True,
blank=True,
help_text=_("Date when the package was scanned."),
)
package_archive = models.ForeignKey(
PackageArchive,
on_delete=models.CASCADE,
help_text=_("The stored archive file associated with this package."),
)
scancode_version = models.CharField(
max_length=50,
blank=True,
help_text=_("ScanCode version used for scanning."),
)
pipeline_name = models.CharField(
max_length=100,
blank=True,
help_text=_("Pipeline used to process the package."),
)

class Meta:
indexes = [
models.Index(fields=["url"], name="url_idx"),
]
constraints = [
models.UniqueConstraint(
fields=["url", "project"],
condition=Q(url__gt=""),
name="%(app_label)s_%(class)s_unique_url_project",
),
models.UniqueConstraint(
fields=["project", "package_archive"],
name="%(app_label)s_%(class)s_unique_project_archive",
),
]

def __str__(self):
return f"{self.filename} for project {self.project.name}"


@receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL)
def create_auth_token(sender, instance=None, created=False, **kwargs):
"""Create an API key token on user creation, using the signal system."""
Expand Down
41 changes: 41 additions & 0 deletions scanpipe/pipelines/analyze_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import logging
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use instead a modification of the super class, and not a modification to each of the pipelines.

from pathlib import Path

from scanpipe.pipelines.analyze_root_filesystem import RootFS
from scanpipe.pipes import docker
from scanpipe.pipes import rootfs
from scanpipe.pipes.fetch import store_package_archive

logger = logging.getLogger(__name__)


class Docker(RootFS):
Expand All @@ -36,6 +42,7 @@ def steps(cls):
cls.find_images_os_and_distro,
cls.collect_images_information,
cls.collect_and_create_codebase_resources,
cls.store_package_archives,
cls.collect_and_create_system_packages,
cls.flag_uninteresting_codebase_resources,
cls.flag_empty_files,
Expand Down Expand Up @@ -74,6 +81,40 @@ def collect_and_create_codebase_resources(self):
"""Collect and labels all image files as CodebaseResources."""
for image in self.images:
docker.create_codebase_resources(self.project, image)
self.package_files = []
for resource in self.project.codebaseresources.filter(extension=".deb"):
self.package_files.append(resource.path)
logger.debug(f"Found package file: {resource.path}")

def store_package_archives(self):
"""Store identified package archives."""
if not self.project.use_local_storage:
logger.info(
f"Local storage is disabled for project: {self.project.name}."
"Skipping package storage."
)
return []

logger.info(
f"Storing package archives for project: {self.project.name},"
"files: {self.package_files}"
)
stored_files = []
for package_path in self.package_files:
if not Path(package_path).exists():
logger.error(f"Invalid or missing package path: {package_path}")
continue
package_path_str = str(package_path)
logger.info(f"Storing package archive: {package_path_str}")
try:
result = store_package_archive(
self.project, url=None, file_path=package_path_str
)
logger.info(f"Stored package archive {package_path_str}: {result}")
stored_files.append(result)
except Exception as e:
logger.error(f"Failed to store {package_path_str}: {e}")
return stored_files

def collect_and_create_system_packages(self):
"""Collect installed system packages for each layer based on the distro."""
Expand Down
Loading
Loading