Skip to content

add support to store packages/archives locally #1685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 34 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0cade5e
add support to store packages/archives locally
VarshaUN Jun 10, 2025
0515605
resolve conflicts
VarshaUN Jun 11, 2025
6d140c2
Merge branch 'main' into store-all-downloaded-packages
VarshaUN Jun 11, 2025
c845a20
Add required imports
VarshaUN Jun 12, 2025
ca3a1ac
Merge branch 'main' into store-all-downloaded-packages
VarshaUN Jun 23, 2025
993d80c
add support to store packages/archives locally
VarshaUN Jun 10, 2025
59fc6fa
Add support for using Package URL (purl) as project input. (#1686)
tdruez Jun 11, 2025
dd80c83
Add required imports
VarshaUN Jun 12, 2025
3494bee
Upgrade the scancode-action references to aboutcode-org
tdruez Jun 11, 2025
33fc3c7
Add filtering by label and pipeline in the ``flush-projects`` managem…
tdruez Jun 13, 2025
ecf3a21
Upgrade Ace library to latest version 1.42.0 (#1692)
tdruez Jun 16, 2025
5c83405
Display matched snippets details in "Resource viewer" (#1693)
tdruez Jun 16, 2025
5c452b0
Rename the parent_path function to parent_directory #1691 (#1694)
tdruez Jun 16, 2025
d0f272f
Upgrade Django to latest security release (#1695)
tdruez Jun 18, 2025
ad6b14e
Add "(No value detected)" for Copyright and Holder charts #1697 (#1698)
tdruez Jun 19, 2025
5ebe370
Add support for Python3.13 (#1469)
tdruez Jun 19, 2025
9384133
Add "Package Compliance Alert" chart in the Policies section (#1699)
tdruez Jun 19, 2025
50fa74b
Add unit test for the "Package Compliance Alert" chart #1699
tdruez Jun 19, 2025
b117064
Add d2d symbols matching for winpe macho binaries (#1674)
AyanSinhaMahapatra Jun 20, 2025
a773009
Bump version to v35.0.0 for release
tdruez Jun 23, 2025
f4bad69
add required imports
VarshaUN Jun 28, 2025
22c50af
Merge branch 'main' into store-all-downloaded-packages
VarshaUN Jul 1, 2025
7d553ca
Update scancodeio/settings.py
VarshaUN Jul 1, 2025
81942b6
add support for extra extensions
VarshaUN Jul 2, 2025
f818e62
fix CI failures
VarshaUN Jul 2, 2025
18e66a3
add required imports
VarshaUN Jul 2, 2025
04b6cef
remove duplicate code snippets
VarshaUN Jul 2, 2025
e0a59f6
fix errors
VarshaUN Jul 2, 2025
2b62163
fix linters
VarshaUN Jul 2, 2025
5632bf7
fix errors
VarshaUN Jul 3, 2025
d81ebd0
add support to not re-scan packages
VarshaUN Jul 3, 2025
33dfe79
fix errors
VarshaUN Jul 8, 2025
435b2d9
add support for functionc all
VarshaUN Jul 8, 2025
68bb105
squash the migration files
VarshaUN Jul 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions scancodeio/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import os
import sys
import tempfile
from pathlib import Path
Expand Down Expand Up @@ -367,6 +368,15 @@
PROJECT_DIR("static"),
]

# Media files (Uploaded package archives, etc.)

MEDIA_URL = "/media/"
MEDIA_ROOT = os.path.join(str(ROOT_DIR), "media")

# Package storage settings

ENABLE_PACKAGE_STORAGE = env.bool("ENABLE_PACKAGE_STORAGE", default=False)

# Third-party apps

CRISPY_TEMPLATE_PACK = "bootstrap3"
Expand Down
3 changes: 3 additions & 0 deletions scancodeio/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from django.conf import settings
from django.conf.urls.static import static
from django.contrib.auth import views as auth_views
from django.urls import include
from django.urls import path
Expand Down Expand Up @@ -54,6 +55,8 @@
path("", RedirectView.as_view(url="project/")),
]

urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do not use media for our storage, instead we are using our own thing.



if settings.SCANCODEIO_ENABLE_ADMIN_SITE:
urlpatterns.append(path("admin/", admin_site.urls))
Expand Down
6 changes: 6 additions & 0 deletions scanpipe/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class Meta:
"pipeline",
"execute_now",
"selected_groups",
"use_local_storage",
]

def __init__(self, *args, **kwargs):
Expand All @@ -178,6 +179,11 @@ def __init__(self, *args, **kwargs):
pipeline_choices = scanpipe_app.get_pipeline_choices(include_addon=False)
self.fields["pipeline"].choices = pipeline_choices

self.fields["use_local_storage"].label = "Store packages locally"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leave forms for later

self.fields["use_local_storage"].help_text = "If checked, " \
"packages will be stored on the local filesystem."
self.fields["use_local_storage"].widget.attrs.update({"class": "checkbox"})

def clean_name(self):
return " ".join(self.cleaned_data["name"].split())

Expand Down
44 changes: 44 additions & 0 deletions scanpipe/migrations/0068_packagearchive_downloadedpackage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Generated by Django 5.1.1 on 2025-05-10 06:55

import django.db.models.deletion
import uuid
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('scanpipe', '0067_discoveredpackage_notes'),
]

operations = [
migrations.CreateModel(
name='PackageArchive',
fields=[
('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')),
('checksum_sha256', models.CharField(db_index=True, help_text='SHA256 checksum of the package archive file.', max_length=64, unique=True)),
('storage_path', models.CharField(help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024)),
('created_date', models.DateTimeField(auto_now_add=True, help_text='Date when the archive was added to storage.')),
],
options={
'indexes': [models.Index(fields=['checksum_sha256'], name='checksum_idx')],
},
),
migrations.CreateModel(
name='DownloadedPackage',
fields=[
('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')),
('url', models.URLField(blank=True, db_index=True, help_text='URL from which the package was downloaded, if applicable.', max_length=1024)),
('filename', models.CharField(help_text='Name of the package file.', max_length=255)),
('download_date', models.DateTimeField(auto_now_add=True, help_text='Date when the package was downloaded or added.')),
('scan_log', models.TextField(blank=True, help_text='Log output from scanning the package.')),
('scan_date', models.DateTimeField(blank=True, help_text='Date when the package was scanned.', null=True)),
('project', models.ForeignKey(editable=False, on_delete=django.db.models.deletion.CASCADE, related_name='downloadedpackages', to='scanpipe.project')),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we want to put projects on_delete like this here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to think how we are going to handle same package archive used in two different projects using different pipelines, or done with different SCIO versions.

Example, same package archive is scanned with inspect_packages and scan_sing;e_package

Additionally, we need to look into having a help text show up with projects which were run on the same package.

Consider this when you build the models, but we can also update them later as this is preliminary anyway.

('package_archive', models.ForeignKey(help_text='The stored archive file associated with this package.', on_delete=django.db.models.deletion.CASCADE, to='scanpipe.packagearchive')),
],
options={
'indexes': [models.Index(fields=['url'], name='url_idx')],
'constraints': [models.UniqueConstraint(condition=models.Q(('url__gt', '')), fields=('url', 'project'), name='scanpipe_downloadedpackage_unique_url_project')],
},
),
]
23 changes: 23 additions & 0 deletions scanpipe/migrations/0069_packagearchive_package_file_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 5.1.1 on 2025-05-12 09:41

from django.db import migrations, models


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please merge your migrations into one file, since they are for the same fields?
You can always generate multiple migration files as you go, but it's best to merge them and reorder based on merged branches before finally merging itself.

class Migration(migrations.Migration):

dependencies = [
('scanpipe', '0068_packagearchive_downloadedpackage'),
]

operations = [
migrations.AddField(
model_name='packagearchive',
name='package_file',
field=models.FileField(blank=True, help_text='The actual package archive file (e.g., ZIP or TAR).', null=True, upload_to='packages/'),
),
migrations.AlterField(
model_name='packagearchive',
name='storage_path',
field=models.CharField(blank=True, help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024),
),
]
28 changes: 28 additions & 0 deletions scanpipe/migrations/0070_project_use_local_storage_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by Django 5.1.1 on 2025-05-26 09:19

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please merge your migrations into one file, since they are for the same fields?

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('scanpipe', '0069_packagearchive_package_file_and_more'),
]

operations = [
migrations.AddField(
model_name='project',
name='use_local_storage',
field=models.BooleanField(default=False, help_text='Store packages locally if enabled.'),
),
migrations.AlterField(
model_name='packagearchive',
name='package_file',
field=models.FileField(blank=True, help_text='The actual package archive file ( ZIP or TAR).', null=True, upload_to='packages/'),
),
migrations.AlterField(
model_name='packagearchive',
name='storage_path',
field=models.CharField(blank=True, help_text='Path to the stored archive file', max_length=1024),
),
]
111 changes: 99 additions & 12 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,18 +585,9 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model):
)
notes = models.TextField(blank=True)
settings = models.JSONField(default=dict, blank=True)
labels = TaggableManager(through=UUIDTaggedItem, ordering=["name"])
purl = models.CharField(
max_length=2048,
blank=True,
help_text=_(
"Package URL (PURL) for the project, required for pushing the project's "
"scan result to FederatedCode. For example, if the input is an input URL "
"like https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz, the "
"corresponding PURL would be pkg:npm/lodash@4.17.21."
),
)

labels = TaggableManager(through=UUIDTaggedItem)
use_local_storage = models.BooleanField(default=False,
help_text="Store packages locally if enabled.")
objects = ProjectQuerySet.as_manager()

class Meta:
Expand Down Expand Up @@ -4386,6 +4377,102 @@ def success(self):
return self.response_status_code in (200, 201, 202)


class PackageArchive(UUIDPKModel):
"""
Stores metadata about a package archive file stored in the project's storage.
Each archive is uniquely identified by its SHA256 checksum.
"""

checksum_sha256 = models.CharField(
max_length=64,
unique=True,
db_index=True,
help_text=_("SHA256 checksum of the package archive file."),
)
storage_path = models.CharField(
max_length=1024,
blank=True,
help_text=_("Path to the stored archive file"),
)
package_file = models.FileField(
upload_to="packages/",
null=True,
blank=True,
help_text=_("The actual package archive file ( ZIP or TAR)."),
)
created_date = models.DateTimeField(
auto_now_add=True,
help_text=_("Date when the archive was added to storage."),
)

class Meta:
indexes = [
models.Index(fields=["checksum_sha256"], name="checksum_idx"),
]

def __str__(self):
return f"Archive {self.checksum_sha256[:8]} at {self.storage_path
or self.package_file.name}"


class DownloadedPackage(UUIDPKModel):
"""
Tracks packages downloaded or provided as input for a project, linked to a
PackageArchive. Each instance represents a package associated with a project,
including its source URL (if downloaded) and scan details.
"""

project = models.ForeignKey(
Project,
related_name="downloadedpackages",
on_delete=models.CASCADE,
editable=False,
)
url = models.URLField(
max_length=1024,
db_index=True,
blank=True,
help_text=_("URL from which the package was downloaded, if applicable."),
)
filename = models.CharField(
max_length=255,
help_text=_("Name of the package file."),
)
download_date = models.DateTimeField(
auto_now_add=True,
help_text=_("Date when the package was downloaded or added."),
)
scan_log = models.TextField(
blank=True,
help_text=_("Log output from scanning the package."),
)
scan_date = models.DateTimeField(
null=True,
blank=True,
help_text=_("Date when the package was scanned."),
)
package_archive = models.ForeignKey(
PackageArchive,
on_delete=models.CASCADE,
help_text=_("The stored archive file associated with this package."),
)

class Meta:
indexes = [
models.Index(fields=["url"], name="url_idx"),
]
constraints = [
models.UniqueConstraint(
fields=["url", "project"],
condition=Q(url__gt=""),
name="%(app_label)s_%(class)s_unique_url_project",
),
]

def __str__(self):
return f"{self.filename} for project {self.project.name}"


@receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL)
def create_auth_token(sender, instance=None, created=False, **kwargs):
"""Create an API key token on user creation, using the signal system."""
Expand Down
41 changes: 40 additions & 1 deletion scanpipe/pipelines/analyze_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from scanpipe.pipelines.analyze_root_filesystem import RootFS
import logging
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use instead a modification of the super class, and not a modification to each of the pipelines.

from pathlib import Path

from scanpipe.pipelines import RootFS
from scanpipe.pipes import docker
from scanpipe.pipes import rootfs
from scanpipe.pipes.fetch import store_package_archive

logger = logging.getLogger(__name__)


class Docker(RootFS):
Expand All @@ -36,6 +42,7 @@ def steps(cls):
cls.find_images_os_and_distro,
cls.collect_images_information,
cls.collect_and_create_codebase_resources,
cls.store_package_archives,
cls.collect_and_create_system_packages,
cls.flag_uninteresting_codebase_resources,
cls.flag_empty_files,
Expand Down Expand Up @@ -74,6 +81,38 @@ def collect_and_create_codebase_resources(self):
"""Collect and labels all image files as CodebaseResources."""
for image in self.images:
docker.create_codebase_resources(self.project, image)
self.package_files = []
for resource in self.project.codebaseresources.filter(extension=".deb"):
self.package_files.append(resource.path)
logger.debug(f"Found package file: {resource.path}")

def store_package_archives(self):
"""Store identified package archives."""
if not self.project.use_local_storage:
logger.info(f"Local storage is disabled for project: {self.project.name}."
"Skipping package storage.")
return []

logger.info(
f"Storing package archives for project: {self.project.name},"
"files: {self.package_files}"
)
stored_files = []
for package_path in self.package_files:
if not Path(package_path).exists():
logger.error(f"Invalid or missing package path: {package_path}")
continue
package_path_str = str(package_path)
logger.info(f"Storing package archive: {package_path_str}")
try:
result = store_package_archive(
self.project, url=None, file_path=package_path_str
)
logger.info(f"Stored package archive {package_path_str}: {result}")
stored_files.append(result)
except Exception as e:
logger.error(f"Failed to store {package_path_str}: {e}")
return stored_files

def collect_and_create_system_packages(self):
"""Collect installed system packages for each layer based on the distro."""
Expand Down
Loading
Loading