aboutcode-org · quepop · Aug 5, 2021 · Sep 20, 2021 · tdruez · Sep 23, 2021
diff --git a/scanpipe/pipelines/alpine_packages.py b/scanpipe/pipelines/alpine_packages.py
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/nexB/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode.io for support and download.
+
+from scanpipe.pipelines import Pipeline
+from scanpipe.pipes.alpine import download_or_checkout_aports
+from scanpipe.pipes.alpine import extract_summary_fields
+from scanpipe.pipes.alpine import get_unscanned_packages_from_db
+from scanpipe.pipes.alpine import prepare_scan_dir
+from scanpipe.pipes.scancode import run_extractcode
+from scanpipe.pipes.scancode import run_scancode
+
+
+class AlpinePackages(Pipeline):
+    """
+    A pipeline to complement missing alpine package data.
+    Downloads and extracts needed information from aports repository and package source files.
+    Alpine Linux does not provide copyrights and (in some cases) licenses for it's packages.
+    """
+
+    @classmethod
+    def steps(cls):
+        return (
+            cls.create_alpine_versions_dict,
+            cls.download_aports_repo,
+            cls.complement_missing_package_data,
+        )
+
+    scancode_options = ["--copyright", "--summary"]
+
+    def create_alpine_versions_dict(self):
+        """
+        Create a dict mapping alpine image ids from the database to alpine versions.
+        """
+        self.alpine_versions = {
+            i["image_id"]: i["distro"]["version_id"]
+            for i in self.project.extra_data["images"]
+            if i["distro"]["identifier"] == "alpine"
+        }
+
+    def download_aports_repo(self):
+        """
+        Set pipeline's `aports_dir_path` variable to it's project temporary path.
+        Iterate over every alpine version associated with this project.
+        Download corresponding aports repository branches (alpine versions).
+        """
+        self.aports_dir_path = self.project.tmp_path
+        for image_id, alpine_version in self.alpine_versions.items():
+            download_or_checkout_aports(
+                aports_dir_path=self.project.tmp_path, alpine_version=alpine_version
+            )
+
+    def complement_missing_package_data(self):
+        """
+        Iterate over alpine packages associated with this project.
+        Checkout aports repository to the corresponding alpine version and a commit.
+        Prepare scan target directory - download and extract package's sources.
+        Run scancode and extract missing data (only copyrights for now).
+        Update and save package's missing data to database.
+        """
+        for (
+            alpine_version,
+            commit_id,
+            scan_target_path,
+            scan_result_path,
+            package,
+        ) in get_unscanned_packages_from_db(
+            project=self.project, alpine_versions=self.alpine_versions
+        ):
+            if not download_or_checkout_aports(
+                aports_dir_path=self.aports_dir_path,
+                alpine_version=alpine_version,
+                commit_id=commit_id,
+            ) or not prepare_scan_dir(
+                package_name=package.name, scan_target_path=scan_target_path
+            ):
+                continue
+            run_extractcode(location=str(scan_target_path))
+            run_scancode(
+                location=str(scan_target_path),
+                output_file=str(scan_result_path),
+                options=self.scancode_options,
+            )
+            package.update_extra_data(
+                data=extract_summary_fields(
+                    scan_result_path=scan_result_path,
+                    summary_field_names=["copyrights"],
+                )
+            )
diff --git a/scanpipe/pipes/alpine.py b/scanpipe/pipes/alpine.py
@@ -20,8 +20,114 @@
 # ScanCode.io is a free software code scanning tool from nexB Inc. and others.
 # Visit https://github.com/nexB/scancode.io for support and download.
 
+
+import json
+from shutil import copytree
+
+from fetchcode import fetch
+from fetchcode.vcs.git import fetch_via_git
 from packagedcode import alpine
 
+from scanpipe.models import DiscoveredPackage
+
+APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git"
+APORTS_DIR_NAME = "aports"
+APORTS_SUBDIRS = ["main", "non-free", "testing", "community", "unmaintained"]
+
+
+def download_or_checkout_aports(aports_dir_path, alpine_version, commit_id=None):
+    """
+    Download aports repository and it's branch based on `alpine_version`.
+    Checkout to a branch (alpine version).
+    If `commit_id` is provided also checkout to a commit.
+    Return `aports_dir_path` if checkout(s) succeded. #TODO Proper fetchcode patch required (extending #54)
+    """
+    major, minor = alpine_version.split(".")[:2]
+    aports_dir_path = str(aports_dir_path / APORTS_DIR_NAME)
+    fetch_via_git(
+        url=f"git+{APORTS_URL}@{major}.{minor}-stable", location=aports_dir_path
+    )
+    if commit_id:
+        fetch_via_git(url=f"git+{APORTS_URL}@{commit_id}", location=aports_dir_path)
+    return aports_dir_path
+
+
+def get_unscanned_packages_from_db(project, alpine_versions):
+    """
+    Return an iterator of 5-tuples (alpine_version, commit_id, scan_target_path, scan_result_path, package) where:
+    `alpine_version` is an alpine version from which a package comes from (obtained from `alpine_versions` dict),
+    `commit_id` is an id of aports repository commit that added corresponding version of a package,
+    `scan_target_path` is a path of the directory on which a scan will be performed,
+    `scan_result_path` is a path of the scan result json file,
+    `package` is a DiscoveredPackage instance that belongs to a `project` with an alpine package type.
+    The returned iterator contains not-a-subpackage alpine packages that don't have an existing scan result file.
+    """
+    for package in DiscoveredPackage.objects.filter(project=project, type="alpine"):
+        scan_id = f"{package.name}_{package.version}"
+        scan_result_path = project.output_path / (scan_id + ".json")
+        alpine_version = alpine_versions.get(package.extra_data["image_id"])
+        commit_id = package.vcs_url.split("id=")[1]
+        scan_target_path = project.tmp_path / scan_id
+        not_a_subpackage = (
+            not package.source_packages or package.source_packages[0] in package.purl
+        )
+        scan_result_nonexistent = not scan_result_path.exists()
+        if not_a_subpackage and scan_result_nonexistent:
+            yield alpine_version, commit_id, scan_target_path, scan_result_path, package
+
+
+def prepare_scan_dir(package_name, scan_target_path, aports_dir_path=None):
+    """
+    A function to gather all the package's source files in `scan_target_path`.
+    Source files of an alpine package are obtained from it's aports directory whose location has to be guessed.
+    Such directory is present in one of the five aports repository subdirectories (main, non-free, testing, community, unmaintained).
+    It's name is the same as the value of the corresponding package's `name` field (hence the `package_name` parameter).
+    Here are some path examples:
+    .../aports/main/acf-db
+    .../aports/non-free/mongodb
+    Inside, there are some extra files (patches) and an APKBUILD which contains urls to source tarballs.
+    The function copies all these files (including APKBUILD) and downloads all the source tarballs to `scan_target_path`.
+    The default value of `aports_dir_path` is set to the parent of the `scan_target_path`.
+    If the package's aports path is found/guessed and it's also not empty the returned value is `scan_target_path`.
+    """
+    if aports_dir_path is None:
+        aports_dir_path = scan_target_path.parent
+    for subdir_name in APORTS_SUBDIRS:
+        apkbuild_dir = aports_dir_path / APORTS_DIR_NAME / subdir_name / package_name
+        if not apkbuild_dir.exists():
+            continue
+        if not any(apkbuild_dir.iterdir()):
+            break
+        copytree(apkbuild_dir, scan_target_path)
+        package_sources = (
+            alpine.parse_apkbuild(scan_target_path / "APKBUILD")
+            .to_dict()
+            .get("extra_data")
+            .get("sources")
+            or []
+        )
+        for source in package_sources:
+            source_url = source.get("url")
+            if source_url:
+                fetch(source_url, scan_target_path)
+        return scan_target_path
+
+
+def extract_summary_fields(scan_result_path, summary_field_names):
+    """
+    Having a scancode result file extract all the values from the `summary` section of the scan result file (`scan_result_path`).
+    Put them in the arrays inside the `result` object (result[`field_name`]).
+    Return `result`.
+    """
+    scan_result = open(scan_result_path)
+    summaries = json.load(scan_result)["summary"]
+    scan_result.close()
+    result = {}
+    for field_name in summary_field_names:
+        values = (summary["value"] for summary in summaries.get(field_name, []))
+        result[field_name] = [v for v in values if v]
+    return result
+
 
 def package_getter(root_dir, **kwargs):
     """

diff --git a/scanpipe/pipes/docker.py b/scanpipe/pipes/docker.py
@@ -122,6 +122,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True):
 
     for i, (purl, package, layer) in enumerate(installed_packages):
         logger.info(f"Creating package #{i}: {purl}")
+        package.extra_data = {"image_id": image.image_id}
         created_package = pipes.update_or_create_package(project, package.to_dict())
 
         # We have no files for this installed package, we cannot go further.

diff --git a/scanpipe/tests/data/aports/community/A/APKBUILD b/scanpipe/tests/data/aports/community/A/APKBUILD
diff --git a/scanpipe/tests/data/aports/community/D/APKBUILD b/scanpipe/tests/data/aports/community/D/APKBUILD
diff --git a/scanpipe/tests/data/aports/community/E/NOTAPKBUILD b/scanpipe/tests/data/aports/community/E/NOTAPKBUILD
diff --git a/scanpipe/tests/data/aports/example/C/APKBUILD b/scanpipe/tests/data/aports/example/C/APKBUILD
diff --git a/scanpipe/tests/data/example_scan_summary.json b/scanpipe/tests/data/example_scan_summary.json
@@ -0,0 +1,34 @@
+{
+  "summary": {
+    "copyrights": [
+      {
+        "value": "Copyright (c) A B",
+        "count": 51
+      },
+      {
+        "value": "Copyright (c) C D",
+        "count": 8
+      }
+    ],
+    "holders": [
+      {
+        "value": "A B",
+        "count": 51
+      },
+      {
+        "value": "C D",
+        "count": 41
+      }
+    ],
+    "authors": [
+      {
+        "value": "A B",
+        "count": 2
+      },
+      {
+        "value": "C D",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/scanpipe/tests/test_pipes.py b/scanpipe/tests/test_pipes.py
@@ -38,6 +38,7 @@
 from scanpipe.models import CodebaseResource
 from scanpipe.models import DiscoveredPackage
 from scanpipe.models import Project
+from scanpipe.pipes import alpine
 from scanpipe.pipes import codebase
 from scanpipe.pipes import docker
 from scanpipe.pipes import fetch
@@ -756,6 +757,102 @@ def test_scanpipe_pipes_rootfs_has_hash_diff(self):
         codebase_resource = CodebaseResource(sha256="sha256", md5="md5")
         self.assertFalse(rootfs.has_hash_diff(install_file, codebase_resource))
 
+    @mock.patch("scanpipe.pipes.alpine.fetch_via_git")
+    def test_scanpipe_pipes_alpine_download_or_checkout_aports(self, fetch_via_git):
+        example_path = Path()
+        aports_path = str(example_path / alpine.APORTS_DIR_NAME)
+
+        alpine.download_or_checkout_aports(
+            aports_dir_path=example_path, alpine_version="3.13.14"
+        )
+        fetch_via_git.assert_called_with(
+            url=f"git+{alpine.APORTS_URL}@3.13-stable", location=aports_path
+        )
+
+        alpine.download_or_checkout_aports(
+            aports_dir_path=example_path, alpine_version="3.13.14", commit_id="1"
+        )
+        fetch_via_git.assert_called_with(
+            url=f"git+{alpine.APORTS_URL}@1", location=aports_path
+        )
+
+    def test_scanpipe_pipes_alpine_get_unscanned_packages_from_db(self):
+        project = Project.objects.create(name="example")
+        alpine_versions = {"1": "3.12", "2": "3.13"}
+        package_field_names = (
+            "type",
+            "name",
+            "version",
+            "vcs_url",
+            "source_packages",
+            "extra_data",
+        )
+        package_data = [
+            ("debian",),
+            ("rpm",),
+            ("alpine", "A", "1.0", "id=A", [], {"image_id": "1"}),
+            ("alpine", "B", "1.0", "id=B", [], {"image_id": "2"}),
+        ]
+        # The test will get bigger (thus arrays and loops instead of consecutive function calls) - futher patches for this function expected
+        expected_package_tuples = [
+            (
+                "3.13",
+                "B",
+                project.tmp_path / "B_1.0",
+                project.output_path / "B_1.0.json",
+            ),
+        ]
+        (project.output_path / "A_1.0.json").touch()
+        for package_data_tuple in package_data:
+            DiscoveredPackage.objects.create(
+                project=project, **dict(zip(package_field_names, package_data_tuple))
+            )
+        yielded_package_tuples = alpine.get_unscanned_packages_from_db(
+            project=project, alpine_versions=alpine_versions
+        )
+        for i, package_tuple in enumerate(yielded_package_tuples):
+            self.assertEqual(expected_package_tuples[i], package_tuple[:4])
+
+    @mock.patch("scanpipe.pipes.alpine.alpine.parse_apkbuild")
+    @mock.patch("scanpipe.pipes.alpine.copytree")
+    def test_scanpipe_pipes_alpine_prepare_scan_dir(self, copytree, parse_apkbuild):
+        example_path = Path()
+
+        aports_path = self.data_location / alpine.APORTS_DIR_NAME
+        (aports_path / "main" / "A").mkdir(parents=True, exist_ok=True)
+        (aports_path / "non-free" / "A").mkdir(parents=True, exist_ok=True)
+        (aports_path / "community" / "B").mkdir(parents=True, exist_ok=True)
+
+        package_test_cases = [
+            ("A", None),
+            ("B", None),
+            ("C", None),
+            ("D", example_path),
+            ("E", example_path),
+        ]
+
+        for test_case in package_test_cases:
+            returned_value = alpine.prepare_scan_dir(
+                package_name=test_case[0],
+                scan_target_path=example_path,
+                aports_dir_path=self.data_location,
+            )
+            self.assertEqual(returned_value, test_case[1])
+
+    def test_scanpipe_pipes_alpine_extract_summary_fields(self):
+        returned_value = alpine.extract_summary_fields(
+            self.data_location / "example_scan_summary.json",
+            ["copyrights", "holders", "authors"],
+        )
+        self.assertEqual(
+            returned_value,
+            {
+                "copyrights": ["Copyright (c) A B", "Copyright (c) C D"],
+                "holders": ["A B", "C D"],
+                "authors": ["A B", "C D"],
+            },
+        )
+
 
 class ScanPipePipesTransactionTest(TransactionTestCase):
     """

diff --git a/setup.py b/setup.py
@@ -61,6 +61,7 @@
             "root_filesystems = scanpipe.pipelines.root_filesystems:RootFS",
             "scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase",
             "scan_package = scanpipe.pipelines.scan_package:ScanPackage",
+            "alpine_packages = scanpipe.pipelines.alpine_packages:AlpinePackages"
         ],
     },
     classifiers=[