-
Notifications
You must be signed in to change notification settings - Fork 109
Add AlpinePackages pipeline #272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# http://nexb.com and https://github.com/nexB/scancode.io | ||
# The ScanCode.io software is licensed under the Apache License version 2.0. | ||
# Data generated with ScanCode.io is provided as-is without warranties. | ||
# ScanCode is a trademark of nexB Inc. | ||
# | ||
# You may not use this software except in compliance with the License. | ||
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software distributed | ||
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
# CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations under the License. | ||
# | ||
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES | ||
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from | ||
# ScanCode.io should be considered or used as legal advice. Consult an Attorney | ||
# for any legal advice. | ||
# | ||
# ScanCode.io is a free software code scanning tool from nexB Inc. and others. | ||
# Visit https://github.com/nexB/scancode.io for support and download. | ||
|
||
from scanpipe.pipelines import Pipeline | ||
from scanpipe.pipes.alpine import download_or_checkout_aports | ||
from scanpipe.pipes.alpine import extract_summary_fields | ||
from scanpipe.pipes.alpine import get_unscanned_packages_from_db | ||
from scanpipe.pipes.alpine import prepare_scan_dir | ||
from scanpipe.pipes.scancode import run_extractcode | ||
from scanpipe.pipes.scancode import run_scancode | ||
|
||
|
||
class AlpinePackages(Pipeline): | ||
""" | ||
A pipeline to complement missing alpine package data. | ||
Downloads and extracts needed information from aports repository and package source files. | ||
Alpine Linux does not provide copyrights and (in some cases) licenses for it's packages. | ||
""" | ||
|
||
@classmethod | ||
def steps(cls): | ||
return ( | ||
cls.create_alpine_versions_dict, | ||
cls.download_aports_repo, | ||
cls.complement_missing_package_data, | ||
) | ||
|
||
scancode_options = ["--copyright", "--summary"] | ||
|
||
def create_alpine_versions_dict(self): | ||
""" | ||
Create a dict mapping alpine image ids from the database to alpine versions. | ||
""" | ||
self.alpine_versions = { | ||
i["image_id"]: i["distro"]["version_id"] | ||
for i in self.project.extra_data["images"] | ||
if i["distro"]["identifier"] == "alpine" | ||
quepop marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
def download_aports_repo(self): | ||
""" | ||
Set pipeline's `aports_dir_path` variable to it's project temporary path. | ||
Iterate over every alpine version associated with this project. | ||
Download corresponding aports repository branches (alpine versions). | ||
""" | ||
self.aports_dir_path = self.project.tmp_path | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the |
||
for image_id, alpine_version in self.alpine_versions.items(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @quepop values() would be better since you do not use the |
||
download_or_checkout_aports( | ||
aports_dir_path=self.project.tmp_path, alpine_version=alpine_version | ||
) | ||
|
||
def complement_missing_package_data(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The following code should be made more digest and readable. |
||
""" | ||
Iterate over alpine packages associated with this project. | ||
Checkout aports repository to the corresponding alpine version and a commit. | ||
Prepare scan target directory - download and extract package's sources. | ||
Run scancode and extract missing data (only copyrights for now). | ||
Update and save package's missing data to database. | ||
""" | ||
for ( | ||
alpine_version, | ||
commit_id, | ||
scan_target_path, | ||
scan_result_path, | ||
package, | ||
) in get_unscanned_packages_from_db( | ||
project=self.project, alpine_versions=self.alpine_versions | ||
): | ||
Comment on lines
+85
to
+87
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general, when the name of the keyword argument and the provided variable is the same, it's explicit enough to only keep the variable. For example:
I think the following is as explicit and more readable:
It make sense to keep the keyword agrs in the following example though:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used unnamed positional arguments before and @pombredanne commented that i should use named positionals everywhere.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I disagree with the "makes reading much easier" in the cases mentioned above but "more resistant to refactorings" may be a fair point. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could store that call in a |
||
if not download_or_checkout_aports( | ||
aports_dir_path=self.aports_dir_path, | ||
alpine_version=alpine_version, | ||
commit_id=commit_id, | ||
) or not prepare_scan_dir( | ||
package_name=package.name, scan_target_path=scan_target_path | ||
): | ||
continue | ||
run_extractcode(location=str(scan_target_path)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This |
||
run_scancode( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would suggest to call directly the ScanCode |
||
location=str(scan_target_path), | ||
output_file=str(scan_result_path), | ||
options=self.scancode_options, | ||
) | ||
package.update_extra_data( | ||
data=extract_summary_fields( | ||
scan_result_path=scan_result_path, | ||
summary_field_names=["copyrights"], | ||
) | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,8 +20,114 @@ | |
# ScanCode.io is a free software code scanning tool from nexB Inc. and others. | ||
# Visit https://github.com/nexB/scancode.io for support and download. | ||
|
||
|
||
import json | ||
from shutil import copytree | ||
|
||
from fetchcode import fetch | ||
from fetchcode.vcs.git import fetch_via_git | ||
from packagedcode import alpine | ||
|
||
from scanpipe.models import DiscoveredPackage | ||
|
||
APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git" | ||
APORTS_DIR_NAME = "aports" | ||
APORTS_SUBDIRS = ["main", "non-free", "testing", "community", "unmaintained"] | ||
|
||
|
||
def download_or_checkout_aports(aports_dir_path, alpine_version, commit_id=None): | ||
""" | ||
Download aports repository and it's branch based on `alpine_version`. | ||
Checkout to a branch (alpine version). | ||
If `commit_id` is provided also checkout to a commit. | ||
Return `aports_dir_path` if checkout(s) succeded. #TODO Proper fetchcode patch required (extending #54) | ||
""" | ||
major, minor = alpine_version.split(".")[:2] | ||
aports_dir_path = str(aports_dir_path / APORTS_DIR_NAME) | ||
fetch_via_git( | ||
url=f"git+{APORTS_URL}@{major}.{minor}-stable", location=aports_dir_path | ||
) | ||
if commit_id: | ||
fetch_via_git(url=f"git+{APORTS_URL}@{commit_id}", location=aports_dir_path) | ||
return aports_dir_path | ||
|
||
|
||
def get_unscanned_packages_from_db(project, alpine_versions): | ||
""" | ||
Return an iterator of 5-tuples (alpine_version, commit_id, scan_target_path, scan_result_path, package) where: | ||
`alpine_version` is an alpine version from which a package comes from (obtained from `alpine_versions` dict), | ||
`commit_id` is an id of aports repository commit that added corresponding version of a package, | ||
`scan_target_path` is a path of the directory on which a scan will be performed, | ||
`scan_result_path` is a path of the scan result json file, | ||
`package` is a DiscoveredPackage instance that belongs to a `project` with an alpine package type. | ||
The returned iterator contains not-a-subpackage alpine packages that don't have an existing scan result file. | ||
""" | ||
for package in DiscoveredPackage.objects.filter(project=project, type="alpine"): | ||
scan_id = f"{package.name}_{package.version}" | ||
scan_result_path = project.output_path / (scan_id + ".json") | ||
alpine_version = alpine_versions.get(package.extra_data["image_id"]) | ||
commit_id = package.vcs_url.split("id=")[1] | ||
scan_target_path = project.tmp_path / scan_id | ||
not_a_subpackage = ( | ||
not package.source_packages or package.source_packages[0] in package.purl | ||
) | ||
scan_result_nonexistent = not scan_result_path.exists() | ||
if not_a_subpackage and scan_result_nonexistent: | ||
yield alpine_version, commit_id, scan_target_path, scan_result_path, package | ||
|
||
|
||
def prepare_scan_dir(package_name, scan_target_path, aports_dir_path=None): | ||
""" | ||
A function to gather all the package's source files in `scan_target_path`. | ||
Source files of an alpine package are obtained from it's aports directory whose location has to be guessed. | ||
Such directory is present in one of the five aports repository subdirectories (main, non-free, testing, community, unmaintained). | ||
It's name is the same as the value of the corresponding package's `name` field (hence the `package_name` parameter). | ||
Here are some path examples: | ||
.../aports/main/acf-db | ||
.../aports/non-free/mongodb | ||
Inside, there are some extra files (patches) and an APKBUILD which contains urls to source tarballs. | ||
The function copies all these files (including APKBUILD) and downloads all the source tarballs to `scan_target_path`. | ||
The default value of `aports_dir_path` is set to the parent of the `scan_target_path`. | ||
If the package's aports path is found/guessed and it's also not empty the returned value is `scan_target_path`. | ||
""" | ||
if aports_dir_path is None: | ||
aports_dir_path = scan_target_path.parent | ||
for subdir_name in APORTS_SUBDIRS: | ||
quepop marked this conversation as resolved.
Show resolved
Hide resolved
|
||
apkbuild_dir = aports_dir_path / APORTS_DIR_NAME / subdir_name / package_name | ||
if not apkbuild_dir.exists(): | ||
continue | ||
if not any(apkbuild_dir.iterdir()): | ||
break | ||
copytree(apkbuild_dir, scan_target_path) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need a copy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because we have to download and extract the sources somewhere and it shouldn't be inside of the aports repo directory. Furthermore doing a scan on a single directory is in my opinion much better. If we were to do two separate scans, path handling and scan result merging would make the code much less clean. Also we have the files in one place for further investigation (if something is wrong with the package) and it simply would not be possible if we didn't copy them because we do checkouts in a loop. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, so if I understand correctly you are:
I think it could be better if you separate each operation and the process could benefit from more documentation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, exactly.
If i separated 1. from 2. it would have only There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So do you want me to split it? Given my reasons above I don't think we should. Also when we look at the rest of the scancode.io source files there are many instances where much bigger functions than mine aren't split into smaller ones. |
||
package_sources = ( | ||
alpine.parse_apkbuild(scan_target_path / "APKBUILD") | ||
quepop marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.to_dict() | ||
.get("extra_data") | ||
.get("sources") | ||
or [] | ||
) | ||
for source in package_sources: | ||
source_url = source.get("url") | ||
if source_url: | ||
fetch(source_url, scan_target_path) | ||
return scan_target_path | ||
|
||
|
||
def extract_summary_fields(scan_result_path, summary_field_names): | ||
""" | ||
Having a scancode result file extract all the values from the `summary` section of the scan result file (`scan_result_path`). | ||
Put them in the arrays inside the `result` object (result[`field_name`]). | ||
Return `result`. | ||
""" | ||
scan_result = open(scan_result_path) | ||
summaries = json.load(scan_result)["summary"] | ||
scan_result.close() | ||
result = {} | ||
for field_name in summary_field_names: | ||
values = (summary["value"] for summary in summaries.get(field_name, [])) | ||
result[field_name] = [v for v in values if v] | ||
return result | ||
|
||
|
||
def package_getter(root_dir, **kwargs): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
{ | ||
"summary": { | ||
"copyrights": [ | ||
{ | ||
"value": "Copyright (c) A B", | ||
"count": 51 | ||
}, | ||
{ | ||
"value": "Copyright (c) C D", | ||
"count": 8 | ||
} | ||
], | ||
"holders": [ | ||
{ | ||
"value": "A B", | ||
"count": 51 | ||
}, | ||
{ | ||
"value": "C D", | ||
"count": 41 | ||
} | ||
], | ||
"authors": [ | ||
{ | ||
"value": "A B", | ||
"count": 2 | ||
}, | ||
{ | ||
"value": "C D", | ||
"count": 1 | ||
} | ||
] | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.