diff --git a/scanpipe/filters.py b/scanpipe/filters.py index cdba64d8a..1e7efb25c 100644 --- a/scanpipe/filters.py +++ b/scanpipe/filters.py @@ -409,6 +409,8 @@ def filter(self, qs, value): ("js_path", "js path"), ("path", "path"), ("sha1", "sha1"), + ("dwarf_included_paths", "dwarf_included_paths"), + ("dwarf_compiled_paths", "dwarf_compiled_paths"), ) @@ -492,6 +494,7 @@ class ResourceFilterSet(FilterSetUtilsMixin, django_filters.FilterSet): relation_map_type = RelationMapTypeFilter( label="Relation map type", field_name="related_from__map_type", + distinct=True, ) class Meta: diff --git a/scanpipe/pipelines/d2d_dwarf.py b/scanpipe/pipelines/d2d_dwarf.py new file mode 100644 index 000000000..120622118 --- /dev/null +++ b/scanpipe/pipelines/d2d_dwarf.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines.deploy_to_develop import DeployToDevelop +from scanpipe.pipes import d2d +from scanpipe.pipes import input + + +class DWARF(DeployToDevelop): + """ELFs and DWARFs.""" + + @classmethod + def steps(cls): + return ( + cls.get_inputs, + cls.build_inventory_from_scans, + cls.flag_ignored_resources, + cls.map_dwarf_paths, + cls.flag_mapped_resources_archives_and_ignored_directories, + ) + + + + def build_inventory_from_scans(self): + """Build inventories""" + for input_paths, tag in [(self.from_files, "from"), (self.to_files, "to")]: + for input_path in input_paths: + input.load_inventory_from_toolkit_scan( + self.project, input_path, resource_defaults={"tag": tag} + ) + + def map_dwarf_paths(self): + """Map DWARF paths""" + d2d.map_dwarf_path(project=self.project, logger=self.log) diff --git a/scanpipe/pipes/__init__.py b/scanpipe/pipes/__init__.py index 5a189379a..990f1dfd6 100644 --- a/scanpipe/pipes/__init__.py +++ b/scanpipe/pipes/__init__.py @@ -239,25 +239,55 @@ def update_or_create_dependency( def get_or_create_relation(project, relation_data): """ - Get or create a CodebaseRelation then return it. + Get or create a CodebaseRelation then return it. The support for update is not useful as there is no fields on the model that could be updated. """ + resource_qs = project.codebaseresources + from_resource_path = relation_data.get("from_resource") + from_resource = resource_qs.get(path=from_resource_path) + to_resource_path = relation_data.get("to_resource") - resource_qs = project.codebaseresources + to_resource = resource_qs.get(path=to_resource_path) + + map_type = relation_data.get("map_type") + + codebase_relation, _ = get_or_create_relation_from_resources( + from_resource=from_resource, + to_resource=to_resource, + map_type=map_type, + ) + return codebase_relation + + +def get_or_create_relation_from_resources( + from_resource, to_resource, map_type, extra_data=None +): + """ + Get or create a Code baseRelationrelation of type ``map_type`` between the + ``from_resource`` and the ``to_resource`` and return it. + ``extra_data`` if any will override any pre-existing value for these. + """ codebase_relation, _ = CodebaseRelation.objects.get_or_create( - project=project, - from_resource=resource_qs.get(path=from_resource_path), - to_resource=resource_qs.get(path=to_resource_path), - map_type=relation_data.get("map_type"), + project=from_resource.project, + from_resource=from_resource, + to_resource=to_resource, + map_type=map_type, ) + if extra_data: + codebase_relation.extra_data = extra_data + codebase_relation.save() return codebase_relation def make_relation(from_resource, to_resource, map_type, **extra_fields): + """ + Create a Code baseRelationrelation of type ``map_type`` between the + ``from_resource`` and the ``to_resource`` and return it. + """ return CodebaseRelation.objects.create( project=from_resource.project, from_resource=from_resource, @@ -387,11 +417,14 @@ def iter(self, iterator): yield item -def get_text_str_diff_ratio(str_a, str_b): +def get_text_str_diff_ratio(str_a, str_b, as_lines=True): """ Return a similarity ratio as a float between 0 and 1 by comparing the text content of the ``str_a`` and ``str_b``. + Split the text in lines and compare lines if ``as_lines`` is True. + Otherwise, process the input as-is. + Return None if any of the two resources str is empty. """ if not (str_a and str_b): @@ -399,20 +432,29 @@ def get_text_str_diff_ratio(str_a, str_b): if not isinstance(str_a, str) or not isinstance(str_b, str): raise ValueError("Values must be str") - - matcher = difflib.SequenceMatcher(a=str_a.splitlines(), b=str_b.splitlines()) + if as_lines: + a = str_a.splitlines() + b = str_b.splitlines() + else: + a = str_a + b = str_b + matcher = difflib.SequenceMatcher(a=a, b=b) return matcher.quick_ratio() -def get_resource_diff_ratio(resource_a, resource_b): +def get_resource_diff_ratio(resource_a, resource_b, as_lines=True): """ Return a similarity ratio as a float between 0 and 1 by comparing the text content of the CodebaseResource ``resource_a`` and ``resource_b``. + Split the text in lines and compare lines if ``as_lines`` is True. + Otherwise, compare the files text content as-is. + Return None if any of the two resources are not readable as text. """ with suppress(IOError): return get_text_str_diff_ratio( str_a=resource_a.file_content, str_b=resource_b.file_content, + as_lines=as_lines, ) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index 6e1214061..b61a437de 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -1409,3 +1409,130 @@ def flag_whitespace_files(project): # If resource contains only whitespace characters. if not non_whitespace_bytes: resource.update(status=flag.IGNORED_WHITESPACE_FILE) + + +def _map_dwarf_path_resource( + to_resource, from_resources, from_resources_index, logger=None, +): + """ + Map DWARF dwarf_paths found in the ``to_resource`` extra_data to + dwarf_paths of the ``from_resources`` CodebaseResource queryset using the + precomputed ``from_resources_index`` path index. + """ + dwarf_source_paths = to_resource.extra_data.get("dwarf_source_paths") or {} + compiled_paths = dwarf_source_paths.get("compiled_paths") or [] + included_paths = dwarf_source_paths.get("included_paths") or [] + dwarf_paths_and_map_type = [ + (compiled_paths, "dwarf_compiled_paths"), + (included_paths, "dwarf_included_paths"), + ] + + dpnm = to_resource.extra_data["dwarf_paths_not_mapped"] = [] + relations = {} + + for dwarf_paths, map_type in dwarf_paths_and_map_type: + for dwarf_path in dwarf_paths: + + match = pathmap.find_paths(dwarf_path, from_resources_index) + if not match: + dpnm.append(dwarf_path) + continue + + # short dwarf path matched more than once is treated as not mapped for now + matched_path_length = match.matched_path_length + + if matched_path_length == 1 and len(match.resource_ids) != 1: + dpnm.append(dwarf_path) + continue + + # Sort match by most similar to the From/ side dwarf_path e.g. if we match + # some/foo/bar/baz.c and this/other/foo/bar/baz.c and the From is + # that/foo/bar/baz.c, some/foo/bar/baz.c has the most segments + # matched wins, e.g., the shortest From/ path wins. + matched_from_resources = [ + from_resources.get(id=rid) for rid in match.resource_ids + ] + matched_from_resources.sort(key=lambda res: (len(res.path.strip("/").split("/")), res.path)) + winning_from_resource = matched_from_resources[0] + + # Do not count the "to/" segment as it is not "matchable" + # always strip leading segment ("to" or from" first segment) + dwarf_path_length = len(dwarf_path.strip("/").split("/")) - 1 + + extra_data = { + "path_score": f"{matched_path_length}/{dwarf_path_length}", + "dwarf_path": dwarf_path, + } + + rel_key = (winning_from_resource.path, to_resource.path, map_type) + if rel_key not in relations: + relation = CodebaseRelation( + project=winning_from_resource.project, + from_resource=winning_from_resource, + to_resource=to_resource, + map_type=map_type, + extra_data=extra_data, + ) + relations[rel_key] = relation + + if relations: + rels = CodebaseRelation.objects.bulk_create(relations.values()) + if logger: + logger(f"Created {len(rels)} mapping using DWARF for: {to_resource.path!r}") + else: + if logger: + logger(f"No mapping using DWARF for: {to_resource.path!r}") + + if dpnm: + # save the "dwarf dwarf_paths not mapped" + to_resource.save() + if logger: + logger(f"WARNING: DWARF paths NOT mapped for: {to_resource.path!r}: " + ", ".join(map(repr, dpnm))) + + +def map_dwarf_path(project, logger=None): + """Map DWARF paths using similarities of path suffixes.""" + project_files = project.codebaseresources.files().no_status() + from_resources = project_files.from_codebase() + to_resources = project_files.to_codebase().has_no_relation() + to_resources = to_resources.filter(extra_data__has_key="dwarf_source_paths") + resource_count = to_resources.count() + + if logger: + logger( + f"Mapping {resource_count:,d} to/ resources using DWARF paths " + f"with {from_resources.count():,d} from/ resources." + ) + + from_resources_index = pathmap.build_index( + from_resources.values_list("id", "path"), with_subpaths=True + ) + + if logger: + logger( + f"Done building from/ resources index." + ) + + resource_iterator = to_resources.iterator(chunk_size=2000) + last_percent = 0 + start_time = timer() + for resource_index, to_resource in enumerate(resource_iterator): + last_percent = pipes.log_progress( + logger, + resource_index, + resource_count, + last_percent, + increment_percent=10, + start_time=start_time, + ) + if logger: + logger( + f"Mapping to/ resource: {to_resource.path!r} using DWARF paths." + ) + + _map_dwarf_path_resource( + to_resource, + from_resources, + from_resources_index, + logger=logger, + ) diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index fa52e02b5..206d24c2d 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -65,14 +65,16 @@ def get_tool_name_from_scan_headers(scan_data): return tool_name -def load_inventory_from_toolkit_scan(project, input_location): +def load_inventory_from_toolkit_scan(project, input_location, resource_defaults=None): """ Create packages, dependencies, and resources loaded from the ScanCode-toolkit scan results located at ``input_location``. """ scanned_codebase = scancode.get_virtual_codebase(project, input_location) scancode.create_discovered_packages(project, scanned_codebase) - scancode.create_codebase_resources(project, scanned_codebase) + scancode.create_codebase_resources( + project, scanned_codebase, defaults=resource_defaults + ) scancode.create_discovered_dependencies( project, scanned_codebase, strip_datafile_path_root=True ) diff --git a/scanpipe/pipes/pathmap.py b/scanpipe/pipes/pathmap.py index 0de855376..a1d014cb7 100644 --- a/scanpipe/pipes/pathmap.py +++ b/scanpipe/pipes/pathmap.py @@ -92,7 +92,7 @@ def find_paths(path, index): return Match(matched_length, resource_ids) -def build_index(resource_id_and_paths, with_subpaths=True): +def build_index(resource_id_and_paths, with_subpaths=True, logger=None): """ Return an index (an index) built from a ``resource_id_and_paths`` iterable of tuples of (resource_id int, resource_path string). @@ -111,7 +111,7 @@ def build_index(resource_id_and_paths, with_subpaths=True): # create a new empty automaton. index = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_STRING) - for resource_id, resource_path in resource_id_and_paths: + for i, (resource_id, resource_path) in enumerate(resource_id_and_paths): segments = get_reversed_path_segments(resource_path) segments_count = len(segments) if with_subpaths: @@ -119,6 +119,9 @@ def build_index(resource_id_and_paths, with_subpaths=True): else: add_path(resource_id, segments, segments_count, index) + if logger: + logger(f"Indexed {i} total resources") + index.make_automaton() return index diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index 1a4323a1f..0885d0d3f 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -511,7 +511,7 @@ def get_virtual_codebase(project, input_location): return VirtualCodebase(input_location, temp_dir=str(temp_path), max_in_memory=0) -def create_codebase_resources(project, scanned_codebase): +def create_codebase_resources(project, scanned_codebase, defaults=None): """ Save the resources of a ScanCode `scanned_codebase` scancode.resource.Codebase object to the database as a CodebaseResource of the `project`. @@ -521,19 +521,35 @@ def create_codebase_resources(project, scanned_codebase): """ for scanned_resource in scanned_codebase.walk(skip_root=True): resource_data = {} + extra_data = {} - for field in CodebaseResource._meta.fields: + known_field_names = set(field.name for field in CodebaseResource._meta.fields) + for field_name, value in scanned_resource.to_dict().items(): # Do not include the path as provided by the scanned_resource since it # includes the "root". The `get_path` method is used instead. - if field.name == "path": + if field_name == "path": continue - value = getattr(scanned_resource, field.name, None) if value is not None: - resource_data[field.name] = value + if field_name in known_field_names: + resource_data[field_name] = value + else: + extra_data[field_name] = value resource_type = "FILE" if scanned_resource.is_file else "DIRECTORY" resource_data["type"] = CodebaseResource.Type[resource_type] - resource_path = scanned_resource.get_path(strip_root=True) + + # TODO review this: + # resource_path = scanned_resource.get_path(strip_root=True) + resource_path = scanned_resource.get_path(strip_root=False) + + if defaults: + resource_data.update(defaults) + # TODO: Use a new path_prefix attribute? + if tag := defaults.get("tag"): + resource_path = f"{tag}/{resource_path.lstrip('/')}" + + if extra_data: + resource_data["extra_data"] = extra_data codebase_resource, _ = CodebaseResource.objects.get_or_create( project=project, diff --git a/setup.cfg b/setup.cfg index eb0b8f467..85e02f862 100644 --- a/setup.cfg +++ b/setup.cfg @@ -128,6 +128,7 @@ scancodeio_pipelines = deploy_to_develop = scanpipe.pipelines.deploy_to_develop:DeployToDevelop docker = scanpipe.pipelines.docker:Docker docker_windows = scanpipe.pipelines.docker_windows:DockerWindows + d2d_dwarf = scanpipe.pipelines.d2d_dwarf:DWARF find_vulnerabilities = scanpipe.pipelines.find_vulnerabilities:FindVulnerabilities inspect_manifest = scanpipe.pipelines.inspect_manifest:InspectManifest load_inventory = scanpipe.pipelines.load_inventory:LoadInventory