Skip to content

Commit c0150e9

Browse files
committed
Improve creation of CodebaseRelation
Signed-off-by: Tushar Goel <tushar.goel.dav@gmail.com>
1 parent a2d0974 commit c0150e9

File tree

2 files changed

+52
-53
lines changed

2 files changed

+52
-53
lines changed

scanpipe/models.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1993,8 +1993,7 @@ def has_directory_content_fingerprint(self):
19931993

19941994
def elfs(self):
19951995
"""
1996-
Resources that are ``files`` and their filetype starts with "ELF" and
1997-
contains any of these "executable", "relocatable", "shared object".
1996+
ELF executable and shared object Resources.
19981997
Keep sync with the content type implementation at ``typecode.contenttype``.
19991998
"""
20001999
return (
@@ -2011,18 +2010,16 @@ def elfs(self):
20112010

20122011
def win_exes(self):
20132012
"""
2014-
Resources that are ``files`` and their filetype contains "for ms windows" or
2015-
starts with "pe32".
2013+
Windows executable and DLL Resources.
20162014
Keep sync with the content type implementation at ``typecode.contenttype``.
20172015
"""
20182016
return self.files().filter(
20192017
Q(file_type__icontains="for ms windows") | Q(file_type__istartswith="pe32")
20202018
)
20212019

2022-
def mach_os(self):
2020+
def macho_binaries(self):
20232021
"""
2024-
Resources that are ``files`` and their filetype contains "for ms mach-o" or
2025-
contains "application/x-mach-binary".
2022+
Mach-O binary Resources.
20262023
Keep sync with the content type implementation at ``typecode.contenttype``.
20272024
"""
20282025
return self.files().filter(
@@ -2031,7 +2028,7 @@ def mach_os(self):
20312028
)
20322029

20332030
def executable_binaries(self):
2034-
return self.union(self.win_exes(), self.mach_os(), self.elfs())
2031+
return self.union(self.win_exes(), self.macho_binaries(), self.elfs())
20352032

20362033

20372034
class ScanFieldsModelMixin(models.Model):

scanpipe/pipes/d2d.py

Lines changed: 47 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1673,62 +1673,65 @@ def map_paths_resource(
16731673
Map paths found in the ``to_resource`` extra_data to paths of the ``from_resources``
16741674
CodebaseResource queryset using the precomputed ``from_resources_index`` path index.
16751675
"""
1676-
relations = {}
1676+
# Accumulate unique relation objects for bulk creation
1677+
relations_to_create = {}
16771678

16781679
for map_type in map_types:
1679-
paths = to_resource.extra_data.get(map_type, [])
1680-
not_mapped_paths = to_resource.extra_data[f"{map_type}_not_mapped"] = []
1681-
process_relations(
1682-
to_resource,
1683-
from_resources,
1684-
from_resources_index,
1685-
relations,
1686-
map_type,
1687-
paths,
1688-
not_mapped_paths,
1689-
)
1680+
paths_in_binary = to_resource.extra_data.get(map_type, [])
1681+
paths_not_mapped = to_resource.extra_data[f"{map_type}_not_mapped"] = []
1682+
for item in process_paths_in_binary(
1683+
to_resource=to_resource,
1684+
from_resources=from_resources,
1685+
from_resources_index=from_resources_index,
1686+
map_type=map_type,
1687+
paths_in_binary=paths_in_binary,
1688+
):
1689+
if isinstance(item, Path):
1690+
paths_not_mapped.append(item)
1691+
else:
1692+
rel_key, relation = item
1693+
if rel_key not in relations_to_create:
1694+
relations_to_create[rel_key] = relation
1695+
if paths_not_mapped:
1696+
to_resource.save()
1697+
logger(
1698+
f"WARNING: #{len(paths_not_mapped)} {map_type} paths NOT mapped for: "
1699+
f"{to_resource.path!r}"
1700+
)
16901701

1691-
if relations:
1692-
rels = CodebaseRelation.objects.bulk_create(relations.values())
1702+
if relations_to_create:
1703+
rels = CodebaseRelation.objects.bulk_create(relations_to_create.values())
16931704
logger(
16941705
f"Created {len(rels)} mappings using "
1695-
f"{', '.join(map_types).upper()} for: {to_resource.path!r}"
1706+
f"{', '.join(map_types)} for: {to_resource.path!r}"
16961707
)
16971708
else:
16981709
logger(
1699-
f"No mappings using {', '.join(map_types).upper()} for: "
1700-
f"{to_resource.path!r}"
1710+
f"No mappings using {', '.join(map_types)} for: " f"{to_resource.path!r}"
17011711
)
17021712

1703-
for map_type in map_types:
1704-
if to_resource.extra_data.get(f"{map_type}_not_mapped"):
1705-
to_resource.save()
1706-
logger(
1707-
f"WARNING: {map_type.upper()} paths NOT mapped for: "
1708-
f"{to_resource.path!r}: "
1709-
+ ", ".join(map(repr, to_resource.extra_data[f"{map_type}_not_mapped"]))
1710-
)
1711-
17121713

1713-
def process_relations(
1714+
def process_paths_in_binary(
17141715
to_resource,
17151716
from_resources,
17161717
from_resources_index,
1717-
relations,
17181718
map_type,
1719-
paths,
1720-
not_mapped_paths,
1719+
paths_in_binary,
17211720
):
1722-
"""Process relations between resources."""
1723-
for path in paths:
1721+
"""
1722+
Process list of paths in binary and Yield either:
1723+
- a tuple of (unique key for a relationship, ``CodebaseRelation`` object)
1724+
- Or a path if it was not mapped
1725+
"""
1726+
for path in paths_in_binary:
17241727
match = pathmap.find_paths(path, from_resources_index)
17251728
if not match:
1726-
not_mapped_paths.append(path)
1729+
yield path
17271730
continue
17281731

17291732
matched_path_length = match.matched_path_length
17301733
if is_invalid_match(match, matched_path_length):
1731-
not_mapped_paths.append(path)
1734+
yield path
17321735
continue
17331736

17341737
matched_from_resources = [
@@ -1744,15 +1747,14 @@ def process_relations(
17441747
}
17451748

17461749
rel_key = (winning_from_resource.path, to_resource.path, map_type)
1747-
if rel_key not in relations:
1748-
relation = CodebaseRelation(
1749-
project=winning_from_resource.project,
1750-
from_resource=winning_from_resource,
1751-
to_resource=to_resource,
1752-
map_type=map_type,
1753-
extra_data=extra_data,
1754-
)
1755-
relations[rel_key] = relation
1750+
relation = CodebaseRelation(
1751+
project=winning_from_resource.project,
1752+
from_resource=winning_from_resource,
1753+
to_resource=to_resource,
1754+
map_type=map_type,
1755+
extra_data=extra_data,
1756+
)
1757+
yield rel_key, relation
17561758

17571759

17581760
def sort_matched_from_resources(matched_from_resources):
@@ -1814,7 +1816,7 @@ def map_paths(project, file_type, collect_paths_func, map_types, logger=None):
18141816

18151817

18161818
def map_elfs(project, logger=None):
1817-
"""Map ELF file paths in a ``project``."""
1819+
"""Map ELF binaries to their sources in ``project``."""
18181820
from_resources = project.codebaseresources.files().from_codebase()
18191821
to_resources = (
18201822
project.codebaseresources.files().to_codebase().has_no_relation().elfs()
@@ -1877,7 +1879,7 @@ def get_go_file_paths(location):
18771879

18781880

18791881
def map_go_paths(project, logger=None):
1880-
"""Map Go file paths in a project."""
1882+
"""Map Go binaries to their source in ``project``."""
18811883
from_resources = project.codebaseresources.files().from_codebase()
18821884
to_resources = (
18831885
project.codebaseresources.files()

0 commit comments

Comments
 (0)