Skip to content

Commit 868b0bf

Browse files
authored
Merge pull request #1170 from nexB/elf_d2d
Add D2D for ELFs and Go binaries #1113 #1114
2 parents 4238641 + dbb0fec commit 868b0bf

File tree

10 files changed

+324
-3
lines changed

10 files changed

+324
-3
lines changed

scanpipe/filters.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,9 @@ def filter(self, qs, value):
409409
("js_path", "js path"),
410410
("path", "path"),
411411
("sha1", "sha1"),
412+
("dwarf_included_paths", "dwarf_included_paths"),
413+
("dwarf_compiled_paths", "dwarf_compiled_paths"),
414+
("go_file_paths", "go_file_paths"),
412415
)
413416

414417

scanpipe/models.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1993,8 +1993,7 @@ def has_directory_content_fingerprint(self):
19931993

19941994
def elfs(self):
19951995
"""
1996-
Resources that are ``files`` and their filetype starts with "ELF" and
1997-
contains any of these "executable", "relocatable", "shared object".
1996+
ELF executable and shared object Resources.
19981997
Keep sync with the content type implementation at ``typecode.contenttype``.
19991998
"""
20001999
return (
@@ -2009,6 +2008,28 @@ def elfs(self):
20092008
)
20102009
)
20112010

2011+
def win_exes(self):
2012+
"""
2013+
Windows executable and DLL Resources.
2014+
Keep sync with the content type implementation at ``typecode.contenttype``.
2015+
"""
2016+
return self.files().filter(
2017+
Q(file_type__icontains="for ms windows") | Q(file_type__istartswith="pe32")
2018+
)
2019+
2020+
def macho_binaries(self):
2021+
"""
2022+
Mach-O binary Resources.
2023+
Keep sync with the content type implementation at ``typecode.contenttype``.
2024+
"""
2025+
return self.files().filter(
2026+
models.Q(file_type__icontains="mach-o")
2027+
| models.Q(mime_type__icontains="application/x-mach-binary")
2028+
)
2029+
2030+
def executable_binaries(self):
2031+
return self.union(self.win_exes(), self.macho_binaries(), self.elfs())
2032+
20122033

20132034
class ScanFieldsModelMixin(models.Model):
20142035
"""Fields returned by the ScanCode-toolkit scans."""

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ def steps(cls):
7070
cls.map_java_to_class,
7171
cls.map_jar_to_source,
7272
cls.map_javascript,
73+
cls.map_elf,
74+
cls.map_go,
7375
cls.match_directories_to_purldb,
7476
cls.match_resources_to_purldb,
7577
cls.map_javascript_post_purldb_match,
@@ -189,6 +191,16 @@ def map_javascript(self):
189191
"""
190192
d2d.map_javascript(project=self.project, logger=self.log)
191193

194+
@group("Elf")
195+
def map_elf(self):
196+
"""Map ELF binaries to their sources."""
197+
d2d.map_elfs(project=self.project, logger=self.log)
198+
199+
@group("Go")
200+
def map_go(self):
201+
"""Map Go binaries to their sources."""
202+
d2d.map_go_paths(project=self.project, logger=self.log)
203+
192204
def match_directories_to_purldb(self):
193205
"""Match selected directories in PurlDB."""
194206
if not purldb.is_available():

scanpipe/pipes/d2d.py

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@
3838
from django.template.defaultfilters import pluralize
3939

4040
from commoncode.paths import common_prefix
41+
from elf_inspector.dwarf import get_dwarf_paths
4142
from extractcode import EXTRACT_SUFFIX
43+
from go_inspector.plugin import collect_and_parse_symbols
4244
from packagedcode.npm import NpmPackageJsonHandler
4345
from summarycode.classify import LEGAL_STARTS_ENDS
4446

@@ -1662,3 +1664,224 @@ def _match_purldb_resources_post_process(
16621664
package.add_resources(unmapped_resources)
16631665

16641666
return interesting_codebase_resources.count()
1667+
1668+
1669+
def map_paths_resource(
1670+
to_resource, from_resources, from_resources_index, map_types, logger=None
1671+
):
1672+
"""
1673+
Map paths found in the ``to_resource`` extra_data to paths of the ``from_resources``
1674+
CodebaseResource queryset using the precomputed ``from_resources_index`` path index.
1675+
"""
1676+
# Accumulate unique relation objects for bulk creation
1677+
relations_to_create = {}
1678+
1679+
for map_type in map_types:
1680+
# These are of type string
1681+
paths_in_binary = to_resource.extra_data.get(map_type, [])
1682+
paths_not_mapped = to_resource.extra_data[f"{map_type}_not_mapped"] = []
1683+
for item in process_paths_in_binary(
1684+
to_resource=to_resource,
1685+
from_resources=from_resources,
1686+
from_resources_index=from_resources_index,
1687+
map_type=map_type,
1688+
paths_in_binary=paths_in_binary,
1689+
):
1690+
if isinstance(item, str):
1691+
paths_not_mapped.append(item)
1692+
else:
1693+
rel_key, relation = item
1694+
if rel_key not in relations_to_create:
1695+
relations_to_create[rel_key] = relation
1696+
if paths_not_mapped:
1697+
to_resource.save()
1698+
logger(
1699+
f"WARNING: #{len(paths_not_mapped)} {map_type} paths NOT mapped for: "
1700+
f"{to_resource.path!r}"
1701+
)
1702+
1703+
if relations_to_create:
1704+
rels = CodebaseRelation.objects.bulk_create(relations_to_create.values())
1705+
logger(
1706+
f"Created {len(rels)} mappings using "
1707+
f"{', '.join(map_types)} for: {to_resource.path!r}"
1708+
)
1709+
else:
1710+
logger(
1711+
f"No mappings using {', '.join(map_types)} for: " f"{to_resource.path!r}"
1712+
)
1713+
1714+
1715+
def process_paths_in_binary(
1716+
to_resource,
1717+
from_resources,
1718+
from_resources_index,
1719+
map_type,
1720+
paths_in_binary,
1721+
):
1722+
"""
1723+
Process list of paths in binary and Yield either:
1724+
- a tuple of (unique key for a relationship, ``CodebaseRelation`` object)
1725+
- Or a path if it was not mapped
1726+
"""
1727+
for path in paths_in_binary:
1728+
match = pathmap.find_paths(path, from_resources_index)
1729+
if not match:
1730+
yield path
1731+
continue
1732+
1733+
matched_path_length = match.matched_path_length
1734+
if is_invalid_match(match, matched_path_length):
1735+
yield path
1736+
continue
1737+
1738+
matched_from_resources = [
1739+
from_resources.get(id=rid) for rid in match.resource_ids
1740+
]
1741+
matched_from_resources = sort_matched_from_resources(matched_from_resources)
1742+
winning_from_resource = matched_from_resources[0]
1743+
1744+
path_length = count_path_segments(path) - 1
1745+
extra_data = {
1746+
"path_score": f"{matched_path_length}/{path_length}",
1747+
map_type: path,
1748+
}
1749+
1750+
rel_key = (winning_from_resource.path, to_resource.path, map_type)
1751+
relation = CodebaseRelation(
1752+
project=winning_from_resource.project,
1753+
from_resource=winning_from_resource,
1754+
to_resource=to_resource,
1755+
map_type=map_type,
1756+
extra_data=extra_data,
1757+
)
1758+
yield rel_key, relation
1759+
1760+
1761+
def count_path_segments(path):
1762+
"""Return the number of path segments in POSIX ``path`` string"""
1763+
return len(path.strip("/").split("/"))
1764+
1765+
1766+
def sort_matched_from_resources(matched_from_resources):
1767+
"""
1768+
Return the sorted list of ``matched_from_resources``
1769+
based on path length and path.
1770+
"""
1771+
1772+
def sorter(res):
1773+
return count_path_segments(res.path), res.path
1774+
1775+
return sorted(matched_from_resources, key=sorter)
1776+
1777+
1778+
def is_invalid_match(match, matched_path_length):
1779+
"""
1780+
Check if the match is invalid based on the ``matched_path_length`` and the number
1781+
of resource IDs.
1782+
"""
1783+
return matched_path_length == 1 and len(match.resource_ids) != 1
1784+
1785+
1786+
def map_elfs(project, logger=None):
1787+
"""Map ELF binaries to their sources in ``project``."""
1788+
from_resources = project.codebaseresources.files().from_codebase()
1789+
to_resources = (
1790+
project.codebaseresources.files().to_codebase().has_no_relation().elfs()
1791+
)
1792+
for resource in to_resources:
1793+
try:
1794+
paths = get_elf_file_dwarf_paths(resource.location_path)
1795+
resource.update_extra_data(paths)
1796+
except Exception as e:
1797+
logger(f"Can not parse {resource.location_path!r} {e!r}")
1798+
1799+
if logger:
1800+
logger(
1801+
f"Mapping {to_resources.count():,d} to/ resources using paths "
1802+
f"with {from_resources.count():,d} from/ resources."
1803+
)
1804+
1805+
from_resources_index = pathmap.build_index(
1806+
from_resources.values_list("id", "path"), with_subpaths=True
1807+
)
1808+
1809+
if logger:
1810+
logger("Done building from/ resources index.")
1811+
1812+
resource_iterator = to_resources.iterator(chunk_size=2000)
1813+
progress = LoopProgress(to_resources.count(), logger)
1814+
for to_resource in progress.iter(resource_iterator):
1815+
map_paths_resource(
1816+
to_resource,
1817+
from_resources,
1818+
from_resources_index,
1819+
map_types=["dwarf_compiled_paths", "dwarf_included_paths"],
1820+
logger=logger,
1821+
)
1822+
1823+
1824+
def get_elf_file_dwarf_paths(location):
1825+
"""Retrieve dwarf paths for ELF files."""
1826+
paths = get_dwarf_paths(location)
1827+
compiled_paths = paths.get("compiled_paths") or []
1828+
included_paths = paths.get("included_paths") or []
1829+
dwarf_paths = {}
1830+
if compiled_paths:
1831+
dwarf_paths["dwarf_compiled_paths"] = compiled_paths
1832+
if included_paths:
1833+
dwarf_paths["dwarf_included_paths"] = included_paths
1834+
return dwarf_paths
1835+
1836+
1837+
def get_go_file_paths(location):
1838+
"""Retrieve Go file paths."""
1839+
go_symbols = (
1840+
collect_and_parse_symbols(location, check_type=False).get("go_symbols") or {}
1841+
)
1842+
file_paths = {}
1843+
go_file_paths = go_symbols.get("file_paths") or []
1844+
if go_file_paths:
1845+
file_paths["go_file_paths"] = go_file_paths
1846+
return file_paths
1847+
1848+
1849+
def map_go_paths(project, logger=None):
1850+
"""Map Go binaries to their source in ``project``."""
1851+
from_resources = project.codebaseresources.files().from_codebase()
1852+
to_resources = (
1853+
project.codebaseresources.files()
1854+
.to_codebase()
1855+
.has_no_relation()
1856+
.executable_binaries()
1857+
)
1858+
for resource in to_resources:
1859+
try:
1860+
paths = get_go_file_paths(resource.location_path)
1861+
resource.update_extra_data(paths)
1862+
except Exception as e:
1863+
logger(f"Can not parse {resource.location_path!r} {e!r}")
1864+
1865+
if logger:
1866+
logger(
1867+
f"Mapping {to_resources.count():,d} to/ resources using paths "
1868+
f"with {from_resources.count():,d} from/ resources."
1869+
)
1870+
1871+
from_resources_index = pathmap.build_index(
1872+
from_resources.values_list("id", "path"), with_subpaths=True
1873+
)
1874+
1875+
if logger:
1876+
logger("Done building from/ resources index.")
1877+
1878+
resource_iterator = to_resources.iterator(chunk_size=2000)
1879+
progress = LoopProgress(to_resources.count(), logger)
1880+
for to_resource in progress.iter(resource_iterator):
1881+
map_paths_resource(
1882+
to_resource,
1883+
from_resources,
1884+
from_resources_index,
1885+
map_types=["go_file_paths"],
1886+
logger=logger,
1887+
)
376 Bytes
Binary file not shown.
61.9 KB
Binary file not shown.
756 Bytes
Binary file not shown.
530 KB
Binary file not shown.

scanpipe/tests/pipes/test_d2d.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from django.test import TestCase
3030

3131
from scanpipe import pipes
32+
from scanpipe.models import CodebaseRelation
3233
from scanpipe.models import CodebaseResource
3334
from scanpipe.models import Project
3435
from scanpipe.pipes import d2d
@@ -1461,3 +1462,63 @@ def test_scanpipe_pipes_d2d_match_purldb_resources_post_process(self):
14611462

14621463
self.assertEqual(2, package1_resource_count)
14631464
self.assertEqual(0, package2_resource_count)
1465+
1466+
def test_scanpipe_pipes_d2d_map_elfs(self):
1467+
input_dir = self.project1.input_path
1468+
input_resources = [
1469+
self.data_location / "d2d-elfs/to-data.zip",
1470+
self.data_location / "d2d-elfs/from-data.zip",
1471+
]
1472+
copy_inputs(input_resources, input_dir)
1473+
self.from_files, self.to_files = d2d.get_inputs(self.project1)
1474+
inputs_with_codebase_path_destination = [
1475+
(self.from_files, self.project1.codebase_path / d2d.FROM),
1476+
(self.to_files, self.project1.codebase_path / d2d.TO),
1477+
]
1478+
for input_files, codebase_path in inputs_with_codebase_path_destination:
1479+
for input_file_path in input_files:
1480+
scancode.extract_archive(input_file_path, codebase_path)
1481+
1482+
scancode.extract_archives(
1483+
self.project1.codebase_path,
1484+
recurse=True,
1485+
)
1486+
pipes.collect_and_create_codebase_resources(self.project1)
1487+
buffer = io.StringIO()
1488+
d2d.map_elfs(project=self.project1, logger=buffer.write)
1489+
self.assertEqual(
1490+
1,
1491+
CodebaseRelation.objects.filter(
1492+
project=self.project1, map_type="dwarf_included_paths"
1493+
).count(),
1494+
)
1495+
1496+
def test_scanpipe_pipes_d2d_map_go_paths(self):
1497+
input_dir = self.project1.input_path
1498+
input_resources = [
1499+
self.data_location / "d2d-go/to-data.zip",
1500+
self.data_location / "d2d-go/from-data.zip",
1501+
]
1502+
copy_inputs(input_resources, input_dir)
1503+
self.from_files, self.to_files = d2d.get_inputs(self.project1)
1504+
inputs_with_codebase_path_destination = [
1505+
(self.from_files, self.project1.codebase_path / d2d.FROM),
1506+
(self.to_files, self.project1.codebase_path / d2d.TO),
1507+
]
1508+
for input_files, codebase_path in inputs_with_codebase_path_destination:
1509+
for input_file_path in input_files:
1510+
scancode.extract_archive(input_file_path, codebase_path)
1511+
1512+
scancode.extract_archives(
1513+
self.project1.codebase_path,
1514+
recurse=True,
1515+
)
1516+
pipes.collect_and_create_codebase_resources(self.project1)
1517+
buffer = io.StringIO()
1518+
d2d.map_go_paths(project=self.project1, logger=buffer.write)
1519+
self.assertEqual(
1520+
1,
1521+
CodebaseRelation.objects.filter(
1522+
project=self.project1, map_type="go_file_paths"
1523+
).count(),
1524+
)

setup.cfg

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,9 @@ install_requires =
7878
# FetchCode
7979
fetchcode-container==1.2.3.210512; sys_platform == "linux"
8080
# Inspectors
81-
python-inspector==0.12.0
8281
elf-inspector==0.0.1
82+
go-inspector==0.2.2
83+
python-inspector==0.12.0
8384
source-inspector==0.3.0
8485
aboutcode-toolkit==10.1.0
8586
# Utilities

0 commit comments

Comments
 (0)