Skip to content

Commit ac378c7

Browse files
committed
Add d2d pipeline for mapping elfs
Signed-off-by: Tushar Goel <tushar.goel.dav@gmail.com>
1 parent d6389b2 commit ac378c7

File tree

4 files changed

+127
-1
lines changed

4 files changed

+127
-1
lines changed

scanpipe/filters.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,8 @@ def filter(self, qs, value):
409409
("js_path", "js path"),
410410
("path", "path"),
411411
("sha1", "sha1"),
412+
("dwarf_included_paths", "dwarf_included_paths"),
413+
("dwarf_compiled_paths", "dwarf_compiled_paths"),
412414
)
413415

414416

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,13 @@ def map_javascript(self):
188188
to its source.
189189
"""
190190
d2d.map_javascript(project=self.project, logger=self.log)
191+
192+
@group("Elf")
193+
def map_elf(self):
194+
"""
195+
Map DWARF paths using similarities of path suffixes.
196+
"""
197+
d2d.map_elf(project=self.project, logger=self.log)
191198

192199
def match_directories_to_purldb(self):
193200
"""Match selected directories in PurlDB."""

scanpipe/pipes/d2d.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from extractcode import EXTRACT_SUFFIX
4242
from packagedcode.npm import NpmPackageJsonHandler
4343
from summarycode.classify import LEGAL_STARTS_ENDS
44+
from elf_inspector.dwarf import get_dwarf_paths
4445

4546
from scanpipe import pipes
4647
from scanpipe.models import CodebaseRelation
@@ -1662,3 +1663,117 @@ def _match_purldb_resources_post_process(
16621663
package.add_resources(unmapped_resources)
16631664

16641665
return interesting_codebase_resources.count()
1666+
1667+
1668+
def _map_dwarf_path_resource(
1669+
to_resource, from_resources, from_resources_index, logger=None,
1670+
):
1671+
"""
1672+
Map DWARF dwarf_paths found in the ``to_resource`` extra_data to
1673+
dwarf_paths of the ``from_resources`` CodebaseResource queryset using the
1674+
precomputed ``from_resources_index`` path index.
1675+
"""
1676+
compiled_paths = to_resource.extra_data.get("compiled_paths") or []
1677+
included_paths = to_resource.extra_data.get("included_paths") or []
1678+
dwarf_paths_and_map_type = [
1679+
(compiled_paths, "dwarf_compiled_paths"),
1680+
(included_paths, "dwarf_included_paths"),
1681+
]
1682+
1683+
dpnm = to_resource.extra_data["dwarf_paths_not_mapped"] = []
1684+
relations = {}
1685+
1686+
for dwarf_paths, map_type in dwarf_paths_and_map_type:
1687+
for dwarf_path in dwarf_paths:
1688+
1689+
match = pathmap.find_paths(dwarf_path, from_resources_index)
1690+
if not match:
1691+
dpnm.append(dwarf_path)
1692+
continue
1693+
1694+
# short dwarf path matched more than once is treated as not mapped for now
1695+
matched_path_length = match.matched_path_length
1696+
1697+
if matched_path_length == 1 and len(match.resource_ids) != 1:
1698+
dpnm.append(dwarf_path)
1699+
continue
1700+
1701+
# Sort match by most similar to the From/ side dwarf_path e.g. if we match
1702+
# some/foo/bar/baz.c and this/other/foo/bar/baz.c and the From is
1703+
# that/foo/bar/baz.c, some/foo/bar/baz.c has the most segments
1704+
# matched wins, e.g., the shortest From/ path wins.
1705+
matched_from_resources = [
1706+
from_resources.get(id=rid) for rid in match.resource_ids
1707+
]
1708+
matched_from_resources.sort(key=lambda res: (len(res.path.strip("/").split("/")), res.path))
1709+
winning_from_resource = matched_from_resources[0]
1710+
1711+
# Do not count the "to/" segment as it is not "matchable"
1712+
# always strip leading segment ("to" or from" first segment)
1713+
dwarf_path_length = len(dwarf_path.strip("/").split("/")) - 1
1714+
1715+
extra_data = {
1716+
"path_score": f"{matched_path_length}/{dwarf_path_length}",
1717+
"dwarf_path": dwarf_path,
1718+
}
1719+
1720+
rel_key = (winning_from_resource.path, to_resource.path, map_type)
1721+
if rel_key not in relations:
1722+
relation = CodebaseRelation(
1723+
project=winning_from_resource.project,
1724+
from_resource=winning_from_resource,
1725+
to_resource=to_resource,
1726+
map_type=map_type,
1727+
extra_data=extra_data,
1728+
)
1729+
relations[rel_key] = relation
1730+
1731+
if relations:
1732+
rels = CodebaseRelation.objects.bulk_create(relations.values())
1733+
if logger:
1734+
logger(f"Created {len(rels)} mapping using DWARF for: {to_resource.path!r}")
1735+
else:
1736+
if logger:
1737+
logger(f"No mapping using DWARF for: {to_resource.path!r}")
1738+
1739+
if dpnm:
1740+
# save the "dwarf dwarf_paths not mapped"
1741+
to_resource.save()
1742+
if logger:
1743+
logger(f"WARNING: DWARF paths NOT mapped for: {to_resource.path!r}: " + ", ".join(map(repr, dpnm)))
1744+
1745+
1746+
def map_elf(project, logger=None):
1747+
"""Map DWARF paths using similarities of path suffixes."""
1748+
project_files = project.codebaseresources.elfs().no_status()
1749+
from_resources = project_files.from_codebase()
1750+
to_resources = project_files.to_codebase().has_no_relation()
1751+
for resource in to_resources:
1752+
dwarf_paths = get_dwarf_paths(resource.location_path)
1753+
resource.update_extra_data(dwarf_paths)
1754+
resource_count = to_resources.count()
1755+
1756+
if logger:
1757+
logger(
1758+
f"Mapping {resource_count:,d} to/ resources using DWARF paths "
1759+
f"with {from_resources.count():,d} from/ resources."
1760+
)
1761+
1762+
from_resources_index = pathmap.build_index(
1763+
from_resources.values_list("id", "path"), with_subpaths=True
1764+
)
1765+
1766+
if logger:
1767+
logger(
1768+
f"Done building from/ resources index."
1769+
)
1770+
1771+
resource_iterator = to_resources.iterator(chunk_size=2000)
1772+
progress = LoopProgress(resource_count, logger)
1773+
for to_resource in progress.iter(resource_iterator):
1774+
_map_dwarf_path_resource(
1775+
to_resource,
1776+
from_resources,
1777+
from_resources_index,
1778+
logger=logger,
1779+
)

scanpipe/pipes/elf.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

2323
from elf_inspector.dwarf import get_dwarf_paths
24-
24+
from scanpipe.pipes import LoopProgress, pathmap
25+
from scanpipe.models import CodebaseRelation
26+
from scanpipe import pipes
2527

2628
def collect_dwarf_source_path_references(resource):
2729
"""Collect and store the DWARF debug paths of the provided ELF ``resource``."""

0 commit comments

Comments
 (0)