Skip to content

Commit 90f396b

Browse files
JonoYangtdruez
andauthored
Update D2D pipeline with Cython source to binary matching (#1703)
* Add step to map Cython files to binaries in d2d pipeline Signed-off-by: Jono Yang <jyang@nexb.com> * Add comments, fix style Signed-off-by: Jono Yang <jyang@nexb.com> * Use branch of source-inspector for get_tree_and_language_info Signed-off-by: Jono Yang <jyang@nexb.com> * Add test for python pyx matching Signed-off-by: Jono Yang <jyang@nexb.com> * Update CHANGELOG.rst Signed-off-by: Jono Yang <jyang@nexb.com> * Update source-inspector Signed-off-by: Jono Yang <jyang@nexb.com> * Add Python to ECOSYSTEM_CONFIGS Signed-off-by: Jono Yang <jyang@nexb.com> * Fix the build and test suite for macOS #1703 Signed-off-by: tdruez <tdruez@nexb.com> --------- Signed-off-by: Jono Yang <jyang@nexb.com> Signed-off-by: tdruez <tdruez@nexb.com> Co-authored-by: tdruez <489057+tdruez@users.noreply.github.com> Co-authored-by: tdruez <tdruez@nexb.com>
1 parent fce1cee commit 90f396b

File tree

8 files changed

+96
-1
lines changed

8 files changed

+96
-1
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ v35.1.0 (unreleased)
1616
license rules used during the scan.
1717
https://github.com/aboutcode-org/scancode.io/issues/1657
1818

19+
- Add a new step to the ``DeployToDevelop`` pipeline, ``map_python``, to match
20+
Cython source files (.pyx) to their compiled binaries.
21+
https://github.com/aboutcode-org/scancode.io/pull/1703
22+
1923
- Update scancode-toolkit to v32.4.0. See CHANGELOG for updates:
2024
https://github.com/aboutcode-org/scancode-toolkit/releases/tag/v32.4.0
2125
Adds a new ``git_sha1`` attribute to the ``CodebaseResource`` model as this

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def steps(cls):
8181
cls.map_winpe,
8282
cls.map_go,
8383
cls.map_rust,
84+
cls.map_python,
8485
cls.match_directories_to_purldb,
8586
cls.match_resources_to_purldb,
8687
cls.map_javascript_post_purldb_match,
@@ -221,6 +222,14 @@ def map_rust(self):
221222
"""Map Rust binaries to their sources using symbols."""
222223
d2d.map_rust_binaries_with_symbols(project=self.project, logger=self.log)
223224

225+
@optional_step("Python")
226+
def map_python(self):
227+
"""
228+
Map binaries from Python packages to their sources using dwarf paths and
229+
symbols.
230+
"""
231+
d2d.map_python_pyx_to_binaries(project=self.project, logger=self.log)
232+
224233
def match_directories_to_purldb(self):
225234
"""Match selected directories in PurlDB."""
226235
if not purldb.is_available():

scanpipe/pipes/d2d.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2254,3 +2254,55 @@ def _map_javascript_strings(to_resource, javascript_from_resources, logger):
22542254
to_resource.update(status=flag.MAPPED)
22552255
return 1
22562256
return 0
2257+
2258+
2259+
def map_python_pyx_to_binaries(project, logger=None):
2260+
"""Map Cython source to their compiled binaries in ``project``."""
2261+
from source_inspector.symbols_tree_sitter import get_tree_and_language_info
2262+
2263+
python_config = d2d_config.get_ecosystem_config(ecosystem="Python")
2264+
from_resources = (
2265+
project.codebaseresources.files()
2266+
.from_codebase()
2267+
.filter(extension__in=python_config.source_symbol_extensions)
2268+
)
2269+
to_resources = (
2270+
project.codebaseresources.files().to_codebase().has_no_relation().elfs()
2271+
)
2272+
2273+
# Collect binary symbols from binaries
2274+
for resource in to_resources:
2275+
try:
2276+
binary_symbols = collect_and_parse_elf_symbols(resource.location)
2277+
resource.update_extra_data(binary_symbols)
2278+
except Exception as e:
2279+
logger(f"Error parsing binary symbols at: {resource.location_path!r} {e!r}")
2280+
2281+
for resource in from_resources:
2282+
# Open Cython source file, create AST, parse it for function definitions
2283+
# and save them in a list
2284+
tree, _ = get_tree_and_language_info(resource.location)
2285+
function_definitions = [
2286+
node
2287+
for node in tree.root_node.children
2288+
if node.type == "function_definition"
2289+
]
2290+
identifiers = []
2291+
for node in function_definitions:
2292+
for child in node.children:
2293+
if child.type == "identifier":
2294+
identifiers.append(child.text.decode())
2295+
2296+
# Find matching to/ resource by checking to see which to/ resource's
2297+
# extra_data field contains function definitions found from Cython
2298+
# source files
2299+
identifiers_qs = Q()
2300+
for identifier in identifiers:
2301+
identifiers_qs |= Q(extra_data__icontains=identifier)
2302+
matching_elfs = to_resources.filter(identifiers_qs)
2303+
for matching_elf in matching_elfs:
2304+
pipes.make_relation(
2305+
from_resource=resource,
2306+
to_resource=matching_elf,
2307+
map_type="python_pyx_match",
2308+
)

scanpipe/pipes/d2d_config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,10 @@ class EcosystemConfig:
131131
ecosystem_option="Windows",
132132
source_symbol_extensions=[".c", ".cpp", ".h", ".cs"],
133133
),
134+
"Python": EcosystemConfig(
135+
ecosystem_option="Python",
136+
source_symbol_extensions=[".pyx", ".pxd"],
137+
),
134138
}
135139

136140

Binary file not shown.
Binary file not shown.

scanpipe/tests/pipes/test_d2d.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1677,6 +1677,32 @@ def test_scanpipe_pipes_d2d_map_macho_symbols(self):
16771677
).count(),
16781678
)
16791679

1680+
@skipIf(sys.platform == "darwin", "Test is failing on macOS")
1681+
def test_scanpipe_pipes_d2d_map_python_pyx(self):
1682+
input_dir = self.project1.input_path
1683+
input_resources = [
1684+
self.data / "d2d-python/to-intbitset.whl",
1685+
self.data / "d2d-python/from-intbitset.tar.gz",
1686+
]
1687+
copy_inputs(input_resources, input_dir)
1688+
self.from_files, self.to_files = d2d.get_inputs(self.project1)
1689+
inputs_with_codebase_path_destination = [
1690+
(self.from_files, self.project1.codebase_path / d2d.FROM),
1691+
(self.to_files, self.project1.codebase_path / d2d.TO),
1692+
]
1693+
for input_files, codebase_path in inputs_with_codebase_path_destination:
1694+
for input_file_path in input_files:
1695+
scancode.extract_archive(input_file_path, codebase_path)
1696+
1697+
scancode.extract_archives(self.project1.codebase_path, recurse=True)
1698+
pipes.collect_and_create_codebase_resources(self.project1)
1699+
buffer = io.StringIO()
1700+
d2d.map_python_pyx_to_binaries(project=self.project1, logger=buffer.write)
1701+
pyx_match_relations = CodebaseRelation.objects.filter(
1702+
project=self.project1, map_type="python_pyx_match"
1703+
)
1704+
self.assertEqual(1, pyx_match_relations.count())
1705+
16801706
@skipIf(sys.platform == "darwin", "Test is failing on macOS")
16811707
def test_scanpipe_pipes_d2d_map_winpe_symbols(self):
16821708
input_dir = self.project1.input_path

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ install_requires =
8686
rust-inspector==0.1.0
8787
binary-inspector==0.1.2
8888
python-inspector==0.14.0
89-
source-inspector==0.6.1; sys_platform != "darwin" and platform_machine != "arm64"
89+
source-inspector==0.7.0; sys_platform != "darwin" and platform_machine != "arm64"
9090
aboutcode-toolkit==11.1.1
9191
# Utilities
9292
XlsxWriter==3.2.5

0 commit comments

Comments
 (0)