Skip to content

Commit 7b00d3f

Browse files
Add rust binary support #1435 (#1488)
* Add rust binary support for packages and symbols * Add support to get packages from rust binaries * Add support for rust source to binary mapping using symbols Reference: #1435 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Add support for single binary file on the to/ side We were failing on the d2d pipelines if we uploaded a single binary on the to side, as the d2d pipeline requires uploading an archive only. Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Add warning instead of logger line in go/elf map We were having a lot of extra logger lines in certain cases previously on selecting wrong options, as the traceback was being shown in the log too. So this creates a warning instead of a log entry. Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Address feedback Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Update CHANGELOG and documention Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> --------- Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 68ade0a commit 7b00d3f

16 files changed

+5265
-11
lines changed

CHANGELOG.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ v34.9.3 (2024-12-31)
1818
* Add SCANCODEIO_RQ_REDIS_SSL setting to enable SSL.
1919
https://github.com/aboutcode-org/scancode.io/issues/1465
2020

21+
- Add support to map binaries to source files using symbols
22+
for rust binaries and source files. This adds also using
23+
``rust-inspector`` to extract symbols from rust binaries.
24+
This is a new optional ``Rust`` step in the
25+
``map_deploy_to_develop`` pipeline.
26+
https://github.com/aboutcode-org/scancode.io/issues/1435
27+
2128
v34.9.2 (2024-12-10)
2229
--------------------
2330

docs/scanpipe-pipes.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ SPDX
110110
.. automodule:: scanpipe.pipes.spdx
111111
:members:
112112

113+
Symbolmap
114+
---------
115+
.. automodule:: scanpipe.pipes.symbolmap
116+
:members:
117+
113118
Symbols
114119
-------
115120
.. automodule:: scanpipe.pipes.symbols

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from scanpipe.pipelines import Pipeline
2626
from scanpipe.pipes import d2d
2727
from scanpipe.pipes import flag
28+
from scanpipe.pipes import input
2829
from scanpipe.pipes import matchcode
2930
from scanpipe.pipes import purldb
3031
from scanpipe.pipes import scancode
@@ -72,6 +73,7 @@ def steps(cls):
7273
cls.map_javascript,
7374
cls.map_elf,
7475
cls.map_go,
76+
cls.map_rust,
7577
cls.match_directories_to_purldb,
7678
cls.match_resources_to_purldb,
7779
cls.map_javascript_post_purldb_match,
@@ -129,7 +131,10 @@ def extract_inputs_to_codebase_directory(self):
129131

130132
for input_files, codebase_path in inputs_with_codebase_path_destination:
131133
for input_file_path in input_files:
132-
self.extract_archive(input_file_path, codebase_path)
134+
if input.is_archive(input_file_path):
135+
self.extract_archive(input_file_path, codebase_path)
136+
else:
137+
input.copy_input(input_file_path, codebase_path)
133138

134139
# Reload the project env post-extraction as the scancode-config.yml file
135140
# may be located in one of the extracted archives.
@@ -198,9 +203,14 @@ def map_elf(self):
198203

199204
@optional_step("Go")
200205
def map_go(self):
201-
"""Map Go binaries to their sources."""
206+
"""Map Go binaries to their sources using paths."""
202207
d2d.map_go_paths(project=self.project, logger=self.log)
203208

209+
@optional_step("Rust")
210+
def map_rust(self):
211+
"""Map Rust binaries to their sources using symbols."""
212+
d2d.map_rust_paths(project=self.project, logger=self.log)
213+
204214
def match_directories_to_purldb(self):
205215
"""Match selected directories in PurlDB."""
206216
if not purldb.is_available():

scanpipe/pipes/d2d.py

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from extractcode import EXTRACT_SUFFIX
4343
from go_inspector.plugin import collect_and_parse_symbols
4444
from packagedcode.npm import NpmPackageJsonHandler
45+
from rust_inspector.binary import collect_and_parse_rust_symbols
4546
from summarycode.classify import LEGAL_STARTS_ENDS
4647

4748
from aboutcode.pipeline import LoopProgress
@@ -57,6 +58,8 @@
5758
from scanpipe.pipes import purldb
5859
from scanpipe.pipes import resolve
5960
from scanpipe.pipes import scancode
61+
from scanpipe.pipes import symbolmap
62+
from scanpipe.pipes import symbols
6063

6164
FROM = "from/"
6265
TO = "to/"
@@ -1794,8 +1797,14 @@ def map_elfs(project, logger=None):
17941797
try:
17951798
paths = get_elf_file_dwarf_paths(resource.location_path)
17961799
resource.update_extra_data(paths)
1797-
except Exception as e:
1798-
logger(f"Can not parse {resource.location_path!r} {e!r}")
1800+
except Exception as exception:
1801+
project.add_warning(
1802+
exception=exception,
1803+
object_instance=resource,
1804+
description=f"Cannot parse binary at {resource.path}",
1805+
model="map_elfs",
1806+
details={"path": resource.path},
1807+
)
17991808

18001809
if logger:
18011810
logger(
@@ -1860,8 +1869,14 @@ def map_go_paths(project, logger=None):
18601869
try:
18611870
paths = get_go_file_paths(resource.location_path)
18621871
resource.update_extra_data(paths)
1863-
except Exception as e:
1864-
logger(f"Can not parse {resource.location_path!r} {e!r}")
1872+
except Exception as exception:
1873+
project.add_warning(
1874+
exception=exception,
1875+
object_instance=resource,
1876+
description=f"Cannot parse binary at {resource.path}",
1877+
model="map_go_paths",
1878+
details={"path": resource.path},
1879+
)
18651880

18661881
if logger:
18671882
logger(
@@ -1886,3 +1901,54 @@ def map_go_paths(project, logger=None):
18861901
map_types=["go_file_paths"],
18871902
logger=logger,
18881903
)
1904+
1905+
1906+
def map_rust_paths(project, logger=None):
1907+
"""Map Rust binaries to their source in ``project``."""
1908+
from_resources = project.codebaseresources.files().from_codebase()
1909+
to_resources = (
1910+
project.codebaseresources.files()
1911+
.to_codebase()
1912+
.has_no_relation()
1913+
.executable_binaries()
1914+
)
1915+
1916+
# Collect source symbols from rust source files
1917+
rust_from_resources = from_resources.filter(extension=".rs")
1918+
symbols.collect_and_store_tree_sitter_symbols_and_strings(
1919+
project=project,
1920+
logger=logger,
1921+
project_files=rust_from_resources,
1922+
)
1923+
1924+
# Collect binary symbols from rust binaries
1925+
for resource in to_resources:
1926+
try:
1927+
binary_symbols = collect_and_parse_rust_symbols(resource.location_path)
1928+
resource.update_extra_data(binary_symbols)
1929+
except Exception as e:
1930+
logger(f"Can not parse {resource.location_path!r} {e!r}")
1931+
1932+
if logger:
1933+
logger(
1934+
f"Mapping {to_resources.count():,d} to/ resources using symbols "
1935+
f"with {rust_from_resources.count():,d} from/ resources."
1936+
)
1937+
1938+
resource_iterator = to_resources.iterator(chunk_size=2000)
1939+
progress = LoopProgress(to_resources.count(), logger)
1940+
for to_resource in progress.iter(resource_iterator):
1941+
binary_symbols = to_resource.extra_data.get("rust_symbols")
1942+
if not binary_symbols:
1943+
continue
1944+
1945+
if logger:
1946+
logger(f"Mapping source files to binary at {to_resource.path}")
1947+
1948+
symbolmap.map_resources_with_symbols(
1949+
to_resource=to_resource,
1950+
from_resources=rust_from_resources,
1951+
binary_symbols=binary_symbols,
1952+
map_type="rust_symbols",
1953+
logger=logger,
1954+
)

scanpipe/pipes/flag.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949

5050
ABOUT_MAPPED = "about-mapped"
5151
MAPPED = "mapped"
52+
MAPPED_BY_SYMBOL = "mapped-by-symbol"
5253
ARCHIVE_PROCESSED = "archive-processed"
5354
MATCHED_TO_PURLDB_PACKAGE = "matched-to-purldb-package"
5455
MATCHED_TO_PURLDB_RESOURCE = "matched-to-purldb-resource"

scanpipe/pipes/input.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
import os
2224
import shutil
2325
from pathlib import Path
2426

@@ -41,11 +43,14 @@
4143
def copy_input(input_location, dest_path):
4244
"""Copy the ``input_location`` (file or directory) to the ``dest_path``."""
4345
input_path = Path(input_location)
44-
destination = Path(dest_path) / input_path.name
46+
destination_dir = Path(dest_path)
47+
destination = destination_dir / input_path.name
4548

4649
if input_path.is_dir():
4750
shutil.copytree(input_location, destination)
4851
else:
52+
if not os.path.exists(destination_dir):
53+
os.makedirs(destination_dir)
4954
shutil.copyfile(input_location, destination)
5055

5156
return destination

scanpipe/pipes/matchcode.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,10 @@ def fingerprint_codebase_directories(project, to_codebase_only=False):
188188
resources = project.codebaseresources.all()
189189
if to_codebase_only:
190190
resources = resources.to_codebase()
191+
192+
if not resources.directories():
193+
return
194+
191195
virtual_codebase = codebase.get_basic_virtual_codebase(resources)
192196
virtual_codebase = compute_codebase_directory_fingerprints(virtual_codebase)
193197
save_directory_fingerprints(

scanpipe/pipes/symbolmap.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from collections import Counter
24+
25+
from aboutcode.pipeline import LoopProgress
26+
from scanpipe.models import CodebaseRelation
27+
from scanpipe.pipes import flag
28+
29+
"""
30+
Path matching using source and binary symbols.
31+
32+
The approach is to create a set of symbols obtained from the rust binary for
33+
each of them and match them to the symbols obtained from the source
34+
"""
35+
36+
MATCHING_RATIO_RUST = 0.5
37+
SMALL_FILE_SYMBOLS_THRESHOLD = 20
38+
MATCHING_RATIO_RUST_SMALL_FILE = 0.4
39+
40+
41+
def map_resources_with_symbols(
42+
to_resource, from_resources, binary_symbols, map_type, logger=None
43+
):
44+
"""
45+
Map paths found in the ``to_resource`` extra_data to paths of the ``from_resources``
46+
CodebaseResource queryset using the precomputed ``from_resources_index`` path index.
47+
"""
48+
if not binary_symbols:
49+
return
50+
51+
# Accumulate unique relation objects for bulk creation
52+
relations_to_create = {}
53+
54+
# These are of type string
55+
paths_not_mapped = to_resource.extra_data[f"{map_type}_not_mapped"] = []
56+
for item in match_source_paths_to_binary(
57+
to_resource=to_resource,
58+
from_resources=from_resources,
59+
binary_symbols=binary_symbols,
60+
map_type=map_type,
61+
logger=logger,
62+
):
63+
if isinstance(item, str):
64+
paths_not_mapped.append(item)
65+
else:
66+
rel_key, relation = item
67+
if rel_key not in relations_to_create:
68+
relations_to_create[rel_key] = relation
69+
70+
# If there are any non-test files in the rust source files which
71+
# are not mapped, we mark the binary as REQUIRES_REVIEW
72+
has_non_test_unmapped_files = any(
73+
[True for path in paths_not_mapped if "/tests/" not in path]
74+
)
75+
if paths_not_mapped and has_non_test_unmapped_files:
76+
to_resource.update(status=flag.REQUIRES_REVIEW)
77+
if logger:
78+
logger(
79+
f"WARNING: #{len(paths_not_mapped)} {map_type} paths NOT mapped for: "
80+
f"{to_resource.path!r}"
81+
)
82+
83+
if relations_to_create:
84+
rels = CodebaseRelation.objects.bulk_create(relations_to_create.values())
85+
from_resources.has_relation().update(status=flag.MAPPED_BY_SYMBOL)
86+
if logger:
87+
logger(
88+
f"Created {len(rels)} mappings using "
89+
f"{map_type} for: {to_resource.path!r}"
90+
)
91+
92+
elif logger:
93+
logger(f"No mappings using {map_type} for: " f"{to_resource.path!r}")
94+
95+
96+
def match_source_symbols_to_binary(source_symbols, binary_symbols):
97+
binary_symbols_set = set(binary_symbols)
98+
source_symbols_set = set(source_symbols)
99+
source_symbols_count = len(source_symbols)
100+
source_symbols_unique_count = len(source_symbols_set)
101+
102+
source_symbols_counter = Counter(source_symbols)
103+
104+
common_symbols = source_symbols_set.intersection(binary_symbols_set)
105+
common_symbols_count = sum(
106+
[source_symbols_counter.get(symbol) for symbol in common_symbols]
107+
)
108+
common_symbols_ratio = common_symbols_count / source_symbols_count
109+
common_symbols_unique_count = len(common_symbols)
110+
common_symbols_unique_ratio = (
111+
common_symbols_unique_count / source_symbols_unique_count
112+
)
113+
stats = {
114+
"common_symbols_unique_ratio": common_symbols_unique_ratio,
115+
"common_symbols_ratio": common_symbols_ratio,
116+
}
117+
118+
if (
119+
common_symbols_ratio > MATCHING_RATIO_RUST
120+
or common_symbols_unique_ratio > MATCHING_RATIO_RUST
121+
):
122+
return True, stats
123+
elif source_symbols_count > SMALL_FILE_SYMBOLS_THRESHOLD and (
124+
common_symbols_ratio > MATCHING_RATIO_RUST_SMALL_FILE
125+
or common_symbols_unique_ratio > MATCHING_RATIO_RUST_SMALL_FILE
126+
):
127+
return True, stats
128+
else:
129+
return False, stats
130+
131+
132+
def match_source_paths_to_binary(
133+
to_resource,
134+
from_resources,
135+
binary_symbols,
136+
map_type,
137+
logger=None,
138+
):
139+
resource_iterator = from_resources.iterator(chunk_size=2000)
140+
progress = LoopProgress(from_resources.count(), logger)
141+
142+
for resource in progress.iter(resource_iterator):
143+
source_symbols = resource.extra_data.get("source_symbols")
144+
if not source_symbols:
145+
yield resource.path
146+
continue
147+
148+
is_source_matched, match_stats = match_source_symbols_to_binary(
149+
source_symbols=source_symbols,
150+
binary_symbols=binary_symbols,
151+
)
152+
if not is_source_matched:
153+
yield resource.path
154+
continue
155+
156+
rel_key = (resource.path, to_resource.path, map_type)
157+
relation = CodebaseRelation(
158+
project=resource.project,
159+
from_resource=resource,
160+
to_resource=to_resource,
161+
map_type=map_type,
162+
extra_data=match_stats,
163+
)
164+
yield rel_key, relation

0 commit comments

Comments
 (0)