Skip to content

Commit 6529a1e

Browse files
authored
Refine ScanCode.io d2d pipeline for JavaScript using symbol mapping (#1622)
* Add d2d step to map javascript using symbols - Map source and deployed javascript files based on Jensen–Shannon divergence Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Use simple set operation for symbol mapping Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Use JSD for precise JavaScript symbol matching Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Add test for javascript symbol matching Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Update test fixture Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Add the missing unit tests Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Fix docstring Signed-off-by: Keshav Priyadarshi <git@keshav.space> --------- Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent d6afc94 commit 6529a1e

File tree

11 files changed

+1034
-1
lines changed

11 files changed

+1034
-1
lines changed

scanpipe/filters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,7 @@ def filter(self, qs, value):
483483
("about_file", "about file"),
484484
("java_to_class", "java to class"),
485485
("jar_to_source", "jar to source"),
486+
("javascript_symbols", "js symbols"),
486487
("js_compiled", "js compiled"),
487488
("js_colocation", "js colocation"),
488489
("js_path", "js path"),

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def steps(cls):
7171
cls.map_java_to_class,
7272
cls.map_jar_to_source,
7373
cls.map_javascript,
74+
cls.map_javascript_symbols,
7475
cls.map_elf,
7576
cls.map_go,
7677
cls.map_rust,
@@ -196,6 +197,11 @@ def map_javascript(self):
196197
"""
197198
d2d.map_javascript(project=self.project, logger=self.log)
198199

200+
@optional_step("JavaScript")
201+
def map_javascript_symbols(self):
202+
"""Map deployed JavaScript, TypeScript to its sources using symbols."""
203+
d2d.map_javascript_symbols(project=self.project, logger=self.log)
204+
199205
@optional_step("Elf")
200206
def map_elf(self):
201207
"""Map ELF binaries to their sources."""

scanpipe/pipes/d2d.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1960,3 +1960,98 @@ def map_rust_paths(project, logger=None):
19601960
map_type="rust_symbols",
19611961
logger=logger,
19621962
)
1963+
1964+
1965+
def map_javascript_symbols(project, logger=None):
1966+
"""Map deployed JavaScript, TypeScript to its sources using symbols."""
1967+
project_files = project.codebaseresources.files()
1968+
1969+
javascript_to_resources = (
1970+
project_files.to_codebase()
1971+
.has_no_relation()
1972+
.filter(extension__in=[".ts", ".js"])
1973+
)
1974+
1975+
javascript_from_resources = (
1976+
project_files.from_codebase()
1977+
.exclude(path__contains="/test/")
1978+
.filter(extension__in=[".ts", ".js"])
1979+
)
1980+
1981+
if not (javascript_from_resources.exists() and javascript_to_resources.exists()):
1982+
return
1983+
1984+
symbols.collect_and_store_tree_sitter_symbols_and_strings(
1985+
project=project,
1986+
logger=logger,
1987+
project_files=javascript_from_resources,
1988+
)
1989+
1990+
symbols.collect_and_store_tree_sitter_symbols_and_strings(
1991+
project=project,
1992+
logger=logger,
1993+
project_files=javascript_to_resources,
1994+
)
1995+
1996+
javascript_from_resources_withsymbols = javascript_from_resources.exclude(
1997+
extra_data={}
1998+
)
1999+
javascript_to_resources_withsymbols = javascript_to_resources.exclude(extra_data={})
2000+
2001+
javascript_from_resources_count = javascript_from_resources_withsymbols.count()
2002+
javascript_to_resources_count = javascript_to_resources_withsymbols.count()
2003+
if logger:
2004+
logger(
2005+
f"Mapping {javascript_to_resources_count:,d} JavaScript resources using"
2006+
f" symbols against {javascript_from_resources_count:,d} from/ codebase."
2007+
)
2008+
2009+
resource_iterator = javascript_to_resources_withsymbols.iterator(chunk_size=2000)
2010+
progress = LoopProgress(javascript_to_resources_count, logger)
2011+
2012+
resource_mapped = 0
2013+
for to_resource in progress.iter(resource_iterator):
2014+
resource_mapped += _map_javascript_symbols(
2015+
to_resource, javascript_from_resources_withsymbols, logger
2016+
)
2017+
2018+
if logger:
2019+
logger(f"{resource_mapped:,d} resource mapped using symbols")
2020+
2021+
2022+
def _map_javascript_symbols(to_resource, javascript_from_resources, logger):
2023+
"""
2024+
Map a deployed JavaScript resource to its source using symbols and
2025+
return 1 if match is found otherwise return 0.
2026+
"""
2027+
to_symbols = to_resource.extra_data.get("source_symbols")
2028+
2029+
if not to_symbols or symbolmap.is_decomposed_javascript(to_symbols):
2030+
return 0
2031+
2032+
best_matching_score = 0
2033+
best_match = None
2034+
for source_js in javascript_from_resources:
2035+
from_symbols = source_js.extra_data.get("source_symbols")
2036+
if not from_symbols:
2037+
continue
2038+
2039+
is_match, similarity = symbolmap.match_javascript_source_symbols_to_deployed(
2040+
source_symbols=from_symbols,
2041+
deployed_symbols=to_symbols,
2042+
)
2043+
2044+
if is_match and similarity > best_matching_score:
2045+
best_matching_score = similarity
2046+
best_match = source_js
2047+
2048+
if best_match:
2049+
pipes.make_relation(
2050+
from_resource=best_match,
2051+
to_resource=to_resource,
2052+
map_type="javascript_symbols",
2053+
extra_data={"jsd_similarity_score": similarity},
2054+
)
2055+
to_resource.update(status=flag.MAPPED)
2056+
return 1
2057+
return 0

scanpipe/pipes/symbolmap.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
from collections import Counter
2424

25+
from scipy.spatial.distance import jensenshannon
26+
2527
from aboutcode.pipeline import LoopProgress
2628
from scanpipe.models import CodebaseRelation
2729
from scanpipe.pipes import flag
@@ -37,6 +39,11 @@
3739
SMALL_FILE_SYMBOLS_THRESHOLD = 20
3840
MATCHING_RATIO_RUST_SMALL_FILE = 0.4
3941

42+
MATCHING_RATIO_JAVASCRIPT = 0.7
43+
SMALL_FILE_SYMBOLS_THRESHOLD_JAVASCRIPT = 30
44+
MATCHING_RATIO_JAVASCRIPT_SMALL_FILE = 0.5
45+
JAVASCRIPT_DECOMPOSED_SYMBOLS_THRESHOLD = 0.5
46+
4047

4148
def map_resources_with_symbols(
4249
to_resource, from_resources, binary_symbols, map_type, logger=None
@@ -162,3 +169,69 @@ def match_source_paths_to_binary(
162169
extra_data=match_stats,
163170
)
164171
yield rel_key, relation
172+
173+
174+
def is_decomposed_javascript(symbols):
175+
"""Return whether given set of symbols represents decomposed JavaScript code."""
176+
meaningful_symbols = sum(1 for k in symbols if len(k) > 3)
177+
ratio = meaningful_symbols / len(symbols) if meaningful_symbols > 0 else 0
178+
179+
return ratio < JAVASCRIPT_DECOMPOSED_SYMBOLS_THRESHOLD
180+
181+
182+
def get_symbols_probability_distribution(symbols, unique_symbols):
183+
"""Compute probability distribution of symbols based on their frequency."""
184+
counter = Counter(symbols)
185+
total_count = len(symbols)
186+
187+
probability_dist = [
188+
(counter.get(symbol) / total_count) if symbol in counter else 0
189+
for symbol in unique_symbols
190+
]
191+
192+
return probability_dist
193+
194+
195+
def get_similarity_between_source_and_deployed_symbols(
196+
source_symbols,
197+
deployed_symbols,
198+
matching_ratio,
199+
matching_ratio_small_file,
200+
small_file_threshold,
201+
):
202+
"""
203+
Compute similarity between source and deployed symbols based on Jensen-Shannon
204+
Divergence and return whether they match based on provided threshold.
205+
"""
206+
unique_symbols = set(source_symbols).union(set(deployed_symbols))
207+
208+
source_probability_dist = get_symbols_probability_distribution(
209+
source_symbols, unique_symbols
210+
)
211+
deployed_probability_dist = get_symbols_probability_distribution(
212+
deployed_symbols, unique_symbols
213+
)
214+
215+
divergence = jensenshannon(source_probability_dist, deployed_probability_dist)
216+
similarity = 1 - divergence
217+
218+
matching_threshold = matching_ratio
219+
if len(deployed_symbols) <= small_file_threshold:
220+
matching_threshold = matching_ratio_small_file
221+
222+
is_match = similarity >= matching_threshold
223+
224+
return is_match, similarity
225+
226+
227+
def match_javascript_source_symbols_to_deployed(source_symbols, deployed_symbols):
228+
"""Match source and deployed symbols using Jensen-Shannon Divergence."""
229+
is_match, similarity = get_similarity_between_source_and_deployed_symbols(
230+
source_symbols=source_symbols,
231+
deployed_symbols=deployed_symbols,
232+
matching_ratio=MATCHING_RATIO_JAVASCRIPT,
233+
matching_ratio_small_file=MATCHING_RATIO_JAVASCRIPT_SMALL_FILE,
234+
small_file_threshold=SMALL_FILE_SYMBOLS_THRESHOLD_JAVASCRIPT,
235+
)
236+
237+
return is_match, similarity

scanpipe/pipes/symbols.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def collect_and_store_tree_sitter_symbols_and_strings(
139139
if logger:
140140
logger(
141141
f"Getting source symbols and strings from {resources_count:,d}"
142-
" from/ resources using tree-sitter."
142+
" resources using tree-sitter."
143143
)
144144

145145
resource_iterator = resources.iterator(chunk_size=2000)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
about_resource: cesium
2+
name: cesium
3+
version: 1.125
4+
download_url: https://github.com/CesiumGS/cesium/archive/refs/tags/1.125.zip
5+
homepage_url: https://github.com/CesiumGS/cesium
6+
license_expression: apache-2.0
7+
attribute: yes
8+
package_url: pkg:github/CesiumGS/cesium@1.125

0 commit comments

Comments
 (0)