Skip to content

Commit 58a45cc

Browse files
authored
Addon pipeline to collect tree-sitter symbols (#1181)
Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent 1bd0e4d commit 58a45cc

File tree

8 files changed

+152
-2
lines changed

8 files changed

+152
-2
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ v34.5.0 (unreleased)
1919
SBOMs that contains properties with no values.
2020
https://github.com/nexB/scancode.io/issues/1185
2121

22+
- Add a new `CollectTreeSitterSymbolsAndStrings` pipeline (addon) for collecting source
23+
symbol and string using tree-sitter.
24+
https://github.com/nexB/scancode.io/pull/1181
25+
2226
v34.4.0 (2024-04-22)
2327
--------------------
2428

docs/built-in-pipelines.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,14 @@ Collect Codebase Symbols (addon)
6666
:members:
6767
:member-order: bysource
6868

69+
.. _collect_tree_sitter_symbols:
70+
71+
Collect Tree-Sitter Source Symbols (addon)
72+
------------------------------------------
73+
.. autoclass:: scanpipe.pipelines.collect_tree_sitter_symbols.CollectTreeSitterSymbolsAndStrings()
74+
:members:
75+
:member-order: bysource
76+
6977
.. _pipeline_find_vulnerabilities:
7078

7179
Find Vulnerabilities (addon)

scanpipe/pipelines/collect_tree_sitter_symbols.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class CollectTreeSitterSymbolsAndStrings(Pipeline):
2828
"""
2929
Collect codebase symbols using tree-sitter and keep them in extra data field.
3030
31-
Also collect strings and comments.
31+
Also collect strings.
3232
"""
3333

3434
download_inputs = False
@@ -40,7 +40,7 @@ def steps(cls):
4040

4141
def collect_and_store_tree_sitter_symbols_and_strings(self):
4242
"""
43-
Collect symbols, strings and comments from codebase files using tree-sitter
43+
Collect symbols and strings from codebase files using tree-sitter
4444
and store them in the extra data field.
4545
"""
4646
symbols.collect_and_store_tree_sitter_symbols_and_strings(

scanpipe/pipes/symbols.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,11 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

23+
from django.db.models import Q
24+
2325
from source_inspector import symbols_ctags
2426
from source_inspector import symbols_pygments
27+
from source_inspector import symbols_tree_sitter
2528

2629
from scanpipe.pipes import LoopProgress
2730

@@ -103,3 +106,44 @@ def _collect_and_store_pygments_symbols_and_strings(resource):
103106
"source_comments": result.get("source_comments"),
104107
}
105108
)
109+
110+
111+
def collect_and_store_tree_sitter_symbols_and_strings(project, logger=None):
112+
"""
113+
Collect symbols from codebase files using tree-sitter and store
114+
them in the extra data field.
115+
"""
116+
project_files = project.codebaseresources.files()
117+
118+
language_qs = Q()
119+
120+
for language in symbols_tree_sitter.TS_LANGUAGE_WHEELS.keys():
121+
language_qs |= Q(programming_language__iexact=language)
122+
123+
resources = project_files.filter(
124+
is_binary=False,
125+
is_archive=False,
126+
is_media=False,
127+
).filter(language_qs)
128+
129+
resources_count = resources.count()
130+
131+
resource_iterator = resources.iterator(chunk_size=2000)
132+
progress = LoopProgress(resources_count, logger)
133+
134+
for resource in progress.iter(resource_iterator):
135+
_collect_and_store_tree_sitter_symbols_and_strings(resource)
136+
137+
138+
def _collect_and_store_tree_sitter_symbols_and_strings(resource):
139+
"""
140+
Collect symbols ans string from a resource using tree-sitter and store
141+
them in the extra data field.
142+
"""
143+
result = symbols_tree_sitter.get_treesitter_symbols(resource.location)
144+
resource.update_extra_data(
145+
{
146+
"source_symbols": result.get("source_symbols"),
147+
"source_strings": result.get("source_strings"),
148+
}
149+
)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"source_strings": [
3+
"Testing dummy functions:\n",
4+
"Addition: %d + %d = %d\n",
5+
"Subtraction: %d - %d = %d\n"
6+
],
7+
"source_symbols": [
8+
"add",
9+
"a",
10+
"b",
11+
"a",
12+
"b",
13+
"subtract",
14+
"a",
15+
"b",
16+
"a",
17+
"b",
18+
"main",
19+
"x",
20+
"y",
21+
"printf",
22+
"printf",
23+
"x",
24+
"y",
25+
"add",
26+
"x",
27+
"y",
28+
"printf",
29+
"x",
30+
"y",
31+
"subtract",
32+
"x",
33+
"y"
34+
]
35+
}

scanpipe/tests/pipes/test_symbols.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,29 @@ def test_scanpipe_pipes_collect_and_store_pygments_symbols_and_strings(self):
7979
expected_extra_data = json.load(f)
8080

8181
self.assertDictEqual(expected_extra_data, result_extra_data)
82+
83+
def test_scanpipe_pipes_collect_and_store_tree_sitter_symbols_and_strings(self):
84+
dir = self.project1.codebase_path / "codefile"
85+
dir.mkdir(parents=True)
86+
87+
file_location = self.data_location / "source-inspector" / "test3.cpp"
88+
copy_input(file_location, dir)
89+
90+
pipes.collect_and_create_codebase_resources(self.project1)
91+
92+
symbols.collect_and_store_tree_sitter_symbols_and_strings(self.project1)
93+
94+
main_file = self.project1.codebaseresources.files()[0]
95+
96+
result_extra_data = main_file.extra_data
97+
98+
expected_extra_data = (
99+
self.data_location
100+
/ "source-inspector"
101+
/ "test3.cpp-tree-sitter-expected.json"
102+
)
103+
104+
with open(expected_extra_data) as f:
105+
expected_extra_data = json.load(f)
106+
107+
self.assertDictEqual(expected_extra_data, result_extra_data)

scanpipe/tests/test_pipelines.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,3 +1297,35 @@ def test_scanpipe_collect_pygments_symbols_pipeline_integration(self):
12971297
expected_extra_data = json.load(f)
12981298

12991299
self.assertDictEqual(expected_extra_data, result_extra_data)
1300+
1301+
def test_scanpipe_collect_tree_sitter_symbols_pipeline_integration(self):
1302+
pipeline_name = "collect_tree_sitter_symbols"
1303+
project1 = Project.objects.create(name="Analysis")
1304+
1305+
dir = project1.codebase_path / "codefile"
1306+
dir.mkdir(parents=True)
1307+
1308+
file_location = self.data_location / "source-inspector" / "test3.cpp"
1309+
copy_input(file_location, dir)
1310+
1311+
pipes.collect_and_create_codebase_resources(project1)
1312+
1313+
run = project1.add_pipeline(pipeline_name)
1314+
pipeline = run.make_pipeline_instance()
1315+
1316+
exitcode, out = pipeline.execute()
1317+
self.assertEqual(0, exitcode, msg=out)
1318+
1319+
main_file = project1.codebaseresources.files()[0]
1320+
result_extra_data = main_file.extra_data
1321+
1322+
expected_extra_data = (
1323+
self.data_location
1324+
/ "source-inspector"
1325+
/ "test3.cpp-tree-sitter-expected.json"
1326+
)
1327+
1328+
with open(expected_extra_data) as f:
1329+
expected_extra_data = json.load(f)
1330+
1331+
self.assertDictEqual(expected_extra_data, result_extra_data)

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ scancodeio_pipelines =
135135
collect_pygments_symbols = scanpipe.pipelines.collect_pygments_symbols:CollectPygmentsSymbolsAndStrings
136136
collect_source_strings = scanpipe.pipelines.collect_source_strings:CollectSourceStrings
137137
collect_symbols = scanpipe.pipelines.collect_symbols:CollectSymbols
138+
collect_tree_sitter_symbols = scanpipe.pipelines.collect_tree_sitter_symbols:CollectTreeSitterSymbolsAndStrings
138139
find_vulnerabilities = scanpipe.pipelines.find_vulnerabilities:FindVulnerabilities
139140
inspect_elf_binaries = scanpipe.pipelines.inspect_elf_binaries:InspectELFBinaries
140141
inspect_packages = scanpipe.pipelines.inspect_packages:InspectPackages

0 commit comments

Comments
 (0)