Skip to content

Commit 0bdb8d3

Browse files
authored
Addon pipeline to collect pygments symbols (#1179)
Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent ee313f2 commit 0bdb8d3

File tree

10 files changed

+249
-1
lines changed

10 files changed

+249
-1
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ v34.5.0 (unreleased)
1111
datafile_resource fields do not have a value.
1212
https://github.com/nexB/scancode.io/issues/1177
1313

14+
- Add a new `CollectPygmentsSymbolsAndStrings` pipeline (addon) for collecting source
15+
symbol, string and comments using Pygments.
16+
https://github.com/nexB/scancode.io/pull/1179
17+
1418
v34.4.0 (2024-04-22)
1519
--------------------
1620

docs/built-in-pipelines.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ Analyse Docker Windows Image
4242
:members:
4343
:member-order: bysource
4444

45+
.. _collect_pygments_symbols:
46+
47+
Collect Pygments Source Symbols (addon)
48+
---------------------------------------
49+
.. autoclass:: scanpipe.pipelines.collect_pygments_symbols.CollectPygmentsSymbolsAndStrings()
50+
:members:
51+
:member-order: bysource
52+
4553
.. _pipeline_collect_source_strings:
4654

4755
Collect Source Strings (addon)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/nexB/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import symbols
25+
26+
27+
class CollectPygmentsSymbolsAndStrings(Pipeline):
28+
"""
29+
Collect codebase symbols using pygments and keep them in extra data field.
30+
31+
Also collect strings and comments.
32+
"""
33+
34+
download_inputs = False
35+
is_addon = True
36+
37+
@classmethod
38+
def steps(cls):
39+
return (cls.collect_and_store_pygments_symbols_and_strings,)
40+
41+
def collect_and_store_pygments_symbols_and_strings(self):
42+
"""
43+
Collect symbols, strings and comments from codebase files using pygments
44+
and store them in the extra data field.
45+
"""
46+
symbols.collect_and_store_pygments_symbols_and_strings(self.project, self.log)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/nexB/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import symbols
25+
26+
27+
class CollectTreeSitterSymbolsAndStrings(Pipeline):
28+
"""
29+
Collect codebase symbols using tree-sitter and keep them in extra data field.
30+
31+
Also collect strings and comments.
32+
"""
33+
34+
download_inputs = False
35+
is_addon = True
36+
37+
@classmethod
38+
def steps(cls):
39+
return (cls.collect_and_store_tree_sitter_symbols_and_strings,)
40+
41+
def collect_and_store_tree_sitter_symbols_and_strings(self):
42+
"""
43+
Collect symbols, strings and comments from codebase files using tree-sitter
44+
and store them in the extra data field.
45+
"""
46+
symbols.collect_and_store_tree_sitter_symbols_and_strings(
47+
self.project, self.log
48+
)

scanpipe/pipes/symbols.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

2323
from source_inspector import symbols_ctags
24+
from source_inspector import symbols_pygments
2425

2526
from scanpipe.pipes import LoopProgress
2627

@@ -65,3 +66,40 @@ def _collect_and_store_resource_symbols(resource):
6566
symbols = symbols_ctags.collect_symbols(resource.location)
6667
tags = [symbol["name"] for symbol in symbols if "name" in symbol]
6768
resource.update_extra_data({"source_symbols": tags})
69+
70+
71+
def collect_and_store_pygments_symbols_and_strings(project, logger=None):
72+
"""
73+
Collect symbols, strings and comments from codebase files using pygments and store
74+
them in the extra data field.
75+
"""
76+
project_files = project.codebaseresources.files()
77+
78+
resources = project_files.filter(
79+
is_binary=False,
80+
is_archive=False,
81+
is_media=False,
82+
)
83+
84+
resources_count = resources.count()
85+
86+
resource_iterator = resources.iterator(chunk_size=2000)
87+
progress = LoopProgress(resources_count, logger)
88+
89+
for resource in progress.iter(resource_iterator):
90+
_collect_and_store_pygments_symbols_and_strings(resource)
91+
92+
93+
def _collect_and_store_pygments_symbols_and_strings(resource):
94+
"""
95+
Collect symbols, strings and comments from a resource using pygments and store
96+
them in the extra data field.
97+
"""
98+
result = symbols_pygments.get_pygments_symbols(resource.location)
99+
resource.update_extra_data(
100+
{
101+
"source_symbols": result.get("source_symbols"),
102+
"source_strings": result.get("source_strings"),
103+
"source_comments": result.get("source_comments"),
104+
}
105+
)
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#include <stdio.h>
2+
3+
// Function to add two integers
4+
int add(int a, int b) {
5+
return a + b;
6+
}
7+
8+
// Function to subtract two integers
9+
int subtract(int a, int b) {
10+
return a - b;
11+
}
12+
13+
int main() {
14+
int x = 10;
15+
int y = 5;
16+
17+
printf("Testing dummy functions:\n");
18+
19+
printf("Addition: %d + %d = %d\n", x, y, add(x, y));
20+
printf("Subtraction: %d - %d = %d\n", x, y, subtract(x, y));
21+
22+
return 0;
23+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"source_strings": [
3+
"10",
4+
"5",
5+
"\"",
6+
"Testing dummy functions:",
7+
"\\n",
8+
"\"",
9+
"\"",
10+
"Addition: %d + %d = %d",
11+
"\\n",
12+
"\"",
13+
"\"",
14+
"Subtraction: %d - %d = %d",
15+
"\\n",
16+
"\"",
17+
"0"
18+
],
19+
"source_symbols": [
20+
"add",
21+
"subtract",
22+
"main"
23+
],
24+
"source_comments": []
25+
}

scanpipe/tests/pipes/test_symbols.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

23+
import json
2324
import sys
2425
from pathlib import Path
2526
from unittest import skipIf
@@ -54,3 +55,27 @@ def test_scanpipe_pipes_symbols_collect_and_store_resource_symbols(self):
5455
result_extra_data_symbols = main_file.extra_data.get("source_symbols")
5556
expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"]
5657
self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols)
58+
59+
def test_scanpipe_pipes_collect_and_store_pygments_symbols_and_strings(self):
60+
dir = self.project1.codebase_path / "codefile"
61+
dir.mkdir(parents=True)
62+
63+
file_location = self.data_location / "source-inspector" / "test3.cpp"
64+
copy_input(file_location, dir)
65+
66+
pipes.collect_and_create_codebase_resources(self.project1)
67+
68+
symbols.collect_and_store_pygments_symbols_and_strings(self.project1)
69+
70+
main_file = self.project1.codebaseresources.files()[0]
71+
72+
result_extra_data = main_file.extra_data
73+
74+
expected_extra_data = (
75+
self.data_location / "source-inspector" / "test3.cpp-pygments-expected.json"
76+
)
77+
78+
with open(expected_extra_data) as f:
79+
expected_extra_data = json.load(f)
80+
81+
self.assertDictEqual(expected_extra_data, result_extra_data)

scanpipe/tests/test_pipelines.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1267,3 +1267,33 @@ def test_scanpipe_collect_source_strings_pipeline_integration(self):
12671267
"Enter the desired length of your password:",
12681268
]
12691269
self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings)
1270+
1271+
def test_scanpipe_collect_pygments_symbols_pipeline_integration(self):
1272+
pipeline_name = "collect_pygments_symbols"
1273+
project1 = Project.objects.create(name="Analysis")
1274+
1275+
dir = project1.codebase_path / "codefile"
1276+
dir.mkdir(parents=True)
1277+
1278+
file_location = self.data_location / "source-inspector" / "test3.cpp"
1279+
copy_input(file_location, dir)
1280+
1281+
pipes.collect_and_create_codebase_resources(project1)
1282+
1283+
run = project1.add_pipeline(pipeline_name)
1284+
pipeline = run.make_pipeline_instance()
1285+
1286+
exitcode, out = pipeline.execute()
1287+
self.assertEqual(0, exitcode, msg=out)
1288+
1289+
main_file = project1.codebaseresources.files()[0]
1290+
result_extra_data = main_file.extra_data
1291+
1292+
expected_extra_data = (
1293+
self.data_location / "source-inspector" / "test3.cpp-pygments-expected.json"
1294+
)
1295+
1296+
with open(expected_extra_data) as f:
1297+
expected_extra_data = json.load(f)
1298+
1299+
self.assertDictEqual(expected_extra_data, result_extra_data)

setup.cfg

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ install_requires =
8181
elf-inspector==0.0.1
8282
go-inspector==0.2.2
8383
python-inspector==0.12.0
84-
source-inspector==0.3.0
84+
source-inspector==0.5.0
8585
aboutcode-toolkit==10.1.0
8686
# Utilities
8787
XlsxWriter==3.2.0
@@ -132,6 +132,7 @@ scancodeio_pipelines =
132132
analyze_docker_image = scanpipe.pipelines.docker:Docker
133133
analyze_root_filesystem_or_vm_image = scanpipe.pipelines.root_filesystem:RootFS
134134
analyze_windows_docker_image = scanpipe.pipelines.docker_windows:DockerWindows
135+
collect_pygments_symbols = scanpipe.pipelines.collect_pygments_symbols:CollectPygmentsSymbolsAndStrings
135136
collect_source_strings = scanpipe.pipelines.collect_source_strings:CollectSourceStrings
136137
collect_symbols = scanpipe.pipelines.collect_symbols:CollectSymbols
137138
find_vulnerabilities = scanpipe.pipelines.find_vulnerabilities:FindVulnerabilities

0 commit comments

Comments
 (0)