Skip to content

Commit eec8b12

Browse files
authored
Addon pipeline for symbol collection (#1116)
* Add addon pipeline for symbol collection Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Update dockerfile to install universal-ctags Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Update docs to include Universal Ctags as system dependency Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Add test for collect_symbols pipeline Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Update CI to install Universal Ctags Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Use source-inspector for symbol collection Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Bump source-inspector to v0.2.0 Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Add CollectSymbols in built-in-pipelines doc Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Use proper ref for CollectSymbols pipeline Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Move symbol collection to new pipe Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Add CHANGELOG for CollectSymbols pipeline Signed-off-by: Keshav Priyadarshi <git@keshav.space> --------- Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent 63b78de commit eec8b12

File tree

11 files changed

+310
-0
lines changed

11 files changed

+310
-0
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ jobs:
4242
with:
4343
python-version: ${{ matrix.python-version }}
4444

45+
- name: Install universal ctags
46+
run: sudo apt-get install -y universal-ctags
47+
4548
- name: Install dependencies
4649
run: make dev envfile
4750

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ v34.1.0 (unreleased)
1111
The docstring are converted from markdown to html for proper rendering.
1212
https://github.com/nexB/scancode.io/pull/1105
1313

14+
- Add a new `CollectSymbols` pipeline (addon) for collecting codebase symbols using
15+
Universal Ctags.
16+
https://github.com/nexB/scancode.io/pull/1116
17+
1418
v34.0.0 (2024-03-04)
1519
--------------------
1620

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ ENV PYTHONPATH $PYTHONPATH:$APP_DIR
4040

4141
# OS requirements as per
4242
# https://scancode-toolkit.readthedocs.io/en/latest/getting-started/install.html
43+
# Also install universal-ctags for symbol collection.
4344
RUN apt-get update \
4445
&& apt-get install -y --no-install-recommends \
4546
bzip2 \
@@ -58,6 +59,7 @@ RUN apt-get update \
5859
linux-image-amd64 \
5960
git \
6061
wait-for-it \
62+
universal-ctags \
6163
&& apt-get clean \
6264
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
6365

docs/built-in-pipelines.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ Analyse Docker Windows Image
4242
:members:
4343
:member-order: bysource
4444

45+
.. _pipeline_collect_symbols:
46+
47+
Collect Codebase Symbols (addon)
48+
---------------------------------
49+
.. autoclass:: scanpipe.pipelines.collect_symbols.CollectSymbols()
50+
:members:
51+
:member-order: bysource
52+
4553
.. _pipeline_find_vulnerabilities:
4654

4755
Find Vulnerabilities (addon)

docs/installation.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,15 @@ Make sure those are installed before attempting the ScanCode.io installation::
260260
See also `ScanCode-toolkit Prerequisites <https://scancode-toolkit.readthedocs.io/en/
261261
latest/getting-started/install.html#prerequisites>`_ for more details.
262262

263+
For the :ref:`pipeline_collect_symbols` pipeline, `Universal Ctags <https://github.com/universal-ctags/ctags>`_ is needed.
264+
On **Linux** install it using::
265+
266+
sudo apt-get install universal-ctags
267+
268+
On **MacOS** install Universal Ctags using Homebrew::
269+
270+
brew install universal-ctags
271+
263272
Clone and Configure
264273
^^^^^^^^^^^^^^^^^^^
265274

scanpipe/pipelines/collect_symbols.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/nexB/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import symbols
25+
26+
27+
class CollectSymbols(Pipeline):
28+
"""Collect symbols from codebase files and keep them in extra data field."""
29+
30+
download_inputs = False
31+
is_addon = True
32+
33+
@classmethod
34+
def steps(cls):
35+
return (cls.collect_and_store_resource_symbols,)
36+
37+
def collect_and_store_resource_symbols(self):
38+
"""
39+
Collect symbols from codebase files using Ctags and store
40+
them in the extra data field.
41+
"""
42+
symbols.collect_and_store_resource_symbols(self.project, self.log)

scanpipe/pipes/symbols.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/nexB/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/scancode.io for support and download.
22+
23+
from source_inspector import symbols_ctags
24+
25+
from scanpipe.pipes import LoopProgress
26+
27+
28+
class UniversalCtagsNotFound(Exception):
29+
pass
30+
31+
32+
def collect_and_store_resource_symbols(project, logger=None):
33+
"""
34+
Collect symbols from codebase files using Ctags and store
35+
them in the extra data field.
36+
"""
37+
if not symbols_ctags.is_ctags_installed():
38+
raise UniversalCtagsNotFound(
39+
"``Universal Ctags`` not found."
40+
"Install ``Universal Ctags`` to use this pipeline."
41+
)
42+
43+
project_files = project.codebaseresources.files()
44+
45+
resources = project_files.filter(
46+
is_binary=False,
47+
is_archive=False,
48+
is_media=False,
49+
)
50+
51+
resources_count = resources.count()
52+
53+
resource_iterator = resources.iterator(chunk_size=2000)
54+
progress = LoopProgress(resources_count, logger)
55+
56+
for resource in progress.iter(resource_iterator):
57+
_collect_and_store_resource_symbols(resource)
58+
59+
60+
def _collect_and_store_resource_symbols(resource):
61+
"""
62+
Collect symbols from a resource using Ctags and store
63+
them in the extra data field.
64+
"""
65+
symbols = symbols_ctags.collect_symbols(resource.location)
66+
tags = [symbol["name"] for symbol in symbols if "name" in symbol]
67+
resource.update_extra_data({"source_symbols": tags})
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
{
2+
"headers": [
3+
{
4+
"tool_name": "scanpipe",
5+
"notice": "Generated with ScanCode.io and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied.\nNo content created from ScanCode.io should be considered or used as legal advice.\nConsult an Attorney for any legal advice.\nScanCode.io is a free software code scanning tool from nexB Inc. and others\nlicensed under the Apache License version 2.0.\nScanCode is a trademark of nexB Inc.\nVisit https://github.com/nexB/scancode.io for support and download.\n",
6+
"input_sources": [],
7+
"runs": [
8+
{
9+
"pipeline_name": "collect_symbols",
10+
"status": "not_started",
11+
"scancodeio_version": "",
12+
"task_id": null,
13+
"task_start_date": null,
14+
"task_end_date": null,
15+
"task_exitcode": null,
16+
"task_output": "",
17+
"execution_time": null
18+
}
19+
],
20+
"extra_data": {}
21+
}
22+
],
23+
"packages": [],
24+
"dependencies": [],
25+
"files": [
26+
{
27+
"path": "codefile",
28+
"type": "directory",
29+
"name": "codefile",
30+
"status": "",
31+
"tag": "",
32+
"extension": "",
33+
"md5": "",
34+
"sha1": "",
35+
"sha256": "",
36+
"sha512": "",
37+
"programming_language": "",
38+
"is_binary": false,
39+
"is_text": false,
40+
"is_archive": false,
41+
"is_media": false,
42+
"is_key_file": false,
43+
"detected_license_expression": "",
44+
"detected_license_expression_spdx": "",
45+
"license_detections": [],
46+
"license_clues": [],
47+
"percentage_of_license_text": null,
48+
"copyrights": [],
49+
"holders": [],
50+
"authors": [],
51+
"package_data": [],
52+
"for_packages": [],
53+
"emails": [],
54+
"urls": [],
55+
"extra_data": {}
56+
},
57+
{
58+
"path": "codefile/main.js",
59+
"type": "file",
60+
"name": "main.js",
61+
"status": "",
62+
"tag": "",
63+
"extension": ".js",
64+
"md5": "5a6e6fa1e732b600d4c2260bc49ed73f",
65+
"sha1": "d6bfcf7d1f8a00cc639b3a186a52453d37c52f61",
66+
"sha256": "adf540c42cfd6b8413d7232fcd6e5df39fa990be6f280531f9ca05d92c6bc0d6",
67+
"sha512": "",
68+
"programming_language": "JavaScript",
69+
"is_binary": false,
70+
"is_text": true,
71+
"is_archive": false,
72+
"is_media": false,
73+
"is_key_file": false,
74+
"detected_license_expression": "",
75+
"detected_license_expression_spdx": "",
76+
"license_detections": [],
77+
"license_clues": [],
78+
"percentage_of_license_text": null,
79+
"copyrights": [],
80+
"holders": [],
81+
"authors": [],
82+
"package_data": [],
83+
"for_packages": [],
84+
"emails": [],
85+
"urls": [],
86+
"extra_data": {
87+
"symbols": [
88+
"passwordLength",
89+
"generatePassword",
90+
"charSet"
91+
]
92+
}
93+
}
94+
],
95+
"relations": []
96+
}

scanpipe/tests/pipes/test_symbols.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/nexB/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/scancode.io for support and download.
22+
23+
from pathlib import Path
24+
25+
from django.test import TestCase
26+
27+
from scanpipe import pipes
28+
from scanpipe.models import Project
29+
from scanpipe.pipes import symbols
30+
from scanpipe.pipes.input import copy_input
31+
32+
33+
class ScanPipeSymbolsPipesTest(TestCase):
34+
data_location = Path(__file__).parent.parent / "data"
35+
36+
def setUp(self):
37+
self.project1 = Project.objects.create(name="Analysis")
38+
39+
def test_scanpipe_pipes_symbols_collect_and_store_resource_symbols(self):
40+
41+
dir = self.project1.codebase_path / "codefile"
42+
dir.mkdir(parents=True)
43+
44+
file_location = self.data_location / "d2d-javascript" / "from" / "main.js"
45+
copy_input(file_location, dir)
46+
47+
pipes.collect_and_create_codebase_resources(self.project1)
48+
49+
symbols.collect_and_store_resource_symbols(self.project1)
50+
51+
main_file = self.project1.codebaseresources.files()[0]
52+
result_extra_data_symbols = main_file.extra_data.get("source_symbols")
53+
expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"]
54+
self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols)

scanpipe/tests/test_pipelines.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,3 +1211,26 @@ def mock_request_post_return(url, data, headers, timeout):
12111211
)
12121212
self.assertIn("1 PURLs were already present in PurlDB index queue", run.log)
12131213
self.assertIn("Couldn't index 1 unsupported PURLs", run.log)
1214+
1215+
def test_scanpipe_collect_symbols_pipeline_integration(self):
1216+
pipeline_name = "collect_symbols"
1217+
project1 = Project.objects.create(name="Analysis")
1218+
1219+
dir = project1.codebase_path / "codefile"
1220+
dir.mkdir(parents=True)
1221+
1222+
file_location = self.data_location / "d2d-javascript" / "from" / "main.js"
1223+
copy_input(file_location, dir)
1224+
1225+
pipes.collect_and_create_codebase_resources(project1)
1226+
1227+
run = project1.add_pipeline(pipeline_name)
1228+
pipeline = run.make_pipeline_instance()
1229+
1230+
exitcode, out = pipeline.execute()
1231+
self.assertEqual(0, exitcode, msg=out)
1232+
1233+
main_file = project1.codebaseresources.files()[0]
1234+
result_extra_data_symbols = main_file.extra_data.get("source_symbols")
1235+
expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"]
1236+
self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols)

0 commit comments

Comments
 (0)