Skip to content

Commit 77988dc

Browse files
Add functions to create fingerprints from stemmed code, highlight code snippet matches from matchcode (#1656)
* Add scanner to compute stem code fingerprint Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Bump matchcode-toolkit Signed-off-by: Keshav Priyadarshi <git@keshav.space> * Update function names Signed-off-by: Jono Yang <jyang@nexb.com> * Update matchcode-toolkit to 7.2.1 Signed-off-by: Jono Yang <jyang@nexb.com> * Use matchcode-toolkit branch * there is a bug in the code stemming function Signed-off-by: Jono Yang <jyang@nexb.com> * Use matchcode-toolkit 7.2.2 Signed-off-by: Jono Yang <jyang@nexb.com> * Add test for fingerprint_stemmed_codebase_resources Signed-off-by: Jono Yang <jyang@nexb.com> * Create test for get_matched_snippet_annotations * Update test js file Signed-off-by: Jono Yang <jyang@nexb.com> * Update CHANGELOG.rst Signed-off-by: Jono Yang <jyang@nexb.com> --------- Signed-off-by: Keshav Priyadarshi <git@keshav.space> Signed-off-by: Jono Yang <jyang@nexb.com> Co-authored-by: Keshav Priyadarshi <git@keshav.space>
1 parent 3ce578d commit 77988dc

File tree

8 files changed

+471
-1
lines changed

8 files changed

+471
-1
lines changed

CHANGELOG.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ v34.10.2 (unreleased)
88
Use the UUID for the DiscoveredDependency spdx_id for better SPDX compatibility.
99
https://github.com/aboutcode-org/scancode.io/issues/1651
1010

11+
- Add MatchCode-specific functions to compute fingerprints from stemmed code
12+
files. Update CodebaseResource file content view to display snippet matches,
13+
if available, when the codebase has been sent for matching to MatchCode.
14+
https://github.com/aboutcode-org/scancode.io/pull/1656
15+
1116
v34.10.1 (2025-03-26)
1217
---------------------
1318

scanpipe/pipes/matchcode.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
import requests
2929
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
3030
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
31+
from matchcode_toolkit.fingerprinting import get_line_by_pos
32+
from matchcode_toolkit.fingerprinting import get_stemmed_file_fingerprint_hashes
3133
from scancode import Scanner
3234

3335
from scanpipe.pipes import codebase
@@ -254,6 +256,48 @@ def fingerprint_codebase_resources(
254256
)
255257

256258

259+
def fingerprint_stemmed_codebase_resource(location, with_threading=True, **kwargs):
260+
"""
261+
Compute stemmed code fingerprints for the resource at `location` using the
262+
scancode-toolkit direct API.
263+
264+
Return a dictionary of scan `results` and a list of `errors`.
265+
"""
266+
scanners = [
267+
Scanner("stemmed_fingerprints", get_stemmed_file_fingerprint_hashes),
268+
]
269+
return _scan_resource(location, scanners, with_threading=with_threading)
270+
271+
272+
def fingerprint_stemmed_codebase_resources(
273+
project, resource_qs=None, progress_logger=None, to_codebase_only=False
274+
):
275+
"""
276+
Compute stemmed code fingerprints for the resources from `project`.
277+
278+
These resource fingerprints are used for matching purposes on matchcode.
279+
280+
Multiprocessing is enabled by default on this pipe, the number of processes can be
281+
controlled through the SCANCODEIO_PROCESSES setting.
282+
283+
If `to_codebase_only` is True, the only resources from the `to/` codebase
284+
are computed.
285+
"""
286+
# Checking for None to make the distinction with an empty resource_qs queryset
287+
if resource_qs is None:
288+
resource_qs = project.codebaseresources.filter(is_text=True)
289+
290+
if to_codebase_only:
291+
resource_qs = resource_qs.to_codebase()
292+
293+
scan_resources(
294+
resource_qs=resource_qs,
295+
scan_func=fingerprint_stemmed_codebase_resource,
296+
save_func=save_resource_fingerprints,
297+
progress_logger=progress_logger,
298+
)
299+
300+
257301
def send_project_json_to_matchcode(
258302
project, timeout=DEFAULT_TIMEOUT, api_url=MATCHCODEIO_API_URL
259303
):
@@ -362,3 +406,14 @@ def create_packages_from_match_results(project, match_results):
362406
package_data=matched_package,
363407
status=flag.MATCHED_TO_PURLDB_PACKAGE,
364408
)
409+
match_resources = match_results.get("files", [])
410+
for match_resource in match_resources:
411+
match_resource_extra_data = match_resource["extra_data"]
412+
if match_resource_extra_data:
413+
resource = project.codebaseresources.get(path=match_resource["path"])
414+
# compute line_by_pos for displaying matches in CodebaseResource detail view
415+
with open(resource.location) as f:
416+
content = f.read()
417+
line_by_pos = get_line_by_pos(content)
418+
match_resource_extra_data["line_by_pos"] = line_by_pos
419+
resource.update_extra_data(match_resource_extra_data)
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
{
2+
"halo1": "0000004f5cc2ec9a5ebdaa44336f53be569d6829",
3+
"snippets": [
4+
{
5+
"snippet": "24a1651c51468fb8cf1ac6c38a2c4add",
6+
"position": "0"
7+
},
8+
{
9+
"snippet": "7b1cbef763885c6856df8b15fa4e57a5",
10+
"position": 5
11+
},
12+
{
13+
"snippet": "46828d9d4a64300b1543e4e5a6356ed5",
14+
"position": 12
15+
},
16+
{
17+
"snippet": "c0496b020a8d87a3b1bf1a83c67c16d5",
18+
"position": 14
19+
},
20+
{
21+
"snippet": "b2ec716c571a0368ea37dbb7821c6945",
22+
"position": 15
23+
},
24+
{
25+
"snippet": "8dd2b57022204ecd9ea4a2471f224fd4",
26+
"position": 22
27+
},
28+
{
29+
"snippet": "cb9216ce4ad33a5d6feb378dbf0404c8",
30+
"position": 30
31+
},
32+
{
33+
"snippet": "034b634f1c726c9c0f7740ea9723637b",
34+
"position": 37
35+
},
36+
{
37+
"snippet": "d0bb8a1740512218c8e87bbaa5f5d9a6",
38+
"position": 38
39+
},
40+
{
41+
"snippet": "7ae529b13ddb3b0c74421772d78821a7",
42+
"position": 41
43+
},
44+
{
45+
"snippet": "b2aad3c6ab2c2c9ba1a95edac417aa09",
46+
"position": 42
47+
},
48+
{
49+
"snippet": "be339f1c1670b7789e83f875978c1e06",
50+
"position": 46
51+
},
52+
{
53+
"snippet": "a895f0ff2b99352b33392fda0a87a4cf",
54+
"position": 53
55+
},
56+
{
57+
"snippet": "6819c7f718a1fa7f2501009d21ee46d7",
58+
"position": 57
59+
},
60+
{
61+
"snippet": "97ecd33b1ca08589363df198458d976f",
62+
"position": 61
63+
},
64+
{
65+
"snippet": "2c73086d098f182cf8441046b97af434",
66+
"position": 64
67+
},
68+
{
69+
"snippet": "3ba6ad01d6f9130be38df14a44633abd",
70+
"position": 67
71+
}
72+
],
73+
"line_by_pos": {
74+
"0": 1,
75+
"1": 1,
76+
"2": 1,
77+
"3": 3,
78+
"4": 3,
79+
"5": 3,
80+
"6": 3,
81+
"7": 3,
82+
"8": 4,
83+
"9": 4,
84+
"10": 5,
85+
"11": 5,
86+
"12": 6,
87+
"13": 6,
88+
"14": 6,
89+
"15": 6,
90+
"16": 6,
91+
"17": 6,
92+
"18": 7,
93+
"19": 7,
94+
"20": 7,
95+
"21": 7,
96+
"22": 7,
97+
"23": 7,
98+
"24": 8,
99+
"25": 8,
100+
"26": 8,
101+
"27": 8,
102+
"28": 8,
103+
"29": 8,
104+
"30": 11,
105+
"31": 11,
106+
"32": 11,
107+
"33": 11,
108+
"34": 11,
109+
"35": 11,
110+
"36": 11,
111+
"37": 12,
112+
"38": 12,
113+
"39": 12,
114+
"40": 15,
115+
"41": 15,
116+
"42": 16,
117+
"43": 16,
118+
"44": 16,
119+
"45": 16,
120+
"46": 17,
121+
"47": 17,
122+
"48": 17,
123+
"49": 18,
124+
"50": 18,
125+
"51": 19,
126+
"52": 19,
127+
"53": 19,
128+
"54": 20,
129+
"55": 20,
130+
"56": 20,
131+
"57": 20,
132+
"58": 21,
133+
"59": 21,
134+
"60": 21,
135+
"61": 21,
136+
"62": 21,
137+
"63": 22,
138+
"64": 22,
139+
"65": 22,
140+
"66": 22,
141+
"67": 22,
142+
"68": 23,
143+
"69": 23,
144+
"70": 24,
145+
"71": 24,
146+
"72": 24,
147+
"73": 25,
148+
"74": 25,
149+
"75": 25,
150+
"76": 27,
151+
"77": 27,
152+
"78": 28,
153+
"79": 28,
154+
"80": 28,
155+
"81": 29,
156+
"82": 29
157+
},
158+
"stemmed_halo1": "000000240a64b6c8aae4625491a8aa77ffd9b2a6",
159+
"stemmed_snippets": [
160+
{
161+
"snippet": "8e5f6fead6d0469a9af967bd3b3c823c",
162+
"position": "0"
163+
},
164+
{
165+
"snippet": "3b4fb17158ed94e2babd49970af94d06",
166+
"position": 2
167+
},
168+
{
169+
"snippet": "b0607c96667235727aa1e4212e907f7b",
170+
"position": 3
171+
},
172+
{
173+
"snippet": "65aecd343e17c78db5cfca34a8a4fa02",
174+
"position": 4
175+
},
176+
{
177+
"snippet": "89a7bf1c4ead7854f274e6f41b7654da",
178+
"position": 5
179+
},
180+
{
181+
"snippet": "8c38b55be87ffec2c0b91d6085f12e69",
182+
"position": 6
183+
},
184+
{
185+
"snippet": "5e0ddfbe6eeaa0bbe00f0a3bcb4183a8",
186+
"position": 7
187+
},
188+
{
189+
"snippet": "f8a7cabd43fb2d8a40a23d83217e3d8b",
190+
"position": 8
191+
},
192+
{
193+
"snippet": "fdc4910fe720d6b9f20196d306e7aedc",
194+
"position": 9
195+
},
196+
{
197+
"snippet": "7a5ee56ca82edc1c76e0b0b9322129dd",
198+
"position": 10
199+
},
200+
{
201+
"snippet": "6b93bb4ea1623dd6946a21f99418a3fa",
202+
"position": 11
203+
},
204+
{
205+
"snippet": "8f2a211b1a10cbd28fb8f1ad21dbf5fb",
206+
"position": 12
207+
},
208+
{
209+
"snippet": "c3c82df4de85b1c9dbf69b2b5a45935c",
210+
"position": 13
211+
},
212+
{
213+
"snippet": "216e662345dd2969bff90aefdae76672",
214+
"position": 14
215+
},
216+
{
217+
"snippet": "24d9e003c332e26e2cae1263d18e0ef6",
218+
"position": 15
219+
},
220+
{
221+
"snippet": "7210020de6bfe60b69ca8ec908845a15",
222+
"position": 17
223+
},
224+
{
225+
"snippet": "667f800b10c105c2418effd6035e6763",
226+
"position": 18
227+
},
228+
{
229+
"snippet": "c18caedb3daf59b210278b2b6d1d0db5",
230+
"position": 19
231+
},
232+
{
233+
"snippet": "a19fe989f63161a76526933a34593741",
234+
"position": 20
235+
},
236+
{
237+
"snippet": "f782389ac40b56bc81a7c92f40d87a83",
238+
"position": 21
239+
},
240+
{
241+
"snippet": "4ed61cd372dcc7d88c95d899271fd138",
242+
"position": 22
243+
},
244+
{
245+
"snippet": "e9c74c50192eb95bc4595254fc253427",
246+
"position": 23
247+
},
248+
{
249+
"snippet": "5a908af743b549f1f0ef8ab02c9053eb",
250+
"position": 24
251+
}
252+
],
253+
"matched_snippets": [
254+
{
255+
"package": "pkg:github/isaacs/inherits@v2.0.3",
256+
"resource": "inherits-2.0.3/inherits.js",
257+
"similarity": "1.0",
258+
"match_detections": [
259+
0,
260+
1,
261+
2,
262+
3,
263+
4,
264+
5,
265+
6,
266+
7,
267+
8,
268+
9,
269+
10,
270+
11,
271+
12,
272+
13,
273+
14,
274+
15
275+
]
276+
}
277+
]
278+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
module.exports = inherits
2+
3+
function inherits (c, p, proto) {
4+
proto = proto || {}
5+
var e = {}
6+
;[c.prototype, proto].forEach(function (s) {
7+
Object.getOwnPropertyNames(s).forEach(function (k) {
8+
e[k] = Object.getOwnPropertyDescriptor(s, k)
9+
})
10+
})
11+
c.prototype = Object.create(p.prototype, e)
12+
c.super = p
13+
}
14+
15+
//function Child () {
16+
// Child.super.call(this)
17+
// console.error([this
18+
// ,this.constructor
19+
// ,this.constructor === Child
20+
// ,this.constructor.super === Parent
21+
// ,Object.getPrototypeOf(this) === Child.prototype
22+
// ,Object.getPrototypeOf(Object.getPrototypeOf(this))
23+
// === Parent.prototype
24+
// ,this instanceof Child
25+
// ,this instanceof Parent])
26+
//}
27+
//function Parent () {}
28+
//inherits(Child, Parent)
29+
//new Child

0 commit comments

Comments
 (0)