19
19
from git .raw_commit import RawCommit
20
20
from log .logger import logger
21
21
from stats .execution import execution_statistics , measure_execution_time
22
- from util .lsh import (
23
- build_lsh_index ,
24
- compute_minhash ,
25
- encode_minhash ,
26
- get_encoded_minhash ,
27
- )
22
+ from util .lsh import get_encoded_minhash
28
23
29
24
# GIT_CACHE = os.getenv("GIT_CACHE")
30
25
GITHUB_TOKEN = os .getenv ("GITHUB_TOKEN" )
@@ -118,7 +113,6 @@ def __init__(
118
113
self .shallow_clone = shallow
119
114
self .exec = Exec (workdir = self .path )
120
115
self .storage = None
121
- # self.lsh_index = build_lsh_index()
122
116
123
117
def execute (self , cmd : str , silent : bool = False ):
124
118
return self .exec .run (cmd , silent = silent , cache = True )
@@ -202,7 +196,7 @@ def clone(self, shallow=None, skip_existing=False):
202
196
else :
203
197
logger .debug (f"Found repo { self .url } in { self .path } .\n Fetching...." )
204
198
205
- self .execute ("git fetch --progress --all --tags" )
199
+ self .execute ("git fetch --progress --all --tags --force " )
206
200
return
207
201
208
202
if os .path .exists (self .path ):
@@ -239,10 +233,6 @@ def clone(self, shallow=None, skip_existing=False):
239
233
shutil .rmtree (self .path )
240
234
raise e
241
235
242
- def get_tags ():
243
- cmd = "git log --tags --format=%H - %D"
244
- pass
245
-
246
236
@measure_execution_time (execution_statistics .sub_collection ("core" ))
247
237
def create_commits (
248
238
self ,
@@ -259,33 +249,22 @@ def create_commits(
259
249
if ancestors_of is None or find_twins :
260
250
cmd += " --all"
261
251
262
- # by filtering the dates of the tags we can reduce the commit range safely (in theory)
263
252
if ancestors_of :
264
253
if not find_twins :
265
254
cmd += f" { ancestors_of } "
266
255
until = self .extract_tag_timestamp (ancestors_of )
256
+ cmd += f" --until={ until } "
267
257
# TODO: if find twins is true, we dont need the ancestors, only the timestamps
268
258
if exclude_ancestors_of :
269
259
if not find_twins :
270
260
cmd += f" ^{ exclude_ancestors_of } "
271
261
since = self .extract_tag_timestamp (exclude_ancestors_of )
272
-
273
- if since :
274
262
cmd += f" --since={ since } "
275
263
276
- if until :
277
- cmd += f" --until={ until } "
278
-
279
- # for ext in FILTERING_EXTENSIONS:
280
- # cmd += f" *.{ext}"
281
-
282
264
try :
283
265
logger .debug (cmd )
284
266
out = self .execute (cmd )
285
267
# if --all is used, we are traversing all branches and therefore we can check for twins
286
-
287
- # TODO: problem -> twins can be merge commits, same commits for different branches, not only security related fixes
288
-
289
268
return self .parse_git_output (out , find_twins , ancestors_of )
290
269
291
270
except Exception :
@@ -302,8 +281,9 @@ def parse_git_output(
302
281
if line == GIT_SEPARATOR :
303
282
if sector == 3 :
304
283
sector = 1
305
- if 0 < len (commit .changed_files ) < 100 :
284
+ if 0 < len (commit .changed_files ) < 100 and len ( commit . msg ) < 5000 :
306
285
commit .msg = commit .msg .strip ()
286
+
307
287
# TODO: should work here
308
288
# commit.set_tags(next_tag)
309
289
if find_twins :
@@ -326,7 +306,14 @@ def parse_git_output(
326
306
elif sector == 2 :
327
307
commit .msg += line + " "
328
308
elif sector == 3 and not any (
329
- x in line for x in ("test" , ".md" , "/docs/" )
309
+ x in line
310
+ for x in (
311
+ "test" ,
312
+ ".md" ,
313
+ "docs/" ,
314
+ ".meta" ,
315
+ ".utf8" ,
316
+ ) # TODO: build a list for these. If there are no . then is not relevant
330
317
):
331
318
commit .add_changed_file (line )
332
319
@@ -360,71 +347,71 @@ def get_issues(self, since=None) -> Dict[str, str]:
360
347
break
361
348
r = requests .get (query_url , params = params , headers = headers )
362
349
363
- # @measure_execution_time(execution_statistics.sub_collection("core"))
364
- def get_commits (
365
- self ,
366
- ancestors_of = None ,
367
- exclude_ancestors_of = None ,
368
- since = None ,
369
- until = None ,
370
- find_in_code = "" ,
371
- find_in_msg = "" ,
372
- ):
373
- cmd = "git log --format=%H"
374
-
375
- if ancestors_of is None :
376
- cmd += " --all"
377
-
378
- # by filtering the dates of the tags we can reduce the commit range safely (in theory)
379
- if ancestors_of :
380
- cmd += f" { ancestors_of } "
381
- until = self .extract_tag_timestamp (ancestors_of )
382
-
383
- if exclude_ancestors_of :
384
- cmd += f" ^{ exclude_ancestors_of } "
385
- since = self .extract_tag_timestamp (exclude_ancestors_of )
386
-
387
- if since :
388
- cmd += f" --since={ since } "
389
-
390
- if until :
391
- cmd += f" --until={ until } "
392
-
393
- for ext in FILTERING_EXTENSIONS :
394
- cmd += f" *.{ ext } "
395
-
396
- # What is this??
397
- if find_in_code :
398
- cmd += f" -S{ find_in_code } "
399
-
400
- if find_in_msg :
401
- cmd += f" --grep={ find_in_msg } "
402
-
403
- try :
404
- logger .debug (cmd )
405
- out = self .execute (cmd )
406
-
407
- except Exception :
408
- logger .error ("Git command failed, cannot get commits" , exc_info = True )
409
- out = []
410
-
411
- return out
412
-
413
- def get_commits_between_two_commit (self , commit_from : str , commit_to : str ):
414
- """
415
- Return the commits between the start commit and the end commmit if there are path between them or empty list
416
- """
417
- try :
418
- cmd = f"git rev-list --ancestry-path { commit_from } ..{ commit_to } "
419
-
420
- path = self .execute (cmd ) # ???
421
- if len (path ) > 0 :
422
- path .pop (0 )
423
- path .reverse ()
424
- return path
425
- except :
426
- logger .error ("Failed to obtain commits, details below:" , exc_info = True )
427
- return []
350
+ # # @measure_execution_time(execution_statistics.sub_collection("core"))
351
+ # def get_commits(
352
+ # self,
353
+ # ancestors_of=None,
354
+ # exclude_ancestors_of=None,
355
+ # since=None,
356
+ # until=None,
357
+ # find_in_code="",
358
+ # find_in_msg="",
359
+ # ):
360
+ # cmd = "git log --format=%H"
361
+
362
+ # if ancestors_of is None:
363
+ # cmd += " --all"
364
+
365
+ # # by filtering the dates of the tags we can reduce the commit range safely (in theory)
366
+ # if ancestors_of:
367
+ # cmd += f" {ancestors_of}"
368
+ # until = self.extract_tag_timestamp(ancestors_of)
369
+
370
+ # if exclude_ancestors_of:
371
+ # cmd += f" ^{exclude_ancestors_of}"
372
+ # since = self.extract_tag_timestamp(exclude_ancestors_of)
373
+
374
+ # if since:
375
+ # cmd += f" --since={since}"
376
+
377
+ # if until:
378
+ # cmd += f" --until={until}"
379
+
380
+ # for ext in FILTERING_EXTENSIONS:
381
+ # cmd += f" *.{ext}"
382
+
383
+ # # What is this??
384
+ # if find_in_code:
385
+ # cmd += f" -S{find_in_code}"
386
+
387
+ # if find_in_msg:
388
+ # cmd += f" --grep={find_in_msg}"
389
+
390
+ # try:
391
+ # logger.debug(cmd)
392
+ # out = self.execute(cmd)
393
+
394
+ # except Exception:
395
+ # logger.error("Git command failed, cannot get commits", exc_info=True)
396
+ # out = []
397
+
398
+ # return out
399
+
400
+ # def get_commits_between_two_commit(self, commit_from: str, commit_to: str):
401
+ # """
402
+ # Return the commits between the start commit and the end commmit if there are path between them or empty list
403
+ # """
404
+ # try:
405
+ # cmd = f"git rev-list --ancestry-path {commit_from}..{commit_to}"
406
+
407
+ # path = self.execute(cmd) # ???
408
+ # if len(path) > 0:
409
+ # path.pop(0)
410
+ # path.reverse()
411
+ # return path
412
+ # except:
413
+ # logger.error("Failed to obtain commits, details below:", exc_info=True)
414
+ # return []
428
415
429
416
@measure_execution_time (execution_statistics .sub_collection ("core" ))
430
417
def get_commit (self , id ):
0 commit comments