Skip to content

Commit cdb2460

Browse files
sacca97copernico
authored andcommitted
cleaned some code and added a time delta of tolerance when extracting the tag dates, so we can get some commits eventually ported a few days later. Currently set to 2weeks
1 parent 56149c3 commit cdb2460

File tree

1 file changed

+23
-72
lines changed

1 file changed

+23
-72
lines changed

prospector/git/git.py

Lines changed: 23 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626

2727
GIT_SEPARATOR = "-@-@-@-@-"
2828

29+
TEN_DAYS_TIME_DELTA = 14 * 24 * 60 * 60
30+
2931
FILTERING_EXTENSIONS = ["java", "c", "cpp", "py", "js", "go", "php", "h", "jsp"]
3032
RELEVANT_EXTENSIONS = [
3133
"java",
@@ -200,7 +202,7 @@ def clone(self, shallow=None, skip_existing=False):
200202
return
201203

202204
if os.path.exists(self.path):
203-
logger.debug(f"Folder {self.path} is not a git repository.")
205+
logger.info(f"Folder {self.path} is not a git repository.")
204206
return
205207

206208
os.makedirs(self.path)
@@ -236,117 +238,66 @@ def clone(self, shallow=None, skip_existing=False):
236238
@measure_execution_time(execution_statistics.sub_collection("core"))
237239
def create_commits(
238240
self,
239-
ancestors_of=None,
240-
exclude_ancestors_of=None,
241+
next_tag=None,
242+
prev_tag=None,
241243
since=None,
242244
until=None,
245+
filter_extension=None,
243246
find_in_code="",
244247
find_in_msg="",
245-
find_twins=True,
246248
) -> Dict[str, RawCommit]:
247-
cmd = f"git log --name-only --full-index --format=%n{GIT_SEPARATOR}%n%H:%at:%P%n{GIT_SEPARATOR}%n%B%n{GIT_SEPARATOR}%n"
248-
249-
if ancestors_of is None or find_twins:
250-
cmd += " --all"
249+
cmd = f"git log --all --name-only --full-index --format=%n{GIT_SEPARATOR}%n%H:%at:%P%n{GIT_SEPARATOR}%n%B%n{GIT_SEPARATOR}%n"
251250

252-
if ancestors_of:
253-
if not find_twins:
254-
cmd += f" {ancestors_of}"
255-
until = self.extract_tag_timestamp(ancestors_of)
251+
if next_tag:
252+
until = self.extract_tag_timestamp(next_tag) + TEN_DAYS_TIME_DELTA
256253
cmd += f" --until={until}"
254+
257255
# TODO: if find twins is true, we dont need the ancestors, only the timestamps
258-
if exclude_ancestors_of:
259-
if not find_twins:
260-
cmd += f" ^{exclude_ancestors_of}"
261-
since = self.extract_tag_timestamp(exclude_ancestors_of)
256+
if prev_tag:
257+
since = self.extract_tag_timestamp(prev_tag) - TEN_DAYS_TIME_DELTA
262258
cmd += f" --since={since}"
263259

260+
if filter_extension:
261+
cmd += " *." + " *.".join(filter_extension)
262+
264263
try:
265264
logger.debug(cmd)
266265
out = self.execute(cmd)
267-
# if --all is used, we are traversing all branches and therefore we can check for twins
268-
return self.parse_git_output(out, find_twins, ancestors_of)
266+
return self.parse_git_output(out)
269267

270268
except Exception:
271269
logger.error("Git command failed, cannot get commits", exc_info=True)
272270
return dict()
273271

274-
def parse_git_output(
275-
self, raw: List[str], find_twins: bool = False, next_tag: Optional[str] = None
276-
):
272+
def parse_git_output(self, raw: List[str]) -> Dict[str, RawCommit]:
277273
commits: Dict[str, RawCommit] = dict()
278274
commit = None
279275
sector = 0
276+
raw.append(GIT_SEPARATOR)
280277
for line in raw:
281278
if line == GIT_SEPARATOR:
282279
if sector == 3:
283280
sector = 1
284-
if 0 < len(commit.changed_files) < 100 and len(commit.msg) < 5000:
285-
commit.msg = commit.msg.strip()
286-
287-
# TODO: should work here
288-
# commit.set_tags(next_tag)
289-
if find_twins:
290-
commit.minhash = get_encoded_minhash(commit.msg[:50])
291-
292-
commits[commit.id] = commit
293-
281+
commit.msg = commit.msg.strip()
282+
commits[commit.id] = commit
294283
else:
295284
sector += 1
296285
else:
297286
if sector == 1:
298287
id, timestamp, parent = line.split(":")
299-
parent = parent.split(" ")[0]
300288
commit = RawCommit(
301289
repository=self,
302290
commit_id=id,
303291
timestamp=int(timestamp),
304-
parent_id=parent,
292+
parent_id=parent.split()[0],
305293
)
306294
elif sector == 2:
307295
commit.msg += line + " "
308-
elif sector == 3 and not any(
309-
x in line
310-
for x in (
311-
"test",
312-
".md",
313-
"docs/",
314-
".meta",
315-
".utf8",
316-
) # TODO: build a list for these. If there are no . then is not relevant
317-
):
318-
commit.add_changed_file(line)
296+
elif sector == 3:
297+
commit.changed_files.append(line)
319298

320299
return commits
321300

322-
def get_issues(self, since=None) -> Dict[str, str]:
323-
owner, repo = self.url.split("/")[-2:]
324-
query_url = f"https://api.github.com/repos/{owner}/{repo}/issues"
325-
# /repos/{owner}/{repo}/issues/{issue_number}
326-
params = {
327-
"state": "closed",
328-
"per_page": 100,
329-
"since": since,
330-
"page": 1,
331-
}
332-
headers = {
333-
"Authorization": f"Bearer {GITHUB_TOKEN}",
334-
"Accept": "application/vnd.github+json",
335-
}
336-
r = requests.get(query_url, params=params, headers=headers)
337-
338-
while len(r.json()) > 0:
339-
for elem in r.json():
340-
body = elem["body"] or ""
341-
self.issues[str(elem["number"])] = (
342-
elem["title"] + " " + " ".join(body.split())
343-
)
344-
345-
params["page"] += 1
346-
if params["page"] > 10:
347-
break
348-
r = requests.get(query_url, params=params, headers=headers)
349-
350301
# # @measure_execution_time(execution_statistics.sub_collection("core"))
351302
# def get_commits(
352303
# self,

0 commit comments

Comments
 (0)