|
26 | 26 |
|
27 | 27 | GIT_SEPARATOR = "-@-@-@-@-"
|
28 | 28 |
|
| 29 | +TEN_DAYS_TIME_DELTA = 14 * 24 * 60 * 60 |
| 30 | + |
29 | 31 | FILTERING_EXTENSIONS = ["java", "c", "cpp", "py", "js", "go", "php", "h", "jsp"]
|
30 | 32 | RELEVANT_EXTENSIONS = [
|
31 | 33 | "java",
|
@@ -200,7 +202,7 @@ def clone(self, shallow=None, skip_existing=False):
|
200 | 202 | return
|
201 | 203 |
|
202 | 204 | if os.path.exists(self.path):
|
203 |
| - logger.debug(f"Folder {self.path} is not a git repository.") |
| 205 | + logger.info(f"Folder {self.path} is not a git repository.") |
204 | 206 | return
|
205 | 207 |
|
206 | 208 | os.makedirs(self.path)
|
@@ -236,117 +238,66 @@ def clone(self, shallow=None, skip_existing=False):
|
236 | 238 | @measure_execution_time(execution_statistics.sub_collection("core"))
|
237 | 239 | def create_commits(
|
238 | 240 | self,
|
239 |
| - ancestors_of=None, |
240 |
| - exclude_ancestors_of=None, |
| 241 | + next_tag=None, |
| 242 | + prev_tag=None, |
241 | 243 | since=None,
|
242 | 244 | until=None,
|
| 245 | + filter_extension=None, |
243 | 246 | find_in_code="",
|
244 | 247 | find_in_msg="",
|
245 |
| - find_twins=True, |
246 | 248 | ) -> Dict[str, RawCommit]:
|
247 |
| - cmd = f"git log --name-only --full-index --format=%n{GIT_SEPARATOR}%n%H:%at:%P%n{GIT_SEPARATOR}%n%B%n{GIT_SEPARATOR}%n" |
248 |
| - |
249 |
| - if ancestors_of is None or find_twins: |
250 |
| - cmd += " --all" |
| 249 | + cmd = f"git log --all --name-only --full-index --format=%n{GIT_SEPARATOR}%n%H:%at:%P%n{GIT_SEPARATOR}%n%B%n{GIT_SEPARATOR}%n" |
251 | 250 |
|
252 |
| - if ancestors_of: |
253 |
| - if not find_twins: |
254 |
| - cmd += f" {ancestors_of}" |
255 |
| - until = self.extract_tag_timestamp(ancestors_of) |
| 251 | + if next_tag: |
| 252 | + until = self.extract_tag_timestamp(next_tag) + TEN_DAYS_TIME_DELTA |
256 | 253 | cmd += f" --until={until}"
|
| 254 | + |
257 | 255 | # TODO: if find twins is true, we dont need the ancestors, only the timestamps
|
258 |
| - if exclude_ancestors_of: |
259 |
| - if not find_twins: |
260 |
| - cmd += f" ^{exclude_ancestors_of}" |
261 |
| - since = self.extract_tag_timestamp(exclude_ancestors_of) |
| 256 | + if prev_tag: |
| 257 | + since = self.extract_tag_timestamp(prev_tag) - TEN_DAYS_TIME_DELTA |
262 | 258 | cmd += f" --since={since}"
|
263 | 259 |
|
| 260 | + if filter_extension: |
| 261 | + cmd += " *." + " *.".join(filter_extension) |
| 262 | + |
264 | 263 | try:
|
265 | 264 | logger.debug(cmd)
|
266 | 265 | out = self.execute(cmd)
|
267 |
| - # if --all is used, we are traversing all branches and therefore we can check for twins |
268 |
| - return self.parse_git_output(out, find_twins, ancestors_of) |
| 266 | + return self.parse_git_output(out) |
269 | 267 |
|
270 | 268 | except Exception:
|
271 | 269 | logger.error("Git command failed, cannot get commits", exc_info=True)
|
272 | 270 | return dict()
|
273 | 271 |
|
274 |
| - def parse_git_output( |
275 |
| - self, raw: List[str], find_twins: bool = False, next_tag: Optional[str] = None |
276 |
| - ): |
| 272 | + def parse_git_output(self, raw: List[str]) -> Dict[str, RawCommit]: |
277 | 273 | commits: Dict[str, RawCommit] = dict()
|
278 | 274 | commit = None
|
279 | 275 | sector = 0
|
| 276 | + raw.append(GIT_SEPARATOR) |
280 | 277 | for line in raw:
|
281 | 278 | if line == GIT_SEPARATOR:
|
282 | 279 | if sector == 3:
|
283 | 280 | sector = 1
|
284 |
| - if 0 < len(commit.changed_files) < 100 and len(commit.msg) < 5000: |
285 |
| - commit.msg = commit.msg.strip() |
286 |
| - |
287 |
| - # TODO: should work here |
288 |
| - # commit.set_tags(next_tag) |
289 |
| - if find_twins: |
290 |
| - commit.minhash = get_encoded_minhash(commit.msg[:50]) |
291 |
| - |
292 |
| - commits[commit.id] = commit |
293 |
| - |
| 281 | + commit.msg = commit.msg.strip() |
| 282 | + commits[commit.id] = commit |
294 | 283 | else:
|
295 | 284 | sector += 1
|
296 | 285 | else:
|
297 | 286 | if sector == 1:
|
298 | 287 | id, timestamp, parent = line.split(":")
|
299 |
| - parent = parent.split(" ")[0] |
300 | 288 | commit = RawCommit(
|
301 | 289 | repository=self,
|
302 | 290 | commit_id=id,
|
303 | 291 | timestamp=int(timestamp),
|
304 |
| - parent_id=parent, |
| 292 | + parent_id=parent.split()[0], |
305 | 293 | )
|
306 | 294 | elif sector == 2:
|
307 | 295 | commit.msg += line + " "
|
308 |
| - elif sector == 3 and not any( |
309 |
| - x in line |
310 |
| - for x in ( |
311 |
| - "test", |
312 |
| - ".md", |
313 |
| - "docs/", |
314 |
| - ".meta", |
315 |
| - ".utf8", |
316 |
| - ) # TODO: build a list for these. If there are no . then is not relevant |
317 |
| - ): |
318 |
| - commit.add_changed_file(line) |
| 296 | + elif sector == 3: |
| 297 | + commit.changed_files.append(line) |
319 | 298 |
|
320 | 299 | return commits
|
321 | 300 |
|
322 |
| - def get_issues(self, since=None) -> Dict[str, str]: |
323 |
| - owner, repo = self.url.split("/")[-2:] |
324 |
| - query_url = f"https://api.github.com/repos/{owner}/{repo}/issues" |
325 |
| - # /repos/{owner}/{repo}/issues/{issue_number} |
326 |
| - params = { |
327 |
| - "state": "closed", |
328 |
| - "per_page": 100, |
329 |
| - "since": since, |
330 |
| - "page": 1, |
331 |
| - } |
332 |
| - headers = { |
333 |
| - "Authorization": f"Bearer {GITHUB_TOKEN}", |
334 |
| - "Accept": "application/vnd.github+json", |
335 |
| - } |
336 |
| - r = requests.get(query_url, params=params, headers=headers) |
337 |
| - |
338 |
| - while len(r.json()) > 0: |
339 |
| - for elem in r.json(): |
340 |
| - body = elem["body"] or "" |
341 |
| - self.issues[str(elem["number"])] = ( |
342 |
| - elem["title"] + " " + " ".join(body.split()) |
343 |
| - ) |
344 |
| - |
345 |
| - params["page"] += 1 |
346 |
| - if params["page"] > 10: |
347 |
| - break |
348 |
| - r = requests.get(query_url, params=params, headers=headers) |
349 |
| - |
350 | 301 | # # @measure_execution_time(execution_statistics.sub_collection("core"))
|
351 | 302 | # def get_commits(
|
352 | 303 | # self,
|
|
0 commit comments