Skip to content

Commit e6a5e3f

Browse files
authored
Upstream commit metrics scraping script and container definition (#483)
This upstreams a script for daily scraping of [llvm/llvm-project](https://github.com/llvm/llvm-project) for new commits. The scraped commits are queried against [GitHub Archive BigQuery](https://www.gharchive.org/#bigquery) to determine useful metrics such as how many commits are submitted with or without a pull request and how many of those pull requests are reviewed or approved. This container will run in the cluster and will push information to a Grafana dashboard similar to https://llvm.grafana.net/public-dashboards/21c6e0a7cdd14651a90e118df46be4cc
1 parent df463d2 commit e6a5e3f

File tree

4 files changed

+687
-0
lines changed

4 files changed

+687
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM docker.io/python:3.12
2+
3+
COPY requirements.lock.txt ./
4+
RUN pip3 install --no-cache-dir -r requirements.lock.txt
5+
COPY process_llvm_commits.py ./
6+
7+
CMD ["python3", "process_llvm_commits.py"]
Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
import dataclasses
2+
import datetime
3+
import logging
4+
import os
5+
import git
6+
from google.cloud import bigquery
7+
import requests
8+
9+
GRAFANA_URL = (
10+
"https://influx-prod-13-prod-us-east-0.grafana.net/api/v1/push/influx/write"
11+
)
12+
13+
# Path to checked out llvm/llvm-project repository
14+
REPOSITORY_PATH = "/data/llvm-project"
15+
16+
# Path to record of most recently processed commits
17+
DATA_PATH = "/data/recent_commits.csv"
18+
19+
# Number of days to look back for new commits
20+
# We allow some buffer time between when a commit is made and when it is queried
21+
# for reviews. This is allow time for any events to propogate in the GitHub
22+
# Archive BigQuery tables.
23+
LOOKBACK_DAYS = 2
24+
25+
# Template query to find pull requests associated with commits on a given day.
26+
# Searches for pull requests within a lower and upper bound of Github Archive
27+
# event dates.
28+
GITHUB_ARCHIVE_REVIEW_QUERY = """
29+
WITH PullRequestReviews AS (
30+
SELECT DISTINCT
31+
JSON_VALUE(payload, '$.pull_request.id') AS pr_id,
32+
JSON_VALUE(payload, '$.review.state') as review_state,
33+
FROM `githubarchive.day.20*`
34+
WHERE
35+
repo.id = 75821432
36+
AND `type` = 'PullRequestReviewEvent'
37+
AND (_TABLE_SUFFIX BETWEEN '{lower_review_bound}' AND '{upper_review_bound}')
38+
)
39+
SELECT DISTINCT
40+
JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') AS merge_commit_sha,
41+
JSON_VALUE(pr_event.payload, '$.pull_request.number') AS pull_request_number,
42+
pr_review.review_state as review_state
43+
FROM `githubarchive.day.{commit_date}` AS pr_event
44+
LEFT JOIN PullRequestReviews as pr_review ON
45+
JSON_VALUE(pr_event.payload, '$.pull_request.id') = pr_review.pr_id # PR ID should match the review events
46+
WHERE
47+
pr_event.repo.id = 75821432
48+
AND pr_event.`type` = 'PullRequestEvent'
49+
AND JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') IS NOT NULL
50+
"""
51+
52+
53+
@dataclasses.dataclass
54+
class LLVMCommitInfo:
55+
commit_sha: str
56+
commit_datetime: datetime.datetime
57+
commit_timestamp_seconds: int
58+
has_pull_request: bool = False
59+
pr_number: int = 0
60+
is_reviewed: bool = False
61+
is_approved: bool = False
62+
63+
64+
def read_past_commits() -> list[list[str]]:
65+
"""Read recently scraped commits from the data path.
66+
67+
Returns:
68+
List of commits that have been scraped.
69+
"""
70+
# If the data path doesn't exist, we haven't scraped any commits yet.
71+
if not os.path.exists(DATA_PATH):
72+
logging.warning(
73+
" Data path %s does not exist. No past commits found.", DATA_PATH
74+
)
75+
return []
76+
77+
# Read the past commits from the data path
78+
with open(DATA_PATH, "r") as f:
79+
f.readline() # Skip header
80+
rows = f.readlines()
81+
commit_history = [row.strip().split(",") for row in rows if row.strip()]
82+
return commit_history
83+
84+
85+
def record_new_commits(new_commits: list[LLVMCommitInfo]) -> None:
86+
"""Record newly scraped commits to the data path.
87+
88+
Args:
89+
new_commits: List of commits to record.
90+
91+
Returns:
92+
None
93+
"""
94+
with open(DATA_PATH, "w") as f:
95+
96+
# Write CSV header
97+
f.write(
98+
",".join([
99+
"commit_sha",
100+
"commit_datetime",
101+
"has_pull_request",
102+
"pull_request_number",
103+
"is_reviewed",
104+
"is_approved",
105+
])
106+
+ "\n"
107+
)
108+
109+
# We want the newest commit as the last entry, so iterate backwards
110+
for i in range(len(new_commits) - 1, -1, -1):
111+
commit_info = new_commits[i]
112+
record = ",".join([
113+
commit_info.commit_sha,
114+
commit_info.commit_datetime.astimezone(
115+
datetime.timezone.utc
116+
).isoformat(),
117+
str(commit_info.has_pull_request),
118+
str(commit_info.pr_number),
119+
str(commit_info.is_reviewed),
120+
str(commit_info.is_approved),
121+
])
122+
f.write(f"{record}\n")
123+
124+
125+
def scrape_new_commits_by_date(
126+
last_known_commit: str, target_datetime: datetime.datetime
127+
) -> list[git.Commit]:
128+
"""Scrape new commits from a given dates.
129+
130+
Args:
131+
last_known_commit: The last known scraped commit.
132+
target_datetime: The date to scrape for new commits.
133+
134+
Returns:
135+
List of new commits made on the given date.
136+
"""
137+
# Pull any new commits into local repository
138+
repo = git.Repo(REPOSITORY_PATH)
139+
repo.remotes.origin.pull()
140+
141+
# Scrape for new commits
142+
# iter_commits() yields commits in reverse chronological order
143+
new_commits = []
144+
for commit in repo.iter_commits():
145+
# Skip commits that are too new
146+
committed_datetime = commit.committed_datetime.astimezone(
147+
datetime.timezone.utc
148+
)
149+
if committed_datetime.date() > target_datetime.date():
150+
continue
151+
# Stop scraping if the commit is older than the target date
152+
if committed_datetime.date() < target_datetime.date():
153+
break
154+
# Stop scraping if we've already recorded this commit
155+
if commit.hexsha == last_known_commit:
156+
break
157+
158+
new_commits.append(commit)
159+
160+
logging.info("Found %d new commits", len(new_commits))
161+
return new_commits
162+
163+
164+
def query_for_reviews(
165+
new_commits: list[git.Commit], commit_datetime: datetime.datetime
166+
) -> list[LLVMCommitInfo]:
167+
"""Query GitHub Archive BigQuery for reviews of new commits.
168+
169+
Args:
170+
new_commits: List of new commits to query for reviews.
171+
commit_datetime: The date that the new commits were made on.
172+
173+
Returns:
174+
List of LLVMCommitInfo objects for each commit's review information.
175+
"""
176+
177+
# Search for reviews in the last 4 weeks
178+
earliest_review_date = (
179+
commit_datetime - datetime.timedelta(weeks=4)
180+
).strftime("%Y%m%d")
181+
latest_review_date = datetime.datetime.now(datetime.timezone.utc).strftime(
182+
"%Y%m%d"
183+
)
184+
185+
# Create a map of commit sha to info
186+
new_commits = {
187+
commit.hexsha: LLVMCommitInfo(
188+
commit.hexsha, commit.committed_datetime, commit.committed_date
189+
)
190+
for commit in new_commits
191+
}
192+
193+
# Query each relevant daily GitHub Archive table
194+
query = GITHUB_ARCHIVE_REVIEW_QUERY.format(
195+
commit_date=commit_datetime.strftime("%Y%m%d"),
196+
lower_review_bound=earliest_review_date.removeprefix("20"),
197+
upper_review_bound=latest_review_date.removeprefix("20"),
198+
)
199+
bq_client = bigquery.Client()
200+
query_job = bq_client.query(query)
201+
results = query_job.result()
202+
203+
# Process each found merge commit
204+
for row in results:
205+
# If this commit is irrelevant, skip it
206+
# Not every merge_commit_sha makes it into main, a "merge commit" can mean
207+
# different things depending on the state of the pull request.
208+
# docs.github.com/en/rest/pulls/pulls#get-a-pull-request for more details.
209+
merge_commit_sha = row["merge_commit_sha"]
210+
if merge_commit_sha not in new_commits:
211+
continue
212+
213+
commit_info = new_commits[merge_commit_sha]
214+
commit_info.has_pull_request = True
215+
commit_info.pr_number = row["pull_request_number"]
216+
commit_info.is_reviewed = row["review_state"] is not None
217+
commit_info.is_approved = row["review_state"] == "approved"
218+
219+
logging.info(
220+
"Total gigabytes processed: %d GB",
221+
query_job.total_bytes_processed / (1024**3),
222+
)
223+
224+
return list(new_commits.values())
225+
226+
227+
def upload_daily_metrics(
228+
grafana_api_key: str,
229+
grafana_metrics_userid: str,
230+
new_commits: list[LLVMCommitInfo],
231+
) -> None:
232+
"""Upload daily commit metrics to Grafana.
233+
234+
Args:
235+
grafana_api_key: The key to make API requests with.
236+
grafana_metrics_userid: The user to make API requests with.
237+
new_commits: List of commits to process & upload to Grafana.
238+
239+
Returns:
240+
None
241+
"""
242+
# Count each type of commit made
243+
approval_count = 0
244+
review_count = 0
245+
pull_request_count = 0
246+
push_count = 0
247+
for commit in new_commits:
248+
if commit.is_approved:
249+
approval_count += 1
250+
elif commit.is_reviewed:
251+
review_count += 1
252+
elif commit.has_pull_request:
253+
pull_request_count += 1
254+
else:
255+
push_count += 1
256+
257+
# Post data via InfluxDB API call
258+
request_data = (
259+
"llvm_project_main_daily_commits"
260+
" approval_count={},review_count={},pull_request_count={},push_count={}"
261+
).format(approval_count, review_count, pull_request_count, push_count)
262+
response = requests.post(
263+
GRAFANA_URL, # Set timestamp precision to seconds
264+
headers={"Content-Type": "text/plain"},
265+
data=request_data,
266+
auth=(grafana_metrics_userid, grafana_api_key),
267+
)
268+
269+
if response.status_code < 200 or response.status_code >= 300:
270+
logging.error("Failed to submit data to Grafana: %s", response.text)
271+
272+
273+
def main() -> None:
274+
grafana_api_key = os.environ["GRAFANA_API_KEY"]
275+
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
276+
277+
logging.info("Reading recently processed commits.")
278+
recorded_commits = read_past_commits()
279+
280+
last_known_commit = recorded_commits[-1][0] if recorded_commits else ""
281+
282+
# Scrape new commits, if any
283+
date_to_scrape = datetime.datetime.now(
284+
datetime.timezone.utc
285+
) - datetime.timedelta(days=LOOKBACK_DAYS)
286+
logging.info(
287+
"Scraping checked out llvm/llvm-project for new commits on %s",
288+
date_to_scrape.strftime("%Y-%m-%d"),
289+
)
290+
new_commits = scrape_new_commits_by_date(last_known_commit, date_to_scrape)
291+
if not new_commits:
292+
logging.info("No new commits found. Exiting.")
293+
return
294+
295+
logging.info("Querying for reviews of new commits.")
296+
new_commit_info = query_for_reviews(new_commits, date_to_scrape)
297+
298+
logging.info("Uploading metrics to Grafana.")
299+
upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info)
300+
301+
logging.info("Recording new commits.")
302+
record_new_commits(new_commit_info)
303+
304+
305+
if __name__ == "__main__":
306+
logging.basicConfig(level=logging.INFO)
307+
main()
308+

0 commit comments

Comments
 (0)