Skip to content

Commit 77c2b00

Browse files
[CI] Upstream metrics script and container definition (llvm#117461)
This patch includes the script that pulls information from Github and pushes it to Grafana. This is currently running in the cluster and pushes information to https://llvm.grafana.net/public-dashboards/6a1c1969b6794e0a8ee5d494c72ce2cd. This script is designed to accept other jobs relatively easily and can be easily modified to look at other metrics.
1 parent 8fb748b commit 77c2b00

File tree

4 files changed

+525
-0
lines changed

4 files changed

+525
-0
lines changed

.ci/metrics/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM docker.io/python:3.12
2+
3+
COPY requirements.lock.txt ./
4+
RUN pip3 install --no-cache-dir -r requirements.lock.txt
5+
COPY metrics.py ./
6+
7+
CMD ["python3", "metrics.py"]

.ci/metrics/metrics.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
import requests
2+
import time
3+
import os
4+
from dataclasses import dataclass
5+
import sys
6+
7+
import github
8+
from github import Github
9+
from github import Auth
10+
11+
GRAFANA_URL = (
12+
"https://influx-prod-13-prod-us-east-0.grafana.net/api/v1/push/influx/write"
13+
)
14+
GITHUB_PROJECT = "llvm/llvm-project"
15+
WORKFLOWS_TO_TRACK = ["Check code formatting"]
16+
SCRAPE_INTERVAL_SECONDS = 5 * 60
17+
18+
19+
@dataclass
20+
class JobMetrics:
21+
job_name: str
22+
queue_time: int
23+
run_time: int
24+
status: int
25+
created_at_ns: int
26+
workflow_id: int
27+
28+
29+
def get_metrics(github_repo: github.Repository, workflows_to_track: dict[str, int]):
30+
"""Gets the metrics for specified Github workflows.
31+
32+
This function takes in a list of workflows to track, and optionally the
33+
workflow ID of the last tracked invocation. It grabs the relevant data
34+
from Github, returning it to the caller.
35+
36+
Args:
37+
github_repo: A github repo object to use to query the relevant information.
38+
workflows_to_track: A dictionary mapping workflow names to the last
39+
invocation ID where metrics have been collected, or None to collect the
40+
last five results.
41+
42+
Returns:
43+
Returns a list of JobMetrics objects, containing the relevant metrics about
44+
the workflow.
45+
"""
46+
workflow_runs = iter(github_repo.get_workflow_runs())
47+
48+
workflow_metrics = []
49+
50+
workflows_to_include = set(workflows_to_track.keys())
51+
52+
while len(workflows_to_include) > 0:
53+
workflow_run = next(workflow_runs)
54+
if workflow_run.status != "completed":
55+
continue
56+
57+
# This workflow was already sampled for this run, or is not tracked at
58+
# all. Ignoring.
59+
if workflow_run.name not in workflows_to_include:
60+
continue
61+
62+
# There were no new workflow invocations since the previous scrape.
63+
# The API returns a sorted list with the most recent invocations first,
64+
# so we can stop looking for this particular workflow. Continue to grab
65+
# information on the other workflows of interest, if present.
66+
if workflows_to_track[workflow_run.name] == workflow_run.id:
67+
workflows_to_include.remove(workflow_run.name)
68+
continue
69+
70+
workflow_jobs = workflow_run.jobs()
71+
if workflow_jobs.totalCount == 0:
72+
continue
73+
if workflow_jobs.totalCount > 1:
74+
raise ValueError(
75+
f"Encountered an unexpected number of jobs: {workflow_jobs.totalCount}"
76+
)
77+
78+
created_at = workflow_jobs[0].created_at
79+
started_at = workflow_jobs[0].started_at
80+
completed_at = workflow_jobs[0].completed_at
81+
82+
job_result = int(workflow_jobs[0].conclusion == "success")
83+
84+
queue_time = started_at - created_at
85+
run_time = completed_at - started_at
86+
87+
if run_time.seconds == 0:
88+
continue
89+
90+
if (
91+
workflows_to_track[workflow_run.name] is None
92+
or workflows_to_track[workflow_run.name] == workflow_run.id
93+
):
94+
workflows_to_include.remove(workflow_run.name)
95+
if (
96+
workflows_to_track[workflow_run.name] is not None
97+
and len(workflows_to_include) == 0
98+
):
99+
break
100+
101+
# The timestamp associated with the event is expected by Grafana to be
102+
# in nanoseconds.
103+
created_at_ns = int(created_at.timestamp()) * 10**9
104+
105+
workflow_metrics.append(
106+
JobMetrics(
107+
workflow_run.name,
108+
queue_time.seconds,
109+
run_time.seconds,
110+
job_result,
111+
created_at_ns,
112+
workflow_run.id,
113+
)
114+
)
115+
116+
return workflow_metrics
117+
118+
119+
def upload_metrics(workflow_metrics, metrics_userid, api_key):
120+
"""Upload metrics to Grafana.
121+
122+
Takes in a list of workflow metrics and then uploads them to Grafana
123+
through a REST request.
124+
125+
Args:
126+
workflow_metrics: A list of metrics to upload to Grafana.
127+
metrics_userid: The userid to use for the upload.
128+
api_key: The API key to use for the upload.
129+
"""
130+
metrics_batch = []
131+
for workflow_metric in workflow_metrics:
132+
workflow_formatted_name = workflow_metric.job_name.lower().replace(" ", "_")
133+
metrics_batch.append(
134+
f"{workflow_formatted_name} queue_time={workflow_metric.queue_time},run_time={workflow_metric.run_time},status={workflow_metric.status} {workflow_metric.created_at_ns}"
135+
)
136+
137+
request_data = "\n".join(metrics_batch)
138+
response = requests.post(
139+
GRAFANA_URL,
140+
headers={"Content-Type": "text/plain"},
141+
data=request_data,
142+
auth=(metrics_userid, api_key),
143+
)
144+
145+
if response.status_code < 200 or response.status_code >= 300:
146+
print(
147+
f"Failed to submit data to Grafana: {response.status_code}", file=sys.stderr
148+
)
149+
150+
151+
def main():
152+
# Authenticate with Github
153+
auth = Auth.Token(os.environ["GITHUB_TOKEN"])
154+
github_object = Github(auth=auth)
155+
github_repo = github_object.get_repo("llvm/llvm-project")
156+
157+
grafana_api_key = os.environ["GRAFANA_API_KEY"]
158+
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
159+
160+
workflows_to_track = {}
161+
for workflow_to_track in WORKFLOWS_TO_TRACK:
162+
workflows_to_track[workflow_to_track] = None
163+
164+
# Enter the main loop. Every five minutes we wake up and dump metrics for
165+
# the relevant jobs.
166+
while True:
167+
current_metrics = get_metrics(github_repo, workflows_to_track)
168+
if len(current_metrics) == 0:
169+
print("No metrics found to upload.", file=sys.stderr)
170+
continue
171+
172+
upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key)
173+
print(f"Uploaded {len(current_metrics)} metrics", file=sys.stderr)
174+
175+
for workflow_metric in reversed(current_metrics):
176+
workflows_to_track[workflow_metric.job_name] = workflow_metric.workflow_id
177+
178+
time.sleep(SCRAPE_INTERVAL_SECONDS)
179+
180+
181+
if __name__ == "__main__":
182+
main()

0 commit comments

Comments
 (0)