Skip to content

Commit df153bd

Browse files
committed
ci: migrate to new slo action
1 parent cea4669 commit df153bd

File tree

6 files changed

+171
-117
lines changed

6 files changed

+171
-117
lines changed

.github/workflows/slo-report.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: slo-report
2+
3+
on:
4+
workflow_run:
5+
workflows: ['slo']
6+
types:
7+
- completed
8+
9+
jobs:
10+
test-ydb-slo-action:
11+
runs-on: ubuntu-latest
12+
name: Publish YDB SLO Report
13+
permissions:
14+
contents: read
15+
pull-requests: write
16+
if: github.event.workflow_run.conclusion == 'success'
17+
steps:
18+
- name: Publish YDB SLO Report
19+
uses: ydb-platform/ydb-slo-action/report@main
20+
with:
21+
token: ${{ secrets.GITHUB_TOKEN }}
22+
run_id: ${{ github.event.workflow_run.id }}

.github/workflows/slo.yml

Lines changed: 60 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,78 @@
1-
name: SLO
1+
name: slo
22

33
on:
4+
push:
5+
branches:
6+
- main
47
pull_request:
5-
branches: [main]
8+
branches:
9+
- main
610
workflow_dispatch:
11+
inputs:
12+
github_pull_request_number:
13+
required: true
14+
slo_workload_duration_seconds:
15+
default: '600'
16+
required: false
17+
slo_workload_read_max_rps:
18+
default: '1000'
19+
required: false
20+
slo_workload_write_max_rps:
21+
default: '100'
22+
required: false
723

824
jobs:
9-
test-slo:
10-
concurrency:
11-
group: slo-${{ github.ref }}
25+
ydb-slo-action-init:
1226
if: (!contains(github.event.pull_request.labels.*.name, 'no slo'))
1327

28+
concurrency:
29+
group: slo-${{ github.ref }}-${{ matrix.sdk }}
30+
cancel-in-progress: true
31+
32+
name: Run YDB SLO Tests
1433
runs-on: ubuntu-latest
15-
name: SLO test
16-
permissions:
17-
checks: write
18-
pull-requests: write
19-
contents: read
20-
issues: write
34+
35+
strategy:
36+
matrix:
37+
sdk:
38+
- py-sync-table
39+
- py-sync-query
2140

2241
steps:
2342
- name: Checkout repository
24-
uses: actions/checkout@v3
25-
if: env.DOCKER_REPO != null
26-
env:
27-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
43+
uses: actions/checkout@v4
2844

29-
- name: Run SLO
30-
uses: ydb-platform/slo-tests@main
31-
if: env.DOCKER_REPO != null
32-
env:
33-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
34-
continue-on-error: true
45+
- name: Install Python3
46+
uses: actions/setup-python@v5
3547
with:
36-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
37-
KUBECONFIG_B64: ${{ secrets.SLO_KUBE_CONFIG }}
38-
AWS_CREDENTIALS_B64: ${{ secrets.SLO_AWS_CREDENTIALS }}
39-
AWS_CONFIG_B64: ${{ secrets.SLO_AWS_CONFIG }}
40-
DOCKER_USERNAME: ${{ secrets.SLO_DOCKER_USERNAME }}
41-
DOCKER_PASSWORD: ${{ secrets.SLO_DOCKER_PASSWORD }}
42-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
43-
DOCKER_FOLDER: ${{ secrets.SLO_DOCKER_FOLDER }}
44-
s3_endpoint: ${{ secrets.SLO_S3_ENDPOINT }}
45-
s3_images_folder: ${{ vars.SLO_S3_IMAGES_FOLDER }}
46-
grafana_domain: ${{ vars.SLO_GRAFANA_DOMAIN }}
47-
grafana_dashboard: ${{ vars.SLO_GRAFANA_DASHBOARD }}
48-
ydb_version: 'newest'
49-
timeBetweenPhases: 30
50-
shutdownTime: 30
48+
python-version: '3.8'
49+
cache: 'pip'
5150

52-
language_id0: sync-python-table
53-
language0: Python SDK over Table Service
54-
workload_path0: tests/slo
55-
workload_build_context0: ../..
56-
workload_build_options0: -f Dockerfile --build-arg SDK_SERVICE=sync-python-table
51+
- name: Install dependencies
52+
run: |
53+
python -m pip install --no-cache-dir --upgrade pip
54+
python -m pip install --no-cache-dir -e .
55+
python -m pip install --no-cache-dir -r tests/slo/requirements.txt
5756
58-
language_id1: sync-python-query
59-
language1: Python SDK over Query Service
60-
workload_path1: tests/slo
61-
workload_build_context1: ../..
62-
workload_build_options1: -f Dockerfile --build-arg SDK_SERVICE=sync-python-query
57+
- name: Initialize YDB SLO
58+
uses: ydb-platform/ydb-slo-action/init@main
59+
with:
60+
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
61+
github_token: ${{ secrets.GITHUB_TOKEN }}
62+
sdk_name: ${{ matrix.sdk }}
6363

64-
- uses: actions/upload-artifact@v3
65-
if: env.DOCKER_REPO != null
64+
- name: Run SLO Tests
6665
env:
67-
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
68-
with:
69-
name: slo-logs
70-
path: logs/
66+
REF: ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
67+
SDK_SERVICE: '${{ matrix.sdk }}'
68+
run: |
69+
python ./tests/slo/src create grpc://localhost:2135 /Root/testdb
70+
python ./tests/slo/src run grpc://localhost:2135 /Root/testdb \
71+
--prom-pgw localhost:9091 \
72+
--report-period 250 \
73+
--read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
74+
--write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
75+
--read-timeout 10000 \
76+
--write-timeout 10000 \
77+
--time ${{inputs.slo_workload_duration_seconds || 600}}
78+
python ./tests/slo/src cleanup grpc://localhost:2135 /Root/testdb

tests/slo/Dockerfile

Lines changed: 0 additions & 11 deletions
This file was deleted.

tests/slo/src/jobs.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import threading
1010

11-
from metrics import Metrics, JOB_WRITE_LABEL, JOB_READ_LABEL
11+
from metrics import Metrics, OP_TYPE_WRITE, OP_TYPE_READ
1212

1313
from generator import RowGenerator
1414

@@ -106,7 +106,7 @@ def check_result(result):
106106
query=query,
107107
params=params,
108108
metrics=metrics,
109-
labels=(JOB_READ_LABEL,),
109+
labels=(OP_TYPE_READ,),
110110
request_settings=request_settings,
111111
retry_settings=retry_setting,
112112
check_result_cb=check_result,
@@ -163,7 +163,7 @@ def check_result(result):
163163
query=query,
164164
params=params,
165165
metrics=metrics,
166-
labels=(JOB_READ_LABEL,),
166+
labels=(OP_TYPE_READ,),
167167
request_settings=request_settings,
168168
retry_settings=retry_setting,
169169
check_result_cb=check_result,
@@ -220,7 +220,7 @@ def run_writes(driver, query, row_generator, metrics, limiter, runtime, timeout)
220220
query=query,
221221
params=params,
222222
metrics=metrics,
223-
labels=(JOB_WRITE_LABEL,),
223+
labels=(OP_TYPE_WRITE,),
224224
request_settings=request_settings,
225225
retry_settings=retry_setting,
226226
)
@@ -285,7 +285,7 @@ def check_result(result):
285285
query=query,
286286
params=params,
287287
metrics=metrics,
288-
labels=(JOB_WRITE_LABEL,),
288+
labels=(OP_TYPE_WRITE,),
289289
request_settings=request_settings,
290290
retry_settings=retry_setting,
291291
check_result_cb=check_result,

tests/slo/src/metrics.py

Lines changed: 82 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -7,55 +7,91 @@
77

88
environ["PROMETHEUS_DISABLE_CREATED_SERIES"] = "True"
99

10-
from prometheus_client import CollectorRegistry, Gauge, Histogram, push_to_gateway # noqa: E402
11-
from summary import Summary # noqa: E402
10+
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, push_to_gateway # noqa: E402
1211

13-
JOB_READ_LABEL, JOB_WRITE_LABEL = "read", "write"
14-
JOB_STATUS_OK, JOB_STATUS_ERR = "ok", "err"
12+
OP_TYPE_READ, OP_TYPE_WRITE = "read", "write"
13+
OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "err"
1514

16-
SDK_SERVICE_NAME = environ.get("SDK_SERVICE", "sync-python-table")
15+
REF = environ.get("REF", "main")
16+
SDK_SERVICE_NAME = environ.get("SDK_SERVICE", "py-sync-table")
1717

1818

1919
class Metrics:
2020
def __init__(self, push_gateway):
2121
self._push_gtw = push_gateway
2222
self._registry = CollectorRegistry()
2323
self._metrics = dict(
24-
oks=Gauge(
25-
"oks",
26-
"amount of OK requests",
27-
labelnames=("jobName",),
24+
errors_total=Counter(
25+
"sdk_errors_total",
26+
"Total number of errors encountered, categorized by error type.",
27+
labelnames=("operation_type", "error_type"),
2828
registry=self._registry,
2929
),
30-
not_oks=Gauge(
31-
"not_oks",
32-
"amount of not OK requests",
33-
labelnames=("jobName",),
30+
operations_total=Counter(
31+
"sdk_operations_total",
32+
"Total number of operations, categorized by type attempted by the SDK.",
33+
labelnames=("operation_type",),
3434
registry=self._registry,
3535
),
36-
inflight=Gauge(
37-
"inflight",
38-
"amount of requests in flight",
39-
labelnames=("jobName",),
36+
operations_success_total=Counter(
37+
"sdk_operations_success_total",
38+
"Total number of successful operations, categorized by type.",
39+
labelnames=("operation_type",),
4040
registry=self._registry,
4141
),
42-
latency=Summary(
43-
"latency",
44-
"summary of latencies in ms",
45-
labelnames=("jobName", "status"),
42+
operations_failure_total=Counter(
43+
"sdk_operations_failure_total",
44+
"Total number of failed operations, categorized by type.",
45+
labelnames=("operation_type",),
4646
registry=self._registry,
47-
objectives=(
48-
(0.5, 0.01),
49-
(0.99, 0.001),
50-
(1.0, 0.0),
47+
),
48+
operation_latency_seconds=Histogram(
49+
"sdk_operation_latency_seconds",
50+
"Latency of operations performed by the SDK in seconds, categorized by type and status.",
51+
labelnames=(
52+
"operation_type",
53+
"operation_status",
54+
),
55+
registry=self._registry,
56+
buckets=(
57+
0.001, # 1 ms
58+
0.002, # 2 ms
59+
0.003, # 3 ms
60+
0.004, # 4 ms
61+
0.005, # 5 ms
62+
0.0075, # 7.5 ms
63+
0.010, # 10 ms
64+
0.020, # 20 ms
65+
0.050, # 50 ms
66+
0.100, # 100 ms
67+
0.200, # 200 ms
68+
0.500, # 500 ms
69+
1.000, # 1 s
5170
),
5271
),
53-
attempts=Histogram(
54-
"attempts",
55-
"histogram of amount of requests",
56-
labelnames=("jobName", "status"),
72+
retry_attempts_total=Counter(
73+
"sdk_retry_attempts_total",
74+
"Total number of retry attempts, categorized by operation type.",
75+
labelnames=("operation_type",),
76+
registry=self._registry,
77+
),
78+
retries_success_total=Counter(
79+
"sdk_retries_success_total",
80+
"Total number of successful retries, categorized by operation type.",
81+
labelnames=("operation_type",),
82+
registry=self._registry,
83+
),
84+
retries_failure_total=Counter(
85+
"sdk_retries_failure_total",
86+
"Total number of failed retries, categorized by operation type.",
87+
labelnames=("operation_type",),
88+
registry=self._registry,
89+
),
90+
pending_operations=Gauge(
91+
"sdk_pending_operations",
92+
"Current number of pending operations, categorized by type.",
93+
labelnames=("operation_type",),
5794
registry=self._registry,
58-
buckets=tuple(range(1, 11)),
5995
),
6096
)
6197
self.reset()
@@ -81,44 +117,44 @@ def start(self, labels):
81117
if not isinstance(labels, Iterable):
82118
labels = (labels,)
83119

84-
self.inflight.labels(*labels).inc()
120+
self.pending_operations.labels(*labels).inc()
85121
return time.time()
86122

87123
def stop(self, labels, start_time, attempts=1, error=None):
88-
runtime_ms = 1000 * (time.time() - start_time)
124+
duration = time.time() - start_time
89125

90126
if not isinstance(labels, Iterable):
91127
labels = (labels,)
92128

93-
self.inflight.labels(*labels).dec()
129+
self.operations_total.labels(*labels).inc()
130+
self.pending_operations.labels(*labels).dec()
131+
self.retry_attempts_total.labels(*labels).inc(attempts)
94132

95133
if error:
96-
self.not_oks.labels(*labels).inc()
97-
self.latency.labels(*labels, JOB_STATUS_ERR).observe(runtime_ms)
134+
self.errors_total.labels(*labels, type(error).__name__).inc()
135+
self.retries_failure_total.labels(*labels).inc(attempts)
136+
self.operations_failure_total.labels(*labels).inc()
137+
self.operation_latency_seconds.labels(*labels, OP_STATUS_FAILURE).observe(duration)
98138
return
99139

100-
self.oks.labels(*labels).inc()
101-
self.latency.labels(*labels, JOB_STATUS_OK).observe(runtime_ms)
102-
self.attempts.labels(*labels, JOB_STATUS_OK).observe(attempts)
140+
self.retries_success_total.labels(*labels).inc(attempts)
141+
self.operations_success_total.labels(*labels).inc()
142+
self.operation_latency_seconds.labels(*labels, OP_STATUS_SUCCESS).observe(duration)
103143

104144
def push(self):
105145
push_to_gateway(
106146
self._push_gtw,
107147
job=f"workload-{SDK_SERVICE_NAME}",
108148
registry=self._registry,
109149
grouping_key={
150+
"ref": REF,
110151
"sdk": SDK_SERVICE_NAME,
111-
"sdkVersion": version("ydb"),
152+
"sdk_version": version("ydb"),
112153
},
113154
)
114155

115156
def reset(self):
116-
for label in (JOB_READ_LABEL, JOB_WRITE_LABEL):
117-
self.oks.labels(label).set(0)
118-
self.not_oks.labels(label).set(0)
119-
self.inflight.labels(label).set(0)
120-
121-
self.latency.clear()
122-
self.attempts.clear()
157+
for m in self._metrics.values():
158+
m.clear()
123159

124160
self.push()

0 commit comments

Comments
 (0)