Skip to content

Commit 47d8c3b

Browse files
committed
Performance Alerting
Introduces an early-warning series of prometheus alerts to attempt to catch issues with performance at an early stage in development. Signed-off-by: Daniel Franz <dfranz@redhat.com>
1 parent 1333f7b commit 47d8c3b

File tree

3 files changed

+118
-6
lines changed

3 files changed

+118
-6
lines changed

.github/workflows/e2e.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,21 @@ jobs:
3535
- name: Run e2e tests
3636
run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
3737

38+
- name: alerts-check
39+
# Grab all current alerts, filtering out pending, and print the GH actions warning string
40+
# containing the alert name and description.
41+
#
42+
# NOTE: Leaving this as annotating-only instead of failing the run until we have some more
43+
# finely-tuned alerts.
44+
run: |
45+
if [[ -s /tmp/artifacts/alerts.out ]]; then \
46+
jq -r 'if .state=="firing" then
47+
"::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
48+
elif .state=="pending" then
49+
"::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
50+
end' /tmp/artifacts/alerts.out
51+
fi
52+
3853
- uses: actions/upload-artifact@v4
3954
if: failure()
4055
with:

Makefile

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ test-e2e: run image-registry prometheus e2e e2e-metrics e2e-coverage kind-clean
272272
test-experimental-e2e: SOURCE_MANIFEST := $(EXPERIMENTAL_E2E_MANIFEST)
273273
test-experimental-e2e: KIND_CLUSTER_NAME := operator-controller-e2e
274274
test-experimental-e2e: GO_BUILD_EXTRA_FLAGS := -cover
275-
test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-metrics e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
275+
test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-metrics #e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster
276276

277277
.PHONY: prometheus
278278
prometheus: PROMETHEUS_NAMESPACE := olmv1-system
@@ -285,11 +285,10 @@ prometheus: #EXHELP Deploy Prometheus into specified namespace
285285
# prometheus. Prometheus will gather metrics we currently query for over the test run,
286286
# and provide alerts from the metrics based on the rules that we set.
287287
.PHONY: e2e-metrics
288-
e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set
289-
curl -X POST \
290-
-H "Content-Type: application/x-www-form-urlencoded" \
291-
--data 'query={pod=~"operator-controller-controller-manager-.*|catalogd-controller-manager-.*"}' \
292-
http://localhost:30900/api/v1/query > $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/metrics.out
288+
e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out
289+
e2e-metrics: #EXHELP Request metrics from prometheus; select only actively firing alerts; place in ARTIFACT_PATH if set
290+
ALERTS_FILE_PATH=$(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out
291+
curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH)
293292

294293
.PHONY: extension-developer-e2e
295294
extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e

hack/test/setup-monitoring.sh

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/bin/bash
22

3+
#TODO dtfranz: The yaml in this file should be pulled out and organized into a kustomization.yaml (where possible) for maintainability/readability
4+
35
set -euo pipefail
46

57
help="setup-monitoring.sh is used to set up prometheus monitoring for e2e testing.
@@ -92,6 +94,7 @@ spec:
9294
runAsUser: 65534
9395
seccompProfile:
9496
type: RuntimeDefault
97+
ruleSelector: {}
9598
serviceDiscoveryRole: EndpointSlice
9699
serviceMonitorSelector: {}
97100
EOF
@@ -115,6 +118,49 @@ spec:
115118
- {} # Allows us to query prometheus
116119
EOF
117120

121+
kubectl apply -f - << EOF
122+
apiVersion: monitoring.coreos.com/v1
123+
kind: ServiceMonitor
124+
metadata:
125+
name: kubelet
126+
namespace: olmv1-system
127+
labels:
128+
k8s-app: kubelet
129+
spec:
130+
jobLabel: k8s-app
131+
endpoints:
132+
- port: https-metrics
133+
scheme: https
134+
path: /metrics
135+
interval: 10s
136+
honorLabels: true
137+
tlsConfig:
138+
insecureSkipVerify: true
139+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
140+
metricRelabelings:
141+
- action: keep
142+
sourceLabels: [pod,container]
143+
regex: (operator-controller|catalogd).*;manager
144+
- port: https-metrics
145+
scheme: https
146+
path: /metrics/cadvisor
147+
interval: 10s
148+
honorLabels: true
149+
tlsConfig:
150+
insecureSkipVerify: true
151+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
152+
metricRelabelings:
153+
- action: keep
154+
sourceLabels: [pod,container]
155+
regex: (operator-controller|catalogd).*;manager
156+
selector:
157+
matchLabels:
158+
k8s-app: kubelet
159+
namespaceSelector:
160+
matchNames:
161+
- kube-system
162+
EOF
163+
118164
# Give the operator time to create the pod
119165
kubectl wait --for=create pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=60s
120166
kubectl wait --for=condition=Ready pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=120s
@@ -131,6 +177,56 @@ metadata:
131177
kubernetes.io/service-account.name: prometheus
132178
EOF
133179

180+
kubectl apply -f - << EOF
181+
apiVersion: monitoring.coreos.com/v1
182+
kind: PrometheusRule
183+
metadata:
184+
name: controller-alerts
185+
namespace: ${NAMESPACE}
186+
spec:
187+
groups:
188+
- name: controller-panic
189+
rules:
190+
- alert: reconciler-panic
191+
expr: controller_runtime_reconcile_panics_total{} > 0
192+
annotations:
193+
description: "controller of pod {{ \$labels.pod }} experienced panic(s); count={{ \$value }}"
194+
- alert: webhook-panic
195+
expr: controller_runtime_webhook_panics_total{} > 0
196+
annotations:
197+
description: "controller webhook of pod {{ \$labels.pod }} experienced panic(s); count={{ \$value }}"
198+
- name: resource-usage
199+
rules:
200+
- alert: oom-events
201+
expr: container_oom_events_total > 0
202+
annotations:
203+
description: "container {{ \$labels.container }} of pod {{ \$labels.pod }} experienced OOM event(s); count={{ \$value }}"
204+
- alert: operator-controller-memory-growth
205+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
206+
for: 5m
207+
keep_firing_for: 1d
208+
annotations:
209+
description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ \$value | humanize }}B/sec"
210+
- alert: catalogd-memory-growth
211+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
212+
for: 5m
213+
keep_firing_for: 1d
214+
annotations:
215+
description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ \$value | humanize }}B/sec"
216+
- alert: operator-controller-cpu-usage
217+
expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
218+
for: 5m
219+
keep_firing_for: 1d
220+
annotations:
221+
description: "operator-controller using high cpu resource for 5 minutes: {{ \$value | printf \"%.2f\" }}%"
222+
- alert: catalogd-cpu-usage
223+
expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
224+
for: 5m
225+
keep_firing_for: 1d
226+
annotations:
227+
description: "catalogd using high cpu resources for 5 minutes: {{ \$value | printf \"%.2f\" }}%"
228+
EOF
229+
134230
# ServiceMonitors for operator-controller and catalogd
135231
kubectl apply -f - <<EOF
136232
apiVersion: monitoring.coreos.com/v1
@@ -141,6 +237,7 @@ metadata:
141237
spec:
142238
endpoints:
143239
- path: /metrics
240+
interval: 10s
144241
port: https
145242
scheme: https
146243
authorization:
@@ -178,6 +275,7 @@ spec:
178275
endpoints:
179276
- path: /metrics
180277
port: metrics
278+
interval: 10s
181279
scheme: https
182280
authorization:
183281
credentials:

0 commit comments

Comments
 (0)