Skip to content

✨ Performance Alerting #2081

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ jobs:
- name: Run e2e tests
run: ARTIFACT_PATH=/tmp/artifacts make test-e2e

- name: alerts-check
# Grab all current alerts, filtering out pending, and print the GH actions warning string
# containing the alert name and description.
#
# NOTE: Leaving this as annotating-only instead of failing the run until we have some more
# finely-tuned alerts.
run: |
if [[ -s /tmp/artifacts/alerts.out ]]; then \
jq -r 'if .state=="firing" then
"::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
elif .state=="pending" then
"::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
end' /tmp/artifacts/alerts.out
fi

- uses: actions/upload-artifact@v4
if: failure()
with:
Expand Down
8 changes: 3 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -285,11 +285,9 @@ prometheus: #EXHELP Deploy Prometheus into specified namespace
# prometheus. Prometheus will gather metrics we currently query for over the test run,
# and provide alerts from the metrics based on the rules that we set.
.PHONY: e2e-metrics
e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set
curl -X POST \
-H "Content-Type: application/x-www-form-urlencoded" \
--data 'query={pod=~"operator-controller-controller-manager-.*|catalogd-controller-manager-.*"}' \
http://localhost:30900/api/v1/query > $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/metrics.out
e2e-metrics: #EXHELP Request metrics from prometheus; select only actively firing alerts; place in ARTIFACT_PATH if set
ALERTS_FILE_PATH=$(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out
curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH)
Comment on lines +288 to +290
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be complaining because ALERTS_FILE_PATH is indented, so it's not considered a make variable. So you need braces {} (and some more sh magic) or unindent the line and use :=


.PHONY: extension-developer-e2e
extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e
Expand Down
98 changes: 98 additions & 0 deletions hack/test/setup-monitoring.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

#TODO dtfranz: The yaml in this file should be pulled out and organized into a kustomization.yaml (where possible) for maintainability/readability

set -euo pipefail

help="setup-monitoring.sh is used to set up prometheus monitoring for e2e testing.
Expand Down Expand Up @@ -92,6 +94,7 @@ spec:
runAsUser: 65534
seccompProfile:
type: RuntimeDefault
ruleSelector: {}
serviceDiscoveryRole: EndpointSlice
serviceMonitorSelector: {}
EOF
Expand All @@ -115,6 +118,49 @@ spec:
- {} # Allows us to query prometheus
EOF

kubectl apply -f - << EOF
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kubelet
namespace: olmv1-system
labels:
k8s-app: kubelet
spec:
jobLabel: k8s-app
endpoints:
- port: https-metrics
scheme: https
path: /metrics
interval: 10s
honorLabels: true
tlsConfig:
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
metricRelabelings:
- action: keep
sourceLabels: [pod,container]
regex: (operator-controller|catalogd).*;manager
- port: https-metrics
scheme: https
path: /metrics/cadvisor
interval: 10s
honorLabels: true
tlsConfig:
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
metricRelabelings:
- action: keep
sourceLabels: [pod,container]
regex: (operator-controller|catalogd).*;manager
selector:
matchLabels:
k8s-app: kubelet
namespaceSelector:
matchNames:
- kube-system
EOF

# Give the operator time to create the pod
kubectl wait --for=create pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=60s
kubectl wait --for=condition=Ready pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=120s
Expand All @@ -131,6 +177,56 @@ metadata:
kubernetes.io/service-account.name: prometheus
EOF

kubectl apply -f - << EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: controller-alerts
namespace: ${NAMESPACE}
spec:
groups:
- name: controller-panic
rules:
- alert: reconciler-panic
expr: controller_runtime_reconcile_panics_total{} > 0
annotations:
description: "controller of pod {{ \$labels.pod }} experienced panic(s); count={{ \$value }}"
- alert: webhook-panic
expr: controller_runtime_webhook_panics_total{} > 0
annotations:
description: "controller webhook of pod {{ \$labels.pod }} experienced panic(s); count={{ \$value }}"
- name: resource-usage
rules:
- alert: oom-events
expr: container_oom_events_total > 0
annotations:
description: "container {{ \$labels.container }} of pod {{ \$labels.pod }} experienced OOM event(s); count={{ \$value }}"
- alert: operator-controller-memory-growth
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
for: 5m
keep_firing_for: 1d
annotations:
description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ \$value | humanize }}B/sec"
- alert: catalogd-memory-growth
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
for: 5m
keep_firing_for: 1d
annotations:
description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ \$value | humanize }}B/sec"
- alert: operator-controller-cpu-usage
expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
for: 5m
keep_firing_for: 1d
annotations:
description: "operator-controller using high cpu resource for 5 minutes: {{ \$value | printf \"%.2f\" }}%"
- alert: catalogd-cpu-usage
expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
for: 5m
keep_firing_for: 1d
annotations:
description: "catalogd using high cpu resources for 5 minutes: {{ \$value | printf \"%.2f\" }}%"
EOF

# ServiceMonitors for operator-controller and catalogd
kubectl apply -f - <<EOF
apiVersion: monitoring.coreos.com/v1
Expand All @@ -141,6 +237,7 @@ metadata:
spec:
endpoints:
- path: /metrics
interval: 10s
port: https
scheme: https
authorization:
Expand Down Expand Up @@ -178,6 +275,7 @@ spec:
endpoints:
- path: /metrics
port: metrics
interval: 10s
scheme: https
authorization:
credentials:
Expand Down
Loading