From bb8a597a53b75cd07ae622a8d8a17192d86fe1fc Mon Sep 17 00:00:00 2001 From: Daniel Franz Date: Mon, 7 Jul 2025 17:10:27 +0900 Subject: [PATCH] Performance Alerting Introduces an early-warning series of prometheus alerts to attempt to catch issues with performance at an early stage in development. Signed-off-by: Daniel Franz --- .github/workflows/e2e.yaml | 15 ++++++ Makefile | 8 ++- hack/test/setup-monitoring.sh | 98 +++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 5 deletions(-) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index d0dd6b8f9..e5d4c0858 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -35,6 +35,21 @@ jobs: - name: Run e2e tests run: ARTIFACT_PATH=/tmp/artifacts make test-e2e + - name: alerts-check + # Grab all current alerts, filtering out pending, and print the GH actions warning string + # containing the alert name and description. + # + # NOTE: Leaving this as annotating-only instead of failing the run until we have some more + # finely-tuned alerts. + run: | + if [[ -s /tmp/artifacts/alerts.out ]]; then \ + jq -r 'if .state=="firing" then + "::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)" + elif .state=="pending" then + "::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)" + end' /tmp/artifacts/alerts.out + fi + - uses: actions/upload-artifact@v4 if: failure() with: diff --git a/Makefile b/Makefile index e429f88a3..55ea8163e 100644 --- a/Makefile +++ b/Makefile @@ -285,11 +285,9 @@ prometheus: #EXHELP Deploy Prometheus into specified namespace # prometheus. Prometheus will gather metrics we currently query for over the test run, # and provide alerts from the metrics based on the rules that we set. .PHONY: e2e-metrics -e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set - curl -X POST \ - -H "Content-Type: application/x-www-form-urlencoded" \ - --data 'query={pod=~"operator-controller-controller-manager-.*|catalogd-controller-manager-.*"}' \ - http://localhost:30900/api/v1/query > $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/metrics.out +e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out +e2e-metrics: #EXHELP Request metrics from prometheus; select only actively firing alerts; place in ARTIFACT_PATH if set + curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH) .PHONY: extension-developer-e2e extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e diff --git a/hack/test/setup-monitoring.sh b/hack/test/setup-monitoring.sh index 3435988b2..0f48a1746 100755 --- a/hack/test/setup-monitoring.sh +++ b/hack/test/setup-monitoring.sh @@ -1,5 +1,7 @@ #!/bin/bash +#TODO dtfranz: The yaml in this file should be pulled out and organized into a kustomization.yaml (where possible) for maintainability/readability + set -euo pipefail help="setup-monitoring.sh is used to set up prometheus monitoring for e2e testing. @@ -92,6 +94,7 @@ spec: runAsUser: 65534 seccompProfile: type: RuntimeDefault + ruleSelector: {} serviceDiscoveryRole: EndpointSlice serviceMonitorSelector: {} EOF @@ -115,6 +118,49 @@ spec: - {} # Allows us to query prometheus EOF +kubectl apply -f - << EOF +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kubelet + namespace: olmv1-system + labels: + k8s-app: kubelet +spec: + jobLabel: k8s-app + endpoints: + - port: https-metrics + scheme: https + path: /metrics + interval: 10s + honorLabels: true + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + metricRelabelings: + - action: keep + sourceLabels: [pod,container] + regex: (operator-controller|catalogd).*;manager + - port: https-metrics + scheme: https + path: /metrics/cadvisor + interval: 10s + honorLabels: true + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + metricRelabelings: + - action: keep + sourceLabels: [pod,container] + regex: (operator-controller|catalogd).*;manager + selector: + matchLabels: + k8s-app: kubelet + namespaceSelector: + matchNames: + - kube-system +EOF + # Give the operator time to create the pod kubectl wait --for=create pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=60s kubectl wait --for=condition=Ready pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=120s @@ -131,6 +177,56 @@ metadata: kubernetes.io/service-account.name: prometheus EOF +kubectl apply -f - << EOF +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: controller-alerts + namespace: ${NAMESPACE} +spec: + groups: + - name: controller-panic + rules: + - alert: reconciler-panic + expr: controller_runtime_reconcile_panics_total{} > 0 + annotations: + description: "controller of pod {{ \$labels.pod }} experienced panic(s); count={{ \$value }}" + - alert: webhook-panic + expr: controller_runtime_webhook_panics_total{} > 0 + annotations: + description: "controller webhook of pod {{ \$labels.pod }} experienced panic(s); count={{ \$value }}" + - name: resource-usage + rules: + - alert: oom-events + expr: container_oom_events_total > 0 + annotations: + description: "container {{ \$labels.container }} of pod {{ \$labels.pod }} experienced OOM event(s); count={{ \$value }}" + - alert: operator-controller-memory-growth + expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000 + for: 5m + keep_firing_for: 1d + annotations: + description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ \$value | humanize }}B/sec" + - alert: catalogd-memory-growth + expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000 + for: 5m + keep_firing_for: 1d + annotations: + description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ \$value | humanize }}B/sec" + - alert: operator-controller-cpu-usage + expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20 + for: 5m + keep_firing_for: 1d + annotations: + description: "operator-controller using high cpu resource for 5 minutes: {{ \$value | printf \"%.2f\" }}%" + - alert: catalogd-cpu-usage + expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20 + for: 5m + keep_firing_for: 1d + annotations: + description: "catalogd using high cpu resources for 5 minutes: {{ \$value | printf \"%.2f\" }}%" +EOF + # ServiceMonitors for operator-controller and catalogd kubectl apply -f - <