From bb8a597a53b75cd07ae622a8d8a17192d86fe1fc Mon Sep 17 00:00:00 2001
From: Daniel Franz <dfranz@redhat.com>
Date: Mon, 7 Jul 2025 17:10:27 +0900
Subject: [PATCH] Performance Alerting

Introduces an early-warning series of prometheus alerts to attempt to catch issues with performance at an early stage in development.

Signed-off-by: Daniel Franz <dfranz@redhat.com>
---
 .github/workflows/e2e.yaml    | 15 ++++++
 Makefile                      |  8 ++-
 hack/test/setup-monitoring.sh | 98 +++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
index d0dd6b8f9..e5d4c0858 100644
--- a/.github/workflows/e2e.yaml
+++ b/.github/workflows/e2e.yaml
@@ -35,6 +35,21 @@ jobs:
       - name: Run e2e tests
         run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
 
+      - name: alerts-check
+        # Grab all current alerts, filtering out pending, and print the GH actions warning string
+        # containing the alert name and description.
+        #
+        # NOTE: Leaving this as annotating-only instead of failing the run until we have some more 
+        # finely-tuned alerts.
+        run: |
+          if [[ -s /tmp/artifacts/alerts.out ]]; then \
+            jq -r 'if .state=="firing" then 
+              "::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)" 
+            elif .state=="pending" then
+              "::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
+            end' /tmp/artifacts/alerts.out
+          fi
+
       - uses: actions/upload-artifact@v4
         if: failure()
         with:
diff --git a/Makefile b/Makefile
index e429f88a3..55ea8163e 100644
--- a/Makefile
+++ b/Makefile
@@ -285,11 +285,9 @@ prometheus: #EXHELP Deploy Prometheus into specified namespace
 # prometheus. Prometheus will gather metrics we currently query for over the test run, 
 # and provide alerts from the metrics based on the rules that we set.
 .PHONY: e2e-metrics
-e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set
-	curl -X POST \
-	-H "Content-Type: application/x-www-form-urlencoded" \
-	--data 'query={pod=~"operator-controller-controller-manager-.*|catalogd-controller-manager-.*"}' \
-	http://localhost:30900/api/v1/query > $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/metrics.out
+e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out
+e2e-metrics: #EXHELP Request metrics from prometheus; select only actively firing alerts; place in ARTIFACT_PATH if set
+	curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH)
 
 .PHONY: extension-developer-e2e
 extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e
diff --git a/hack/test/setup-monitoring.sh b/hack/test/setup-monitoring.sh
index 3435988b2..0f48a1746 100755
--- a/hack/test/setup-monitoring.sh
+++ b/hack/test/setup-monitoring.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+#TODO dtfranz: The yaml in this file should be pulled out and organized into a kustomization.yaml (where possible) for maintainability/readability
+
 set -euo pipefail
 
 help="setup-monitoring.sh is used to set up prometheus monitoring for e2e testing.
@@ -92,6 +94,7 @@ spec:
     runAsUser: 65534
     seccompProfile:
         type: RuntimeDefault
+  ruleSelector: {}
   serviceDiscoveryRole: EndpointSlice
   serviceMonitorSelector: {}
 EOF
@@ -115,6 +118,49 @@ spec:
     - {}  # Allows us to query prometheus
 EOF
 
+kubectl apply -f - << EOF
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: kubelet
+  namespace: olmv1-system
+  labels:
+    k8s-app: kubelet
+spec:
+  jobLabel: k8s-app
+  endpoints:
+  - port: https-metrics
+    scheme: https
+    path: /metrics
+    interval: 10s
+    honorLabels: true
+    tlsConfig:
+      insecureSkipVerify: true
+    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    metricRelabelings:
+      - action: keep
+        sourceLabels: [pod,container]
+        regex: (operator-controller|catalogd).*;manager
+  - port: https-metrics
+    scheme: https
+    path: /metrics/cadvisor
+    interval: 10s
+    honorLabels: true
+    tlsConfig:
+      insecureSkipVerify: true
+    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    metricRelabelings:
+      - action: keep
+        sourceLabels: [pod,container]
+        regex: (operator-controller|catalogd).*;manager
+  selector:
+    matchLabels:
+      k8s-app: kubelet
+  namespaceSelector:
+    matchNames:
+    - kube-system
+EOF
+
 # Give the operator time to create the pod
 kubectl wait --for=create pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=60s
 kubectl wait --for=condition=Ready pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=120s
@@ -131,6 +177,56 @@ metadata:
     kubernetes.io/service-account.name: prometheus
 EOF
 
+kubectl apply -f - << EOF
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: controller-alerts
+  namespace: ${NAMESPACE}
+spec:
+  groups:
+  - name: controller-panic
+    rules:
+    - alert: reconciler-panic
+      expr: controller_runtime_reconcile_panics_total{} > 0
+      annotations:
+        description: "controller of pod {{ \$labels.pod }} experienced panic(s); count={{ \$value }}"
+    - alert: webhook-panic
+      expr: controller_runtime_webhook_panics_total{} > 0
+      annotations:
+        description: "controller webhook of pod {{ \$labels.pod }} experienced panic(s); count={{ \$value }}"
+  - name: resource-usage
+    rules:
+    - alert: oom-events
+      expr: container_oom_events_total > 0
+      annotations:
+        description: "container {{ \$labels.container }} of pod {{ \$labels.pod }} experienced OOM event(s); count={{ \$value }}"
+    - alert: operator-controller-memory-growth
+      expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ \$value | humanize }}B/sec"
+    - alert: catalogd-memory-growth
+      expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ \$value | humanize }}B/sec"
+    - alert: operator-controller-cpu-usage
+      expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "operator-controller using high cpu resource for 5 minutes: {{ \$value | printf \"%.2f\" }}%"
+    - alert: catalogd-cpu-usage
+      expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "catalogd using high cpu resources for 5 minutes: {{ \$value | printf \"%.2f\" }}%"
+EOF
+
 # ServiceMonitors for operator-controller and catalogd
 kubectl apply -f - <<EOF
 apiVersion: monitoring.coreos.com/v1
@@ -141,6 +237,7 @@ metadata:
 spec:
   endpoints:
     - path: /metrics
+      interval: 10s
       port: https
       scheme: https
       authorization:
@@ -178,6 +275,7 @@ spec:
   endpoints:
     - path: /metrics
       port: metrics
+      interval: 10s
       scheme: https
       authorization:
         credentials: