1
1
#! /bin/bash
2
2
3
+ # TODO dtfranz: The yaml in this file should be pulled out and organized into a kustomization.yaml (where possible) for maintainability/readability
4
+
3
5
set -euo pipefail
4
6
5
7
help=" setup-monitoring.sh is used to set up prometheus monitoring for e2e testing.
92
94
runAsUser: 65534
93
95
seccompProfile:
94
96
type: RuntimeDefault
97
+ ruleSelector: {}
95
98
serviceDiscoveryRole: EndpointSlice
96
99
serviceMonitorSelector: {}
97
100
EOF
@@ -115,6 +118,49 @@ spec:
115
118
- {} # Allows us to query prometheus
116
119
EOF
117
120
121
+ kubectl apply -f - << EOF
122
+ apiVersion: monitoring.coreos.com/v1
123
+ kind: ServiceMonitor
124
+ metadata:
125
+ name: kubelet
126
+ namespace: olmv1-system
127
+ labels:
128
+ k8s-app: kubelet
129
+ spec:
130
+ jobLabel: k8s-app
131
+ endpoints:
132
+ - port: https-metrics
133
+ scheme: https
134
+ path: /metrics
135
+ interval: 10s
136
+ honorLabels: true
137
+ tlsConfig:
138
+ insecureSkipVerify: true
139
+ bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
140
+ metricRelabelings:
141
+ - action: keep
142
+ sourceLabels: [pod,container]
143
+ regex: (operator-controller|catalogd).*;manager
144
+ - port: https-metrics
145
+ scheme: https
146
+ path: /metrics/cadvisor
147
+ interval: 10s
148
+ honorLabels: true
149
+ tlsConfig:
150
+ insecureSkipVerify: true
151
+ bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
152
+ metricRelabelings:
153
+ - action: keep
154
+ sourceLabels: [pod,container]
155
+ regex: (operator-controller|catalogd).*;manager
156
+ selector:
157
+ matchLabels:
158
+ k8s-app: kubelet
159
+ namespaceSelector:
160
+ matchNames:
161
+ - kube-system
162
+ EOF
163
+
118
164
# Give the operator time to create the pod
119
165
kubectl wait --for=create pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=60s
120
166
kubectl wait --for=condition=Ready pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=120s
@@ -131,6 +177,56 @@ metadata:
131
177
kubernetes.io/service-account.name: prometheus
132
178
EOF
133
179
180
+ kubectl apply -f - << EOF
181
+ apiVersion: monitoring.coreos.com/v1
182
+ kind: PrometheusRule
183
+ metadata:
184
+ name: controller-alerts
185
+ namespace: ${NAMESPACE}
186
+ spec:
187
+ groups:
188
+ - name: controller-panic
189
+ rules:
190
+ - alert: reconciler-panic
191
+ expr: controller_runtime_reconcile_panics_total{} > 0
192
+ annotations:
193
+ description: "controller of pod {{ \$ labels.pod }} experienced panic(s); count={{ \$ value }}"
194
+ - alert: webhook-panic
195
+ expr: controller_runtime_webhook_panics_total{} > 0
196
+ annotations:
197
+ description: "controller webhook of pod {{ \$ labels.pod }} experienced panic(s); count={{ \$ value }}"
198
+ - name: resource-usage
199
+ rules:
200
+ - alert: oom-events
201
+ expr: container_oom_events_total > 0
202
+ annotations:
203
+ description: "container {{ \$ labels.container }} of pod {{ \$ labels.pod }} experienced OOM event(s); count={{ \$ value }}"
204
+ - alert: operator-controller-memory-growth
205
+ expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
206
+ for: 5m
207
+ keep_firing_for: 1d
208
+ annotations:
209
+ description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ \$ value | humanize }}B/sec"
210
+ - alert: catalogd-memory-growth
211
+ expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
212
+ for: 5m
213
+ keep_firing_for: 1d
214
+ annotations:
215
+ description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ \$ value | humanize }}B/sec"
216
+ - alert: operator-controller-cpu-usage
217
+ expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
218
+ for: 5m
219
+ keep_firing_for: 1d
220
+ annotations:
221
+ description: "operator-controller using high cpu resource for 5 minutes: {{ \$ value | printf \"%.2f\" }}%"
222
+ - alert: catalogd-cpu-usage
223
+ expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
224
+ for: 5m
225
+ keep_firing_for: 1d
226
+ annotations:
227
+ description: "catalogd using high cpu resources for 5 minutes: {{ \$ value | printf \"%.2f\" }}%"
228
+ EOF
229
+
134
230
# ServiceMonitors for operator-controller and catalogd
135
231
kubectl apply -f - << EOF
136
232
apiVersion: monitoring.coreos.com/v1
@@ -141,6 +237,7 @@ metadata:
141
237
spec:
142
238
endpoints:
143
239
- path: /metrics
240
+ interval: 10s
144
241
port: https
145
242
scheme: https
146
243
authorization:
@@ -178,6 +275,7 @@ spec:
178
275
endpoints:
179
276
- path: /metrics
180
277
port: metrics
278
+ interval: 10s
181
279
scheme: https
182
280
authorization:
183
281
credentials:
0 commit comments