Skip to content

Commit d97a45d

Browse files
authored
Explain in metrics.go how all metrics are used (#955)
* Add comments to metrics.go to explain when and how all metrics are used * add to the docs * fix a namespace tag
1 parent 22c86f5 commit d97a45d

File tree

4 files changed

+151
-47
lines changed

4 files changed

+151
-47
lines changed

docs/metrics_events.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,15 @@ Here's the list of metrics sent by the controller and the injector.
2222
* `chaos.controller.disruptions.stuck_on_removal` increments when a disruption is stuck on removal
2323
* `chaos.controller.disruptions.stuck_on_removal_total` is the total count of existing disruption being flagged as stuck on removal
2424
* `chaos.controller.disruptions.gauge` is the total count of existing disruption
25-
* `chaos.controller.disruptions.count` increments when a disruption is created
25+
* `chaos.controller.disruptions.count` increments when a disruption is finished
26+
* `chaos.controller.watcher.calls_total` increments each time any watcher handles an OnChange event
27+
* `chaos.cron.controller.schedule.too_late` increments each time a DisruptionCron has missed the time to schedule its disruption
28+
* `chaos.cron.controller.schedule.target_missing` increments each time a DisruptionCron cannot find its target
29+
* `chaos.cron.controller.schedule.missing_target_found` increments each time a DisruptionCron which couldn't find its target, is now able to find it
30+
* `chaos.cron.controller.schedule.missing_target_deleted` increments each time a DisruptionCron self deletes because its target was missing for too long
31+
* `chaos.cron.controller.schedule.next_scheduled` is the time between now and when the next disruption for this DisruptionCron should run
32+
* `chaos.cron.controller.schedule.disruption_scheduled` increments each time a DisruptionCron schedules a child disruption
33+
* `chaos.cron.controller.schedule.paused` increments each time a DisruptionCron reconciles while in a paused state
2634

2735
#### Admission webhooks
2836

o11y/metrics/datadog/datadog.go

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ func (d Sink) GetPrefix() string {
7979
return d.prefix
8080
}
8181

82-
// MetricInjected increments the injected metric
82+
// MetricInjected is used by the chaos-injector to indicate it has finished trying to inject the disruption into the target,
83+
// the `succeed` bool argument is false if there was an error while injecting.
8384
func (d Sink) MetricInjected(succeed bool, kind string, tags []string) error {
8485
status := boolToStatus(succeed)
8586
t := []string{"status:" + status, "kind:" + kind}
@@ -88,7 +89,9 @@ func (d Sink) MetricInjected(succeed bool, kind string, tags []string) error {
8889
return d.metricWithStatus(d.prefix+"injected", t)
8990
}
9091

91-
// MetricReinjected increments the reinjected metric
92+
// MetricReinjected is used by the chaos-injector to indicate it has finished trying to inject the disruption into the target,
93+
// the `succeed` bool argument is false if there was an error while injecting. This metric is used instead of MetricInjected
94+
// if the chaos-injector pod is performing any injection after its first, i.e., when using the pulse feature
9295
func (d Sink) MetricReinjected(succeed bool, kind string, tags []string) error {
9396
status := boolToStatus(succeed)
9497
t := []string{"status:" + status, "kind:" + kind}
@@ -97,7 +100,8 @@ func (d Sink) MetricReinjected(succeed bool, kind string, tags []string) error {
97100
return d.metricWithStatus(d.prefix+"reinjected", t)
98101
}
99102

100-
// MetricCleanedForReinjection increments the cleanedForReinjection metric
103+
// MetricCleanedForReinjection is used by the chaos-injector to indicate an injector has cleaned the disruptions from the target,
104+
// but expects to reinject, i.e., when using the spec.pulse feature
101105
func (d Sink) MetricCleanedForReinjection(succeed bool, kind string, tags []string) error {
102106
status := boolToStatus(succeed)
103107
t := []string{"status:" + status, "kind:" + kind}
@@ -106,7 +110,8 @@ func (d Sink) MetricCleanedForReinjection(succeed bool, kind string, tags []stri
106110
return d.metricWithStatus(d.prefix+"cleaned_for_reinjection", t)
107111
}
108112

109-
// MetricCleaned increments the cleaned metric
113+
// MetricCleaned is used by the chaos-injector to indicate an injector has cleaned the disruptions from the target,
114+
// and does not intend to re-inject.
110115
func (d Sink) MetricCleaned(succeed bool, kind string, tags []string) error {
111116
status := boolToStatus(succeed)
112117
t := []string{"status:" + status, "kind:" + kind}
@@ -115,55 +120,64 @@ func (d Sink) MetricCleaned(succeed bool, kind string, tags []string) error {
115120
return d.metricWithStatus(d.prefix+"cleaned", t)
116121
}
117122

118-
// MetricReconcile increment reconcile metric
123+
// MetricReconcile is used to count how many times the controller enters any reconcile loop
119124
func (d Sink) MetricReconcile() error {
120125
return d.metricWithStatus(d.prefix+"reconcile", []string{})
121126
}
122127

123-
// MetricReconcileDuration send timing metric for reconcile loop
128+
// MetricReconcileDuration is used at the end of every reconcile loop to indicate the duration that Reconcile() call spent
124129
func (d Sink) MetricReconcileDuration(duration time.Duration, tags []string) error {
125130
return d.timing(d.prefix+"reconcile.duration", duration, tags)
126131
}
127132

128-
// MetricCleanupDuration send timing metric for cleanup duration
133+
// MetricCleanupDuration indicates the duration between a Disruption's deletion timestamp, and when the chaos-controller
134+
// removes its finalizer
129135
func (d Sink) MetricCleanupDuration(duration time.Duration, tags []string) error {
130136
return d.timing(d.prefix+"cleanup.duration", duration, tags)
131137
}
132138

133-
// MetricInjectDuration send timing metric for inject duration
139+
// MetricInjectDuration indicates the duration between a Disruption's creation timestamp, and when it reaches a status
140+
// of Injected, indicating all chaos-injector pods have injected into their targets, and we've reached the expected count
134141
func (d Sink) MetricInjectDuration(duration time.Duration, tags []string) error {
135142
return d.timing(d.prefix+"inject.duration", duration, tags)
136143
}
137144

138-
// MetricDisruptionCompletedDuration sends timing metric for entire disruption duration
145+
// MetricDisruptionCompletedDuration indicates the duration between a Disruption's creation timestamp, and when the chaos-controller
146+
// removes its finalizer
139147
func (d Sink) MetricDisruptionCompletedDuration(duration time.Duration, tags []string) error {
140148
return d.timing(d.prefix+"disruption.completed_duration", duration, tags)
141149
}
142150

143-
// MetricDisruptionOngoingDuration sends timing metric for disruption duration so far
151+
// MetricDisruptionOngoingDuration indicates the duration between a Disruption's creation timestamp, and the current time.
152+
// This is emitted approximately every one minute
144153
func (d Sink) MetricDisruptionOngoingDuration(duration time.Duration, tags []string) error {
145154
return d.timing(d.prefix+"disruption.ongoing_duration", duration, tags)
146155
}
147156

148-
// MetricPodsCreated increment pods.created metric
157+
// MetricPodsCreated is used every time the chaos-controller finishes sending a Create request to the k8s api to
158+
// schedule a new chaos-injector pod. The `succeed` bool argument is false if there was an error returned.
149159
func (d Sink) MetricPodsCreated(target, instanceName, namespace string, succeed bool) error {
150160
status := boolToStatus(succeed)
151-
tags := []string{"target:" + target, "disruptionName:" + instanceName, "status:" + status, "namespace:" + namespace}
161+
tags := []string{"target:" + target, "disruptionName:" + instanceName, "status:" + status, "disruptionNamespace:" + namespace}
152162

153163
return d.metricWithStatus(d.prefix+"pods.created", tags)
154164
}
155165

156-
// MetricStuckOnRemoval increments disruptions.stuck_on_removal metric
166+
// MetricStuckOnRemoval is emitted once per minute per disruption, if that disruption is "stuck on removal", i.e.,
167+
// we have attempted to clean and delete the disruption, but that has not worked, and a human needs to intervene.
157168
func (d Sink) MetricStuckOnRemoval(tags []string) error {
158169
return d.metricWithStatus(d.prefix+"disruptions.stuck_on_removal", tags)
159170
}
160171

161-
// MetricStuckOnRemovalGauge sends disruptions.stuck_on_removal_total metric containing the gauge of stuck disruptions
172+
// MetricStuckOnRemovalGauge is emitted once per minute counting the total number of disruptions that are
173+
// "stuck on removal", i.e., we have attempted to clean and delete the disruption, but that has not worked,
174+
// and a human needs to intervene.
162175
func (d Sink) MetricStuckOnRemovalGauge(gauge float64) error {
163176
return d.client.Gauge(d.prefix+"disruptions.stuck_on_removal_total", gauge, []string{}, 1)
164177
}
165178

166-
// MetricDisruptionsGauge sends the disruptions.gauge metric counting ongoing disruptions
179+
// MetricDisruptionsGauge is emitted once per minute counting the total number of ongoing disruptions per namespace,
180+
// or if we fail to determine the namespaced metrics, simply the total number of disruptions found
167181
func (d Sink) MetricDisruptionsGauge(gauge float64, tags []string) error {
168182
return d.client.Gauge(d.prefix+"disruptions.gauge", gauge, tags, 1)
169183
}
@@ -174,37 +188,40 @@ func (d Sink) MetricDisruptionsCount(kind chaostypes.DisruptionKindName, tags []
174188
return d.metricWithStatus(d.prefix+"disruptions.count", tags)
175189
}
176190

177-
// MetricPodsGauge sends the pods.gauge metric counting existing chaos pods
191+
// MetricPodsGauge is emitted once per minute counting the total number of live chaos pods for all ongoing disruptions
178192
func (d Sink) MetricPodsGauge(gauge float64) error {
179193
return d.client.Gauge(d.prefix+"pods.gauge", gauge, []string{}, 1)
180194
}
181195

182-
// MetricRestart sends an increment of the controller restart metric
196+
// MetricRestart is emitted once, every time the manager container of the chaos-controller starts up
183197
func (d Sink) MetricRestart() error {
184198
return d.metricWithStatus(d.prefix+"restart", []string{})
185199
}
186200

187-
// MetricValidationFailed increments the failed validation metric
201+
// MetricValidationFailed is emitted in ValidateCreate and ValidateUpdate in the disruption_webhook, specifically and
202+
// only when DisruptionSpec.Validate() returns an error, OR when trying to remove the finalizer from a disruption with
203+
// chaos pods.
188204
func (d Sink) MetricValidationFailed(tags []string) error {
189205
return d.metricWithStatus(d.prefix+"validation.failed", tags)
190206
}
191207

192-
// MetricValidationCreated increments the created validation metric
208+
// MetricValidationCreated is emitted once per created Disruption, in the webhook after validation completes.
193209
func (d Sink) MetricValidationCreated(tags []string) error {
194210
return d.metricWithStatus(d.prefix+"validation.created", tags)
195211
}
196212

197-
// MetricValidationUpdated increments the updated validation metric
213+
// MetricValidationUpdated is emitted once per Disruption update, in the webhook after validation completes
198214
func (d Sink) MetricValidationUpdated(tags []string) error {
199215
return d.metricWithStatus(d.prefix+"validation.updated", tags)
200216
}
201217

202-
// MetricValidationDeleted increments the deleted validation metric
218+
// MetricValidationDeleted is emitted once per Disruption delete, in the webhook
203219
func (d Sink) MetricValidationDeleted(tags []string) error {
204220
return d.metricWithStatus(d.prefix+"validation.deleted", tags)
205221
}
206222

207-
// MetricInformed increments when the pod informer receives an event to process before reconciliation
223+
// MetricInformed is emitted every time the manager container's informer is called to check a pod in the chaos-controller's
224+
// namespace, to see if that pod is a chaos-injector pod that needs its Disruption reconciled.
208225
func (d Sink) MetricInformed(tags []string) error {
209226
return d.metricWithStatus(d.prefix+"informed", tags)
210227
}
@@ -214,24 +231,24 @@ func (d Sink) MetricOrphanFound(tags []string) error {
214231
return d.metricWithStatus(d.prefix+"orphan.found", tags)
215232
}
216233

217-
// MetricWatcherCalls is a counter of watcher calls.
234+
// MetricWatcherCalls is a counter of watcher calls. This is emitted by every OnChange event for all of our watchers,
235+
// e.g., the chaos pod watcher, the target pod watcher, the disruption watcher.
218236
func (d Sink) MetricWatcherCalls(tags []string) error {
219237
return d.metricWithStatus(d.prefix+"watcher.calls_total", tags)
220238
}
221239

222-
// MetricTooLate reports when a scheduled disruption misses its aloted time to be scheduled
240+
// MetricTooLate reports when a scheduled Disruption misses its configured time to be run,
223241
// specific to cron and rollout controllers
224242
func (d Sink) MetricTooLate(tags []string) error {
225243
return d.metricWithStatus(d.prefix+"schedule.too_late", tags)
226244
}
227245

228-
// MetricTargetMissing reports when a scheduled Disruption can not find its specific target
229-
// either for the first time or multiple times. A deletion occurs on the final alert
246+
// MetricTargetMissing reports anytime scheduled Disruption can not find its specified target
230247
func (d Sink) MetricTargetMissing(duration time.Duration, tags []string) error {
231248
return d.timing(d.prefix+"schedule.target_missing", duration, tags)
232249
}
233250

234-
// MetricMissingTargetFound reports when a scheduled Disruption which had initially been deemed missing
251+
// MetricMissingTargetFound reports when a scheduled Disruption's target which had initially been deemed missing
235252
// is "found" and running in the kubernetes namespace
236253
func (d Sink) MetricMissingTargetFound(tags []string) error {
237254
return d.metricWithStatus(d.prefix+"schedule.missing_target_found", tags)
@@ -243,7 +260,7 @@ func (d Sink) MetricMissingTargetDeleted(tags []string) error {
243260
return d.metricWithStatus(d.prefix+"schedule.missing_target_deleted", tags)
244261
}
245262

246-
// MetricNextScheduledTime reports the duration until the next scheduled disruption will run
263+
// MetricNextScheduledTime reports the duration until this scheduled Disruption's next scheduled disruption should run
247264
func (d Sink) MetricNextScheduledTime(duration time.Duration, tags []string) error {
248265
return d.timing(d.prefix+"schedule.next_scheduled", duration, tags)
249266
}

0 commit comments

Comments
 (0)