@@ -79,7 +79,8 @@ func (d Sink) GetPrefix() string {
79
79
return d .prefix
80
80
}
81
81
82
- // MetricInjected increments the injected metric
82
+ // MetricInjected is used by the chaos-injector to indicate it has finished trying to inject the disruption into the target,
83
+ // the `succeed` bool argument is false if there was an error while injecting.
83
84
func (d Sink ) MetricInjected (succeed bool , kind string , tags []string ) error {
84
85
status := boolToStatus (succeed )
85
86
t := []string {"status:" + status , "kind:" + kind }
@@ -88,7 +89,9 @@ func (d Sink) MetricInjected(succeed bool, kind string, tags []string) error {
88
89
return d .metricWithStatus (d .prefix + "injected" , t )
89
90
}
90
91
91
- // MetricReinjected increments the reinjected metric
92
+ // MetricReinjected is used by the chaos-injector to indicate it has finished trying to inject the disruption into the target,
93
+ // the `succeed` bool argument is false if there was an error while injecting. This metric is used instead of MetricInjected
94
+ // if the chaos-injector pod is performing any injection after its first, i.e., when using the pulse feature
92
95
func (d Sink ) MetricReinjected (succeed bool , kind string , tags []string ) error {
93
96
status := boolToStatus (succeed )
94
97
t := []string {"status:" + status , "kind:" + kind }
@@ -97,7 +100,8 @@ func (d Sink) MetricReinjected(succeed bool, kind string, tags []string) error {
97
100
return d .metricWithStatus (d .prefix + "reinjected" , t )
98
101
}
99
102
100
- // MetricCleanedForReinjection increments the cleanedForReinjection metric
103
+ // MetricCleanedForReinjection is used by the chaos-injector to indicate an injector has cleaned the disruptions from the target,
104
+ // but expects to reinject, i.e., when using the spec.pulse feature
101
105
func (d Sink ) MetricCleanedForReinjection (succeed bool , kind string , tags []string ) error {
102
106
status := boolToStatus (succeed )
103
107
t := []string {"status:" + status , "kind:" + kind }
@@ -106,7 +110,8 @@ func (d Sink) MetricCleanedForReinjection(succeed bool, kind string, tags []stri
106
110
return d .metricWithStatus (d .prefix + "cleaned_for_reinjection" , t )
107
111
}
108
112
109
- // MetricCleaned increments the cleaned metric
113
+ // MetricCleaned is used by the chaos-injector to indicate an injector has cleaned the disruptions from the target,
114
+ // and does not intend to re-inject.
110
115
func (d Sink ) MetricCleaned (succeed bool , kind string , tags []string ) error {
111
116
status := boolToStatus (succeed )
112
117
t := []string {"status:" + status , "kind:" + kind }
@@ -115,55 +120,64 @@ func (d Sink) MetricCleaned(succeed bool, kind string, tags []string) error {
115
120
return d .metricWithStatus (d .prefix + "cleaned" , t )
116
121
}
117
122
118
- // MetricReconcile increment reconcile metric
123
+ // MetricReconcile is used to count how many times the controller enters any reconcile loop
119
124
func (d Sink ) MetricReconcile () error {
120
125
return d .metricWithStatus (d .prefix + "reconcile" , []string {})
121
126
}
122
127
123
- // MetricReconcileDuration send timing metric for reconcile loop
128
+ // MetricReconcileDuration is used at the end of every reconcile loop to indicate the duration that Reconcile() call spent
124
129
func (d Sink ) MetricReconcileDuration (duration time.Duration , tags []string ) error {
125
130
return d .timing (d .prefix + "reconcile.duration" , duration , tags )
126
131
}
127
132
128
- // MetricCleanupDuration send timing metric for cleanup duration
133
+ // MetricCleanupDuration indicates the duration between a Disruption's deletion timestamp, and when the chaos-controller
134
+ // removes its finalizer
129
135
func (d Sink ) MetricCleanupDuration (duration time.Duration , tags []string ) error {
130
136
return d .timing (d .prefix + "cleanup.duration" , duration , tags )
131
137
}
132
138
133
- // MetricInjectDuration send timing metric for inject duration
139
+ // MetricInjectDuration indicates the duration between a Disruption's creation timestamp, and when it reaches a status
140
+ // of Injected, indicating all chaos-injector pods have injected into their targets, and we've reached the expected count
134
141
func (d Sink ) MetricInjectDuration (duration time.Duration , tags []string ) error {
135
142
return d .timing (d .prefix + "inject.duration" , duration , tags )
136
143
}
137
144
138
- // MetricDisruptionCompletedDuration sends timing metric for entire disruption duration
145
+ // MetricDisruptionCompletedDuration indicates the duration between a Disruption's creation timestamp, and when the chaos-controller
146
+ // removes its finalizer
139
147
func (d Sink ) MetricDisruptionCompletedDuration (duration time.Duration , tags []string ) error {
140
148
return d .timing (d .prefix + "disruption.completed_duration" , duration , tags )
141
149
}
142
150
143
- // MetricDisruptionOngoingDuration sends timing metric for disruption duration so far
151
+ // MetricDisruptionOngoingDuration indicates the duration between a Disruption's creation timestamp, and the current time.
152
+ // This is emitted approximately every one minute
144
153
func (d Sink ) MetricDisruptionOngoingDuration (duration time.Duration , tags []string ) error {
145
154
return d .timing (d .prefix + "disruption.ongoing_duration" , duration , tags )
146
155
}
147
156
148
- // MetricPodsCreated increment pods.created metric
157
+ // MetricPodsCreated is used every time the chaos-controller finishes sending a Create request to the k8s api to
158
+ // schedule a new chaos-injector pod. The `succeed` bool argument is false if there was an error returned.
149
159
func (d Sink ) MetricPodsCreated (target , instanceName , namespace string , succeed bool ) error {
150
160
status := boolToStatus (succeed )
151
- tags := []string {"target:" + target , "disruptionName:" + instanceName , "status:" + status , "namespace :" + namespace }
161
+ tags := []string {"target:" + target , "disruptionName:" + instanceName , "status:" + status , "disruptionNamespace :" + namespace }
152
162
153
163
return d .metricWithStatus (d .prefix + "pods.created" , tags )
154
164
}
155
165
156
- // MetricStuckOnRemoval increments disruptions.stuck_on_removal metric
166
+ // MetricStuckOnRemoval is emitted once per minute per disruption, if that disruption is "stuck on removal", i.e.,
167
+ // we have attempted to clean and delete the disruption, but that has not worked, and a human needs to intervene.
157
168
func (d Sink ) MetricStuckOnRemoval (tags []string ) error {
158
169
return d .metricWithStatus (d .prefix + "disruptions.stuck_on_removal" , tags )
159
170
}
160
171
161
- // MetricStuckOnRemovalGauge sends disruptions.stuck_on_removal_total metric containing the gauge of stuck disruptions
172
+ // MetricStuckOnRemovalGauge is emitted once per minute counting the total number of disruptions that are
173
+ // "stuck on removal", i.e., we have attempted to clean and delete the disruption, but that has not worked,
174
+ // and a human needs to intervene.
162
175
func (d Sink ) MetricStuckOnRemovalGauge (gauge float64 ) error {
163
176
return d .client .Gauge (d .prefix + "disruptions.stuck_on_removal_total" , gauge , []string {}, 1 )
164
177
}
165
178
166
- // MetricDisruptionsGauge sends the disruptions.gauge metric counting ongoing disruptions
179
+ // MetricDisruptionsGauge is emitted once per minute counting the total number of ongoing disruptions per namespace,
180
+ // or if we fail to determine the namespaced metrics, simply the total number of disruptions found
167
181
func (d Sink ) MetricDisruptionsGauge (gauge float64 , tags []string ) error {
168
182
return d .client .Gauge (d .prefix + "disruptions.gauge" , gauge , tags , 1 )
169
183
}
@@ -174,37 +188,40 @@ func (d Sink) MetricDisruptionsCount(kind chaostypes.DisruptionKindName, tags []
174
188
return d .metricWithStatus (d .prefix + "disruptions.count" , tags )
175
189
}
176
190
177
- // MetricPodsGauge sends the pods.gauge metric counting existing chaos pods
191
+ // MetricPodsGauge is emitted once per minute counting the total number of live chaos pods for all ongoing disruptions
178
192
func (d Sink ) MetricPodsGauge (gauge float64 ) error {
179
193
return d .client .Gauge (d .prefix + "pods.gauge" , gauge , []string {}, 1 )
180
194
}
181
195
182
- // MetricRestart sends an increment of the controller restart metric
196
+ // MetricRestart is emitted once, every time the manager container of the chaos- controller starts up
183
197
func (d Sink ) MetricRestart () error {
184
198
return d .metricWithStatus (d .prefix + "restart" , []string {})
185
199
}
186
200
187
- // MetricValidationFailed increments the failed validation metric
201
+ // MetricValidationFailed is emitted in ValidateCreate and ValidateUpdate in the disruption_webhook, specifically and
202
+ // only when DisruptionSpec.Validate() returns an error, OR when trying to remove the finalizer from a disruption with
203
+ // chaos pods.
188
204
func (d Sink ) MetricValidationFailed (tags []string ) error {
189
205
return d .metricWithStatus (d .prefix + "validation.failed" , tags )
190
206
}
191
207
192
- // MetricValidationCreated increments the created validation metric
208
+ // MetricValidationCreated is emitted once per created Disruption, in the webhook after validation completes.
193
209
func (d Sink ) MetricValidationCreated (tags []string ) error {
194
210
return d .metricWithStatus (d .prefix + "validation.created" , tags )
195
211
}
196
212
197
- // MetricValidationUpdated increments the updated validation metric
213
+ // MetricValidationUpdated is emitted once per Disruption update, in the webhook after validation completes
198
214
func (d Sink ) MetricValidationUpdated (tags []string ) error {
199
215
return d .metricWithStatus (d .prefix + "validation.updated" , tags )
200
216
}
201
217
202
- // MetricValidationDeleted increments the deleted validation metric
218
+ // MetricValidationDeleted is emitted once per Disruption delete, in the webhook
203
219
func (d Sink ) MetricValidationDeleted (tags []string ) error {
204
220
return d .metricWithStatus (d .prefix + "validation.deleted" , tags )
205
221
}
206
222
207
- // MetricInformed increments when the pod informer receives an event to process before reconciliation
223
+ // MetricInformed is emitted every time the manager container's informer is called to check a pod in the chaos-controller's
224
+ // namespace, to see if that pod is a chaos-injector pod that needs its Disruption reconciled.
208
225
func (d Sink ) MetricInformed (tags []string ) error {
209
226
return d .metricWithStatus (d .prefix + "informed" , tags )
210
227
}
@@ -214,24 +231,24 @@ func (d Sink) MetricOrphanFound(tags []string) error {
214
231
return d .metricWithStatus (d .prefix + "orphan.found" , tags )
215
232
}
216
233
217
- // MetricWatcherCalls is a counter of watcher calls.
234
+ // MetricWatcherCalls is a counter of watcher calls. This is emitted by every OnChange event for all of our watchers,
235
+ // e.g., the chaos pod watcher, the target pod watcher, the disruption watcher.
218
236
func (d Sink ) MetricWatcherCalls (tags []string ) error {
219
237
return d .metricWithStatus (d .prefix + "watcher.calls_total" , tags )
220
238
}
221
239
222
- // MetricTooLate reports when a scheduled disruption misses its aloted time to be scheduled
240
+ // MetricTooLate reports when a scheduled Disruption misses its configured time to be run,
223
241
// specific to cron and rollout controllers
224
242
func (d Sink ) MetricTooLate (tags []string ) error {
225
243
return d .metricWithStatus (d .prefix + "schedule.too_late" , tags )
226
244
}
227
245
228
- // MetricTargetMissing reports when a scheduled Disruption can not find its specific target
229
- // either for the first time or multiple times. A deletion occurs on the final alert
246
+ // MetricTargetMissing reports anytime scheduled Disruption can not find its specified target
230
247
func (d Sink ) MetricTargetMissing (duration time.Duration , tags []string ) error {
231
248
return d .timing (d .prefix + "schedule.target_missing" , duration , tags )
232
249
}
233
250
234
- // MetricMissingTargetFound reports when a scheduled Disruption which had initially been deemed missing
251
+ // MetricMissingTargetFound reports when a scheduled Disruption's target which had initially been deemed missing
235
252
// is "found" and running in the kubernetes namespace
236
253
func (d Sink ) MetricMissingTargetFound (tags []string ) error {
237
254
return d .metricWithStatus (d .prefix + "schedule.missing_target_found" , tags )
@@ -243,7 +260,7 @@ func (d Sink) MetricMissingTargetDeleted(tags []string) error {
243
260
return d .metricWithStatus (d .prefix + "schedule.missing_target_deleted" , tags )
244
261
}
245
262
246
- // MetricNextScheduledTime reports the duration until the next scheduled disruption will run
263
+ // MetricNextScheduledTime reports the duration until this scheduled Disruption's next scheduled disruption should run
247
264
func (d Sink ) MetricNextScheduledTime (duration time.Duration , tags []string ) error {
248
265
return d .timing (d .prefix + "schedule.next_scheduled" , duration , tags )
249
266
}
0 commit comments