Skip to content

Commit 9790751

Browse files
authored
CHAOSPLT-580: Send metric when cron or rollout self-deletes (#953)
* CHAOSPLT-580: Allow for alerting when a cron is deleted for missing its target * allow for configuring target resource missing threshold
1 parent 301264f commit 9790751

File tree

12 files changed

+97
-19
lines changed

12 files changed

+97
-19
lines changed

chart/templates/configmap.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ data:
6565
minimumCronFrequency: {{ .Values.controller.minimumCronFrequency }}
6666
maxDuration: {{ .Values.controller.maxDuration }}
6767
finalizerDeletionDelay: {{ .Values.controller.finalizerDeletionDelay }}
68+
targetResourceMissingThreshold: {{ .Values.controller.targetResourceMissingThreshold }}
6869
expiredDisruptionGCDelay: {{ .Values.controller.expiredDisruptionGCDelay }}
6970
userInfoHook: {{ .Values.controller.userInfoHook }}
7071
webhook:

chart/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ controller:
7171
defaultCronDelayedStartTolerance: 15m
7272
minimumCronFrequency: 15m # a disruption cron with a spec.schedule that runs more often than this will be rejected.
7373
finalizerDeletionDelay: 20s
74+
targetResourceMissingThreshold: 24h # duration after a cron or rollout self-delete if target is missing for this long
7475
expiredDisruptionGCDelay: 10m # time after a disruption expires before deleting it
7576
userInfoHook: true
7677
webhook: # admission webhook configuration

chart/values/local.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ controller:
4343
defaultDuration: 3m
4444
finalizerDeletionDelay: 2s
4545
expiredDisruptionGCDelay: 15s
46+
targetResourceMissingThreshold: 1m
4647
webhook:
4748
certDir: chart/certs
4849
host: ""

config/config.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ type controllerConfig struct {
5555
DisruptionRolloutEnabled bool `json:"disruptionRolloutEnabled" yaml:"disruptionRolloutEnabled"`
5656
DisruptionDeletionTimeout time.Duration `json:"disruptionDeletionTimeout" yaml:"disruptionDeletionTimeout"`
5757
FinalizerDeletionDelay time.Duration `json:"finalizerDeletionDelay" yaml:"finalizerDeletionDelay"`
58+
TargetResourceMissingThreshold time.Duration `json:"targetResourceMissingThreshold" yaml:"targetResourceMissingThreshold"`
5859
DisabledDisruptions []string `json:"disabledDisruptions" yaml:"disabledDisruptions"`
5960
}
6061

@@ -569,6 +570,12 @@ func New(client corev1client.ConfigMapInterface, logger *zap.SugaredLogger, osAr
569570
return cfg, err
570571
}
571572

573+
mainFS.DurationVar(&cfg.Controller.TargetResourceMissingThreshold, "target-resource-missing-threshold", time.Hour*24, "Define the amount of time a cron or rollout will tolerate its target missing before self-deleting")
574+
575+
if err := viper.BindPFlag("controller.targetResourceMissingThreshold", mainFS.Lookup("target-resource-missing-threshold")); err != nil {
576+
return cfg, err
577+
}
578+
572579
if err := preConfigFS.Parse(osArgs); err != nil {
573580
return cfg, fmt.Errorf("unable to retrieve configuration parse from provided flag: %w", err)
574581
}

controllers/cron_rollout_helpers.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,8 @@ import (
2525
)
2626

2727
const (
28-
DisruptionCronNameLabel = chaosv1beta1.GroupName + "/disruption-cron-name"
29-
DisruptionRolloutNameLabel = chaosv1beta1.GroupName + "/disruption-rollout-name"
30-
TargetResourceMissingThreshold = time.Hour * 24
28+
DisruptionCronNameLabel = chaosv1beta1.GroupName + "/disruption-cron-name"
29+
DisruptionRolloutNameLabel = chaosv1beta1.GroupName + "/disruption-rollout-name"
3130
)
3231

3332
// GetChildDisruptions retrieves disruptions associated with a resource by its label.

controllers/disruption_cron_controller.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,13 @@ import (
2727
var DisruptionCronTags = []string{}
2828

2929
type DisruptionCronReconciler struct {
30-
Client client.Client
31-
Scheme *runtime.Scheme
32-
BaseLog *zap.SugaredLogger
33-
log *zap.SugaredLogger
34-
MetricsSink metrics.Sink
35-
FinalizerDeletionDelay time.Duration
30+
Client client.Client
31+
Scheme *runtime.Scheme
32+
BaseLog *zap.SugaredLogger
33+
log *zap.SugaredLogger
34+
MetricsSink metrics.Sink
35+
FinalizerDeletionDelay time.Duration
36+
TargetResourceMissingThreshold time.Duration
3637
}
3738

3839
func (r *DisruptionCronReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) {
@@ -260,7 +261,7 @@ func (r *DisruptionCronReconciler) updateTargetResourcePreviouslyMissing(ctx con
260261
return targetResourceExists, disruptionCronDeleted, r.handleTargetResourceFirstMissing(ctx, instance)
261262
}
262263

263-
if time.Since(instance.Status.TargetResourcePreviouslyMissing.Time) > TargetResourceMissingThreshold {
264+
if time.Since(instance.Status.TargetResourcePreviouslyMissing.Time) > r.TargetResourceMissingThreshold {
264265
r.log.Warnw("target has been missing for over one day, deleting this schedule",
265266
"timeMissing", time.Since(instance.Status.TargetResourcePreviouslyMissing.Time))
266267

@@ -298,6 +299,8 @@ func (r *DisruptionCronReconciler) handleTargetResourceMissingPastExpiration(ctx
298299
return fmt.Errorf("failed to delete instance: %w", err)
299300
}
300301

302+
r.handleMetricSinkError(r.MetricsSink.MetricMissingTargetDeleted(DisruptionCronTags))
303+
301304
return nil
302305
}
303306

controllers/disruption_rollout_controller.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,12 @@ import (
2424
var DisruptionRolloutTags = []string{}
2525

2626
type DisruptionRolloutReconciler struct {
27-
Client client.Client
28-
Scheme *runtime.Scheme
29-
BaseLog *zap.SugaredLogger
30-
log *zap.SugaredLogger
31-
MetricsSink metrics.Sink
27+
Client client.Client
28+
Scheme *runtime.Scheme
29+
BaseLog *zap.SugaredLogger
30+
log *zap.SugaredLogger
31+
MetricsSink metrics.Sink
32+
TargetResourceMissingThreshold time.Duration
3233
}
3334

3435
func (r *DisruptionRolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) {
@@ -203,7 +204,7 @@ func (r *DisruptionRolloutReconciler) updateTargetResourcePreviouslyMissing(ctx
203204
return targetResourceExists, disruptionRolloutDeleted, r.handleTargetResourceFirstMissing(ctx, instance)
204205
}
205206

206-
if time.Since(instance.Status.TargetResourcePreviouslyMissing.Time) > TargetResourceMissingThreshold {
207+
if time.Since(instance.Status.TargetResourcePreviouslyMissing.Time) > r.TargetResourceMissingThreshold {
207208
r.log.Errorw("target has been missing for over one day, deleting this schedule",
208209
"timeMissing", time.Since(instance.Status.TargetResourcePreviouslyMissing.Time))
209210

@@ -241,6 +242,8 @@ func (r *DisruptionRolloutReconciler) handleTargetResourceMissingPastExpiration(
241242
return fmt.Errorf("failed to delete instance: %w", err)
242243
}
243244

245+
r.handleMetricSinkError(r.MetricsSink.MetricMissingTargetDeleted(DisruptionRolloutTags))
246+
244247
return nil
245248
}
246249

main.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,8 @@ func main() {
322322
BaseLog: logger,
323323
Scheme: mgr.GetScheme(),
324324
// new metrics sink for rollout controller
325-
MetricsSink: initMetricsSink(cfg.Controller.MetricsSink, logger, metricstypes.SinkAppRolloutController),
325+
MetricsSink: initMetricsSink(cfg.Controller.MetricsSink, logger, metricstypes.SinkAppRolloutController),
326+
TargetResourceMissingThreshold: cfg.Controller.TargetResourceMissingThreshold,
326327
}
327328

328329
defer closeMetricsSink(logger, disruptionRolloutReconciler.MetricsSink)
@@ -386,8 +387,9 @@ func main() {
386387
BaseLog: logger,
387388
Scheme: mgr.GetScheme(),
388389
// new metrics sink for cron controller
389-
MetricsSink: initMetricsSink(cfg.Controller.MetricsSink, logger, metricstypes.SinkAppCronController),
390-
FinalizerDeletionDelay: cfg.Controller.FinalizerDeletionDelay,
390+
MetricsSink: initMetricsSink(cfg.Controller.MetricsSink, logger, metricstypes.SinkAppCronController),
391+
FinalizerDeletionDelay: cfg.Controller.FinalizerDeletionDelay,
392+
TargetResourceMissingThreshold: cfg.Controller.TargetResourceMissingThreshold,
391393
}
392394

393395
defer closeMetricsSink(logger, disruptionCronReconciler.MetricsSink)

o11y/metrics/datadog/datadog.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,12 @@ func (d Sink) MetricMissingTargetFound(tags []string) error {
242242
return d.metricWithStatus(d.prefix+"schedule.missing_target_found", tags)
243243
}
244244

245+
// MetricMissingTargetDeleted reports when a scheduled Disruption has been deleted by the chaos-controller,
246+
// because its target has been missing for too long
247+
func (d Sink) MetricMissingTargetDeleted(tags []string) error {
248+
return d.metricWithStatus(d.prefix+"schedule.missing_target_deleted", tags)
249+
}
250+
245251
// MetricNextScheduledTime reports the duration until the next scheduled disruption will run
246252
func (d Sink) MetricNextScheduledTime(duration time.Duration, tags []string) error {
247253
return d.timing(d.prefix+"schedule.next_scheduled", duration, tags)

o11y/metrics/metrics.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ type Sink interface {
4949
MetricTooLate(tags []string) error
5050
MetricTargetMissing(duration time.Duration, tags []string) error
5151
MetricMissingTargetFound(tags []string) error
52+
MetricMissingTargetDeleted(tags []string) error
5253
MetricNextScheduledTime(time time.Duration, tags []string) error
5354
MetricDisruptionScheduled(tags []string) error
5455
MetricPausedCron(tags []string) error

o11y/metrics/noop/noop.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,14 @@ func (n Sink) MetricMissingTargetFound(tags []string) error {
234234
return nil
235235
}
236236

237+
// MetricMissingTargetDeleted reports when a scheduled Disruption has been deleted by the chaos-controller,
238+
// because its target has been missing for too long
239+
func (n Sink) MetricMissingTargetDeleted(tags []string) error {
240+
n.log.Debugf("NOOP: MetricMissingTargetDeleted %s\n", tags)
241+
242+
return nil
243+
}
244+
237245
// MetricNextScheduledTime reports the duration until the next scheduled disruption will run
238246
func (n Sink) MetricNextScheduledTime(duration time.Duration, tags []string) error {
239247
n.log.Debugf("NOOP: MetricNextScheduledRun %v, s%s\n", duration, tags)

o11y/metrics/sink_mock.go

Lines changed: 46 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)