Skip to content

Commit e9408ce

Browse files
authored
defer validationFailed so that we always send it correctly (#956)
* defer validationFailed so that we always send it correctly * disruption crons should send validation failed metrics * dont use namespace key for disruption validation metrics
1 parent 9790751 commit e9408ce

File tree

5 files changed

+78
-29
lines changed

5 files changed

+78
-29
lines changed

api/v1beta1/disruption_cron_types.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
package v1beta1
77

88
import (
9+
"fmt"
910
"time"
1011

12+
"github.com/DataDog/chaos-controller/log"
13+
1114
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1215
)
1316

@@ -108,3 +111,13 @@ type DisruptionCronTrigger struct {
108111
func (d *DisruptionCron) IsReadyToRemoveFinalizer(finalizerDelay time.Duration) bool {
109112
return d.DeletionTimestamp != nil && time.Now().After(d.DeletionTimestamp.Add(finalizerDelay))
110113
}
114+
115+
// getMetricsTags parses the disruptioncron to generate metrics tags
116+
func (d *DisruptionCron) getMetricsTags() []string {
117+
tags := []string{
118+
fmt.Sprintf("%s:%s", log.DisruptionCronNameKey, d.Name),
119+
fmt.Sprintf("%s:%s", log.DisruptionCronNamespaceKey, d.Namespace),
120+
}
121+
122+
return tags
123+
}

api/v1beta1/disruption_cron_webhook.go

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"time"
1414

1515
cLog "github.com/DataDog/chaos-controller/log"
16+
"github.com/DataDog/chaos-controller/o11y/metrics"
1617
"github.com/DataDog/chaos-controller/utils"
1718
"github.com/robfig/cron"
1819
"go.uber.org/zap"
@@ -37,6 +38,7 @@ var (
3738
disruptionCronWebhookDeleteOnly bool
3839
disruptionCronPermittedUserGroups map[string]struct{}
3940
disruptionCronPermittedUserGroupString string
41+
disruptionCronMetricsSink metrics.Sink
4042
defaultCronDelayedStartTolerance time.Duration
4143
minimumCronFrequency time.Duration
4244
)
@@ -48,6 +50,7 @@ func (d *DisruptionCron) SetupWebhookWithManager(setupWebhookConfig utils.SetupW
4850
"source", "admission-controller",
4951
"admission-controller", "disruption-cron-webhook",
5052
)
53+
disruptionCronMetricsSink = setupWebhookConfig.MetricsSink
5154

5255
disruptionCronPermittedUserGroups = map[string]struct{}{}
5356

@@ -82,29 +85,39 @@ func (d *DisruptionCron) Default() {
8285
var _ webhook.Validator = &DisruptionCron{}
8386

8487
// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
85-
func (d *DisruptionCron) ValidateCreate() (admission.Warnings, error) {
88+
func (d *DisruptionCron) ValidateCreate() (_ admission.Warnings, err error) {
8689
log := disruptionCronWebhookLogger.With(cLog.DisruptionCronNameKey, d.Name, cLog.DisruptionCronNamespaceKey, d.Namespace)
8790

8891
log.Infow("validating created disruption cron", "spec", d.Spec)
8992

93+
metricTags := d.getMetricsTags()
94+
95+
defer func() {
96+
if err != nil {
97+
if mErr := disruptionCronMetricsSink.MetricValidationFailed(metricTags); mErr != nil {
98+
log.Errorw("error sending a metric", "error", mErr)
99+
}
100+
}
101+
}()
102+
90103
// delete-only mode, reject everything trying to be created
91104
if disruptionCronWebhookDeleteOnly {
92105
return nil, errors.New("the controller is currently in delete-only mode, you can't create new disruption cron for now")
93106
}
94107

95-
if err := validateUserInfoGroup(d, disruptionCronPermittedUserGroups, disruptionCronPermittedUserGroupString); err != nil {
108+
if err = validateUserInfoGroup(d, disruptionCronPermittedUserGroups, disruptionCronPermittedUserGroupString); err != nil {
96109
return nil, err
97110
}
98111

99-
if err := d.validateDisruptionCronName(); err != nil {
112+
if err = d.validateDisruptionCronName(); err != nil {
100113
return nil, err
101114
}
102115

103-
if err := d.validateDisruptionCronSpec(); err != nil {
116+
if err = d.validateDisruptionCronSpec(); err != nil {
104117
return nil, err
105118
}
106119

107-
if err := d.validateMinimumFrequency(minimumCronFrequency); err != nil {
120+
if err = d.validateMinimumFrequency(minimumCronFrequency); err != nil {
108121
return nil, err
109122
}
110123

@@ -114,20 +127,30 @@ func (d *DisruptionCron) ValidateCreate() (admission.Warnings, error) {
114127
return nil, nil
115128
}
116129

117-
func (d *DisruptionCron) ValidateUpdate(oldObject runtime.Object) (admission.Warnings, error) {
130+
func (d *DisruptionCron) ValidateUpdate(oldObject runtime.Object) (_ admission.Warnings, err error) {
118131
log := logger.With(cLog.DisruptionCronNameKey, d.Name, cLog.DisruptionCronNamespaceKey, d.Namespace)
119132

120133
log.Infow("validating updated disruption cron", "spec", d.Spec)
121134

122-
if err := validateUserInfoImmutable(oldObject.(*DisruptionCron), d); err != nil {
135+
metricTags := d.getMetricsTags()
136+
137+
defer func() {
138+
if err != nil {
139+
if mErr := disruptionCronMetricsSink.MetricValidationFailed(metricTags); mErr != nil {
140+
log.Errorw("error sending a metric", "error", mErr)
141+
}
142+
}
143+
}()
144+
145+
if err = validateUserInfoImmutable(oldObject.(*DisruptionCron), d); err != nil {
123146
return nil, err
124147
}
125148

126-
if err := d.validateDisruptionCronName(); err != nil {
149+
if err = d.validateDisruptionCronName(); err != nil {
127150
return nil, err
128151
}
129152

130-
if err := d.validateDisruptionCronSpec(); err != nil {
153+
if err = d.validateDisruptionCronSpec(); err != nil {
131154
return nil, err
132155
}
133156

api/v1beta1/disruption_cron_webhook_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"time"
1111

1212
"github.com/DataDog/chaos-controller/mocks"
13+
metricsnoop "github.com/DataDog/chaos-controller/o11y/metrics/noop"
1314
"github.com/stretchr/testify/mock"
1415
"go.uber.org/zap/zaptest"
1516
authV1 "k8s.io/api/authentication/v1"
@@ -29,6 +30,7 @@ var _ = Describe("DisruptionCron Webhook", func() {
2930

3031
BeforeEach(func() {
3132
disruptionCronWebhookLogger = zaptest.NewLogger(GinkgoT()).Sugar()
33+
disruptionCronMetricsSink = metricsnoop.New(disruptionCronWebhookLogger)
3234
defaultUserGroups = map[string]struct{}{
3335
"group1": {},
3436
"group2": {},

api/v1beta1/disruption_webhook.go

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,19 @@ func (r *Disruption) Default() {
112112
var _ webhook.Validator = &Disruption{}
113113

114114
// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
115-
func (r *Disruption) ValidateCreate() (admission.Warnings, error) {
115+
func (r *Disruption) ValidateCreate() (_ admission.Warnings, err error) {
116116
log := logger.With(cLog.DisruptionNameKey, r.Name, cLog.DisruptionNamespaceKey, r.Namespace)
117117

118+
metricTags := r.getMetricsTags()
119+
120+
defer func() {
121+
if err != nil {
122+
if mErr := metricsSink.MetricValidationFailed(metricTags); mErr != nil {
123+
log.Errorw("error sending a metric", "error", mErr)
124+
}
125+
}
126+
}()
127+
118128
ctx, err := r.SpanContext(context.Background())
119129
if err != nil {
120130
log.Errorw("did not find span context", "err", err)
@@ -196,11 +206,7 @@ func (r *Disruption) ValidateCreate() (admission.Warnings, error) {
196206
}
197207
}
198208

199-
if err := r.Spec.Validate(); err != nil {
200-
if mErr := metricsSink.MetricValidationFailed(r.getMetricsTags()); mErr != nil {
201-
log.Errorw("error sending a metric", "error", mErr)
202-
}
203-
209+
if err = r.Spec.Validate(); err != nil {
204210
return nil, err
205211
}
206212

@@ -260,15 +266,23 @@ func (r *Disruption) ValidateCreate() (admission.Warnings, error) {
260266
}
261267

262268
// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
263-
func (r *Disruption) ValidateUpdate(old runtime.Object) (admission.Warnings, error) {
269+
func (r *Disruption) ValidateUpdate(old runtime.Object) (_ admission.Warnings, err error) {
264270
log := logger.With(cLog.DisruptionNameKey, r.Name, cLog.DisruptionNamespaceKey, r.Namespace)
265271
log.Debugw("validating updated disruption", "spec", r.Spec)
266272

267-
var err error
273+
metricTags := r.getMetricsTags()
274+
275+
defer func() {
276+
if err != nil {
277+
if mErr := metricsSink.MetricValidationFailed(metricTags); mErr != nil {
278+
log.Errorw("error sending a metric", "error", mErr)
279+
}
280+
}
281+
}()
268282

269283
oldDisruption := old.(*Disruption)
270284

271-
if err := validateUserInfoImmutable(oldDisruption, r); err != nil {
285+
if err = validateUserInfoImmutable(oldDisruption, r); err != nil {
272286
return nil, err
273287
}
274288

@@ -287,10 +301,7 @@ func (r *Disruption) ValidateUpdate(old runtime.Object) (admission.Warnings, err
287301
oldPodsInfos = append(oldPodsInfos, fmt.Sprintf("%s/%s", oldPod.Namespace, oldPod.Name))
288302
}
289303

290-
metricTags := append(r.getMetricsTags(), "prevent_finalizer_removal:true")
291-
if mErr := metricsSink.MetricValidationFailed(metricTags); mErr != nil {
292-
log.Errorw("error sending a metric", "error", mErr)
293-
}
304+
metricTags = append(metricTags, "prevent_finalizer_removal:true")
294305

295306
return nil, fmt.Errorf(`unable to remove disruption finalizer, disruption '%s/%s' still has associated pods:
296307
- %s
@@ -337,10 +348,6 @@ You first need to remove those chaos pods (and potentially their finalizers) to
337348
}
338349

339350
if err := r.Spec.Validate(); err != nil {
340-
if mErr := metricsSink.MetricValidationFailed(r.getMetricsTags()); mErr != nil {
341-
log.Errorw("error sending a metric", "error", mErr)
342-
}
343-
344351
return nil, err
345352
}
346353

@@ -365,8 +372,8 @@ func (r *Disruption) ValidateDelete() (admission.Warnings, error) {
365372
// getMetricsTags parses the disruption to generate metrics tags
366373
func (r *Disruption) getMetricsTags() []string {
367374
tags := []string{
368-
"disruptionName:" + r.Name,
369-
"namespace:" + r.Namespace,
375+
fmt.Sprintf("%s:%s", cLog.DisruptionNameKey, r.Name),
376+
fmt.Sprintf("%s:%s", cLog.DisruptionNamespaceKey, r.Namespace),
370377
}
371378

372379
if userInfo, err := r.UserInfo(); !errors.Is(err, ErrNoUserInfo) {

main.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,14 +380,17 @@ func main() {
380380

381381
if cfg.Controller.DisruptionCronEnabled {
382382
disruptionCronRecorder := broadcaster.NewRecorder(mgr.GetScheme(), corev1.EventSource{Component: chaosv1beta1.SourceDisruptionCronComponent})
383+
disruptionCronMetricsSink := initMetricsSink(cfg.Controller.MetricsSink, logger, metricstypes.SinkAppCronController)
384+
385+
defer closeMetricsSink(logger, disruptionCronMetricsSink)
383386

384387
// create disruption cron reconciler
385388
disruptionCronReconciler := &controllers.DisruptionCronReconciler{
386389
Client: mgr.GetClient(),
387390
BaseLog: logger,
388391
Scheme: mgr.GetScheme(),
389392
// new metrics sink for cron controller
390-
MetricsSink: initMetricsSink(cfg.Controller.MetricsSink, logger, metricstypes.SinkAppCronController),
393+
MetricsSink: disruptionCronMetricsSink,
391394
FinalizerDeletionDelay: cfg.Controller.FinalizerDeletionDelay,
392395
TargetResourceMissingThreshold: cfg.Controller.TargetResourceMissingThreshold,
393396
}
@@ -408,6 +411,7 @@ func main() {
408411
DefaultCronDelayedStartTolerance: cfg.Controller.DefaultCronDelayedStartTolerance,
409412
MinimumCronFrequency: cfg.Controller.MinimumCronFrequency,
410413
DefaultDurationFlag: cfg.Controller.DefaultDuration,
414+
MetricsSink: disruptionCronMetricsSink,
411415
}
412416

413417
if err = (&chaosv1beta1.DisruptionCron{}).SetupWebhookWithManager(disruptionCronSetupWebhookConfig); err != nil {

0 commit comments

Comments
 (0)