Skip to content

Commit 0fad555

Browse files
chore: Add operator_status_condition_current_status_seconds and operator_status_condition_transitions_total (#62)
* Add status condition metrics on last_transition_time and current_status_seconds * Drop last_condition_time testing * Add testing for metrics on abnormal conditions
1 parent a14aede commit 0fad555

File tree

2 files changed

+240
-14
lines changed

2 files changed

+240
-14
lines changed

status/controller.go

Lines changed: 96 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package status
33
import (
44
"context"
55
"fmt"
6+
"time"
67

78
"github.com/awslabs/operatorpkg/object"
89
"github.com/prometheus/client_golang/prometheus"
@@ -12,6 +13,7 @@ import (
1213
"k8s.io/client-go/tools/record"
1314
controllerruntime "sigs.k8s.io/controller-runtime"
1415
"sigs.k8s.io/controller-runtime/pkg/client"
16+
"sigs.k8s.io/controller-runtime/pkg/controller"
1517
"sigs.k8s.io/controller-runtime/pkg/manager"
1618
"sigs.k8s.io/controller-runtime/pkg/metrics"
1719
"sigs.k8s.io/controller-runtime/pkg/reconcile"
@@ -24,6 +26,7 @@ const (
2426
MetricLabelName = "name"
2527
MetricLabelConditionType = "type"
2628
MetricLabelConditionStatus = "status"
29+
MetricLabelConditionReason = "reason"
2730
)
2831

2932
const (
@@ -45,9 +48,10 @@ func NewController[T Object](client client.Client, eventRecorder record.EventRec
4548
}
4649
}
4750

48-
func (c *Controller[T]) Register(ctx context.Context, m manager.Manager) error {
51+
func (c *Controller[T]) Register(_ context.Context, m manager.Manager) error {
4952
return controllerruntime.NewControllerManagedBy(m).
5053
For(object.New[T]()).
54+
WithOptions(controller.Options{MaxConcurrentReconciles: 10}).
5155
Named("status").
5256
Complete(c)
5357
}
@@ -61,8 +65,14 @@ func (c *Controller[T]) Reconcile(ctx context.Context, req reconcile.Request) (r
6165
ConditionCount.DeletePartialMatch(prometheus.Labels{
6266
MetricLabelGroup: gvk.Group,
6367
MetricLabelKind: gvk.Kind,
64-
MetricLabelNamespace: string(req.Namespace),
65-
MetricLabelName: string(req.Name),
68+
MetricLabelNamespace: req.Namespace,
69+
MetricLabelName: req.Name,
70+
})
71+
ConditionCurrentStatusSeconds.DeletePartialMatch(prometheus.Labels{
72+
MetricLabelGroup: gvk.Group,
73+
MetricLabelKind: gvk.Kind,
74+
MetricLabelNamespace: req.Namespace,
75+
MetricLabelName: req.Name,
6676
})
6777
return reconcile.Result{}, nil
6878
}
@@ -78,21 +88,41 @@ func (c *Controller[T]) Reconcile(ctx context.Context, req reconcile.Request) (r
7888
ConditionCount.With(prometheus.Labels{
7989
MetricLabelGroup: gvk.Group,
8090
MetricLabelKind: gvk.Kind,
81-
MetricLabelNamespace: string(req.Namespace),
82-
MetricLabelName: string(req.Name),
83-
MetricLabelConditionType: string(condition.Type),
91+
MetricLabelNamespace: req.Namespace,
92+
MetricLabelName: req.Name,
93+
MetricLabelConditionType: condition.Type,
8494
MetricLabelConditionStatus: string(condition.Status),
95+
MetricLabelConditionReason: condition.Reason,
8596
}).Set(1)
97+
ConditionCurrentStatusSeconds.With(prometheus.Labels{
98+
MetricLabelGroup: gvk.Group,
99+
MetricLabelKind: gvk.Kind,
100+
MetricLabelNamespace: req.Namespace,
101+
MetricLabelName: req.Name,
102+
MetricLabelConditionType: condition.Type,
103+
MetricLabelConditionStatus: string(condition.Status),
104+
MetricLabelConditionReason: condition.Reason,
105+
}).Set(time.Since(condition.LastTransitionTime.Time).Seconds())
86106
}
87107
for _, observedCondition := range observedConditions.List() {
88108
if currentCondition := currentConditions.Get(observedCondition.Type); currentCondition == nil || currentCondition.Status != observedCondition.Status {
89109
ConditionCount.Delete(prometheus.Labels{
90110
MetricLabelGroup: gvk.Group,
91111
MetricLabelKind: gvk.Kind,
92-
MetricLabelNamespace: string(req.Namespace),
93-
MetricLabelName: string(req.Name),
94-
MetricLabelConditionType: string(observedCondition.Type),
112+
MetricLabelNamespace: req.Namespace,
113+
MetricLabelName: req.Name,
114+
MetricLabelConditionType: observedCondition.Type,
95115
MetricLabelConditionStatus: string(observedCondition.Status),
116+
MetricLabelConditionReason: observedCondition.Reason,
117+
})
118+
ConditionCurrentStatusSeconds.Delete(prometheus.Labels{
119+
MetricLabelGroup: gvk.Group,
120+
MetricLabelKind: gvk.Kind,
121+
MetricLabelNamespace: req.Namespace,
122+
MetricLabelName: req.Name,
123+
MetricLabelConditionType: observedCondition.Type,
124+
MetricLabelConditionStatus: string(observedCondition.Status),
125+
MetricLabelConditionReason: observedCondition.Reason,
96126
})
97127
}
98128
}
@@ -114,25 +144,36 @@ func (c *Controller[T]) Reconcile(ctx context.Context, req reconcile.Request) (r
114144
// time, and our likelyhood of observing this is much higher.
115145
for _, condition := range currentConditions.List() {
116146
observedCondition := observedConditions.Get(condition.Type)
117-
if observedCondition == nil || observedCondition.GetStatus() == condition.GetStatus() {
147+
if observedCondition.GetStatus() == condition.GetStatus() {
148+
continue
149+
}
150+
// A condition transitions if it either didn't exist before or it has changed
151+
ConditionTransitionsTotal.With(prometheus.Labels{
152+
MetricLabelGroup: gvk.Group,
153+
MetricLabelKind: gvk.Kind,
154+
MetricLabelConditionType: condition.Type,
155+
MetricLabelConditionStatus: string(condition.Status),
156+
MetricLabelConditionReason: condition.Reason,
157+
}).Inc()
158+
if observedCondition == nil {
118159
continue
119160
}
120161
duration := condition.LastTransitionTime.Time.Sub(observedCondition.LastTransitionTime.Time).Seconds()
121162
ConditionDuration.With(prometheus.Labels{
122163
MetricLabelGroup: gvk.Group,
123164
MetricLabelKind: gvk.Kind,
124-
MetricLabelConditionType: string(observedCondition.Type),
165+
MetricLabelConditionType: observedCondition.Type,
125166
MetricLabelConditionStatus: string(observedCondition.Status),
126-
}).Observe(float64(duration))
127-
c.eventRecorder.Event(o, v1.EventTypeNormal, string(condition.Type), fmt.Sprintf("Status condition transitioned, Type: %s, Status: %s -> %s, Reason: %s%s",
167+
}).Observe(duration)
168+
c.eventRecorder.Event(o, v1.EventTypeNormal, condition.Type, fmt.Sprintf("Status condition transitioned, Type: %s, Status: %s -> %s, Reason: %s%s",
128169
condition.Type,
129170
observedCondition.Status,
130171
condition.Status,
131172
condition.Reason,
132173
lo.Ternary(condition.Message != "", fmt.Sprintf(", Message: %s", condition.Message), ""),
133174
))
134175
}
135-
return reconcile.Result{}, nil
176+
return reconcile.Result{RequeueAfter: time.Second * 10}, nil
136177
}
137178

138179
// Cardinality is limited to # objects * # conditions * # objectives
@@ -166,12 +207,53 @@ var ConditionCount = prometheus.NewGaugeVec(
166207
MetricLabelKind,
167208
MetricLabelConditionType,
168209
MetricLabelConditionStatus,
210+
MetricLabelConditionReason,
211+
},
212+
)
213+
214+
// Cardinality is limited to # objects * # conditions
215+
// NOTE: This metric is based on a requeue so it won't show the current status seconds with extremely high accuracy.
216+
// This metric is useful for aggreations. If you need a high accuracy metric, use operator_status_condition_last_transition_time_seconds
217+
var ConditionCurrentStatusSeconds = prometheus.NewGaugeVec(
218+
prometheus.GaugeOpts{
219+
Namespace: MetricNamespace,
220+
Subsystem: MetricSubsystem,
221+
Name: "current_status_seconds",
222+
Help: "The current amount of time in seconds that a status condition has been in a specific state. Alarm := P99(Updated=Unknown) > 5 minutes",
223+
},
224+
[]string{
225+
MetricLabelNamespace,
226+
MetricLabelName,
227+
MetricLabelGroup,
228+
MetricLabelKind,
229+
MetricLabelConditionType,
230+
MetricLabelConditionStatus,
231+
MetricLabelConditionReason,
232+
},
233+
)
234+
235+
// Cardinality is limited to # objects * # conditions
236+
var ConditionTransitionsTotal = prometheus.NewCounterVec(
237+
prometheus.CounterOpts{
238+
Namespace: MetricNamespace,
239+
Subsystem: MetricSubsystem,
240+
Name: "transitions_total",
241+
Help: "The count of transitions of a given object, type and status.",
242+
},
243+
[]string{
244+
MetricLabelGroup,
245+
MetricLabelKind,
246+
MetricLabelConditionType,
247+
MetricLabelConditionStatus,
248+
MetricLabelConditionReason,
169249
},
170250
)
171251

172252
func init() {
173253
metrics.Registry.MustRegister(
174254
ConditionCount,
175255
ConditionDuration,
256+
ConditionTransitionsTotal,
257+
ConditionCurrentStatusSeconds,
176258
)
177259
}

0 commit comments

Comments
 (0)