Skip to content

Commit a1e44c3

Browse files
committed
feat(slm-collector): cluster label added to slm metrics
1 parent be01f3f commit a1e44c3

File tree

2 files changed

+111
-36
lines changed

2 files changed

+111
-36
lines changed

collector/slm.go

Lines changed: 95 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,69 +31,81 @@ var (
3131
slmRetentionRunsTotal = prometheus.NewDesc(
3232
prometheus.BuildFQName(namespace, "slm_stats", "retention_runs_total"),
3333
"Total retention runs",
34-
nil, nil,
34+
[]string{"cluster"}, nil,
3535
)
3636
slmRetentionFailedTotal = prometheus.NewDesc(
3737
prometheus.BuildFQName(namespace, "slm_stats", "retention_failed_total"),
3838
"Total failed retention runs",
39-
nil, nil,
39+
[]string{"cluster"}, nil,
4040
)
4141
slmRetentionTimedOutTotal = prometheus.NewDesc(
4242
prometheus.BuildFQName(namespace, "slm_stats", "retention_timed_out_total"),
4343
"Total timed out retention runs",
44-
nil, nil,
44+
[]string{"cluster"}, nil,
4545
)
4646
slmRetentionDeletionTimeSeconds = prometheus.NewDesc(
4747
prometheus.BuildFQName(namespace, "slm_stats", "retention_deletion_time_seconds"),
4848
"Retention run deletion time",
49-
nil, nil,
49+
[]string{"cluster"}, nil,
5050
)
5151
slmTotalSnapshotsTaken = prometheus.NewDesc(
5252
prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_taken_total"),
5353
"Total snapshots taken",
54-
nil, nil,
54+
[]string{"cluster"}, nil,
5555
)
5656
slmTotalSnapshotsFailed = prometheus.NewDesc(
5757
prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_failed_total"),
5858
"Total snapshots failed",
59-
nil, nil,
59+
[]string{"cluster"}, nil,
6060
)
6161
slmTotalSnapshotsDeleted = prometheus.NewDesc(
6262
prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_deleted_total"),
6363
"Total snapshots deleted",
64-
nil, nil,
64+
[]string{"cluster"}, nil,
6565
)
6666
slmTotalSnapshotsDeleteFailed = prometheus.NewDesc(
6767
prometheus.BuildFQName(namespace, "slm_stats", "total_snapshot_deletion_failures_total"),
6868
"Total snapshot deletion failures",
69-
nil, nil,
69+
[]string{"cluster"}, nil,
7070
)
7171

7272
slmOperationMode = prometheus.NewDesc(
7373
prometheus.BuildFQName(namespace, "slm_stats", "operation_mode"),
7474
"Operating status of SLM",
75-
[]string{"operation_mode"}, nil,
75+
[]string{"cluster", "operation_mode"}, nil,
7676
)
7777

7878
slmSnapshotsTaken = prometheus.NewDesc(
7979
prometheus.BuildFQName(namespace, "slm_stats", "snapshots_taken_total"),
8080
"Total snapshots taken",
81-
[]string{"policy"}, nil,
81+
[]string{
82+
"policy",
83+
"cluster",
84+
}, nil,
8285
)
8386
slmSnapshotsFailed = prometheus.NewDesc(
8487
prometheus.BuildFQName(namespace, "slm_stats", "snapshots_failed_total"),
8588
"Total snapshots failed",
86-
[]string{"policy"}, nil,
89+
[]string{
90+
"policy",
91+
"cluster",
92+
}, nil,
8793
)
8894
slmSnapshotsDeleted = prometheus.NewDesc(
8995
prometheus.BuildFQName(namespace, "slm_stats", "snapshots_deleted_total"),
9096
"Total snapshots deleted",
91-
[]string{"policy"}, nil,
97+
[]string{
98+
"policy",
99+
"cluster",
100+
}, nil,
92101
)
93102
slmSnapshotsDeletionFailure = prometheus.NewDesc(
94103
prometheus.BuildFQName(namespace, "slm_stats", "snapshot_deletion_failures_total"),
95104
"Total snapshot deletion failures",
96-
[]string{"policy"}, nil,
105+
[]string{
106+
"policy",
107+
"cluster",
108+
}, nil,
97109
)
98110
)
99111

@@ -103,18 +115,67 @@ func init() {
103115

104116
// SLM information struct
105117
type SLM struct {
106-
logger *slog.Logger
107-
hc *http.Client
108-
u *url.URL
118+
logger *slog.Logger
119+
hc *http.Client
120+
u *url.URL
121+
clusterInfoCh chan *clusterinfo.Response
122+
lastClusterInfo *clusterinfo.Response
109123
}
110124

111125
// NewSLM defines SLM Prometheus metrics
112126
func NewSLM(logger *slog.Logger, u *url.URL, hc *http.Client, ci *clusterinfo.Retriever) (Collector, error) {
113-
return &SLM{
114-
logger: logger,
115-
hc: hc,
116-
u: u,
117-
}, nil
127+
slm := &SLM{
128+
logger: logger,
129+
hc: hc,
130+
u: u,
131+
clusterInfoCh: make(chan *clusterinfo.Response),
132+
lastClusterInfo: &clusterinfo.Response{
133+
ClusterName: "unknown_cluster",
134+
},
135+
}
136+
137+
err := ci.RegisterConsumer(slm)
138+
if err != nil {
139+
return slm, err
140+
}
141+
142+
// start go routine to fetch clusterinfo updates and save them to lastClusterinfo
143+
go func() {
144+
logger.Debug("starting cluster info receive loop")
145+
for ci := range slm.clusterInfoCh {
146+
if ci != nil {
147+
logger.Debug("received cluster info update", "cluster", ci.ClusterName)
148+
slm.lastClusterInfo = ci
149+
}
150+
}
151+
logger.Debug("exiting cluster info receive loop")
152+
}()
153+
154+
return slm, nil
155+
}
156+
157+
func (s *SLM) Describe(ch chan<- *prometheus.Desc) {
158+
ch <- slmRetentionRunsTotal
159+
ch <- slmRetentionFailedTotal
160+
ch <- slmRetentionTimedOutTotal
161+
ch <- slmRetentionDeletionTimeSeconds
162+
ch <- slmTotalSnapshotsTaken
163+
ch <- slmTotalSnapshotsFailed
164+
ch <- slmTotalSnapshotsDeleted
165+
ch <- slmTotalSnapshotsDeleteFailed
166+
ch <- slmOperationMode
167+
ch <- slmSnapshotsTaken
168+
ch <- slmSnapshotsFailed
169+
ch <- slmSnapshotsDeleted
170+
ch <- slmSnapshotsDeletionFailure
171+
}
172+
173+
func (s *SLM) ClusterLabelUpdates() *chan *clusterinfo.Response {
174+
return &s.clusterInfoCh
175+
}
176+
177+
func (s *SLM) String() string {
178+
return namespace + "slm"
118179
}
119180

120181
// SLMStatsResponse is a representation of the SLM stats
@@ -181,6 +242,7 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
181242
slmOperationMode,
182243
prometheus.GaugeValue,
183244
value,
245+
s.lastClusterInfo.ClusterName,
184246
status,
185247
)
186248
}
@@ -189,43 +251,51 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
189251
slmRetentionRunsTotal,
190252
prometheus.CounterValue,
191253
float64(slmStatsResp.RetentionRuns),
254+
s.lastClusterInfo.ClusterName,
192255
)
193256

194257
ch <- prometheus.MustNewConstMetric(
195258
slmRetentionFailedTotal,
196259
prometheus.CounterValue,
197260
float64(slmStatsResp.RetentionFailed),
261+
s.lastClusterInfo.ClusterName,
198262
)
199263

200264
ch <- prometheus.MustNewConstMetric(
201265
slmRetentionTimedOutTotal,
202266
prometheus.CounterValue,
203267
float64(slmStatsResp.RetentionTimedOut),
268+
s.lastClusterInfo.ClusterName,
204269
)
205270
ch <- prometheus.MustNewConstMetric(
206271
slmRetentionDeletionTimeSeconds,
207272
prometheus.GaugeValue,
208273
float64(slmStatsResp.RetentionDeletionTimeMillis)/1000,
274+
s.lastClusterInfo.ClusterName,
209275
)
210276
ch <- prometheus.MustNewConstMetric(
211277
slmTotalSnapshotsTaken,
212278
prometheus.CounterValue,
213279
float64(slmStatsResp.TotalSnapshotsTaken),
280+
s.lastClusterInfo.ClusterName,
214281
)
215282
ch <- prometheus.MustNewConstMetric(
216283
slmTotalSnapshotsFailed,
217284
prometheus.CounterValue,
218285
float64(slmStatsResp.TotalSnapshotsFailed),
286+
s.lastClusterInfo.ClusterName,
219287
)
220288
ch <- prometheus.MustNewConstMetric(
221289
slmTotalSnapshotsDeleted,
222290
prometheus.CounterValue,
223291
float64(slmStatsResp.TotalSnapshotsDeleted),
292+
s.lastClusterInfo.ClusterName,
224293
)
225294
ch <- prometheus.MustNewConstMetric(
226295
slmTotalSnapshotsDeleteFailed,
227296
prometheus.CounterValue,
228297
float64(slmStatsResp.TotalSnapshotDeletionFailures),
298+
s.lastClusterInfo.ClusterName,
229299
)
230300

231301
for _, policy := range slmStatsResp.PolicyStats {
@@ -234,24 +304,28 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
234304
prometheus.CounterValue,
235305
float64(policy.SnapshotsTaken),
236306
policy.Policy,
307+
s.lastClusterInfo.ClusterName,
237308
)
238309
ch <- prometheus.MustNewConstMetric(
239310
slmSnapshotsFailed,
240311
prometheus.CounterValue,
241312
float64(policy.SnapshotsFailed),
242313
policy.Policy,
314+
s.lastClusterInfo.ClusterName,
243315
)
244316
ch <- prometheus.MustNewConstMetric(
245317
slmSnapshotsDeleted,
246318
prometheus.CounterValue,
247319
float64(policy.SnapshotsDeleted),
248320
policy.Policy,
321+
s.lastClusterInfo.ClusterName,
249322
)
250323
ch <- prometheus.MustNewConstMetric(
251324
slmSnapshotsDeletionFailure,
252325
prometheus.CounterValue,
253326
float64(policy.SnapshotDeletionFailures),
254327
policy.Policy,
328+
s.lastClusterInfo.ClusterName,
255329
)
256330
}
257331

collector/slm_test.go

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -48,45 +48,45 @@ func TestSLM(t *testing.T) {
4848
file: "7.15.0.json",
4949
want: `# HELP elasticsearch_slm_stats_operation_mode Operating status of SLM
5050
# TYPE elasticsearch_slm_stats_operation_mode gauge
51-
elasticsearch_slm_stats_operation_mode{operation_mode="RUNNING"} 0
52-
elasticsearch_slm_stats_operation_mode{operation_mode="STOPPED"} 0
53-
elasticsearch_slm_stats_operation_mode{operation_mode="STOPPING"} 0
51+
elasticsearch_slm_stats_operation_mode{cluster="unknown_cluster",operation_mode="RUNNING"} 0
52+
elasticsearch_slm_stats_operation_mode{cluster="unknown_cluster",operation_mode="STOPPED"} 0
53+
elasticsearch_slm_stats_operation_mode{cluster="unknown_cluster",operation_mode="STOPPING"} 0
5454
# HELP elasticsearch_slm_stats_retention_deletion_time_seconds Retention run deletion time
5555
# TYPE elasticsearch_slm_stats_retention_deletion_time_seconds gauge
56-
elasticsearch_slm_stats_retention_deletion_time_seconds 72.491
56+
elasticsearch_slm_stats_retention_deletion_time_seconds{cluster="unknown_cluster"} 72.491
5757
# HELP elasticsearch_slm_stats_retention_failed_total Total failed retention runs
5858
# TYPE elasticsearch_slm_stats_retention_failed_total counter
59-
elasticsearch_slm_stats_retention_failed_total 0
59+
elasticsearch_slm_stats_retention_failed_total{cluster="unknown_cluster"} 0
6060
# HELP elasticsearch_slm_stats_retention_runs_total Total retention runs
6161
# TYPE elasticsearch_slm_stats_retention_runs_total counter
62-
elasticsearch_slm_stats_retention_runs_total 9
62+
elasticsearch_slm_stats_retention_runs_total{cluster="unknown_cluster"} 9
6363
# HELP elasticsearch_slm_stats_retention_timed_out_total Total timed out retention runs
6464
# TYPE elasticsearch_slm_stats_retention_timed_out_total counter
65-
elasticsearch_slm_stats_retention_timed_out_total 0
65+
elasticsearch_slm_stats_retention_timed_out_total{cluster="unknown_cluster"} 0
6666
# HELP elasticsearch_slm_stats_snapshot_deletion_failures_total Total snapshot deletion failures
6767
# TYPE elasticsearch_slm_stats_snapshot_deletion_failures_total counter
68-
elasticsearch_slm_stats_snapshot_deletion_failures_total{policy="everything"} 0
68+
elasticsearch_slm_stats_snapshot_deletion_failures_total{cluster="unknown_cluster",policy="everything"} 0
6969
# HELP elasticsearch_slm_stats_snapshots_deleted_total Total snapshots deleted
7070
# TYPE elasticsearch_slm_stats_snapshots_deleted_total counter
71-
elasticsearch_slm_stats_snapshots_deleted_total{policy="everything"} 20
71+
elasticsearch_slm_stats_snapshots_deleted_total{cluster="unknown_cluster",policy="everything"} 20
7272
# HELP elasticsearch_slm_stats_snapshots_failed_total Total snapshots failed
7373
# TYPE elasticsearch_slm_stats_snapshots_failed_total counter
74-
elasticsearch_slm_stats_snapshots_failed_total{policy="everything"} 2
74+
elasticsearch_slm_stats_snapshots_failed_total{cluster="unknown_cluster",policy="everything"} 2
7575
# HELP elasticsearch_slm_stats_snapshots_taken_total Total snapshots taken
7676
# TYPE elasticsearch_slm_stats_snapshots_taken_total counter
77-
elasticsearch_slm_stats_snapshots_taken_total{policy="everything"} 50
77+
elasticsearch_slm_stats_snapshots_taken_total{cluster="unknown_cluster",policy="everything"} 50
7878
# HELP elasticsearch_slm_stats_total_snapshot_deletion_failures_total Total snapshot deletion failures
7979
# TYPE elasticsearch_slm_stats_total_snapshot_deletion_failures_total counter
80-
elasticsearch_slm_stats_total_snapshot_deletion_failures_total 0
80+
elasticsearch_slm_stats_total_snapshot_deletion_failures_total{cluster="unknown_cluster"} 0
8181
# HELP elasticsearch_slm_stats_total_snapshots_deleted_total Total snapshots deleted
8282
# TYPE elasticsearch_slm_stats_total_snapshots_deleted_total counter
83-
elasticsearch_slm_stats_total_snapshots_deleted_total 20
83+
elasticsearch_slm_stats_total_snapshots_deleted_total{cluster="unknown_cluster"} 20
8484
# HELP elasticsearch_slm_stats_total_snapshots_failed_total Total snapshots failed
8585
# TYPE elasticsearch_slm_stats_total_snapshots_failed_total counter
86-
elasticsearch_slm_stats_total_snapshots_failed_total 2
86+
elasticsearch_slm_stats_total_snapshots_failed_total{cluster="unknown_cluster"} 2
8787
# HELP elasticsearch_slm_stats_total_snapshots_taken_total Total snapshots taken
8888
# TYPE elasticsearch_slm_stats_total_snapshots_taken_total counter
89-
elasticsearch_slm_stats_total_snapshots_taken_total 103
89+
elasticsearch_slm_stats_total_snapshots_taken_total{cluster="unknown_cluster"} 103
9090
`,
9191
},
9292
}
@@ -128,6 +128,7 @@ func TestSLM(t *testing.T) {
128128

129129
logger := promslog.NewNopLogger()
130130
ci := clusterinfo.New(logger, http.DefaultClient, u, time.Duration(300000000000))
131+
131132
s, err := NewSLM(logger, u, http.DefaultClient, ci)
132133
if err != nil {
133134
t.Fatal(err)

0 commit comments

Comments
 (0)