@@ -31,69 +31,81 @@ var (
31
31
slmRetentionRunsTotal = prometheus .NewDesc (
32
32
prometheus .BuildFQName (namespace , "slm_stats" , "retention_runs_total" ),
33
33
"Total retention runs" ,
34
- nil , nil ,
34
+ [] string { "cluster" } , nil ,
35
35
)
36
36
slmRetentionFailedTotal = prometheus .NewDesc (
37
37
prometheus .BuildFQName (namespace , "slm_stats" , "retention_failed_total" ),
38
38
"Total failed retention runs" ,
39
- nil , nil ,
39
+ [] string { "cluster" } , nil ,
40
40
)
41
41
slmRetentionTimedOutTotal = prometheus .NewDesc (
42
42
prometheus .BuildFQName (namespace , "slm_stats" , "retention_timed_out_total" ),
43
43
"Total timed out retention runs" ,
44
- nil , nil ,
44
+ [] string { "cluster" } , nil ,
45
45
)
46
46
slmRetentionDeletionTimeSeconds = prometheus .NewDesc (
47
47
prometheus .BuildFQName (namespace , "slm_stats" , "retention_deletion_time_seconds" ),
48
48
"Retention run deletion time" ,
49
- nil , nil ,
49
+ [] string { "cluster" } , nil ,
50
50
)
51
51
slmTotalSnapshotsTaken = prometheus .NewDesc (
52
52
prometheus .BuildFQName (namespace , "slm_stats" , "total_snapshots_taken_total" ),
53
53
"Total snapshots taken" ,
54
- nil , nil ,
54
+ [] string { "cluster" } , nil ,
55
55
)
56
56
slmTotalSnapshotsFailed = prometheus .NewDesc (
57
57
prometheus .BuildFQName (namespace , "slm_stats" , "total_snapshots_failed_total" ),
58
58
"Total snapshots failed" ,
59
- nil , nil ,
59
+ [] string { "cluster" } , nil ,
60
60
)
61
61
slmTotalSnapshotsDeleted = prometheus .NewDesc (
62
62
prometheus .BuildFQName (namespace , "slm_stats" , "total_snapshots_deleted_total" ),
63
63
"Total snapshots deleted" ,
64
- nil , nil ,
64
+ [] string { "cluster" } , nil ,
65
65
)
66
66
slmTotalSnapshotsDeleteFailed = prometheus .NewDesc (
67
67
prometheus .BuildFQName (namespace , "slm_stats" , "total_snapshot_deletion_failures_total" ),
68
68
"Total snapshot deletion failures" ,
69
- nil , nil ,
69
+ [] string { "cluster" } , nil ,
70
70
)
71
71
72
72
slmOperationMode = prometheus .NewDesc (
73
73
prometheus .BuildFQName (namespace , "slm_stats" , "operation_mode" ),
74
74
"Operating status of SLM" ,
75
- []string {"operation_mode" }, nil ,
75
+ []string {"cluster" , " operation_mode" }, nil ,
76
76
)
77
77
78
78
slmSnapshotsTaken = prometheus .NewDesc (
79
79
prometheus .BuildFQName (namespace , "slm_stats" , "snapshots_taken_total" ),
80
80
"Total snapshots taken" ,
81
- []string {"policy" }, nil ,
81
+ []string {
82
+ "policy" ,
83
+ "cluster" ,
84
+ }, nil ,
82
85
)
83
86
slmSnapshotsFailed = prometheus .NewDesc (
84
87
prometheus .BuildFQName (namespace , "slm_stats" , "snapshots_failed_total" ),
85
88
"Total snapshots failed" ,
86
- []string {"policy" }, nil ,
89
+ []string {
90
+ "policy" ,
91
+ "cluster" ,
92
+ }, nil ,
87
93
)
88
94
slmSnapshotsDeleted = prometheus .NewDesc (
89
95
prometheus .BuildFQName (namespace , "slm_stats" , "snapshots_deleted_total" ),
90
96
"Total snapshots deleted" ,
91
- []string {"policy" }, nil ,
97
+ []string {
98
+ "policy" ,
99
+ "cluster" ,
100
+ }, nil ,
92
101
)
93
102
slmSnapshotsDeletionFailure = prometheus .NewDesc (
94
103
prometheus .BuildFQName (namespace , "slm_stats" , "snapshot_deletion_failures_total" ),
95
104
"Total snapshot deletion failures" ,
96
- []string {"policy" }, nil ,
105
+ []string {
106
+ "policy" ,
107
+ "cluster" ,
108
+ }, nil ,
97
109
)
98
110
)
99
111
@@ -103,18 +115,67 @@ func init() {
103
115
104
116
// SLM information struct
105
117
type SLM struct {
106
- logger * slog.Logger
107
- hc * http.Client
108
- u * url.URL
118
+ logger * slog.Logger
119
+ hc * http.Client
120
+ u * url.URL
121
+ clusterInfoCh chan * clusterinfo.Response
122
+ lastClusterInfo * clusterinfo.Response
109
123
}
110
124
111
125
// NewSLM defines SLM Prometheus metrics
112
126
func NewSLM (logger * slog.Logger , u * url.URL , hc * http.Client , ci * clusterinfo.Retriever ) (Collector , error ) {
113
- return & SLM {
114
- logger : logger ,
115
- hc : hc ,
116
- u : u ,
117
- }, nil
127
+ slm := & SLM {
128
+ logger : logger ,
129
+ hc : hc ,
130
+ u : u ,
131
+ clusterInfoCh : make (chan * clusterinfo.Response ),
132
+ lastClusterInfo : & clusterinfo.Response {
133
+ ClusterName : "unknown_cluster" ,
134
+ },
135
+ }
136
+
137
+ err := ci .RegisterConsumer (slm )
138
+ if err != nil {
139
+ return slm , err
140
+ }
141
+
142
+ // start go routine to fetch clusterinfo updates and save them to lastClusterinfo
143
+ go func () {
144
+ logger .Debug ("starting cluster info receive loop" )
145
+ for ci := range slm .clusterInfoCh {
146
+ if ci != nil {
147
+ logger .Debug ("received cluster info update" , "cluster" , ci .ClusterName )
148
+ slm .lastClusterInfo = ci
149
+ }
150
+ }
151
+ logger .Debug ("exiting cluster info receive loop" )
152
+ }()
153
+
154
+ return slm , nil
155
+ }
156
+
157
+ func (s * SLM ) Describe (ch chan <- * prometheus.Desc ) {
158
+ ch <- slmRetentionRunsTotal
159
+ ch <- slmRetentionFailedTotal
160
+ ch <- slmRetentionTimedOutTotal
161
+ ch <- slmRetentionDeletionTimeSeconds
162
+ ch <- slmTotalSnapshotsTaken
163
+ ch <- slmTotalSnapshotsFailed
164
+ ch <- slmTotalSnapshotsDeleted
165
+ ch <- slmTotalSnapshotsDeleteFailed
166
+ ch <- slmOperationMode
167
+ ch <- slmSnapshotsTaken
168
+ ch <- slmSnapshotsFailed
169
+ ch <- slmSnapshotsDeleted
170
+ ch <- slmSnapshotsDeletionFailure
171
+ }
172
+
173
+ func (s * SLM ) ClusterLabelUpdates () * chan * clusterinfo.Response {
174
+ return & s .clusterInfoCh
175
+ }
176
+
177
+ func (s * SLM ) String () string {
178
+ return namespace + "slm"
118
179
}
119
180
120
181
// SLMStatsResponse is a representation of the SLM stats
@@ -181,6 +242,7 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
181
242
slmOperationMode ,
182
243
prometheus .GaugeValue ,
183
244
value ,
245
+ s .lastClusterInfo .ClusterName ,
184
246
status ,
185
247
)
186
248
}
@@ -189,43 +251,51 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
189
251
slmRetentionRunsTotal ,
190
252
prometheus .CounterValue ,
191
253
float64 (slmStatsResp .RetentionRuns ),
254
+ s .lastClusterInfo .ClusterName ,
192
255
)
193
256
194
257
ch <- prometheus .MustNewConstMetric (
195
258
slmRetentionFailedTotal ,
196
259
prometheus .CounterValue ,
197
260
float64 (slmStatsResp .RetentionFailed ),
261
+ s .lastClusterInfo .ClusterName ,
198
262
)
199
263
200
264
ch <- prometheus .MustNewConstMetric (
201
265
slmRetentionTimedOutTotal ,
202
266
prometheus .CounterValue ,
203
267
float64 (slmStatsResp .RetentionTimedOut ),
268
+ s .lastClusterInfo .ClusterName ,
204
269
)
205
270
ch <- prometheus .MustNewConstMetric (
206
271
slmRetentionDeletionTimeSeconds ,
207
272
prometheus .GaugeValue ,
208
273
float64 (slmStatsResp .RetentionDeletionTimeMillis )/ 1000 ,
274
+ s .lastClusterInfo .ClusterName ,
209
275
)
210
276
ch <- prometheus .MustNewConstMetric (
211
277
slmTotalSnapshotsTaken ,
212
278
prometheus .CounterValue ,
213
279
float64 (slmStatsResp .TotalSnapshotsTaken ),
280
+ s .lastClusterInfo .ClusterName ,
214
281
)
215
282
ch <- prometheus .MustNewConstMetric (
216
283
slmTotalSnapshotsFailed ,
217
284
prometheus .CounterValue ,
218
285
float64 (slmStatsResp .TotalSnapshotsFailed ),
286
+ s .lastClusterInfo .ClusterName ,
219
287
)
220
288
ch <- prometheus .MustNewConstMetric (
221
289
slmTotalSnapshotsDeleted ,
222
290
prometheus .CounterValue ,
223
291
float64 (slmStatsResp .TotalSnapshotsDeleted ),
292
+ s .lastClusterInfo .ClusterName ,
224
293
)
225
294
ch <- prometheus .MustNewConstMetric (
226
295
slmTotalSnapshotsDeleteFailed ,
227
296
prometheus .CounterValue ,
228
297
float64 (slmStatsResp .TotalSnapshotDeletionFailures ),
298
+ s .lastClusterInfo .ClusterName ,
229
299
)
230
300
231
301
for _ , policy := range slmStatsResp .PolicyStats {
@@ -234,24 +304,28 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
234
304
prometheus .CounterValue ,
235
305
float64 (policy .SnapshotsTaken ),
236
306
policy .Policy ,
307
+ s .lastClusterInfo .ClusterName ,
237
308
)
238
309
ch <- prometheus .MustNewConstMetric (
239
310
slmSnapshotsFailed ,
240
311
prometheus .CounterValue ,
241
312
float64 (policy .SnapshotsFailed ),
242
313
policy .Policy ,
314
+ s .lastClusterInfo .ClusterName ,
243
315
)
244
316
ch <- prometheus .MustNewConstMetric (
245
317
slmSnapshotsDeleted ,
246
318
prometheus .CounterValue ,
247
319
float64 (policy .SnapshotsDeleted ),
248
320
policy .Policy ,
321
+ s .lastClusterInfo .ClusterName ,
249
322
)
250
323
ch <- prometheus .MustNewConstMetric (
251
324
slmSnapshotsDeletionFailure ,
252
325
prometheus .CounterValue ,
253
326
float64 (policy .SnapshotDeletionFailures ),
254
327
policy .Policy ,
328
+ s .lastClusterInfo .ClusterName ,
255
329
)
256
330
}
257
331
0 commit comments