Skip to content

Commit 2c56556

Browse files
committed
Reduce cardinality of histogram metrics
The combined cardinality of the labels and buckets was too taxing for prometheus.
1 parent d2c76d1 commit 2c56556

File tree

2 files changed

+25
-11
lines changed

2 files changed

+25
-11
lines changed

protocol/protocol.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,22 @@ const (
101101
OpError Op = 0xFF
102102
)
103103

104+
// Type returns the type of the given Op.
105+
func (op Op) Type() string {
106+
switch op {
107+
case OpRSADecrypt, OpRSASignMD5SHA1, OpRSASignSHA1, OpRSASignSHA224, OpRSASignSHA256, OpRSASignSHA384, OpRSASignSHA512, OpRSAPSSSignSHA256, OpRSAPSSSignSHA384, OpRSAPSSSignSHA512:
108+
return "rsa"
109+
case OpECDSASignMD5SHA1, OpECDSASignSHA1, OpECDSASignSHA224, OpECDSASignSHA256, OpECDSASignSHA384, OpECDSASignSHA512:
110+
return "ecdsa"
111+
case OpRPC:
112+
return "rpc"
113+
case OpSeal, OpUnseal, OpPing, OpPong, OpResponse, OpError:
114+
return "other"
115+
default:
116+
return "unknown"
117+
}
118+
}
119+
104120
// Error defines a 1-byte error payload.
105121
type Error byte
106122

server/metrics.go

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,25 +21,23 @@ type statistics struct {
2121
}
2222

2323
var (
24-
// 1 microsecond as a fraction of 1 second
25-
us = 1e-6
26-
// buckets starting at 1 microsecond and doubling until reaching a maximum of
27-
// ~8 seconds
28-
durationBuckets = prometheus.ExponentialBuckets(us, 2.0, 24)
24+
// buckets starting at 100 microseconds and doubling until reaching a
25+
// maximum of ~3.3 seconds
26+
durationBuckets = prometheus.ExponentialBuckets(1e-4, 2.0, 15)
2927
)
3028

3129
func newStatistics() *statistics {
3230
return &statistics{
3331
requestExecDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
3432
Name: "keyless_request_exec_duration_per_opcode",
35-
Help: "Time to execute a request not including time in queues, broken down by opcode and error code.",
33+
Help: "Time to execute a request not including time in queues, broken down by type and error code.",
3634
Buckets: durationBuckets,
37-
}, []string{"opcode", "error"}),
35+
}, []string{"type", "error"}),
3836
requestTotalDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
3937
Name: "keyless_request_total_duration_per_opcode",
40-
Help: "Total time to satisfy a request including time in queues, broken down by opcode and error code.",
38+
Help: "Total time to satisfy a request including time in queues, broken down by type and error code.",
4139
Buckets: durationBuckets,
42-
}, []string{"opcode", "error"}),
40+
}, []string{"type", "error"}),
4341
requests: prometheus.NewCounterVec(prometheus.CounterOpts{
4442
Name: "keyless_requests",
4543
Help: "Total number of requests by opcode.",
@@ -87,11 +85,11 @@ func (stats *statistics) logKeyLoadDuration(loadBegin time.Time) {
8785
// logRequestExecDuration logs the time taken to execute an operation (not
8886
// including queueing).
8987
func (stats *statistics) logRequestExecDuration(opcode protocol.Op, requestBegin time.Time, err protocol.Error) {
90-
stats.requestExecDuration.WithLabelValues(opcode.String(), err.String()).Observe(time.Since(requestBegin).Seconds())
88+
stats.requestExecDuration.WithLabelValues(opcode.Type(), err.String()).Observe(time.Since(requestBegin).Seconds())
9189
}
9290

9391
func (stats *statistics) logRequestTotalDuration(opcode protocol.Op, requestBegin time.Time, err protocol.Error) {
94-
stats.requestTotalDuration.WithLabelValues(opcode.String(), err.String()).Observe(time.Since(requestBegin).Seconds())
92+
stats.requestTotalDuration.WithLabelValues(opcode.Type(), err.String()).Observe(time.Since(requestBegin).Seconds())
9593
}
9694

9795
func (stats *statistics) logEnqueueECDSARequest() { stats.queuedECDSARequests.Inc() }

0 commit comments

Comments
 (0)