Skip to content

Commit ed02613

Browse files
authored
Extract HDR histogram implementation into a shared package (#4611)
1 parent a3df84f commit ed02613

File tree

8 files changed

+438
-419
lines changed

8 files changed

+438
-419
lines changed

internal/ds/histogram/doc.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// Package histogram provides histogram implementations that are used to track the distribution of metrics.
2+
package histogram

internal/ds/histogram/hdr.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
package histogram
2+
3+
import (
4+
"math"
5+
"math/bits"
6+
)
7+
8+
const (
9+
// defaultMinimumResolution is the default resolution used by Hdr.
10+
// It allows to have a higher granularity compared to the basic 1.0 value,
11+
// supporting floating points up to 3 digits.
12+
defaultMinimumResolution = .001
13+
14+
// lowestTrackable represents the minimum value that the Hdr tracks.
15+
// Essentially, it excludes negative numbers.
16+
// Most of the metrics tracked by histograms are durations
17+
// where we don't expect negative numbers.
18+
lowestTrackable = 0
19+
)
20+
21+
// Hdr represents a distribution of metrics samples' values as histogram.
22+
//
23+
// A Hdr is the representation of base-2 exponential histogram with two layers.
24+
// The first layer has primary buckets in the form of a power of two, and a second layer of buckets
25+
// for each primary bucket with an equally distributed amount of buckets inside.
26+
//
27+
// Hdr has a series of (N * 2^m) buckets, where:
28+
// N = a power of 2 that defines the number of primary buckets
29+
// m = a power of 2 that defines the number of the secondary buckets
30+
// The current version is: f(N = 25, m = 7) = 3200.
31+
type Hdr struct {
32+
// Buckets stores the counters for each bin of the histogram.
33+
// It does not include counters for the untrackable values,
34+
// because they contain exception cases and require to be tracked in a dedicated way.
35+
Buckets map[uint32]uint32
36+
37+
// ExtraLowBucket counts occurrences of observed values smaller
38+
// than the minimum trackable value.
39+
ExtraLowBucket uint32
40+
41+
// ExtraHighBucket counts occurrences of observed values bigger
42+
// than the maximum trackable value.
43+
ExtraHighBucket uint32
44+
45+
// Max is the absolute observed maximum value.
46+
Max float64
47+
48+
// Min is the absolute observed minimum value.
49+
Min float64
50+
51+
// Sum is the sum of all observed values.
52+
Sum float64
53+
54+
// Count is counts the amount of observed values.
55+
Count uint32
56+
57+
// MinimumResolution represents resolution used by Hdr.
58+
// In principle, it is a multiplier factor for the tracked values.
59+
MinimumResolution float64
60+
}
61+
62+
// NewHdr creates a new Hdr histogram with default settings.
63+
func NewHdr() *Hdr {
64+
return &Hdr{
65+
MinimumResolution: defaultMinimumResolution,
66+
Buckets: make(map[uint32]uint32),
67+
Max: -math.MaxFloat64,
68+
Min: math.MaxFloat64,
69+
}
70+
}
71+
72+
// Add adds a value to the Hdr histogram.
73+
func (h *Hdr) Add(v float64) {
74+
h.addToBucket(v)
75+
}
76+
77+
// addToBucket increments the counter of the bucket of the provided value.
78+
// If the value is lower or higher than the trackable limits
79+
// then it is counted into specific buckets. All the stats are also updated accordingly.
80+
func (h *Hdr) addToBucket(v float64) {
81+
if v > h.Max {
82+
h.Max = v
83+
}
84+
if v < h.Min {
85+
h.Min = v
86+
}
87+
88+
h.Count++
89+
h.Sum += v
90+
91+
v /= h.MinimumResolution
92+
93+
if v < lowestTrackable {
94+
h.ExtraLowBucket++
95+
return
96+
}
97+
if v > math.MaxInt64 {
98+
h.ExtraHighBucket++
99+
return
100+
}
101+
102+
h.Buckets[resolveBucketIndex(v)]++
103+
}
104+
105+
// resolveBucketIndex returns the index
106+
// of the bucket in the histogram for the provided value.
107+
func resolveBucketIndex(val float64) uint32 {
108+
if val < lowestTrackable {
109+
return 0
110+
}
111+
112+
// We upscale to the next integer to ensure that each sample falls
113+
// within a specific bucket, even when the value is fractional.
114+
// This avoids under-representing the distribution in the Hdr histogram.
115+
upscaled := uint64(math.Ceil(val))
116+
117+
// In Hdr histograms, bucket boundaries are usually defined as multiples of powers of 2,
118+
// allowing for efficient computation of bucket indexes.
119+
//
120+
// We define k=7 in our case, because it allows for sufficient granularity in the
121+
// distribution (2^7=128 primary buckets of which each can be further
122+
// subdivided if needed).
123+
//
124+
// k is the constant balancing factor between granularity and
125+
// computational efficiency.
126+
//
127+
// In our case:
128+
// i.e 2^7 = 128 ~ 100 = 10^2
129+
// 2^10 = 1024 ~ 1000 = 10^3
130+
// f(x) = 3*x + 1 - empiric formula that works for us
131+
// since f(2)=7 and f(3)=10
132+
const k = uint64(7)
133+
134+
// 256 = 1 << (k+1)
135+
if upscaled < 256 {
136+
return uint32(upscaled)
137+
}
138+
139+
// `nkdiff` helps us find the right bucket for `upscaled`. It does so by determining the
140+
// index for the "major" bucket (a set of values within a power of two range) and then
141+
// the "sub" bucket within that major bucket. This system provides us with a fine level
142+
// of granularity within a computationally efficient bucketing system. The result is a
143+
// histogram that provides a detailed representation of the distribution of values.
144+
//
145+
// Here we use some math to get simple formula
146+
// derivation:
147+
// let u = upscaled
148+
// let n = msb(u) - most significant digit position
149+
// i.e. n = floor(log(u, 2))
150+
// major_bucket_index = n - k + 1
151+
// sub_bucket_index = u>>(n - k) - (1<<k)
152+
// bucket = major_bucket_index << k + sub_bucket_index =
153+
// = (n-k+1)<<k + u>>(n-k) - (1<<k) =
154+
// = (n-k)<<k + u>>(n-k)
155+
//
156+
nkdiff := uint64(bits.Len64(upscaled>>k)) - 1 //nolint:gosec // msb index
157+
158+
// We cast safely downscaling because we don't expect we may hit the uint32 limit
159+
// with the bucket index. The bucket represented from the index as MaxUint32
160+
// would be a very huge number bigger than the trackable limits.
161+
return uint32((nkdiff << k) + (upscaled >> nkdiff)) //nolint:gosec
162+
}

0 commit comments

Comments
 (0)