Skip to content

Commit 2891312

Browse files
authored
Merge pull request #125 from postfinance/feat/v1.13
metrics and logging refactor !
2 parents 852ab85 + e324325 commit 2891312

File tree

10 files changed

+105
-112
lines changed

10 files changed

+105
-112
lines changed

.github/actions/check-deployment/action.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ runs:
2222
run: |
2323
echo "Check for unexpected log lines"
2424
test $(kubectl -n ${{ inputs.namespace }} logs -l app.kubernetes.io/name=kubenurse \
25-
| grep -v "I'm ready to help you" \
25+
| grep -v "kubenurse just started" \
2626
| wc -l) -eq 0 \
2727
- shell: bash
2828
run: |
@@ -36,14 +36,14 @@ runs:
3636
curl -k -s --resolve \
3737
${{ inputs.svc-domain }}:443:127.0.0.1 \
3838
https://${{ inputs.svc-domain }}:443/metrics \
39-
| grep "kubenurse_request_" > /dev/null
39+
| grep "kubenurse_httpclient_request_" > /dev/null
4040
- shell: bash
4141
run: |
4242
echo "Check that metrics contain neighbours"
4343
curl -k -s --resolve \
4444
${{ inputs.svc-domain }}:443:127.0.0.1 \
4545
https://${{ inputs.svc-domain }}:443/metrics \
46-
| grep "kubenurse_request_" | grep 'type="path_' > /dev/null
46+
| grep "kubenurse_httpclient_request_" | grep 'type="path_' > /dev/null
4747
- shell: bash
4848
run: |
4949
echo "Check that metrics contain no errors"

CHANGELOG.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,30 @@
1+
## [1.13.0] - 2024-04-12
2+
3+
**Full Changelog**: https://github.com/postfinance/kubenurse/compare/v1.12.3...v1.13.0
4+
5+
### 🚀 Features
6+
7+
- [**breaking**] Use slog and add event type to error counter - ([e9d51a2](https://github.com/postfinance/kubenurse/commit/e9d51a2f781eaa2cb7e415fd175d158fdd681bb3))
8+
9+
### 🐛 Bug Fixes
10+
11+
- *(ci)* Change kubenurse startup string in test condition - ([bd91d7d](https://github.com/postfinance/kubenurse/commit/bd91d7da68b3408d70dfe846562557396b4522c4))
12+
- *(ci)* Update metric name for sanity check - ([c3e7287](https://github.com/postfinance/kubenurse/commit/c3e72874e509cff3fa72381d8aa882878ef32385))
13+
14+
### 🚜 Refactor
15+
16+
- [**breaking**] Remove kubenurse_request_duration histogram - ([3061b64](https://github.com/postfinance/kubenurse/commit/3061b6490d26e39bf096d050e54e15f7d43c0f8e))
17+
18+
### 📚 Documentation
19+
20+
- *(drawings)* Add excalidraw file - ([852ab85](https://github.com/postfinance/kubenurse/commit/852ab85fc2ec00804a290730b5276fb9689116f4))
21+
- *(readme.md)* Format metrics as a table - ([6a35209](https://github.com/postfinance/kubenurse/commit/6a3520955e2e91e3cccb203c528e23cd2032e127))
22+
23+
### Build
24+
25+
- *(artifacthub)* Add validation file - ([e2bf079](https://github.com/postfinance/kubenurse/commit/e2bf079504223a3293ed1be73d061bed49e00e02))
26+
27+
128
## [1.12.1] - 2024-04-05
229

330
**Full Changelog**: https://github.com/postfinance/kubenurse/compare/v1.12.0...1.12.1

README.md

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ The following command can be used to install kubenurse with Helm: `helm upgrade
4747
#### Configuration settings
4848

4949
| Setting | Description | Default |
50-
|------------------------------------|----------------------------------------------------------------------------------------------------------------------|------------------------------------|
50+
| ---------------------------------- | -------------------------------------------------------------------------------------------------------------------- | ---------------------------------- |
5151
| daemonset.image.repository | The repository name | `postfinance/kubenurse` |
5252
| daemonset.image.tag | The tag/ version of the image | `v1.4.0` |
5353
| daemonset.podLabels | Additional labels to be added to the pods of the daemonset | `[]` |
@@ -74,7 +74,7 @@ The following command can be used to install kubenurse with Helm: `helm upgrade
7474
| allow_unschedulable | Sets `KUBENURSE_ALLOW_UNSCHEDULABLE` environment variable | `false` |
7575
| neighbour_filter | Sets `KUBENURSE_NEIGHBOUR_FILTER` environment variable | `app.kubernetes.io/name=kubenurse` |
7676
| neighbour_limit | Sets `KUBENURSE_NEIGHBOUR_LIMIT` environment variable | `10` |
77-
| histogram_buckets | Sets `KUBENURSE_HISTOGRAM_BUCKETS` environment variable | |
77+
| histogram_buckets | Sets `KUBENURSE_HISTOGRAM_BUCKETS` environment variable | |
7878
| extra_ca | Sets `KUBENURSE_EXTRA_CA` environment variable | |
7979
| check_api_server_direct | Sets `KUBENURSE_CHECK_API_SERVER_DIRECT` environment variable | `true` |
8080
| check_api_server_dns | Sets `KUBENURSE_CHECK_API_SERVER_DNS` environment variable | `true` |
@@ -298,13 +298,21 @@ All performed checks expose metrics which can be used to monitor/alert:
298298

299299
At `/metrics` you will find the following metrics:
300300

301-
- `kubenurse_errors_total`: Kubenurse error counter partitioned by error type
302-
- `kubenurse_request_duration`: a histogram for Kubenurse request duration partitioned by error type
303-
- `kubenurse_httpclient_request_duration_seconds`: a latency histogram of request latencies from the kubenurse http client.
304-
- `kubenurse_httpclient_trace_requests_total`: a latency histogram for the http
305-
client _trace_ metric instrumentation, with detailed statistics for e.g.
306-
`dns_start`, `got_conn` events, and more. the details can be seen in the
307-
[`httptrace.go`](https://github.com/postfinance/kubenurse/blob/52767fbb280b65c06ac926dac49dd874e9ec4aee/internal/servicecheck/httptrace.go#L73)
308-
file
309-
- `kubenurse_neighbourhood_incoming_checks`: a gauge which reports how many
310-
unique neighbours have queried the current pod in the last minute
301+
| metric name | labels | description |
302+
| ----------------------------------------------------- | -------------------- | ---------------------------------------------------------------------------------------------------------------------------- |
303+
| ~~`kubenurse request duration`~~ | `type` | (deprecated since v1.13.0) latency histogram for request duration, replaced with the metric below. |
304+
| `kubenurse httpclient request duration seconds` | `type` | latency histogram for request duration, partitioned by request type |
305+
| `kubenurse httpclient trace request duration seconds` | `type, event` | latency histogram for httpclient _trace_ metric instrumentation, partitioned by request type and httptrace connection events |
306+
| `kubenurse httpclient requests total` | `type, code, method` | counter for the total number of http requests, partitioned by HTTP code, method, and request type |
307+
| `kubenurse errors total` | `type, event` | error counter, partitioned by httptrace event and request type |
308+
| `kubenurse neighbourhood incoming checks` | n\a | gauge which reports how many unique neighbours have queried the current pod in the last minute |
309+
310+
For metrics partitioned with a `type` label, it is possible to precisely know
311+
which request type increased an error counter, or to compare the latencies of
312+
multiple request types, for example compare how your service and ingress
313+
latencies differ.
314+
315+
Some `event` labels include `dns_start`, `got_conn`, `tls_handshake_done`, and
316+
more. the details can be seen in the
317+
[`httptrace.go`](https://github.com/postfinance/kubenurse/blob/v1.13.0/internal/servicecheck/httptrace.go#L91)
318+
file.

internal/kubenurse/server.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ package kubenurse
44
import (
55
"context"
66
"fmt"
7-
"log"
7+
"log/slog"
88
"net/http"
99
"os"
1010
"strconv"
@@ -119,10 +119,11 @@ func New(ctx context.Context, c client.Client) (*Server, error) { //nolint:funle
119119

120120
if bucketsString := os.Getenv("KUBENURSE_HISTOGRAM_BUCKETS"); bucketsString != "" {
121121
for _, bucketStr := range strings.Split(bucketsString, ",") {
122-
bucket, e := strconv.ParseFloat(bucketStr, 64)
122+
bucket, err := strconv.ParseFloat(bucketStr, 64)
123123

124-
if e != nil {
125-
log.Fatalf("couldn't parse one of the custom histogram buckets. error:\n%v", e)
124+
if err != nil {
125+
slog.Error("couldn't parse one of the custom histogram buckets", "bucket", bucket, "err", err)
126+
os.Exit(1)
126127
}
127128

128129
histogramBuckets = append(histogramBuckets, bucket)
@@ -243,6 +244,8 @@ func (s *Server) Run() error {
243244
}()
244245
}
245246

247+
slog.Info("kubenurse just started")
248+
246249
wg.Wait()
247250
close(errc)
248251

internal/servicecheck/httptrace.go

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ package servicecheck
33
import (
44
"context"
55
"crypto/tls"
6-
"log"
6+
"log/slog"
77
"net/http"
88
"net/http/httptrace"
99
"time"
@@ -24,7 +24,9 @@ func (rt RoundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) {
2424

2525
// This collects traces and logs errors. As promhttp.InstrumentRoundTripperTrace doesn't process
2626
// errors, this is custom made and inspired by prometheus/client_golang's promhttp
27-
func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durationHistogram []float64) http.RoundTripper {
27+
//
28+
//nolint:funlen // needed to pack all histograms and use them directly in the httptrace wrapper
29+
func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durHist []float64) http.RoundTripper {
2830
httpclientReqTotal := prometheus.NewCounterVec(
2931
prometheus.CounterOpts{
3032
Namespace: MetricsNamespace,
@@ -39,7 +41,7 @@ func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durati
3941
Namespace: MetricsNamespace,
4042
Name: "httpclient_request_duration_seconds",
4143
Help: "A latency histogram of request latencies from the kubenurse http client.",
42-
Buckets: durationHistogram,
44+
Buckets: durHist,
4345
},
4446
[]string{"type"},
4547
)
@@ -49,20 +51,31 @@ func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durati
4951
Namespace: MetricsNamespace,
5052
Name: "httpclient_trace_request_duration_seconds",
5153
Help: "Latency histogram for requests from the kubenurse http client. Time in seconds since the start of the http request.",
52-
Buckets: durationHistogram,
54+
Buckets: durHist,
5355
},
5456
[]string{"event", "type"},
5557
)
5658

57-
registry.MustRegister(httpclientReqTotal, httpclientReqDuration, httpclientTraceReqDuration)
59+
errorCounter := prometheus.NewCounterVec(
60+
prometheus.CounterOpts{
61+
Namespace: MetricsNamespace,
62+
Name: "errors_total",
63+
Help: "Kubenurse error counter partitioned by error type",
64+
},
65+
[]string{"event", "type"},
66+
)
67+
68+
registry.MustRegister(httpclientReqTotal, httpclientReqDuration, httpclientTraceReqDuration, errorCounter)
5869

5970
collectMetric := func(traceEventType string, start time.Time, r *http.Request, err error) {
6071
td := time.Since(start).Seconds()
6172
kubenurseTypeLabel := r.Context().Value(kubenurseTypeKey{}).(string)
6273

63-
// If we got an error inside a trace, log it and do not collect metrics
74+
// If we get an error inside a trace, log it
6475
if err != nil {
65-
log.Printf("httptrace: failed %s for %s with %v", traceEventType, kubenurseTypeLabel, err)
76+
errorCounter.WithLabelValues(traceEventType, kubenurseTypeLabel).Inc()
77+
slog.Error("request failure in httptrace", "event_type", traceEventType, "request_type", kubenurseTypeLabel, "err", err)
78+
6679
return
6780
}
6881

@@ -94,8 +107,8 @@ func withHttptrace(registry *prometheus.Registry, next http.RoundTripper, durati
94107
TLSHandshakeStart: func() {
95108
collectMetric("tls_handshake_start", start, r, nil)
96109
},
97-
TLSHandshakeDone: func(_ tls.ConnectionState, _ error) {
98-
collectMetric("tls_handshake_done", start, r, nil)
110+
TLSHandshakeDone: func(_ tls.ConnectionState, err error) {
111+
collectMetric("tls_handshake_done", start, r, err)
99112
},
100113
WroteRequest: func(info httptrace.WroteRequestInfo) {
101114
collectMetric("wrote_request", start, r, info.Err)

internal/servicecheck/neighbours.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func (c *Checker) checkNeighbours(nh []*Neighbour) {
103103
return c.doRequest(ctx, "http://"+neighbour.PodIP+":8080/alwayshappy", true)
104104
}
105105

106-
_, _ = c.measure(check, "path_"+neighbour.NodeName)
106+
c.measure(check, "path_"+neighbour.NodeName)
107107
}
108108
}
109109

internal/servicecheck/servicecheck.go

Lines changed: 13 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import (
55
"context"
66
"crypto/tls"
77
"fmt"
8-
"log"
8+
"log/slog"
99
"net"
1010
"net/http"
1111
"os"
@@ -26,31 +26,11 @@ const (
2626
// results. Other parameters of the Checker struct need to be configured separately.
2727
func New(_ context.Context, cl client.Client, promRegistry *prometheus.Registry,
2828
allowUnschedulable bool, cacheTTL time.Duration, durationHistogramBuckets []float64) (*Checker, error) {
29-
errorCounter := prometheus.NewCounterVec(
30-
prometheus.CounterOpts{
31-
Namespace: MetricsNamespace,
32-
Name: "errors_total",
33-
Help: "Kubenurse error counter partitioned by error type",
34-
},
35-
[]string{"type"},
36-
)
37-
38-
durationHistogram := prometheus.NewHistogramVec(
39-
prometheus.HistogramOpts{
40-
Namespace: MetricsNamespace,
41-
Name: "request_duration",
42-
Help: "Kubenurse request duration partitioned by target path",
43-
Buckets: durationHistogramBuckets,
44-
},
45-
[]string{"type"},
46-
)
47-
48-
promRegistry.MustRegister(errorCounter, durationHistogram)
49-
5029
// setup http transport
5130
tlsConfig, err := generateTLSConfig(os.Getenv("KUBENURSE_EXTRA_CA"))
5231
if err != nil {
53-
log.Printf("cannot generate tlsConfig with KUBENURSE_EXTRA_CA: %s", err)
32+
slog.Error("cannot generate tlsConfig with provided KUBENURSE_EXTRA_CA. Continuing with default tlsConfig",
33+
"KUBENURSE_EXTRA_CA", os.Getenv("KUBENURSE_EXTRA_CA"), "err", err)
5434

5535
tlsConfig = &tls.Config{MinVersion: tls.VersionTLS12}
5636
}
@@ -82,40 +62,26 @@ func New(_ context.Context, cl client.Client, promRegistry *prometheus.Registry,
8262
client: cl,
8363
httpClient: httpClient,
8464
cacheTTL: cacheTTL,
85-
errorCounter: errorCounter,
86-
durationHistogram: durationHistogram,
8765
stop: make(chan struct{}),
8866
}, nil
8967
}
9068

9169
// Run runs all servicechecks and returns the result togeter with a boolean which indicates success. The cache
9270
// is respected.
93-
func (c *Checker) Run() (Result, bool) {
94-
var (
95-
haserr bool
96-
err error
97-
)
98-
71+
func (c *Checker) Run() {
9972
// Run Checks
10073
res := Result{}
10174

102-
res.APIServerDirect, err = c.measure(c.APIServerDirect, "api_server_direct")
103-
haserr = haserr || (err != nil)
104-
105-
res.APIServerDNS, err = c.measure(c.APIServerDNS, "api_server_dns")
106-
haserr = haserr || (err != nil)
107-
108-
res.MeIngress, err = c.measure(c.MeIngress, "me_ingress")
109-
haserr = haserr || (err != nil)
110-
111-
res.MeService, err = c.measure(c.MeService, "me_service")
112-
haserr = haserr || (err != nil)
75+
res.APIServerDirect = c.measure(c.APIServerDirect, "api_server_direct")
76+
res.APIServerDNS = c.measure(c.APIServerDNS, "api_server_dns")
77+
res.MeIngress = c.measure(c.MeIngress, "me_ingress")
78+
res.MeService = c.measure(c.MeService, "me_service")
11379

11480
if c.SkipCheckNeighbourhood {
11581
res.NeighbourhoodState = skippedStr
11682
} else {
83+
var err error
11784
res.Neighbourhood, err = c.GetNeighbours(context.Background(), c.KubenurseNamespace, c.NeighbourFilter)
118-
haserr = haserr || (err != nil)
11985

12086
// Neighbourhood special error treating
12187
if err != nil {
@@ -130,8 +96,6 @@ func (c *Checker) Run() (Result, bool) {
13096

13197
// Cache result (used for /alive handler)
13298
c.LastCheckResult = &res
133-
134-
return res, haserr
13599
}
136100

137101
// RunScheduled runs the checks in the specified interval which can be used to keep the metrics up-to-date. This
@@ -196,23 +160,13 @@ func (c *Checker) MeService(ctx context.Context) (string, error) {
196160
}
197161

198162
// measure implements metric collections for the check
199-
func (c *Checker) measure(check Check, label string) (string, error) {
200-
start := time.Now()
201-
163+
func (c *Checker) measure(check Check, requestType string) string {
202164
// Add our label (check type) to the context so our http tracer can annotate
203165
// metrics and errors based with the label
204-
ctx := context.WithValue(context.Background(), kubenurseTypeKey{}, label)
166+
ctx := context.WithValue(context.Background(), kubenurseTypeKey{}, requestType)
205167

206168
// Execute check
207-
res, err := check(ctx)
208-
209-
// Process metrics
210-
c.durationHistogram.WithLabelValues(label).Observe(time.Since(start).Seconds())
211-
212-
if err != nil {
213-
log.Printf("failed request for %s with %v", label, err)
214-
c.errorCounter.WithLabelValues(label).Inc()
215-
}
169+
res, _ := check(ctx) // this error is ignored as it is already logged in httptrace
216170

217-
return res, err
171+
return res
218172
}

internal/servicecheck/servicecheck_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ func TestCombined(t *testing.T) {
4242

4343
t.Run("run", func(t *testing.T) {
4444
r := require.New(t)
45-
result, hadError := checker.Run()
46-
r.True(hadError)
47-
r.Len(result.Neighbourhood, 1)
45+
checker.Run()
46+
47+
r.Len(checker.LastCheckResult.Neighbourhood, 1)
4848
})
4949

5050
t.Run("scheduled", func(t *testing.T) {

internal/servicecheck/types.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import (
55
"net/http"
66
"time"
77

8-
"github.com/prometheus/client_golang/prometheus"
98
"sigs.k8s.io/controller-runtime/pkg/client"
109
)
1110

@@ -39,10 +38,6 @@ type Checker struct {
3938
// Controller runtime cached client
4039
client client.Client
4140

42-
// metrics
43-
errorCounter *prometheus.CounterVec
44-
durationHistogram *prometheus.HistogramVec
45-
4641
// Http Client for https requests
4742
httpClient *http.Client
4843

0 commit comments

Comments
 (0)