Skip to content

Commit b28a031

Browse files
committed
feat: implement ready and live health checks
Signed-off-by: Alessandro Yuichi Okimoto <yuichijpn@gmail.com>
1 parent e910a7b commit b28a031

File tree

19 files changed

+437
-119
lines changed

19 files changed

+437
-119
lines changed

docker-compose/config/nginx/bucketeer.conf

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ server {
109109
add_header X-Content-Type-Options nosniff;
110110
add_header X-XSS-Protection "1; mode=block";
111111

112-
# Health check endpoint
112+
# Health check endpoints
113113
location /health {
114114
proxy_pass https://api_health_backend;
115115
proxy_set_header Host $host;
@@ -118,6 +118,30 @@ server {
118118
proxy_set_header X-Forwarded-Proto $scheme;
119119
}
120120

121+
location /ready {
122+
proxy_pass https://api_health_backend;
123+
proxy_set_header Host $host;
124+
proxy_set_header X-Real-IP $remote_addr;
125+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
126+
proxy_set_header X-Forwarded-Proto $scheme;
127+
}
128+
129+
location /live {
130+
proxy_pass https://api_health_backend;
131+
proxy_set_header Host $host;
132+
proxy_set_header X-Real-IP $remote_addr;
133+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
134+
proxy_set_header X-Forwarded-Proto $scheme;
135+
}
136+
137+
location /drain {
138+
proxy_pass https://api_health_backend;
139+
proxy_set_header Host $host;
140+
proxy_set_header X-Real-IP $remote_addr;
141+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
142+
proxy_set_header X-Forwarded-Proto $scheme;
143+
}
144+
121145
# API Gateway gRPC service (main API)
122146
location /bucketeer.gateway.Gateway {
123147
grpc_pass grpcs://api_grpc_backend;
@@ -192,7 +216,7 @@ server {
192216
return 204;
193217
}
194218

195-
# Health check endpoint
219+
# Health check endpoints
196220
location /health {
197221
proxy_pass https://web_health_backend;
198222
proxy_set_header Host $host;
@@ -201,6 +225,30 @@ server {
201225
proxy_set_header X-Forwarded-Proto $scheme;
202226
}
203227

228+
location /ready {
229+
proxy_pass https://web_health_backend;
230+
proxy_set_header Host $host;
231+
proxy_set_header X-Real-IP $remote_addr;
232+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
233+
proxy_set_header X-Forwarded-Proto $scheme;
234+
}
235+
236+
location /live {
237+
proxy_pass https://web_health_backend;
238+
proxy_set_header Host $host;
239+
proxy_set_header X-Real-IP $remote_addr;
240+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
241+
proxy_set_header X-Forwarded-Proto $scheme;
242+
}
243+
244+
location /drain {
245+
proxy_pass https://web_health_backend;
246+
proxy_set_header Host $host;
247+
proxy_set_header X-Real-IP $remote_addr;
248+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
249+
proxy_set_header X-Forwarded-Proto $scheme;
250+
}
251+
204252
# gRPC/gRPC-Web service routes (backend handles both protocols)
205253
location /bucketeer.account.AccountService {
206254
if ($is_grpc_web = 1) {

manifests/bucketeer/charts/api/templates/deployment.yaml

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,17 @@ spec:
5252
image: "{{ .Values.image.repository }}:{{ .Values.global.image.tag }}"
5353
imagePullPolicy: {{ .Values.image.pullPolicy }}
5454
args: ["server"]
55+
lifecycle:
56+
preStop:
57+
exec:
58+
command:
59+
- "/bin/sh"
60+
- "-c"
61+
- |
62+
# Drain the application to fail readiness probe
63+
wget --post-data='' --no-check-certificate -O- https://localhost:{{ .Values.env.port }}/drain || true
64+
# Wait a moment for the readiness probe to detect the drain
65+
sleep 3
5566
env:
5667
- name: BUCKETEER_API_PROFILE
5768
value: "{{.Values.env.profile}}"
@@ -157,7 +168,7 @@ spec:
157168
failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
158169
timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
159170
httpGet:
160-
path: /health
171+
path: /live
161172
port: service
162173
scheme: HTTPS
163174
readinessProbe:
@@ -166,7 +177,7 @@ spec:
166177
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
167178
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
168179
httpGet:
169-
path: /health
180+
path: /ready
170181
port: service
171182
scheme: HTTPS
172183
resources:
@@ -181,28 +192,15 @@ spec:
181192
- "/bin/sh"
182193
- "-c"
183194
- |
184-
# Step 1: Fail Envoy health check so K8s removes pod from endpoints
185-
wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
186-
187-
# Step 2: Wait for all active connections to drain (max 25s)
188-
# Uses Istio pattern: dynamically checks all connections excluding Envoy and TIME-WAIT
189195
elapsed=0
190196
max_wait=25
191197
while [ $elapsed -lt $max_wait ]; do
192-
# Count active connections excluding Envoy process and TIME-WAIT states
193198
active_conns=$(ss -Htlp state all | grep -vE '(envoy|TIME-WAIT)' | wc -l | xargs)
194-
if [ "$active_conns" -eq 0 ]; then
195-
echo "All connections drained after ${elapsed}s"
196-
break
197-
fi
198-
echo "Waiting for $active_conns connections to drain..."
199+
[ "$active_conns" -eq 0 ] && echo "Drained in ${elapsed}s" && break
199200
sleep 1
200201
elapsed=$((elapsed + 1))
201202
done
202-
203-
if [ $elapsed -ge $max_wait ]; then
204-
echo "Timeout reached, forcing shutdown with $active_conns remaining connections"
205-
fi
203+
[ $elapsed -ge $max_wait ] && echo "Timeout: $active_conns connections remain"
206204
command: ["envoy"]
207205
args:
208206
- "-c"
@@ -233,6 +231,7 @@ spec:
233231
initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
234232
periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
235233
failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
234+
timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
236235
httpGet:
237236
path: /ready
238237
port: admin
@@ -241,6 +240,7 @@ spec:
241240
initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
242241
periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
243242
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
243+
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
244244
httpGet:
245245
path: /ready
246246
port: admin

manifests/bucketeer/charts/api/templates/envoy-configmap.yaml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,12 +224,27 @@ data:
224224
allow_credentials: true
225225
max_age: "86400"
226226
routes:
227-
# Health check endpoint
227+
# Health check endpoints
228228
- match:
229229
prefix: /health
230230
route:
231231
cluster: api
232232
timeout: 35s
233+
- match:
234+
prefix: /ready
235+
route:
236+
cluster: api
237+
timeout: 35s
238+
- match:
239+
prefix: /live
240+
route:
241+
cluster: api
242+
timeout: 35s
243+
- match:
244+
prefix: /drain
245+
route:
246+
cluster: api
247+
timeout: 35s
233248
# API REST v1 Gateway routes (Deprecated)
234249
- match:
235250
prefix: /v1/gateway

manifests/bucketeer/charts/api/values.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,13 @@ ingress:
9393
health:
9494
livenessProbe:
9595
initialDelaySeconds: 10
96-
periodSeconds: 3
97-
failureThreshold: 5
98-
timeoutSeconds: 5
96+
periodSeconds: 15
97+
failureThreshold: 3
98+
timeoutSeconds: 3
9999
readinessProbe:
100100
initialDelaySeconds: 10
101101
periodSeconds: 3
102-
failureThreshold: 2
102+
failureThreshold: 1
103103
timeoutSeconds: 3
104104
resources: {}
105105
serviceAccount:

manifests/bucketeer/charts/batch/templates/deployment.yaml

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,17 @@ spec:
6060
{{- range .Values.env.nonPersistentChildRedis.addresses }}
6161
- --non-persistent-child-redis-addresses={{ . }}
6262
{{- end }}
63+
lifecycle:
64+
preStop:
65+
exec:
66+
command:
67+
- "/bin/sh"
68+
- "-c"
69+
- |
70+
# Drain the application to fail readiness probe
71+
wget --post-data='' --no-check-certificate -O- https://localhost:{{ .Values.env.port }}/drain || true
72+
# Wait a moment for the readiness probe to detect the drain
73+
sleep 3
6374
env:
6475
- name: BIGQUERY_WRITER_EMULATOR_HOST
6576
value: "{{.Values.env.bigqueryWriterEmulatorHost}}"
@@ -193,16 +204,18 @@ spec:
193204
initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
194205
periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
195206
failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
207+
timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
196208
httpGet:
197-
path: /health
209+
path: /live
198210
port: service
199211
scheme: HTTPS
200212
readinessProbe:
201213
initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
202214
periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
203215
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
216+
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
204217
httpGet:
205-
path: /health
218+
path: /ready
206219
port: service
207220
scheme: HTTPS
208221
resources:
@@ -217,28 +230,15 @@ spec:
217230
- "/bin/sh"
218231
- "-c"
219232
- |
220-
# Step 1: Fail Envoy health check so K8s removes pod from endpoints
221-
wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
222-
223-
# Step 2: Wait for all active connections to drain (max 25s)
224-
# Uses Istio pattern: dynamically checks all connections excluding Envoy and TIME-WAIT
225233
elapsed=0
226234
max_wait=25
227235
while [ $elapsed -lt $max_wait ]; do
228-
# Count active connections excluding Envoy process and TIME-WAIT states
229236
active_conns=$(ss -Htlp state all | grep -vE '(envoy|TIME-WAIT)' | wc -l | xargs)
230-
if [ "$active_conns" -eq 0 ]; then
231-
echo "All connections drained after ${elapsed}s"
232-
break
233-
fi
234-
echo "Waiting for $active_conns connections to drain..."
237+
[ "$active_conns" -eq 0 ] && echo "Drained in ${elapsed}s" && break
235238
sleep 1
236239
elapsed=$((elapsed + 1))
237240
done
238-
239-
if [ $elapsed -ge $max_wait ]; then
240-
echo "Timeout reached, forcing shutdown with $active_conns remaining connections"
241-
fi
241+
[ $elapsed -ge $max_wait ] && echo "Timeout: $active_conns connections remain"
242242
command: ["envoy"]
243243
args:
244244
- "-c"
@@ -264,6 +264,7 @@ spec:
264264
initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
265265
periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
266266
failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
267+
timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
267268
httpGet:
268269
path: /ready
269270
port: admin
@@ -272,6 +273,7 @@ spec:
272273
initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
273274
periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
274275
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
276+
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
275277
httpGet:
276278
path: /ready
277279
port: admin

manifests/bucketeer/charts/batch/templates/envoy-configmap.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ data:
158158
- name: :path
159159
string_match:
160160
exact: /health
161-
pass_through_mode: false
161+
pass_through_mode: true # Changed to true to allow app to handle /health, /ready, /live, /drain
162162
- name: envoy.filters.http.router
163163
typed_config:
164164
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

manifests/bucketeer/charts/batch/values.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,13 @@ gcpMultiCluster:
121121
health:
122122
livenessProbe:
123123
initialDelaySeconds: 10
124-
periodSeconds: 3
125-
failureThreshold: 5
126-
timeoutSeconds: 5
124+
periodSeconds: 15
125+
failureThreshold: 3
126+
timeoutSeconds: 3
127127
readinessProbe:
128128
initialDelaySeconds: 10
129129
periodSeconds: 3
130-
failureThreshold: 2
130+
failureThreshold: 1
131131
timeoutSeconds: 3
132132

133133
resources: {}

0 commit comments

Comments
 (0)