Skip to content

Commit d331673

Browse files
committed
feat: implement ready health check
Signed-off-by: Alessandro Yuichi Okimoto <yuichijpn@gmail.com>
1 parent e910a7b commit d331673

File tree

24 files changed

+676
-264
lines changed

24 files changed

+676
-264
lines changed

docker-compose/config/nginx/bucketeer.conf

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ server {
109109
add_header X-Content-Type-Options nosniff;
110110
add_header X-XSS-Protection "1; mode=block";
111111

112-
# Health check endpoint
112+
# Health check endpoints
113113
location /health {
114114
proxy_pass https://api_health_backend;
115115
proxy_set_header Host $host;
@@ -118,6 +118,14 @@ server {
118118
proxy_set_header X-Forwarded-Proto $scheme;
119119
}
120120

121+
location /ready {
122+
proxy_pass https://api_health_backend;
123+
proxy_set_header Host $host;
124+
proxy_set_header X-Real-IP $remote_addr;
125+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
126+
proxy_set_header X-Forwarded-Proto $scheme;
127+
}
128+
121129
# API Gateway gRPC service (main API)
122130
location /bucketeer.gateway.Gateway {
123131
grpc_pass grpcs://api_grpc_backend;
@@ -192,7 +200,7 @@ server {
192200
return 204;
193201
}
194202

195-
# Health check endpoint
203+
# Health check endpoints
196204
location /health {
197205
proxy_pass https://web_health_backend;
198206
proxy_set_header Host $host;
@@ -201,6 +209,14 @@ server {
201209
proxy_set_header X-Forwarded-Proto $scheme;
202210
}
203211

212+
location /ready {
213+
proxy_pass https://web_health_backend;
214+
proxy_set_header Host $host;
215+
proxy_set_header X-Real-IP $remote_addr;
216+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
217+
proxy_set_header X-Forwarded-Proto $scheme;
218+
}
219+
204220
# gRPC/gRPC-Web service routes (backend handles both protocols)
205221
location /bucketeer.account.AccountService {
206222
if ($is_grpc_web = 1) {

manifests/bucketeer/charts/api/templates/deployment.yaml

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@ spec:
2121
annotations:
2222
checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
2323
spec:
24-
# Ensure pod has sufficient time for graceful shutdown before SIGKILL
25-
# This matches GCP Spot VM termination window (30s) and allows:
26-
# - 3s for K8s to detect pod is not ready
27-
# - 20s for application graceful shutdown
28-
# - 7s safety margin
29-
terminationGracePeriodSeconds: 30
24+
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds | default 30 }}
3025
{{- with .Values.global.image.imagePullSecrets }}
3126
imagePullSecrets: {{- toYaml . | nindent 8 }}
3227
{{- end }}
@@ -166,7 +161,7 @@ spec:
166161
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
167162
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
168163
httpGet:
169-
path: /health
164+
path: /ready
170165
port: service
171166
scheme: HTTPS
172167
resources:
@@ -181,28 +176,18 @@ spec:
181176
- "/bin/sh"
182177
- "-c"
183178
- |
184-
# Step 1: Fail Envoy health check so K8s removes pod from endpoints
185-
wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
186-
187-
# Step 2: Wait for all active connections to drain (max 25s)
188-
# Uses Istio pattern: dynamically checks all connections excluding Envoy and TIME-WAIT
179+
# Wait for active requests to finish (not idle keep-alive connections)
189180
elapsed=0
190-
max_wait=25
181+
max_wait=30
191182
while [ $elapsed -lt $max_wait ]; do
192-
# Count active connections excluding Envoy process and TIME-WAIT states
193-
active_conns=$(ss -Htlp state all | grep -vE '(envoy|TIME-WAIT)' | wc -l | xargs)
194-
if [ "$active_conns" -eq 0 ]; then
195-
echo "All connections drained after ${elapsed}s"
196-
break
197-
fi
198-
echo "Waiting for $active_conns connections to drain..."
183+
active_reqs=$(wget -q -O- http://localhost:{{ .Values.envoy.adminPort }}/stats 2>/dev/null | grep "ingress_http.downstream_rq_active" | awk '{print $2}')
184+
[ -z "$active_reqs" ] && active_reqs=0
185+
[ "$active_reqs" -eq 0 ] && echo "All active requests completed in ${elapsed}s" && break
199186
sleep 1
200187
elapsed=$((elapsed + 1))
201188
done
202-
203-
if [ $elapsed -ge $max_wait ]; then
204-
echo "Timeout reached, forcing shutdown with $active_conns remaining connections"
205-
fi
189+
[ $elapsed -ge $max_wait ] && echo "Timeout: $active_reqs active requests remain"
190+
exit 0
206191
command: ["envoy"]
207192
args:
208193
- "-c"
@@ -233,6 +218,7 @@ spec:
233218
initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
234219
periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
235220
failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
221+
timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
236222
httpGet:
237223
path: /ready
238224
port: admin
@@ -241,11 +227,12 @@ spec:
241227
initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
242228
periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
243229
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
230+
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
244231
httpGet:
245232
path: /ready
246233
port: admin
247234
scheme: HTTP
248235
resources:
249236
{{ toYaml .Values.envoy.resources | indent 12 }}
250237
strategy:
251-
type: RollingUpdate
238+
{{ toYaml .Values.strategy | indent 4 }}

manifests/bucketeer/charts/api/templates/envoy-configmap.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,12 +224,17 @@ data:
224224
allow_credentials: true
225225
max_age: "86400"
226226
routes:
227-
# Health check endpoint
227+
# Health check endpoints
228228
- match:
229229
prefix: /health
230230
route:
231231
cluster: api
232232
timeout: 35s
233+
- match:
234+
prefix: /ready
235+
route:
236+
cluster: api
237+
timeout: 35s
233238
# API REST v1 Gateway routes (Deprecated)
234239
- match:
235240
prefix: /v1/gateway

manifests/bucketeer/charts/api/values.yaml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ nodeSelector: {}
4848
pdb:
4949
enabled:
5050
maxUnavailable: 20%
51+
strategy:
52+
type: RollingUpdate
53+
rollingUpdate:
54+
maxUnavailable: 10%
55+
maxSurge: 1
5156
hpa:
5257
enabled:
5358
minReplicas:
@@ -90,17 +95,18 @@ ingress:
9095
name: api
9196
port:
9297
number: 9000
98+
terminationGracePeriodSeconds: 30
9399
health:
94100
livenessProbe:
95-
initialDelaySeconds: 10
96-
periodSeconds: 3
101+
initialDelaySeconds: 30
102+
periodSeconds: 15
97103
failureThreshold: 5
98104
timeoutSeconds: 5
99105
readinessProbe:
100-
initialDelaySeconds: 10
106+
initialDelaySeconds: 15
101107
periodSeconds: 3
102-
failureThreshold: 2
103-
timeoutSeconds: 3
108+
failureThreshold: 1
109+
timeoutSeconds: 5
104110
resources: {}
105111
serviceAccount:
106112
annotations: {}

manifests/bucketeer/charts/batch/templates/deployment.yaml

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,7 @@ spec:
2222
annotations:
2323
checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
2424
spec:
25-
# Ensure pod has sufficient time for graceful shutdown before SIGKILL
26-
# This matches GCP Spot VM termination window (30s) and allows:
27-
# - 3s for K8s to detect pod is not ready
28-
# - 20s for application graceful shutdown
29-
# - 7s safety margin
30-
terminationGracePeriodSeconds: 30
25+
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds | default 30 }}
3126
{{- with .Values.global.image.imagePullSecrets }}
3227
imagePullSecrets: {{- toYaml . | nindent 8 }}
3328
{{- end }}
@@ -193,6 +188,7 @@ spec:
193188
initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
194189
periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
195190
failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
191+
timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
196192
httpGet:
197193
path: /health
198194
port: service
@@ -201,8 +197,9 @@ spec:
201197
initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
202198
periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
203199
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
200+
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
204201
httpGet:
205-
path: /health
202+
path: /ready
206203
port: service
207204
scheme: HTTPS
208205
resources:
@@ -217,28 +214,18 @@ spec:
217214
- "/bin/sh"
218215
- "-c"
219216
- |
220-
# Step 1: Fail Envoy health check so K8s removes pod from endpoints
221-
wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
222-
223-
# Step 2: Wait for all active connections to drain (max 25s)
224-
# Uses Istio pattern: dynamically checks all connections excluding Envoy and TIME-WAIT
217+
# Wait for active requests to finish (not idle keep-alive connections)
225218
elapsed=0
226-
max_wait=25
219+
max_wait=30
227220
while [ $elapsed -lt $max_wait ]; do
228-
# Count active connections excluding Envoy process and TIME-WAIT states
229-
active_conns=$(ss -Htlp state all | grep -vE '(envoy|TIME-WAIT)' | wc -l | xargs)
230-
if [ "$active_conns" -eq 0 ]; then
231-
echo "All connections drained after ${elapsed}s"
232-
break
233-
fi
234-
echo "Waiting for $active_conns connections to drain..."
221+
active_reqs=$(wget -q -O- http://localhost:{{ .Values.envoy.adminPort }}/stats 2>/dev/null | grep "ingress_http.downstream_rq_active" | awk '{print $2}')
222+
[ -z "$active_reqs" ] && active_reqs=0
223+
[ "$active_reqs" -eq 0 ] && echo "All active requests completed in ${elapsed}s" && break
235224
sleep 1
236225
elapsed=$((elapsed + 1))
237226
done
238-
239-
if [ $elapsed -ge $max_wait ]; then
240-
echo "Timeout reached, forcing shutdown with $active_conns remaining connections"
241-
fi
227+
[ $elapsed -ge $max_wait ] && echo "Timeout: $active_reqs active requests remain"
228+
exit 0
242229
command: ["envoy"]
243230
args:
244231
- "-c"
@@ -264,6 +251,7 @@ spec:
264251
initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
265252
periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
266253
failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
254+
timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
267255
httpGet:
268256
path: /ready
269257
port: admin
@@ -272,6 +260,7 @@ spec:
272260
initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
273261
periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
274262
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
263+
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
275264
httpGet:
276265
path: /ready
277266
port: admin
@@ -301,5 +290,5 @@ spec:
301290
resources:
302291
{{ toYaml .Values.httpstan.resources | indent 12 }}
303292
strategy:
304-
type: RollingUpdate
293+
{{ toYaml .Values.strategy | indent 4 }}
305294
{{- end }}

manifests/bucketeer/charts/batch/templates/envoy-configmap.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ data:
158158
- name: :path
159159
string_match:
160160
exact: /health
161-
pass_through_mode: false
161+
pass_through_mode: true # Changed to true to allow app to handle /health and /ready
162162
- name: envoy.filters.http.router
163163
typed_config:
164164
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

manifests/bucketeer/charts/batch/values.yaml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ nodeSelector: {}
5959
pdb:
6060
enabled:
6161
maxUnavailable: 50%
62+
strategy:
63+
type: RollingUpdate
64+
rollingUpdate:
65+
maxUnavailable: 10%
66+
maxSurge: 1
6267

6368
hpa:
6469
enabled:
@@ -120,15 +125,15 @@ gcpMultiCluster:
120125

121126
health:
122127
livenessProbe:
123-
initialDelaySeconds: 10
124-
periodSeconds: 3
128+
initialDelaySeconds: 30
129+
periodSeconds: 15
125130
failureThreshold: 5
126131
timeoutSeconds: 5
127132
readinessProbe:
128-
initialDelaySeconds: 10
133+
initialDelaySeconds: 15
129134
periodSeconds: 3
130-
failureThreshold: 2
131-
timeoutSeconds: 3
135+
failureThreshold: 1
136+
timeoutSeconds: 5
132137

133138
resources: {}
134139

@@ -194,3 +199,4 @@ cronjob:
194199
- name: tag-deleter
195200
jobId: TagDeleter
196201
schedule: "0 0 * * *"
202+
terminationGracePeriodSeconds: 30

0 commit comments

Comments
 (0)