Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b8337b9
fix: improve timeout handling and health check resilience during traf…
cre8ivejp Sep 23, 2025
e616e3d
chore: set mising timeout settings for api server
cre8ivejp Oct 6, 2025
286b464
chore: adjust timeout settings
cre8ivejp Oct 7, 2025
e2f1699
feat: add metrics to monitor grpc server shutdown
cre8ivejp Oct 7, 2025
5618c14
chore: implement prometheus push gateway
cre8ivejp Oct 7, 2025
59fcb8e
fix: metrics service name
cre8ivejp Oct 7, 2025
0a4e7d3
chore: set prometheusPushGatewayURL for all services
cre8ivejp Oct 8, 2025
d4e3fe1
fix: deprecated collector functions
cre8ivejp Oct 8, 2025
72941f4
fix: grouping label conflict
cre8ivejp Oct 8, 2025
d8bc12e
chore: split server and service labels
cre8ivejp Oct 8, 2025
2649024
fix: missing server label
cre8ivejp Oct 8, 2025
c8c4fa3
chore: remove shutdown metrics
cre8ivejp Oct 9, 2025
6047110
chore: rebase
cre8ivejp Oct 9, 2025
cb90076
fix: gracefull shutdown for all services
cre8ivejp Oct 9, 2025
95e1943
chore: set the timeout for k8s readiness and liveness
cre8ivejp Oct 9, 2025
abe9498
fix: lint error
cre8ivejp Oct 9, 2025
b66f813
fix: shutting down process
cre8ivejp Oct 9, 2025
cfdfa18
fix: shutdown order
cre8ivejp Oct 9, 2025
cc11c98
chore: remove drain_listeners from envoy prestop
cre8ivejp Oct 9, 2025
1f51d03
fix: 503 errors when shutting down the server
cre8ivejp Oct 9, 2025
e910a7b
chore: remove internal shutdown ready handler
cre8ivejp Oct 9, 2025
b9d6426
feat: implement ready health check
cre8ivejp Oct 15, 2025
faed2ab
chore: change idle timeout settings to improve possible 499 errors
cre8ivejp Oct 16, 2025
4b641a6
fix: health check response when getting sigterm
cre8ivejp Oct 16, 2025
2e31c9a
chore: change retry count
cre8ivejp Oct 16, 2025
eebcfd1
chore: remove envoy healthcheck fail call
cre8ivejp Oct 17, 2025
1d75ac9
chore: sleep before stopping health check
cre8ivejp Oct 17, 2025
f6d8894
chore: change api agteway server context
cre8ivejp Oct 18, 2025
88765b9
chore: change envoy pre stop script
cre8ivejp Oct 20, 2025
25b3cbb
chore: remove duplicate defer functions
cre8ivejp Oct 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions docker-compose/config/nginx/bucketeer.conf
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ server {
add_header X-Content-Type-Options nosniff;
add_header X-XSS-Protection "1; mode=block";

# Health check endpoint
# Health check endpoints
location /health {
proxy_pass https://api_health_backend;
proxy_set_header Host $host;
Expand All @@ -118,6 +118,14 @@ server {
proxy_set_header X-Forwarded-Proto $scheme;
}

location /ready {
proxy_pass https://api_health_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}

# API Gateway gRPC service (main API)
location /bucketeer.gateway.Gateway {
grpc_pass grpcs://api_grpc_backend;
Expand Down Expand Up @@ -192,7 +200,7 @@ server {
return 204;
}

# Health check endpoint
# Health check endpoints
location /health {
proxy_pass https://web_health_backend;
proxy_set_header Host $host;
Expand All @@ -201,6 +209,14 @@ server {
proxy_set_header X-Forwarded-Proto $scheme;
}

location /ready {
proxy_pass https://web_health_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}

# gRPC/gRPC-Web service routes (backend handles both protocols)
location /bucketeer.account.AccountService {
if ($is_grpc_web = 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ spec:
healthCheck:
requestPath: /health
type: HTTP2
timeoutSec: 40
checkIntervalSec: 5
timeoutSec: 5
healthyThreshold: 1
unhealthyThreshold: 1
timeoutSec: 60
connectionDraining:
drainingTimeoutSec: 60
{{- end }}
35 changes: 30 additions & 5 deletions manifests/bucketeer/charts/api/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ spec:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
spec:
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds | default 30 }}
{{- with .Values.global.image.imagePullSecrets }}
imagePullSecrets: {{- toYaml . | nindent 8 }}
{{- end }}
Expand Down Expand Up @@ -145,6 +146,14 @@ spec:
containerPort: {{ .Values.env.port }}
- name: metrics
containerPort: {{ .Values.env.metricsPort }}
startupProbe:
periodSeconds: {{ .Values.health.startupProbe.periodSeconds }}
failureThreshold: {{ .Values.health.startupProbe.failureThreshold }}
timeoutSeconds: {{ .Values.health.startupProbe.timeoutSeconds }}
httpGet:
path: /ready
port: service
scheme: HTTPS
livenessProbe:
initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
Expand All @@ -160,7 +169,7 @@ spec:
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
httpGet:
path: /health
path: /ready
port: service
scheme: HTTPS
resources:
Expand All @@ -172,9 +181,15 @@ spec:
preStop:
exec:
command:
- "/bin/sh"
- "-c"
- "wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail; while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
- /bin/sh
- -c
- |
admin_port={{ .Values.envoy.adminPort }}
# Wait for load balancer propagation (must match app container propagation delay)
sleep 15
wget -q -T 1 -O- --method=POST --body-data='' \
"http://localhost:${admin_port}/drain_listeners?graceful" || true
exit 0
Comment on lines +188 to +192
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will ensure that the envoy container waits until the LB stops sending new requests before shutting down gracefully the in-flight requests.

command: ["envoy"]
args:
- "-c"
Expand All @@ -201,10 +216,19 @@ spec:
containerPort: {{ .Values.envoy.port }}
- name: admin
containerPort: {{ .Values.envoy.adminPort }}
startupProbe:
periodSeconds: {{ .Values.health.startupProbe.periodSeconds }}
failureThreshold: {{ .Values.health.startupProbe.failureThreshold }}
timeoutSeconds: {{ .Values.health.startupProbe.timeoutSeconds }}
httpGet:
path: /ready
port: admin
scheme: HTTP
livenessProbe:
initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
httpGet:
path: /ready
port: admin
Expand All @@ -213,11 +237,12 @@ spec:
initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
httpGet:
path: /ready
port: admin
scheme: HTTP
resources:
{{ toYaml .Values.envoy.resources | indent 12 }}
strategy:
type: RollingUpdate
{{ toYaml .Values.strategy | indent 4 }}
Loading
Loading