Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b8337b9
fix: improve timeout handling and health check resilience during traf…
cre8ivejp Sep 23, 2025
e616e3d
chore: set mising timeout settings for api server
cre8ivejp Oct 6, 2025
286b464
chore: adjust timeout settings
cre8ivejp Oct 7, 2025
e2f1699
feat: add metrics to monitor grpc server shutdown
cre8ivejp Oct 7, 2025
5618c14
chore: implement prometheus push gateway
cre8ivejp Oct 7, 2025
59fcb8e
fix: metrics service name
cre8ivejp Oct 7, 2025
0a4e7d3
chore: set prometheusPushGatewayURL for all services
cre8ivejp Oct 8, 2025
d4e3fe1
fix: deprecated collector functions
cre8ivejp Oct 8, 2025
72941f4
fix: grouping label conflict
cre8ivejp Oct 8, 2025
d8bc12e
chore: split server and service labels
cre8ivejp Oct 8, 2025
2649024
fix: missing server label
cre8ivejp Oct 8, 2025
c8c4fa3
chore: remove shutdown metrics
cre8ivejp Oct 9, 2025
6047110
chore: rebase
cre8ivejp Oct 9, 2025
cb90076
fix: gracefull shutdown for all services
cre8ivejp Oct 9, 2025
95e1943
chore: set the timeout for k8s readiness and liveness
cre8ivejp Oct 9, 2025
abe9498
fix: lint error
cre8ivejp Oct 9, 2025
b66f813
fix: shutting down process
cre8ivejp Oct 9, 2025
cfdfa18
fix: shutdown order
cre8ivejp Oct 9, 2025
cc11c98
chore: remove drain_listeners from envoy prestop
cre8ivejp Oct 9, 2025
1f51d03
fix: 503 errors when shutting down the server
cre8ivejp Oct 9, 2025
e910a7b
chore: remove internal shutdown ready handler
cre8ivejp Oct 9, 2025
b9d6426
feat: implement ready health check
cre8ivejp Oct 15, 2025
faed2ab
chore: change idle timeout settings to improve possible 499 errors
cre8ivejp Oct 16, 2025
4b641a6
fix: health check response when getting sigterm
cre8ivejp Oct 16, 2025
2e31c9a
chore: change retry count
cre8ivejp Oct 16, 2025
eebcfd1
chore: remove envoy healthcheck fail call
cre8ivejp Oct 17, 2025
1d75ac9
chore: sleep before stopping health check
cre8ivejp Oct 17, 2025
f6d8894
chore: change api agteway server context
cre8ivejp Oct 18, 2025
88765b9
chore: change envoy pre stop script
cre8ivejp Oct 20, 2025
25b3cbb
chore: remove duplicate defer functions
cre8ivejp Oct 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion manifests/bucketeer/charts/api/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ spec:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
spec:
# Ensure pod has sufficient time for graceful shutdown before SIGKILL
# This matches GCP Spot VM termination window (30s) and allows:
# - 3s for K8s to detect pod is not ready
# - 20s for application graceful shutdown
# - 7s safety margin
terminationGracePeriodSeconds: 30
{{- with .Values.global.image.imagePullSecrets }}
imagePullSecrets: {{- toYaml . | nindent 8 }}
{{- end }}
Expand Down Expand Up @@ -174,7 +180,26 @@ spec:
command:
- "/bin/sh"
- "-c"
- "wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail; while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
- |
# Fail Envoy health check immediately
wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail

# Wait for API to signal ready for shutdown (max 22s)
# This is coordinated with the app's 20s shutdown timeout.
# Envoy must wait LONGER than the app timeout to ensure it doesn't
# start draining while the app is still processing requests.
timeout=22
while [ $timeout -gt 0 ]; do
if wget -q -O- --no-check-certificate https://localhost:9090/internal/shutdown-ready 2>/dev/null | grep -q "ready"; then
echo "API ready for shutdown, draining connections..."
break
fi
sleep 1
timeout=$((timeout-1))
done

# Additional drain time for remaining connections
sleep 3
command: ["envoy"]
args:
- "-c"
Expand Down
22 changes: 11 additions & 11 deletions manifests/bucketeer/charts/api/templates/envoy-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ data:
prefix: /bucketeer.feature.FeatureService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -322,7 +322,7 @@ data:
prefix: /bucketeer.account.AccountService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -335,7 +335,7 @@ data:
prefix: /bucketeer.push.PushService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -348,7 +348,7 @@ data:
prefix: /bucketeer.coderef.CodeReferenceService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -361,7 +361,7 @@ data:
prefix: /bucketeer.auditlog.AuditLogService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -374,7 +374,7 @@ data:
prefix: /bucketeer.autoops.AutoOpsService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -387,7 +387,7 @@ data:
prefix: /bucketeer.tag.TagService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -400,7 +400,7 @@ data:
prefix: /bucketeer.team.TeamService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -413,7 +413,7 @@ data:
prefix: /bucketeer.notification.NotificationService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -439,7 +439,7 @@ data:
prefix: /bucketeer.eventcounter.EventCounterService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand All @@ -452,7 +452,7 @@ data:
prefix: /bucketeer.environment.EnvironmentService
route:
cluster: web
timeout: 35s
timeout: 45s
retry_policy:
retry_on: 5xx
num_retries: 1
Expand Down
14 changes: 6 additions & 8 deletions manifests/bucketeer/charts/api/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,13 @@ ingress:
number: 9000
health:
livenessProbe:
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 6
timeoutSeconds: 10
initialDelaySeconds: 10
periodSeconds: 3
failureThreshold: 5
readinessProbe:
initialDelaySeconds: 20
periodSeconds: 10
failureThreshold: 3
timeoutSeconds: 5
initialDelaySeconds: 10
periodSeconds: 3
failureThreshold: 2
resources: {}
serviceAccount:
annotations: {}
Expand Down
27 changes: 26 additions & 1 deletion manifests/bucketeer/charts/batch/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ spec:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
spec:
# Ensure pod has sufficient time for graceful shutdown before SIGKILL
# This matches GCP Spot VM termination window (30s) and allows:
# - 3s for K8s to detect pod is not ready
# - 20s for application graceful shutdown
# - 7s safety margin
terminationGracePeriodSeconds: 30
{{- with .Values.global.image.imagePullSecrets }}
imagePullSecrets: {{- toYaml . | nindent 8 }}
{{- end }}
Expand Down Expand Up @@ -210,7 +216,26 @@ spec:
command:
- "/bin/sh"
- "-c"
- "while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
- |
# Fail Envoy health check immediately
wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail

# Wait for Batch service to signal ready for shutdown (max 22s)
# This is coordinated with the app's 20s shutdown timeout.
# Envoy must wait LONGER than the app timeout to ensure it doesn't
# start draining while the app is still processing requests.
timeout=22
while [ $timeout -gt 0 ]; do
if wget -q -O- --no-check-certificate https://localhost:9090/internal/shutdown-ready 2>/dev/null | grep -q "ready"; then
echo "Batch service ready for shutdown, draining connections..."
break
fi
sleep 1
timeout=$((timeout-1))
done

# Additional drain time for remaining connections
sleep 3
command: ["envoy"]
args:
- "-c"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ spec:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
spec:
# Ensure pod has sufficient time for graceful shutdown before SIGKILL
# This matches GCP Spot VM termination window (30s) and allows:
# - 3s for K8s to detect pod is not ready
# - 20s for application graceful shutdown
# - 7s safety margin
terminationGracePeriodSeconds: 30
{{- with .Values.global.image.imagePullSecrets }}
imagePullSecrets: {{- toYaml . | nindent 8 }}
{{- end }}
Expand Down Expand Up @@ -190,7 +196,28 @@ spec:
command:
- "/bin/sh"
- "-c"
- "while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
- |
# Fail Envoy health check immediately
wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail

# For subscriber service, give time for PubSub message processing to complete (max 22s)
# This is coordinated with the app's 20s shutdown timeout.
# Envoy must wait LONGER than the app timeout to ensure it doesn't exit
# while the subscriber is still processing messages.
# Note: Subscriber uses process detection instead of /internal/shutdown-ready endpoint
timeout=22
while [ $timeout -gt 0 ]; do
# Check if subscriber main process is still running (processing messages)
if ! pgrep -f "subscriber" > /dev/null; then
echo "Subscriber process completed, ready for shutdown..."
break
fi
sleep 1
timeout=$((timeout-1))
done

# Additional drain time for remaining connections
sleep 3
command: ["envoy"]
args:
- "-c"
Expand Down
27 changes: 26 additions & 1 deletion manifests/bucketeer/charts/web/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ spec:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
spec:
# Ensure pod has sufficient time for graceful shutdown before SIGKILL
# This matches GCP Spot VM termination window (30s) and allows:
# - 3s for K8s to detect pod is not ready
# - 20s for application graceful shutdown
# - 7s safety margin
terminationGracePeriodSeconds: 30
{{- with .Values.global.image.imagePullSecrets }}
imagePullSecrets: {{- toYaml . | nindent 8 }}
{{- end }}
Expand Down Expand Up @@ -260,7 +266,26 @@ spec:
command:
- "/bin/sh"
- "-c"
- "while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The netstat command was removed from Envoy's image, and the preStop command was failing.

- |
# Fail Envoy health check immediately
wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail

# Wait for Web services to signal ready for shutdown (max 22s)
# This is coordinated with the app's 20s shutdown timeout.
# Envoy must wait LONGER than the app timeout to ensure it doesn't
# start draining while the app is still processing requests.
timeout=22
while [ $timeout -gt 0 ]; do
if wget -q -O- --no-check-certificate https://localhost:9090/internal/shutdown-ready 2>/dev/null | grep -q "ready"; then
echo "Web services ready for shutdown, draining connections..."
break
fi
sleep 1
timeout=$((timeout-1))
done

# Additional drain time for remaining connections
sleep 3
command: ["envoy"]
args:
- "-c"
Expand Down
64 changes: 55 additions & 9 deletions pkg/api/cmd/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ import (
gwproto "github.com/bucketeer-io/bucketeer/v2/proto/gateway"
)

const command = "server"
const (
command = "server"
serverShutDownTimeout = 20 * time.Second
)

type server struct {
*kingpin.CmdClause
Expand Down Expand Up @@ -503,7 +506,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
rpc.WithService(healthChecker),
rpc.WithHandler("/health", healthChecker),
)
defer server.Stop(10 * time.Second)
go server.Run()

// Set up gRPC Gateway for API service
Expand Down Expand Up @@ -532,7 +534,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
if err := apiGateway.Start(ctx, gatewayHandler); err != nil {
return fmt.Errorf("failed to start API gateway: %v", err)
}
defer apiGateway.Stop(10 * time.Second)

restHealthChecker := health.NewRestChecker(
api.Version, api.Service,
Expand Down Expand Up @@ -563,14 +564,59 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
rest.WithService(restHealthChecker),
rest.WithMetrics(registerer),
)
defer httpServer.Stop(10 * time.Second)
go httpServer.Run()

// Ensure to stop the health check before stopping the application
// so the Kubernetes Readiness can detect it faster and remove the pod
// from the service load balancer.
defer healthChecker.Stop()
defer restHealthChecker.Stop()
// Graceful shutdown sequence optimized for GCP Spot VM constraints (30s termination window):
// 1. Stop health checks immediately to fail Kubernetes readiness probe ASAP
// 2. Gracefully drain all servers in parallel (allows in-flight requests to complete)
// 3. Close clients
//
// This coordinates with Envoy's preStop hook which waits for /internal/shutdown-ready
// to return 200 (set by rpc.Server after graceful shutdown completes).
defer func() {
// Step 1: Stop health checks immediately
// This ensures Kubernetes readiness probe fails on next check (within ~3s),
// preventing new traffic from being routed to this pod.
healthChecker.Stop()
restHealthChecker.Stop()

// Step 2: Gracefully stop all servers in parallel
// Each server will reject new requests and wait for existing requests to complete.
done := make(chan struct{})
go func() {
defer close(done)
server.Stop(serverShutDownTimeout)
}()
go apiGateway.Stop(serverShutDownTimeout)
go httpServer.Stop(serverShutDownTimeout)

// Wait for all servers to complete shutdown
<-done

// Step 3: Close clients
// These are fast cleanup operations that can run asynchronously.
go goalPublisher.Stop()
go evaluationPublisher.Stop()
if userPublisher != nil {
go userPublisher.Stop()
}
if metricsPublisher != nil {
go metricsPublisher.Stop()
}
go featureClient.Close()
go accountClient.Close()
go pushClient.Close()
go codeRefClient.Close()
go auditLogClient.Close()
go autoOpsClient.Close()
go tagClient.Close()
go teamClient.Close()
go notificationClient.Close()
go experimentClient.Close()
go eventCounterClient.Close()
go environmentClient.Close()
go redisV3Client.Close()
}()

<-ctx.Done()
return nil
Expand Down
Loading
Loading