bucketeer-io · cre8ivejp · Oct 20, 2025 · Sep 23, 2025 · Oct 6, 2025 · Oct 7, 2025
@@ -21,6 +21,12 @@ spec:
       annotations:
         checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
     spec:
+      # Ensure pod has sufficient time for graceful shutdown before SIGKILL
+      # This matches GCP Spot VM termination window (30s) and allows:
+      # - 3s for K8s to detect pod is not ready
+      # - 20s for application graceful shutdown
+      # - 7s safety margin
+      terminationGracePeriodSeconds: 30
       {{- with .Values.global.image.imagePullSecrets }}
       imagePullSecrets: {{- toYaml . | nindent 8 }}
       {{- end }}
@@ -174,7 +180,26 @@ spec:
                 command:
                   - "/bin/sh"
                   - "-c"
-                  - "wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail; while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
+                  - |
+                    # Fail Envoy health check immediately
+                    wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
+
+                    # Wait for API to signal ready for shutdown (max 22s)
+                    # This is coordinated with the app's 20s shutdown timeout.
+                    # Envoy must wait LONGER than the app timeout to ensure it doesn't
+                    # start draining while the app is still processing requests.
+                    timeout=22
+                    while [ $timeout -gt 0 ]; do
+                      if wget -q -O- --no-check-certificate https://localhost:9090/internal/shutdown-ready 2>/dev/null | grep -q "ready"; then
+                        echo "API ready for shutdown, draining connections..."
+                        break
+                      fi
+                      sleep 1
+                      timeout=$((timeout-1))
+                    done
+
+                    # Additional drain time for remaining connections
+                    sleep 3
           command: ["envoy"]
           args:
             - "-c"

@@ -309,7 +309,7 @@ data:
                               prefix: /bucketeer.feature.FeatureService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -322,7 +322,7 @@ data:
                               prefix: /bucketeer.account.AccountService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -335,7 +335,7 @@ data:
                               prefix: /bucketeer.push.PushService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -348,7 +348,7 @@ data:
                               prefix: /bucketeer.coderef.CodeReferenceService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -361,7 +361,7 @@ data:
                               prefix: /bucketeer.auditlog.AuditLogService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -374,7 +374,7 @@ data:
                               prefix: /bucketeer.autoops.AutoOpsService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -387,7 +387,7 @@ data:
                               prefix: /bucketeer.tag.TagService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -400,7 +400,7 @@ data:
                               prefix: /bucketeer.team.TeamService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -413,7 +413,7 @@ data:
                               prefix: /bucketeer.notification.NotificationService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -439,7 +439,7 @@ data:
                               prefix: /bucketeer.eventcounter.EventCounterService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1
@@ -452,7 +452,7 @@ data:
                               prefix: /bucketeer.environment.EnvironmentService
                             route:
                               cluster: web
-                              timeout: 35s
+                              timeout: 45s
                               retry_policy:
                                 retry_on: 5xx
                                 num_retries: 1

@@ -92,15 +92,13 @@ ingress:
             number: 9000
 health:
   livenessProbe:
-    initialDelaySeconds: 30
-    periodSeconds: 10
-    failureThreshold: 6
-    timeoutSeconds: 10
+    initialDelaySeconds: 10
+    periodSeconds: 3
+    failureThreshold: 5
   readinessProbe:
-    initialDelaySeconds: 20
-    periodSeconds: 10
-    failureThreshold: 3
-    timeoutSeconds: 5
+    initialDelaySeconds: 10
+    periodSeconds: 3
+    failureThreshold: 2
 resources: {}
 serviceAccount:
   annotations: {}

@@ -22,6 +22,12 @@ spec:
       annotations:
         checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
     spec:
+      # Ensure pod has sufficient time for graceful shutdown before SIGKILL
+      # This matches GCP Spot VM termination window (30s) and allows:
+      # - 3s for K8s to detect pod is not ready
+      # - 20s for application graceful shutdown
+      # - 7s safety margin
+      terminationGracePeriodSeconds: 30
       {{- with .Values.global.image.imagePullSecrets }}
       imagePullSecrets: {{- toYaml . | nindent 8 }}
       {{- end }}
@@ -210,7 +216,26 @@ spec:
                 command:
                   - "/bin/sh"
                   - "-c"
-                  - "while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
+                  - |
+                    # Fail Envoy health check immediately
+                    wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
+
+                    # Wait for Batch service to signal ready for shutdown (max 22s)
+                    # This is coordinated with the app's 20s shutdown timeout.
+                    # Envoy must wait LONGER than the app timeout to ensure it doesn't
+                    # start draining while the app is still processing requests.
+                    timeout=22
+                    while [ $timeout -gt 0 ]; do
+                      if wget -q -O- --no-check-certificate https://localhost:9090/internal/shutdown-ready 2>/dev/null | grep -q "ready"; then
+                        echo "Batch service ready for shutdown, draining connections..."
+                        break
+                      fi
+                      sleep 1
+                      timeout=$((timeout-1))
+                    done
+
+                    # Additional drain time for remaining connections
+                    sleep 3
           command: ["envoy"]
           args:
             - "-c"

@@ -22,6 +22,12 @@ spec:
       annotations:
         checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
     spec:
+      # Ensure pod has sufficient time for graceful shutdown before SIGKILL
+      # This matches GCP Spot VM termination window (30s) and allows:
+      # - 3s for K8s to detect pod is not ready
+      # - 20s for application graceful shutdown
+      # - 7s safety margin
+      terminationGracePeriodSeconds: 30
       {{- with .Values.global.image.imagePullSecrets }}
       imagePullSecrets: {{- toYaml . | nindent 8 }}
       {{- end }}
@@ -190,7 +196,28 @@ spec:
                 command:
                   - "/bin/sh"
                   - "-c"
-                  - "while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
+                  - |
+                    # Fail Envoy health check immediately
+                    wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
+
+                    # For subscriber service, give time for PubSub message processing to complete (max 22s)
+                    # This is coordinated with the app's 20s shutdown timeout.
+                    # Envoy must wait LONGER than the app timeout to ensure it doesn't exit
+                    # while the subscriber is still processing messages.
+                    # Note: Subscriber uses process detection instead of /internal/shutdown-ready endpoint
+                    timeout=22
+                    while [ $timeout -gt 0 ]; do
+                      # Check if subscriber main process is still running (processing messages)
+                      if ! pgrep -f "subscriber" > /dev/null; then
+                        echo "Subscriber process completed, ready for shutdown..."
+                        break
+                      fi
+                      sleep 1
+                      timeout=$((timeout-1))
+                    done
+
+                    # Additional drain time for remaining connections
+                    sleep 3
           command: ["envoy"]
           args:
             - "-c"

@@ -21,6 +21,12 @@ spec:
       annotations:
         checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
     spec:
+      # Ensure pod has sufficient time for graceful shutdown before SIGKILL
+      # This matches GCP Spot VM termination window (30s) and allows:
+      # - 3s for K8s to detect pod is not ready
+      # - 20s for application graceful shutdown
+      # - 7s safety margin
+      terminationGracePeriodSeconds: 30
       {{- with .Values.global.image.imagePullSecrets }}
       imagePullSecrets: {{- toYaml . | nindent 8 }}
       {{- end }}
@@ -260,7 +266,26 @@ spec:
                 command:
                   - "/bin/sh"
                   - "-c"
-                  - "while [ $(netstat -plunt | grep tcp | grep -v envoy | wc -l) -ne 0 ]; do sleep 1; done;"
+                  - |
+                    # Fail Envoy health check immediately
+                    wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
+
+                    # Wait for Web services to signal ready for shutdown (max 22s)
+                    # This is coordinated with the app's 20s shutdown timeout.
+                    # Envoy must wait LONGER than the app timeout to ensure it doesn't
+                    # start draining while the app is still processing requests.
+                    timeout=22
+                    while [ $timeout -gt 0 ]; do
+                      if wget -q -O- --no-check-certificate https://localhost:9090/internal/shutdown-ready 2>/dev/null | grep -q "ready"; then
+                        echo "Web services ready for shutdown, draining connections..."
+                        break
+                      fi
+                      sleep 1
+                      timeout=$((timeout-1))
+                    done
+
+                    # Additional drain time for remaining connections
+                    sleep 3
           command: ["envoy"]
           args:
             - "-c"

@@ -51,7 +51,10 @@ import (
 	gwproto "github.com/bucketeer-io/bucketeer/v2/proto/gateway"
 )
 
-const command = "server"
+const (
+	command               = "server"
+	serverShutDownTimeout = 20 * time.Second
+)
 
 type server struct {
 	*kingpin.CmdClause
@@ -503,7 +506,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		rpc.WithService(healthChecker),
 		rpc.WithHandler("/health", healthChecker),
 	)
-	defer server.Stop(10 * time.Second)
 	go server.Run()
 
 	// Set up gRPC Gateway for API service
@@ -532,7 +534,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	if err := apiGateway.Start(ctx, gatewayHandler); err != nil {
 		return fmt.Errorf("failed to start API gateway: %v", err)
 	}
-	defer apiGateway.Stop(10 * time.Second)
 
 	restHealthChecker := health.NewRestChecker(
 		api.Version, api.Service,
@@ -563,14 +564,59 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		rest.WithService(restHealthChecker),
 		rest.WithMetrics(registerer),
 	)
-	defer httpServer.Stop(10 * time.Second)
 	go httpServer.Run()
 
-	// Ensure to stop the health check before stopping the application
-	// so the Kubernetes Readiness can detect it faster and remove the pod
-	// from the service load balancer.
-	defer healthChecker.Stop()
-	defer restHealthChecker.Stop()
+	// Graceful shutdown sequence optimized for GCP Spot VM constraints (30s termination window):
+	// 1. Stop health checks immediately to fail Kubernetes readiness probe ASAP
+	// 2. Gracefully drain all servers in parallel (allows in-flight requests to complete)
+	// 3. Close clients
+	//
+	// This coordinates with Envoy's preStop hook which waits for /internal/shutdown-ready
+	// to return 200 (set by rpc.Server after graceful shutdown completes).
+	defer func() {
+		// Step 1: Stop health checks immediately
+		// This ensures Kubernetes readiness probe fails on next check (within ~3s),
+		// preventing new traffic from being routed to this pod.
+		healthChecker.Stop()
+		restHealthChecker.Stop()
+
+		// Step 2: Gracefully stop all servers in parallel
+		// Each server will reject new requests and wait for existing requests to complete.
+		done := make(chan struct{})
+		go func() {
+			defer close(done)
+			server.Stop(serverShutDownTimeout)
+		}()
+		go apiGateway.Stop(serverShutDownTimeout)
+		go httpServer.Stop(serverShutDownTimeout)
+
+		// Wait for all servers to complete shutdown
+		<-done
+
+		// Step 3: Close clients
+		// These are fast cleanup operations that can run asynchronously.
+		go goalPublisher.Stop()
+		go evaluationPublisher.Stop()
+		if userPublisher != nil {
+			go userPublisher.Stop()
+		}
+		if metricsPublisher != nil {
+			go metricsPublisher.Stop()
+		}
+		go featureClient.Close()
+		go accountClient.Close()
+		go pushClient.Close()
+		go codeRefClient.Close()
+		go auditLogClient.Close()
+		go autoOpsClient.Close()
+		go tagClient.Close()
+		go teamClient.Close()
+		go notificationClient.Close()
+		go experimentClient.Close()
+		go eventCounterClient.Close()
+		go environmentClient.Close()
+		go redisV3Client.Close()
+	}()
 
 	<-ctx.Done()
 	return nil