chore: change api agteway server context

cre8ivejp · cre8ivejp · commit f6d8894816ce · 2025-10-18T21:14:21.000+09:00
Signed-off-by: Alessandro Yuichi Okimoto &lt;yuichijpn@gmail.com&gt;
diff --git a/manifests/bucketeer/charts/api/templates/deployment.yaml b/manifests/bucketeer/charts/api/templates/deployment.yaml
@@ -185,15 +185,39 @@ spec:
                   - -c
                   - |
                     admin_port={{ .Values.envoy.adminPort }}
-                    max_wait=60
+                    max_wait=45
+                    propagation_delay=15
+
+                    sleep "$propagation_delay"
 
                     # Wait for active requests to drain
                     for i in $(seq 1 "$max_wait"); do
-                      active=$(wget -q -O- "http://localhost:${admin_port}/stats" 2>/dev/null | grep "http.ingress_http.downstream_rq_active" | awk '{print $2}' || echo "0")
-                      [ -z "$active" ] && active=0
-                      [ "$active" -eq 0 ] && break
+                      # Fetch stats and check if request succeeded
+                      stats=$(wget -q -T 1 -O- "http://127.0.0.1:${admin_port}/stats" 2>/dev/null)
+                      if [ $? -ne 0 ] || [ -z "$stats" ]; then
+                        echo "Check $i/$max_wait: Failed to fetch stats, retrying..."
+                        sleep 1
+                        continue
+                      fi
+
+                      # Extract active requests metric
+                      active=$(echo "$stats" | grep -E '^http\.ingress_http\.downstream_rq_active:' | awk '{print $2}')
+                      if [ -z "$active" ]; then
+                        echo "Check $i/$max_wait: Metric not found, retrying..."
+                        sleep 1
+                        continue
+                      fi
+
+                      echo "Check $i/$max_wait: Active requests: $active"
+                      [ "$active" -eq 0 ] && echo "No active requests, exiting gracefully" && break
                       sleep 1
                     done
+
+                    if [ -n "$active" ] && [ "$active" -eq 0 ]; then
+                      echo "Graceful shutdown completed successfully"
+                    else
+                      echo "Warning: Timed out after ${max_wait}s - active=${active:-unknown}"
+                    fi
                     exit 0
           command: ["envoy"]
           args:
diff --git a/manifests/bucketeer/charts/api/values.yaml b/manifests/bucketeer/charts/api/values.yaml
@@ -99,7 +99,7 @@ terminationGracePeriodSeconds: 75
 health:
   startupProbe:
     periodSeconds: 3
-    failureThreshold: 10
+    failureThreshold: 20
     timeoutSeconds: 5
   livenessProbe:
     initialDelaySeconds: 30
diff --git a/manifests/bucketeer/charts/batch/templates/deployment.yaml b/manifests/bucketeer/charts/batch/templates/deployment.yaml
@@ -223,19 +223,39 @@ spec:
                   - -c
                   - |
                     admin_port={{ .Values.envoy.adminPort }}
-                    max_wait=35
+                    max_wait=45
                     propagation_delay=15
 
-                    # Wait for GCLB to detect unhealthy status and stop routing
                     sleep "$propagation_delay"
 
                     # Wait for active requests to drain
                     for i in $(seq 1 "$max_wait"); do
-                      active=$(wget -q -O- "http://localhost:${admin_port}/stats" 2>/dev/null | grep "http.ingress_http.downstream_rq_active" | awk '{print $2}' || echo "0")
-                      [ -z "$active" ] && active=0
-                      [ "$active" -eq 0 ] && break
+                      # Fetch stats and check if request succeeded
+                      stats=$(wget -q -T 1 -O- "http://127.0.0.1:${admin_port}/stats" 2>/dev/null)
+                      if [ $? -ne 0 ] || [ -z "$stats" ]; then
+                        echo "Check $i/$max_wait: Failed to fetch stats, retrying..."
+                        sleep 1
+                        continue
+                      fi
+
+                      # Extract active requests metric
+                      active=$(echo "$stats" | grep -E '^http\.ingress_http\.downstream_rq_active:' | awk '{print $2}')
+                      if [ -z "$active" ]; then
+                        echo "Check $i/$max_wait: Metric not found, retrying..."
+                        sleep 1
+                        continue
+                      fi
+
+                      echo "Check $i/$max_wait: Active requests: $active"
+                      [ "$active" -eq 0 ] && echo "No active requests, exiting gracefully" && break
                       sleep 1
                     done
+
+                    if [ -n "$active" ] && [ "$active" -eq 0 ]; then
+                      echo "Graceful shutdown completed successfully"
+                    else
+                      echo "Warning: Timed out after ${max_wait}s - active=${active:-unknown}"
+                    fi
                     exit 0
           command: ["envoy"]
           args:
diff --git a/manifests/bucketeer/charts/batch/values.yaml b/manifests/bucketeer/charts/batch/values.yaml
@@ -126,7 +126,7 @@ gcpMultiCluster:
 health:
   startupProbe:
     periodSeconds: 3
-    failureThreshold: 10
+    failureThreshold: 30
     timeoutSeconds: 5
   livenessProbe:
     initialDelaySeconds: 30
diff --git a/manifests/bucketeer/charts/subscriber/templates/deployment.yaml b/manifests/bucketeer/charts/subscriber/templates/deployment.yaml
@@ -203,19 +203,39 @@ spec:
                   - -c
                   - |
                     admin_port={{ .Values.envoy.adminPort }}
-                    max_wait=35
+                    max_wait=45
                     propagation_delay=15
 
-                    # Wait for GCLB to detect unhealthy status and stop routing
                     sleep "$propagation_delay"
 
                     # Wait for active requests to drain
                     for i in $(seq 1 "$max_wait"); do
-                      active=$(wget -q -O- "http://localhost:${admin_port}/stats" 2>/dev/null | grep "http.ingress_http.downstream_rq_active" | awk '{print $2}' || echo "0")
-                      [ -z "$active" ] && active=0
-                      [ "$active" -eq 0 ] && break
+                      # Fetch stats and check if request succeeded
+                      stats=$(wget -q -T 1 -O- "http://127.0.0.1:${admin_port}/stats" 2>/dev/null)
+                      if [ $? -ne 0 ] || [ -z "$stats" ]; then
+                        echo "Check $i/$max_wait: Failed to fetch stats, retrying..."
+                        sleep 1
+                        continue
+                      fi
+
+                      # Extract active requests metric
+                      active=$(echo "$stats" | grep -E '^http\.ingress_http\.downstream_rq_active:' | awk '{print $2}')
+                      if [ -z "$active" ]; then
+                        echo "Check $i/$max_wait: Metric not found, retrying..."
+                        sleep 1
+                        continue
+                      fi
+
+                      echo "Check $i/$max_wait: Active requests: $active"
+                      [ "$active" -eq 0 ] && echo "No active requests, exiting gracefully" && break
                       sleep 1
                     done
+
+                    if [ -n "$active" ] && [ "$active" -eq 0 ]; then
+                      echo "Graceful shutdown completed successfully"
+                    else
+                      echo "Warning: Timed out after ${max_wait}s - active=${active:-unknown}"
+                    fi
                     exit 0
           command: ["envoy"]
           args:
diff --git a/manifests/bucketeer/charts/subscriber/values.yaml b/manifests/bucketeer/charts/subscriber/values.yaml
@@ -96,7 +96,7 @@ terminationGracePeriodSeconds: 75
 health:
   startupProbe:
     periodSeconds: 3
-    failureThreshold: 10
+    failureThreshold: 20
     timeoutSeconds: 5
   livenessProbe:
     initialDelaySeconds: 30
diff --git a/manifests/bucketeer/charts/web/templates/deployment.yaml b/manifests/bucketeer/charts/web/templates/deployment.yaml
@@ -273,19 +273,39 @@ spec:
                   - -c
                   - |
                     admin_port={{ .Values.envoy.adminPort }}
-                    max_wait=35
+                    max_wait=45
                     propagation_delay=15
 
-                    # Wait for GCLB to detect unhealthy status and stop routing
                     sleep "$propagation_delay"
 
                     # Wait for active requests to drain
                     for i in $(seq 1 "$max_wait"); do
-                      active=$(wget -q -O- "http://localhost:${admin_port}/stats" 2>/dev/null | grep "http.ingress_http.downstream_rq_active" | awk '{print $2}' || echo "0")
-                      [ -z "$active" ] && active=0
-                      [ "$active" -eq 0 ] && break
+                      # Fetch stats and check if request succeeded
+                      stats=$(wget -q -T 1 -O- "http://127.0.0.1:${admin_port}/stats" 2>/dev/null)
+                      if [ $? -ne 0 ] || [ -z "$stats" ]; then
+                        echo "Check $i/$max_wait: Failed to fetch stats, retrying..."
+                        sleep 1
+                        continue
+                      fi
+
+                      # Extract active requests metric
+                      active=$(echo "$stats" | grep -E '^http\.ingress_http\.downstream_rq_active:' | awk '{print $2}')
+                      if [ -z "$active" ]; then
+                        echo "Check $i/$max_wait: Metric not found, retrying..."
+                        sleep 1
+                        continue
+                      fi
+
+                      echo "Check $i/$max_wait: Active requests: $active"
+                      [ "$active" -eq 0 ] && echo "No active requests, exiting gracefully" && break
                       sleep 1
                     done
+
+                    if [ -n "$active" ] && [ "$active" -eq 0 ]; then
+                      echo "Graceful shutdown completed successfully"
+                    else
+                      echo "Warning: Timed out after ${max_wait}s - active=${active:-unknown}"
+                    fi
                     exit 0
           command: ["envoy"]
           args:
diff --git a/manifests/bucketeer/charts/web/values.yaml b/manifests/bucketeer/charts/web/values.yaml
@@ -131,7 +131,7 @@ service:
 health:
   startupProbe:
     periodSeconds: 3
-    failureThreshold: 10
+    failureThreshold: 20
     timeoutSeconds: 5
   livenessProbe:
     initialDelaySeconds: 30
diff --git a/pkg/api/cmd/server.go b/pkg/api/cmd/server.go
@@ -230,9 +230,6 @@ func RegisterCommand(r cli.CommandRegistry, p cli.ParentCommand) cli.Command {
 func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.Logger) error {
 	registerer := metrics.DefaultRegisterer()
 
-	pubsubCtx, pubsubCancel := context.WithTimeout(ctx, 5*time.Second)
-	defer pubsubCancel()
-
 	// Create PubSub client using the factory
 	pubSubType := factory.PubSubType(*s.pubSubType)
 	factoryOpts := []factory.Option{
@@ -261,6 +258,8 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		factoryOpts = append(factoryOpts, factory.WithPartitionCount(*s.pubSubRedisPartitionCount))
 	}
 
+	pubsubCtx, pubsubCancel := context.WithCancel(context.Background())
+	defer pubsubCancel()
 	pubsubClient, err := factory.NewClient(pubsubCtx, factoryOpts...)
 	if err != nil {
 		return err
@@ -388,7 +387,7 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	if err != nil {
 		return err
 	}
-	defer auditLogClient.Close()
+	defer autoOpsClient.Close()
 
 	tagClient, err := tagclient.NewClient(*s.tagService, *s.certPath,
 		client.WithPerRPCCredentials(creds),
@@ -502,8 +501,8 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	// We don't check the Redis health status because if the check fails,
 	// the Kubernetes will restart the container and it might cause internal errors.
 	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown
-	healthCheckCtx, healthCheckCancel := context.WithCancel(ctx)
-	defer healthCheckCancel() // Ensure cleanup on all paths (including early returns)
+	healthCheckCtx, healthCheckCancel := context.WithCancel(context.Background())
+	defer healthCheckCancel()
 
 	healthChecker := health.NewGrpcChecker(
 		health.WithTimeout(5*time.Second),
@@ -545,7 +544,9 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		return fmt.Errorf("failed to create API gateway: %v", err)
 	}
 
-	if err := apiGateway.Start(ctx, gatewayHandler); err != nil {
+	serverCtx, serverCtxCancel := context.WithCancel(context.Background())
+	defer serverCtxCancel()
+	if err := apiGateway.Start(serverCtx, gatewayHandler); err != nil {
 		return fmt.Errorf("failed to start API gateway: %v", err)
 	}
 
@@ -584,27 +585,20 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		shutdownStartTime := time.Now()
 		logger.Info("Starting graceful shutdown sequence")
 
-		waitBeforeUnready := 10 * time.Second
-		logger.Info("Waiting before marking unready",
-			zap.Duration("wait_before_unready", waitBeforeUnready))
-		time.Sleep(waitBeforeUnready)
+		// Wait for K8s endpoint propagation
+		// This prevents "context deadline exceeded" errors during high traffic.
+		time.Sleep(propagationDelay)
+		logger.Info("Starting HTTP/gRPC server shutdown")
 
-		// Cancel the health checker goroutines to prevent connection errors during shutdown
-		healthCheckCancel()
 		// Mark as unhealthy so readiness probes fail
 		// This ensures Kubernetes readiness probe fails on next check,
 		// preventing new traffic from being routed to this pod.
 		healthChecker.Stop()
 		restHealthChecker.Stop()
 
-		// Wait for K8s endpoint propagation
-		// This prevents "context deadline exceeded" errors during high traffic.
-		time.Sleep(propagationDelay)
-		logger.Info("Starting HTTP/gRPC server shutdown")
-
-		// CRITICAL: Shutdown order matters due to dependencies:
+		// Shutdown order matters due to dependencies:
 		// 1. apiGateway/httpServer make gRPC calls to the backend server
-		// 2. We MUST drain them BEFORE stopping the backend
+		// 2. We MUST drain them BEFORE stopping the backend sever
 		// 3. Otherwise their handlers hang waiting for a dead backend
 		// We run apiGateway and httpServer in parallel since they don't depend on each other
 		var wg sync.WaitGroup
@@ -623,9 +617,11 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 
 		// Wait for HTTP/REST traffic to fully drain
 		wg.Wait()
+		logger.Info("gRPC-gateway and HTTP server shutdown completed")
 
 		// Now it's safe to stop the gRPC server (no more HTTP→gRPC calls)
 		server.Stop(grpcStopTimeout)
+		logger.Info("gRPC server shutdown completed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.
diff --git a/pkg/batch/cmd/server/server.go b/pkg/batch/cmd/server/server.go
@@ -584,8 +584,8 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	)
 
 	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown
-	healthCheckCtx, healthCheckCancel := context.WithCancel(ctx)
-	defer healthCheckCancel() // Ensure cleanup on all paths (including early returns)
+	healthCheckCtx, healthCheckCancel := context.WithCancel(context.Background())
+	defer healthCheckCancel()
 
 	healthChecker := health.NewGrpcChecker(
 		health.WithTimeout(time.Second),
@@ -627,31 +627,33 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		return fmt.Errorf("failed to create batch gateway: %v", err)
 	}
 
-	if err := batchGateway.Start(ctx, batchHandler); err != nil {
+	batchCtx, batchCancel := context.WithCancel(context.Background())
+	defer batchCancel()
+	if err := batchGateway.Start(batchCtx, batchHandler); err != nil {
 		return fmt.Errorf("failed to start batch gateway: %v", err)
 	}
 
 	defer func() {
 		shutdownStartTime := time.Now()
 		logger.Info("Starting graceful shutdown sequence")
 
-		// Cancel the health checker goroutines to prevent connection errors during shutdown
-		healthCheckCancel()
-		// Mark as unhealthy so readiness probes fail
-		// This ensures Kubernetes readiness probe fails on next check,
-		// preventing new traffic from being routed to this pod.
-		healthChecker.Stop()
-
 		// Wait for K8s endpoint propagation
 		// This prevents "context deadline exceeded" errors during high traffic.
 		time.Sleep(propagationDelay)
 		logger.Info("Starting HTTP/gRPC server shutdown")
 
-		// Gracefully stop REST gateway (calls the gRPC server internally)
+		// Mark as unhealthy so readiness probes fail
+		// This ensures Kubernetes readiness probe fails on next check,
+		// preventing new traffic from being routed to this pod.
+		healthChecker.Stop()
+
+		// Gracefully stop gRPC Gateway (calls the gRPC server internally)
 		batchGateway.Stop(serverShutDownTimeout)
+		logger.Info("gRPC-gateway server shutdown completed")
 
 		// Stop gRPC server (only pure gRPC connections remain)
 		server.Stop(grpcStopTimeout)
+		logger.Info("gRPC server shutdown completed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.
diff --git a/pkg/subscriber/cmd/server/server.go b/pkg/subscriber/cmd/server/server.go
@@ -343,7 +343,7 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	// healthCheckService
 	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown
 	healthCheckCtx, healthCheckCancel := context.WithCancel(ctx)
-	defer healthCheckCancel() // Ensure cleanup on all paths (including early returns)
+	defer healthCheckCancel()
 
 	restHealthChecker := health.NewRestChecker(
 		"", "",
@@ -365,8 +365,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		shutdownStartTime := time.Now()
 		logger.Info("Starting graceful shutdown sequence")
 
-		// Cancel the health checker goroutines to prevent connection errors during shutdown
-		healthCheckCancel()
 		// Mark as unhealthy so readiness probes fail
 		// This ensures Kubernetes readiness probe fails on next check,
 		// preventing new traffic from being routed to this pod.
@@ -376,6 +374,7 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		// Stop PubSub subscription
 		// This stops receiving new messages and allows in-flight messages to be processed.
 		multiPubSub.Stop()
+		logger.Info("PubSub subscription stopped, all messages processed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.
diff --git a/pkg/subscriber/on_demand_subscriber.go b/pkg/subscriber/on_demand_subscriber.go
diff --git a/pkg/subscriber/subscriber.go b/pkg/subscriber/subscriber.go
diff --git a/pkg/web/cmd/server/server.go b/pkg/web/cmd/server/server.go