chore: change api agteway server context

cre8ivejp · cre8ivejp · commit 7d7e722ab4f4 · 2025-10-18T20:12:59.000+09:00
Signed-off-by: Alessandro Yuichi Okimoto &lt;yuichijpn@gmail.com&gt;
diff --git a/manifests/bucketeer/charts/api/templates/deployment.yaml b/manifests/bucketeer/charts/api/templates/deployment.yaml
@@ -189,7 +189,9 @@ spec:
 
                     # Wait for active requests to drain
                     for i in $(seq 1 "$max_wait"); do
-                      active=$(wget -q -O- "http://localhost:${admin_port}/stats" 2>/dev/null | grep "http.ingress_http.downstream_rq_active" | awk '{print $2}' || echo "0")
+                      active=$(wget -q -T 1 -O- "http://127.0.0.1:${admin_port}/stats" 2>/dev/null \
+                        | grep -E '^http\.ingress_http\.downstream_rq_active:' \
+                        | awk '{print $2}' || echo "0")
                       [ -z "$active" ] && active=0
                       [ "$active" -eq 0 ] && break
                       sleep 1
diff --git a/manifests/bucketeer/charts/batch/templates/deployment.yaml b/manifests/bucketeer/charts/batch/templates/deployment.yaml
@@ -223,15 +223,13 @@ spec:
                   - -c
                   - |
                     admin_port={{ .Values.envoy.adminPort }}
-                    max_wait=35
-                    propagation_delay=15
-
-                    # Wait for GCLB to detect unhealthy status and stop routing
-                    sleep "$propagation_delay"
+                    max_wait=60
 
                     # Wait for active requests to drain
                     for i in $(seq 1 "$max_wait"); do
-                      active=$(wget -q -O- "http://localhost:${admin_port}/stats" 2>/dev/null | grep "http.ingress_http.downstream_rq_active" | awk '{print $2}' || echo "0")
+                      active=$(wget -q -T 1 -O- "http://127.0.0.1:${admin_port}/stats" 2>/dev/null \
+                        | grep -E '^http\.ingress_http\.downstream_rq_active:' \
+                        | awk '{print $2}' || echo "0")
                       [ -z "$active" ] && active=0
                       [ "$active" -eq 0 ] && break
                       sleep 1
diff --git a/manifests/bucketeer/charts/subscriber/templates/deployment.yaml b/manifests/bucketeer/charts/subscriber/templates/deployment.yaml
@@ -203,15 +203,13 @@ spec:
                   - -c
                   - |
                     admin_port={{ .Values.envoy.adminPort }}
-                    max_wait=35
-                    propagation_delay=15
-
-                    # Wait for GCLB to detect unhealthy status and stop routing
-                    sleep "$propagation_delay"
+                    max_wait=60
 
                     # Wait for active requests to drain
                     for i in $(seq 1 "$max_wait"); do
-                      active=$(wget -q -O- "http://localhost:${admin_port}/stats" 2>/dev/null | grep "http.ingress_http.downstream_rq_active" | awk '{print $2}' || echo "0")
+                      active=$(wget -q -T 1 -O- "http://127.0.0.1:${admin_port}/stats" 2>/dev/null \
+                        | grep -E '^http\.ingress_http\.downstream_rq_active:' \
+                        | awk '{print $2}' || echo "0")
                       [ -z "$active" ] && active=0
                       [ "$active" -eq 0 ] && break
                       sleep 1
diff --git a/manifests/bucketeer/charts/web/templates/deployment.yaml b/manifests/bucketeer/charts/web/templates/deployment.yaml
@@ -273,15 +273,13 @@ spec:
                   - -c
                   - |
                     admin_port={{ .Values.envoy.adminPort }}
-                    max_wait=35
-                    propagation_delay=15
-
-                    # Wait for GCLB to detect unhealthy status and stop routing
-                    sleep "$propagation_delay"
+                    max_wait=60
 
                     # Wait for active requests to drain
                     for i in $(seq 1 "$max_wait"); do
-                      active=$(wget -q -O- "http://localhost:${admin_port}/stats" 2>/dev/null | grep "http.ingress_http.downstream_rq_active" | awk '{print $2}' || echo "0")
+                      active=$(wget -q -T 1 -O- "http://127.0.0.1:${admin_port}/stats" 2>/dev/null \
+                        | grep -E '^http\.ingress_http\.downstream_rq_active:' \
+                        | awk '{print $2}' || echo "0")
                       [ -z "$active" ] && active=0
                       [ "$active" -eq 0 ] && break
                       sleep 1
diff --git a/pkg/api/cmd/server.go b/pkg/api/cmd/server.go
@@ -230,9 +230,6 @@ func RegisterCommand(r cli.CommandRegistry, p cli.ParentCommand) cli.Command {
 func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.Logger) error {
 	registerer := metrics.DefaultRegisterer()
 
-	pubsubCtx, pubsubCancel := context.WithTimeout(ctx, 5*time.Second)
-	defer pubsubCancel()
-
 	// Create PubSub client using the factory
 	pubSubType := factory.PubSubType(*s.pubSubType)
 	factoryOpts := []factory.Option{
@@ -261,6 +258,8 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		factoryOpts = append(factoryOpts, factory.WithPartitionCount(*s.pubSubRedisPartitionCount))
 	}
 
+	pubsubCtx, pubsubCancel := context.WithCancel(context.Background())
+	defer pubsubCancel()
 	pubsubClient, err := factory.NewClient(pubsubCtx, factoryOpts...)
 	if err != nil {
 		return err
@@ -388,7 +387,7 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	if err != nil {
 		return err
 	}
-	defer auditLogClient.Close()
+	defer autoOpsClient.Close()
 
 	tagClient, err := tagclient.NewClient(*s.tagService, *s.certPath,
 		client.WithPerRPCCredentials(creds),
@@ -502,8 +501,8 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	// We don't check the Redis health status because if the check fails,
 	// the Kubernetes will restart the container and it might cause internal errors.
 	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown
-	healthCheckCtx, healthCheckCancel := context.WithCancel(ctx)
-	defer healthCheckCancel() // Ensure cleanup on all paths (including early returns)
+	healthCheckCtx, healthCheckCancel := context.WithCancel(context.Background())
+	defer healthCheckCancel()
 
 	healthChecker := health.NewGrpcChecker(
 		health.WithTimeout(5*time.Second),
@@ -545,7 +544,9 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		return fmt.Errorf("failed to create API gateway: %v", err)
 	}
 
-	if err := apiGateway.Start(ctx, gatewayHandler); err != nil {
+	serverCtx, serverCtxCancel := context.WithCancel(context.Background())
+	defer serverCtxCancel()
+	if err := apiGateway.Start(serverCtx, gatewayHandler); err != nil {
 		return fmt.Errorf("failed to start API gateway: %v", err)
 	}
 
@@ -584,27 +585,20 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		shutdownStartTime := time.Now()
 		logger.Info("Starting graceful shutdown sequence")
 
-		waitBeforeUnready := 10 * time.Second
-		logger.Info("Waiting before marking unready",
-			zap.Duration("wait_before_unready", waitBeforeUnready))
-		time.Sleep(waitBeforeUnready)
+		// Wait for K8s endpoint propagation
+		// This prevents "context deadline exceeded" errors during high traffic.
+		time.Sleep(propagationDelay)
+		logger.Info("Starting HTTP/gRPC server shutdown")
 
-		// Cancel the health checker goroutines to prevent connection errors during shutdown
-		healthCheckCancel()
 		// Mark as unhealthy so readiness probes fail
 		// This ensures Kubernetes readiness probe fails on next check,
 		// preventing new traffic from being routed to this pod.
 		healthChecker.Stop()
 		restHealthChecker.Stop()
 
-		// Wait for K8s endpoint propagation
-		// This prevents "context deadline exceeded" errors during high traffic.
-		time.Sleep(propagationDelay)
-		logger.Info("Starting HTTP/gRPC server shutdown")
-
-		// CRITICAL: Shutdown order matters due to dependencies:
+		// Shutdown order matters due to dependencies:
 		// 1. apiGateway/httpServer make gRPC calls to the backend server
-		// 2. We MUST drain them BEFORE stopping the backend
+		// 2. We MUST drain them BEFORE stopping the backend sever
 		// 3. Otherwise their handlers hang waiting for a dead backend
 		// We run apiGateway and httpServer in parallel since they don't depend on each other
 		var wg sync.WaitGroup
@@ -623,9 +617,11 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 
 		// Wait for HTTP/REST traffic to fully drain
 		wg.Wait()
+		logger.Info("gRPC-gateway and HTTP server shutdown completed")
 
 		// Now it's safe to stop the gRPC server (no more HTTP→gRPC calls)
 		server.Stop(grpcStopTimeout)
+		logger.Info("gRPC server shutdown completed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.
diff --git a/pkg/batch/cmd/server/server.go b/pkg/batch/cmd/server/server.go
@@ -584,8 +584,8 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	)
 
 	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown
-	healthCheckCtx, healthCheckCancel := context.WithCancel(ctx)
-	defer healthCheckCancel() // Ensure cleanup on all paths (including early returns)
+	healthCheckCtx, healthCheckCancel := context.WithCancel(context.Background())
+	defer healthCheckCancel()
 
 	healthChecker := health.NewGrpcChecker(
 		health.WithTimeout(time.Second),
@@ -627,31 +627,33 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		return fmt.Errorf("failed to create batch gateway: %v", err)
 	}
 
-	if err := batchGateway.Start(ctx, batchHandler); err != nil {
+	batchCtx, batchCancel := context.WithCancel(context.Background())
+	defer batchCancel()
+	if err := batchGateway.Start(batchCtx, batchHandler); err != nil {
 		return fmt.Errorf("failed to start batch gateway: %v", err)
 	}
 
 	defer func() {
 		shutdownStartTime := time.Now()
 		logger.Info("Starting graceful shutdown sequence")
 
-		// Cancel the health checker goroutines to prevent connection errors during shutdown
-		healthCheckCancel()
-		// Mark as unhealthy so readiness probes fail
-		// This ensures Kubernetes readiness probe fails on next check,
-		// preventing new traffic from being routed to this pod.
-		healthChecker.Stop()
-
 		// Wait for K8s endpoint propagation
 		// This prevents "context deadline exceeded" errors during high traffic.
 		time.Sleep(propagationDelay)
 		logger.Info("Starting HTTP/gRPC server shutdown")
 
-		// Gracefully stop REST gateway (calls the gRPC server internally)
+		// Mark as unhealthy so readiness probes fail
+		// This ensures Kubernetes readiness probe fails on next check,
+		// preventing new traffic from being routed to this pod.
+		healthChecker.Stop()
+
+		// Gracefully stop gRPC Gateway (calls the gRPC server internally)
 		batchGateway.Stop(serverShutDownTimeout)
+		logger.Info("gRPC-gateway server shutdown completed")
 
 		// Stop gRPC server (only pure gRPC connections remain)
 		server.Stop(grpcStopTimeout)
+		logger.Info("gRPC server shutdown completed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.
diff --git a/pkg/subscriber/cmd/server/server.go b/pkg/subscriber/cmd/server/server.go
@@ -343,7 +343,7 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	// healthCheckService
 	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown
 	healthCheckCtx, healthCheckCancel := context.WithCancel(ctx)
-	defer healthCheckCancel() // Ensure cleanup on all paths (including early returns)
+	defer healthCheckCancel()
 
 	restHealthChecker := health.NewRestChecker(
 		"", "",
@@ -365,8 +365,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		shutdownStartTime := time.Now()
 		logger.Info("Starting graceful shutdown sequence")
 
-		// Cancel the health checker goroutines to prevent connection errors during shutdown
-		healthCheckCancel()
 		// Mark as unhealthy so readiness probes fail
 		// This ensures Kubernetes readiness probe fails on next check,
 		// preventing new traffic from being routed to this pod.
@@ -376,6 +374,7 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		// Stop PubSub subscription
 		// This stops receiving new messages and allows in-flight messages to be processed.
 		multiPubSub.Stop()
+		logger.Info("PubSub subscription stopped, all messages processed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.
diff --git a/pkg/subscriber/on_demand_subscriber.go b/pkg/subscriber/on_demand_subscriber.go
@@ -233,8 +233,9 @@ func (s *onDemandSubscriber) createPubSubClient(ctx context.Context) error {
 		}
 	}
 
-	// Create the PubSub client using the factory
-	pubsubClient, err := factory.NewClient(ctx, factoryOpts...)
+	// Create the PubSub client using the factory with context.Background()
+	// to ensure connections remain healthy until explicitly stopped during graceful shutdown
+	pubsubClient, err := factory.NewClient(context.Background(), factoryOpts...)
 	if err != nil {
 		s.logger.Error("Failed to create pubsub client",
 			zap.Error(err),
diff --git a/pkg/subscriber/subscriber.go b/pkg/subscriber/subscriber.go
@@ -201,8 +201,9 @@ func (s pubSubSubscriber) createPuller(
 		}
 	}
 
-	// Create the PubSub client using the factory
-	pubsubClient, err = factory.NewClient(ctx, factoryOpts...)
+	// Create the PubSub client using the factory with context.Background()
+	// to ensure connections remain healthy until explicitly stopped during graceful shutdown
+	pubsubClient, err = factory.NewClient(context.Background(), factoryOpts...)
 	if err != nil {
 		s.logger.Error("Failed to create pubsub client",
 			zap.Error(err),
diff --git a/pkg/web/cmd/server/server.go b/pkg/web/cmd/server/server.go
@@ -404,14 +404,14 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	registerer := metrics.DefaultRegisterer()
 
 	// dataWarehouse config
-	dataWarehouseConfig, err := s.readDataWarehouseConfig(ctx, logger)
+	dataWarehouseConfig, err := s.readDataWarehouseConfig(logger)
 	if err != nil {
 		logger.Error("Failed to read dataWarehouse config", zap.Error(err))
 		return err
 	}
 
 	// oauth config
-	oAuthConfig, err := s.readOAuthConfig(ctx, logger)
+	oAuthConfig, err := s.readOAuthConfig(logger)
 	if err != nil {
 		logger.Error("Failed to read OAuth config", zap.Error(err))
 		return err
@@ -427,16 +427,17 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	if err != nil {
 		return err
 	}
-	// healthCheckService
-	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown
-	healthCheckCtx, healthCheckCancel := context.WithCancel(ctx)
-	defer healthCheckCancel() // Ensure cleanup on all paths (including early returns)
 
 	restHealthChecker := health.NewRestChecker(
 		"", "",
 		health.WithTimeout(healthCheckTimeout),
 		health.WithCheck("metrics", metrics.Check),
 	)
+
+	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown
+	healthCheckCtx, healthCheckCancel := context.WithCancel(context.Background())
+	defer healthCheckCancel()
+
 	go restHealthChecker.Run(healthCheckCtx)
 	// healthcheckService
 	healthcheckServer := rest.NewServer(
@@ -500,13 +501,16 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 	if err != nil {
 		return err
 	}
+
+	pubsubCtx, pubsubCancel := context.WithCancel(context.Background())
+	defer pubsubCancel()
 	// domainTopicPublisher
-	domainTopicPublisher, err := s.createPublisher(ctx, *s.domainTopic, registerer, logger)
+	domainTopicPublisher, err := s.createPublisher(pubsubCtx, *s.domainTopic, registerer, logger)
 	if err != nil {
 		return err
 	}
 	// segmentUsersPublisher
-	segmentUsersPublisher, err := s.createPublisher(ctx, *s.bulkSegmentUsersReceivedTopic, registerer, logger)
+	segmentUsersPublisher, err := s.createPublisher(pubsubCtx, *s.bulkSegmentUsersReceivedTopic, registerer, logger)
 	if err != nil {
 		return err
 	}
@@ -722,7 +726,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 
 	// featureService
 	featureService, err := s.createFeatureService(
-		ctx,
 		accountClient,
 		experimentClient,
 		autoOpsClient,
@@ -861,18 +864,17 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		shutdownStartTime := time.Now()
 		logger.Info("Starting graceful shutdown sequence")
 
-		// Cancel the health checker goroutines to prevent connection errors during shutdown
-		healthCheckCancel()
-		// Mark as unhealthy so readiness probes fail
-		// This ensures Kubernetes readiness probe fails on next check,
-		// preventing new traffic from being routed to this pod.
-		restHealthChecker.Stop()
-
 		// Wait for K8s endpoint propagation
 		// This prevents "context deadline exceeded" errors during high traffic.
 		time.Sleep(propagationDelay)
 		logger.Info("Starting HTTP/gRPC server shutdown")
 
+		// Mark as unhealthy so readiness probes fail
+		// This ensures Kubernetes readiness probe fails on next check,
+		// preventing new traffic from being routed to this pod.
+		healthcheckServer.Stop(5 * time.Second)
+		restHealthChecker.Stop()
+
 		// Stop REST servers in parallel (these call gRPC servers internally)
 		// Stop these first to drain REST traffic before stopping gRPC
 		var restWg sync.WaitGroup
@@ -993,9 +995,6 @@ func (s *server) createPublisher(
 	registerer metrics.Registerer,
 	logger *zap.Logger,
 ) (publisher.Publisher, error) {
-	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
-	defer cancel()
-
 	// Create PubSub client using the factory
 	pubSubType := factory.PubSubType(*s.pubSubType)
 	factoryOpts := []factory.Option{
@@ -1035,7 +1034,6 @@ func (s *server) createPublisher(
 }
 
 func (s *server) readOAuthConfig(
-	ctx context.Context,
 	logger *zap.Logger,
 ) (*auth.OAuthConfig, error) {
 	bytes, err := os.ReadFile(*s.oauthConfigPath)
@@ -1129,7 +1127,6 @@ func (s *server) createEnvironmentService(
 }
 
 func (s *server) createFeatureService(
-	ctx context.Context,
 	accountClient accountclient.Client,
 	experimentClient experimentclient.Client,
 	autoOpsClient autoopsclient.Client,
@@ -1218,7 +1215,6 @@ func (s *server) createGatewayHandlers() []gatewayapi.HandlerRegistrar {
 }
 
 func (s *server) readDataWarehouseConfig(
-	ctx context.Context,
 	logger *zap.Logger,
 ) (*DataWarehouseConfig, error) {
 	// If config path is provided, read from file

Original file line number	Diff line number	Diff line change
`@@ -233,8 +233,9 @@ func (s *onDemandSubscriber) createPubSubClient(ctx context.Context) error {`
`233`	`233`	`}`
`234`	`234`	`}`
`235`	`235`
`236`		`- // Create the PubSub client using the factory`
`237`		`- pubsubClient, err := factory.NewClient(ctx, factoryOpts...)`
	`236`	`+ // Create the PubSub client using the factory with context.Background()`
	`237`	`+ // to ensure connections remain healthy until explicitly stopped during graceful shutdown`
	`238`	`+ pubsubClient, err := factory.NewClient(context.Background(), factoryOpts...)`
`238`	`239`	`if err != nil {`
`239`	`240`	`s.logger.Error("Failed to create pubsub client",`
`240`	`241`	`zap.Error(err),`
Original file line number	Diff line number	Diff line change
`@@ -201,8 +201,9 @@ func (s pubSubSubscriber) createPuller(`
`201`	`201`	`}`
`202`	`202`	`}`
`203`	`203`
`204`		`- // Create the PubSub client using the factory`
`205`		`- pubsubClient, err = factory.NewClient(ctx, factoryOpts...)`
	`204`	`+ // Create the PubSub client using the factory with context.Background()`
	`205`	`+ // to ensure connections remain healthy until explicitly stopped during graceful shutdown`
	`206`	`+ pubsubClient, err = factory.NewClient(context.Background(), factoryOpts...)`
`206`	`207`	`if err != nil {`
`207`	`208`	`s.logger.Error("Failed to create pubsub client",`
`208`	`209`	`zap.Error(err),`