@@ -230,9 +230,6 @@ func RegisterCommand(r cli.CommandRegistry, p cli.ParentCommand) cli.Command {
230230func  (s  * server ) Run (ctx  context.Context , metrics  metrics.Metrics , logger  * zap.Logger ) error  {
231231	registerer  :=  metrics .DefaultRegisterer ()
232232
233- 	pubsubCtx , pubsubCancel  :=  context .WithTimeout (ctx , 5 * time .Second )
234- 	defer  pubsubCancel ()
235- 
236233	// Create PubSub client using the factory 
237234	pubSubType  :=  factory .PubSubType (* s .pubSubType )
238235	factoryOpts  :=  []factory.Option {
@@ -261,6 +258,8 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
261258		factoryOpts  =  append (factoryOpts , factory .WithPartitionCount (* s .pubSubRedisPartitionCount ))
262259	}
263260
261+ 	pubsubCtx , pubsubCancel  :=  context .WithCancel (context .Background ())
262+ 	defer  pubsubCancel ()
264263	pubsubClient , err  :=  factory .NewClient (pubsubCtx , factoryOpts ... )
265264	if  err  !=  nil  {
266265		return  err 
@@ -388,7 +387,7 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
388387	if  err  !=  nil  {
389388		return  err 
390389	}
391- 	defer  auditLogClient .Close ()
390+ 	defer  autoOpsClient .Close ()
392391
393392	tagClient , err  :=  tagclient .NewClient (* s .tagService , * s .certPath ,
394393		client .WithPerRPCCredentials (creds ),
@@ -502,8 +501,8 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
502501	// We don't check the Redis health status because if the check fails, 
503502	// the Kubernetes will restart the container and it might cause internal errors. 
504503	// Use a dedicated context so we can stop the health checker goroutine cleanly during shutdown 
505- 	healthCheckCtx , healthCheckCancel  :=  context .WithCancel (ctx )
506- 	defer  healthCheckCancel ()  // Ensure cleanup on all paths (including early returns) 
504+ 	healthCheckCtx , healthCheckCancel  :=  context .WithCancel (context . Background () )
505+ 	defer  healthCheckCancel ()
507506
508507	healthChecker  :=  health .NewGrpcChecker (
509508		health .WithTimeout (5 * time .Second ),
@@ -545,7 +544,9 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
545544		return  fmt .Errorf ("failed to create API gateway: %v" , err )
546545	}
547546
548- 	if  err  :=  apiGateway .Start (ctx , gatewayHandler ); err  !=  nil  {
547+ 	serverCtx , serverCtxCancel  :=  context .WithCancel (context .Background ())
548+ 	defer  serverCtxCancel ()
549+ 	if  err  :=  apiGateway .Start (serverCtx , gatewayHandler ); err  !=  nil  {
549550		return  fmt .Errorf ("failed to start API gateway: %v" , err )
550551	}
551552
@@ -584,27 +585,20 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
584585		shutdownStartTime  :=  time .Now ()
585586		logger .Info ("Starting graceful shutdown sequence" )
586587
587- 		waitBeforeUnready   :=   10   *   time . Second 
588- 		logger . Info ( "Waiting before marking unready" , 
589- 			 zap . Duration ( "wait_before_unready" ,  waitBeforeUnready ) )
590- 		time . Sleep ( waitBeforeUnready )
588+ 		// Wait for K8s endpoint propagation 
589+ 		// This prevents "context deadline exceeded" errors during high traffic. 
590+ 		time . Sleep ( propagationDelay )
591+ 		logger . Info ( "Starting HTTP/gRPC server shutdown" )
591592
592- 		// Cancel the health checker goroutines to prevent connection errors during shutdown 
593- 		healthCheckCancel ()
594593		// Mark as unhealthy so readiness probes fail 
595594		// This ensures Kubernetes readiness probe fails on next check, 
596595		// preventing new traffic from being routed to this pod. 
597596		healthChecker .Stop ()
598597		restHealthChecker .Stop ()
599598
600- 		// Wait for K8s endpoint propagation 
601- 		// This prevents "context deadline exceeded" errors during high traffic. 
602- 		time .Sleep (propagationDelay )
603- 		logger .Info ("Starting HTTP/gRPC server shutdown" )
604- 
605- 		// CRITICAL: Shutdown order matters due to dependencies: 
599+ 		// Shutdown order matters due to dependencies: 
606600		// 1. apiGateway/httpServer make gRPC calls to the backend server 
607- 		// 2. We MUST drain them BEFORE stopping the backend 
601+ 		// 2. We MUST drain them BEFORE stopping the backend sever  
608602		// 3. Otherwise their handlers hang waiting for a dead backend 
609603		// We run apiGateway and httpServer in parallel since they don't depend on each other 
610604		var  wg  sync.WaitGroup 
@@ -623,9 +617,11 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
623617
624618		// Wait for HTTP/REST traffic to fully drain 
625619		wg .Wait ()
620+ 		logger .Info ("gRPC-gateway and HTTP server shutdown completed" )
626621
627622		// Now it's safe to stop the gRPC server (no more HTTP→gRPC calls) 
628623		server .Stop (grpcStopTimeout )
624+ 		logger .Info ("gRPC server shutdown completed" )
629625
630626		// Close clients 
631627		// These are fast cleanup operations that can run asynchronously. 
0 commit comments