bucketeer-io · cre8ivejp · Oct 27, 2025 · Oct 27, 2025 · cre8ivejp · Oct 27, 2025
@@ -21,6 +21,10 @@ spec:
     spec:
       backoffLimit: 0
       template:
+        metadata:
+          labels:
+            app: {{ template "batch-server.name" $ }}
+            job-type: cronjob
         spec:
           volumes:
             - name: service-cert-secret
@@ -43,18 +47,34 @@ spec:
               env:
                 - name: WEB_GATEWAY_ADDRESS
                   value: "{{ $.Values.cronjob.webGatewayAddress }}"
+                - name: JOB_NAME
+                  value: "{{ .name }}"
+                - name: JOB_ID
+                  value: "{{ .jobId }}"
               command:
                 - /bin/sh
               args:
                 - -c
                 - |
-                  echo "Start {{ .name }} job."
+                  START_TIME=$(date +%s)
                   ENDPOINT="${WEB_GATEWAY_ADDRESS}/bucketeer.batch.BatchService/ExecuteBatchJob"
                   TOKEN=`cat /usr/local/service-token/token`
-                  RES=`curl -X POST -m 3600 --cacert /usr/local/certs/service/tls.crt -d '{"job": "{{ .jobId }}"}' -H "authorization: bearer ${TOKEN}" -H "Content-Type: application/json" -s -o /dev/null -w '%{http_code}\\n' ${ENDPOINT}`
-                  echo "{{ .name }} job result: ${RES}"
-                  if [ "$RES" = 200 ] || [ "$RES" = 503 ] || [ "$RES" = 000 ]
-                  then
+                  RESPONSE=$(mktemp)
+                  RES=$(curl -X POST -m 3600 --cacert /usr/local/certs/service/tls.crt -d '{"job": "'${JOB_ID}'"}' -H "authorization: bearer ${TOKEN}" -H "Content-Type: application/json" -s -o ${RESPONSE} -w '%{http_code}' ${ENDPOINT})
+                  BODY=$(cat ${RESPONSE})
+                  rm -f ${RESPONSE}
+                  END_TIME=$(date +%s)
+                  DURATION=$((END_TIME - START_TIME))
+                  # API returns empty {} on success, or error details on failure
+                  if [ "${RES}" = "200" ]; then
+                    # Success - use DEBUG to avoid verbose logs
+                    echo '{"severity":"DEBUG","message":"'${JOB_NAME}' job completed successfully","job":"'${JOB_NAME}'","jobId":"'${JOB_ID}'","statusCode":'${RES}',"durationSeconds":'${DURATION}'}'
+                  else
+                    # Failure - log as ERROR with response details for debugging
+                    BODY_ESCAPED=$(echo "${BODY}" | sed 's/\\/\\\\/g; s/"/\\"/g' | tr '\n\r\t' '   ')
+                    echo '{"severity":"ERROR","message":"'${JOB_NAME}' job failed","job":"'${JOB_NAME}'","jobId":"'${JOB_ID}'","statusCode":'${RES}',"durationSeconds":'${DURATION}',"responseBody":"'${BODY_ESCAPED}'"}'
+                  fi
+                  if [ "${RES}" = "200" ] || [ "${RES}" = "503" ] || [ "${RES}" = "000" ]; then
                     exit 0
                   else
                     exit 1

@@ -571,7 +571,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		// Wait for K8s endpoint propagation
 		// This prevents "context deadline exceeded" errors during high traffic.
 		time.Sleep(propagationDelay)
-		logger.Info("Starting HTTP/gRPC server shutdown")
 
 		// Mark as unhealthy so readiness probes fail
 		// This ensures Kubernetes readiness probe fails on next check,
@@ -600,11 +599,9 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 
 		// Wait for HTTP/REST traffic to fully drain
 		wg.Wait()
-		logger.Info("gRPC-gateway and HTTP server shutdown completed")
 
 		// Now it's safe to stop the gRPC server (no more HTTP→gRPC calls)
 		server.Stop(grpcStopTimeout)
-		logger.Info("gRPC server shutdown completed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.

@@ -636,7 +636,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		// Wait for K8s endpoint propagation
 		// This prevents "context deadline exceeded" errors during high traffic.
 		time.Sleep(propagationDelay)
-		logger.Info("Starting HTTP/gRPC server shutdown")
 
 		// Mark as unhealthy so readiness probes fail
 		// This ensures Kubernetes readiness probe fails on next check,
@@ -645,11 +644,9 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 
 		// Gracefully stop gRPC Gateway (calls the gRPC server internally)
 		batchGateway.Stop(serverShutDownTimeout)
-		logger.Info("gRPC-gateway server shutdown completed")
 
 		// Stop gRPC server (only pure gRPC connections remain)
 		server.Stop(grpcStopTimeout)
-		logger.Info("gRPC server shutdown completed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.

@@ -135,7 +135,7 @@ func (e *experimentCalculate) runCalculation() {
 			return
 		}
 		if experiments == nil {
-			e.logger.Info("There are no experiments for calculation in the specified environment",
+			e.logger.Debug("There are no experiments for calculation in the specified environment",
 				log.FieldsFromIncomingContext(ctxWithTimeout).AddFields(
 					zap.String("environmentId", env.Id),
 				)...,

@@ -129,7 +129,7 @@ func (w *progressiveRolloutWatcher) executeProgressiveRollout(
 	now := time.Now().Unix()
 	for _, s := range schedules {
 		if s.TriggeredAt == 0 && s.ExecuteAt <= now {
-			w.logger.Info("scheduled time is passed",
+			w.logger.Debug("scheduled time is passed",
 				zap.String("environmentId", environmentId),
 				zap.String("featureId", progressiveRollout.FeatureId),
 				zap.String("progressiveRolloutId", progressiveRollout.Id),

@@ -131,7 +131,7 @@ func (r *redisCounterDeleter) deleteKeysByKind(environmentId, kind string) (int,
 		return 0, err
 	}
 	if len(keys) == 0 {
-		r.logger.Info("No keys was found",
+		r.logger.Debug("No keys was found",
 			zap.String("environmentId", environmentId),
 			zap.String("kind", kind),
 		)
@@ -141,15 +141,14 @@ func (r *redisCounterDeleter) deleteKeysByKind(environmentId, kind string) (int,
 	if err != nil {
 		return 0, err
 	}
-	r.logger.Info("Filtered keys older than 31 days",
+	r.logger.Debug("Filtered keys older than 31 days",
 		zap.String("environmentId", environmentId),
 		zap.String("kind", kind),
 		zap.Int("filteredKeysSize", len(filteredKeys)),
 	)
 	// To avoid blocking Redis for too much time while deleting all the keys
 	// we split the keys in chunks
 	chunks := r.chunkSlice(filteredKeys, redisChunkMaxSize)
-	r.logger.Info("Chunked the filtered keys", zap.Int("chunkSize", len(chunks)))
 	deletedKeys := 0
 	for _, chunk := range chunks {
 		if err := r.deleteKeys(chunk); err != nil {
@@ -163,7 +162,7 @@ func (r *redisCounterDeleter) deleteKeysByKind(environmentId, kind string) (int,
 			return deletedKeys, err
 		}
 		deletedKeys += len(chunk)
-		r.logger.Info("Chunk deleted successfully", zap.Strings("keys", chunk))
+		r.logger.Debug("Chunk deleted successfully", zap.Strings("keys", chunk))
 	}
 	return deletedKeys, nil
 }
@@ -174,7 +173,7 @@ func (r *redisCounterDeleter) newKeyPrefix(environmentId, kind string) string {
 }
 
 func (r *redisCounterDeleter) scan(environmentId, kind, key string) ([]string, error) {
-	r.logger.Info("Starting scan keys from Redis",
+	r.logger.Debug("Starting scan keys from Redis",
 		zap.String("environmentId", environmentId),
 		zap.String("kind", kind),
 	)
@@ -196,7 +195,7 @@ func (r *redisCounterDeleter) scan(environmentId, kind, key string) ([]string, e
 	if err != nil {
 		return nil, err
 	}
-	r.logger.Info("Finished scanning keys from Redis",
+	r.logger.Debug("Finished scanning keys from Redis",
 		zap.String("environmentId", environmentId),
 		zap.String("kind", kind),
 		zap.Duration("elapsedTime", time.Since(startTime)),

@@ -368,12 +368,10 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		// This ensures Kubernetes readiness probe fails on next check,
 		// preventing new traffic from being routed to this pod.
 		restHealthChecker.Stop()
-		logger.Info("Health check marked as unhealthy (readiness will fail)")
 
 		// Stop PubSub subscription
 		// This stops receiving new messages and allows in-flight messages to be processed.
 		multiPubSub.Stop()
-		logger.Info("PubSub subscription stopped, all messages processed")
 
 		// Close clients
 		// These are fast cleanup operations that can run asynchronously.

@@ -865,7 +865,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 		// Wait for K8s endpoint propagation
 		// This prevents "context deadline exceeded" errors during high traffic.
 		time.Sleep(propagationDelay)
-		logger.Info("Starting HTTP/gRPC server shutdown")
 
 		// Mark as unhealthy so readiness probes fail
 		// This ensures Kubernetes readiness probe fails on next check,
@@ -891,7 +890,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L
 
 		// Wait for REST traffic to drain
 		restWg.Wait()
-		logger.Info("REST servers shutdown completed")
 
 		// Gracefully stop all gRPC servers in parallel
 		// Now safe to stop since REST traffic has drained