From 1ea358a46b483af3461d255eeffc107b3ac48165 Mon Sep 17 00:00:00 2001 From: Alessandro Yuichi Okimoto Date: Mon, 27 Oct 2025 19:08:08 +0900 Subject: [PATCH] chore: reduce info log verbosity with structured logging Signed-off-by: Alessandro Yuichi Okimoto --- .../charts/batch/templates/cronjob.yaml | 30 +++++++++++++++---- pkg/api/cmd/server.go | 3 -- pkg/batch/cmd/server/server.go | 3 -- .../jobs/calculator/experiment_calculate.go | 2 +- .../opsevent/progressove_rollout_watcher.go | 2 +- .../rediscounter/redis_counter_deleter.go | 11 ++++--- pkg/subscriber/cmd/server/server.go | 2 -- pkg/web/cmd/server/server.go | 2 -- 8 files changed, 32 insertions(+), 23 deletions(-) diff --git a/manifests/bucketeer/charts/batch/templates/cronjob.yaml b/manifests/bucketeer/charts/batch/templates/cronjob.yaml index 12d981f28e..f453b7b432 100644 --- a/manifests/bucketeer/charts/batch/templates/cronjob.yaml +++ b/manifests/bucketeer/charts/batch/templates/cronjob.yaml @@ -21,6 +21,10 @@ spec: spec: backoffLimit: 0 template: + metadata: + labels: + app: {{ template "batch-server.name" $ }} + job-type: cronjob spec: volumes: - name: service-cert-secret @@ -43,18 +47,34 @@ spec: env: - name: WEB_GATEWAY_ADDRESS value: "{{ $.Values.cronjob.webGatewayAddress }}" + - name: JOB_NAME + value: "{{ .name }}" + - name: JOB_ID + value: "{{ .jobId }}" command: - /bin/sh args: - -c - | - echo "Start {{ .name }} job." + START_TIME=$(date +%s) ENDPOINT="${WEB_GATEWAY_ADDRESS}/bucketeer.batch.BatchService/ExecuteBatchJob" TOKEN=`cat /usr/local/service-token/token` - RES=`curl -X POST -m 3600 --cacert /usr/local/certs/service/tls.crt -d '{"job": "{{ .jobId }}"}' -H "authorization: bearer ${TOKEN}" -H "Content-Type: application/json" -s -o /dev/null -w '%{http_code}\\n' ${ENDPOINT}` - echo "{{ .name }} job result: ${RES}" - if [ "$RES" = 200 ] || [ "$RES" = 503 ] || [ "$RES" = 000 ] - then + RESPONSE=$(mktemp) + RES=$(curl -X POST -m 3600 --cacert /usr/local/certs/service/tls.crt -d '{"job": "'${JOB_ID}'"}' -H "authorization: bearer ${TOKEN}" -H "Content-Type: application/json" -s -o ${RESPONSE} -w '%{http_code}' ${ENDPOINT}) + BODY=$(cat ${RESPONSE}) + rm -f ${RESPONSE} + END_TIME=$(date +%s) + DURATION=$((END_TIME - START_TIME)) + # API returns empty {} on success, or error details on failure + if [ "${RES}" = "200" ]; then + # Success - use DEBUG to avoid verbose logs + echo '{"severity":"DEBUG","message":"'${JOB_NAME}' job completed successfully","job":"'${JOB_NAME}'","jobId":"'${JOB_ID}'","statusCode":'${RES}',"durationSeconds":'${DURATION}'}' + else + # Failure - log as ERROR with response details for debugging + BODY_ESCAPED=$(echo "${BODY}" | sed 's/\\/\\\\/g; s/"/\\"/g' | tr '\n\r\t' ' ') + echo '{"severity":"ERROR","message":"'${JOB_NAME}' job failed","job":"'${JOB_NAME}'","jobId":"'${JOB_ID}'","statusCode":'${RES}',"durationSeconds":'${DURATION}',"responseBody":"'${BODY_ESCAPED}'"}' + fi + if [ "${RES}" = "200" ] || [ "${RES}" = "503" ] || [ "${RES}" = "000" ]; then exit 0 else exit 1 diff --git a/pkg/api/cmd/server.go b/pkg/api/cmd/server.go index d1d7d83c11..68e4bbdb2c 100644 --- a/pkg/api/cmd/server.go +++ b/pkg/api/cmd/server.go @@ -571,7 +571,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L // Wait for K8s endpoint propagation // This prevents "context deadline exceeded" errors during high traffic. time.Sleep(propagationDelay) - logger.Info("Starting HTTP/gRPC server shutdown") // Mark as unhealthy so readiness probes fail // This ensures Kubernetes readiness probe fails on next check, @@ -600,11 +599,9 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L // Wait for HTTP/REST traffic to fully drain wg.Wait() - logger.Info("gRPC-gateway and HTTP server shutdown completed") // Now it's safe to stop the gRPC server (no more HTTP→gRPC calls) server.Stop(grpcStopTimeout) - logger.Info("gRPC server shutdown completed") // Close clients // These are fast cleanup operations that can run asynchronously. diff --git a/pkg/batch/cmd/server/server.go b/pkg/batch/cmd/server/server.go index 0548554009..89d42e3907 100644 --- a/pkg/batch/cmd/server/server.go +++ b/pkg/batch/cmd/server/server.go @@ -636,7 +636,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L // Wait for K8s endpoint propagation // This prevents "context deadline exceeded" errors during high traffic. time.Sleep(propagationDelay) - logger.Info("Starting HTTP/gRPC server shutdown") // Mark as unhealthy so readiness probes fail // This ensures Kubernetes readiness probe fails on next check, @@ -645,11 +644,9 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L // Gracefully stop gRPC Gateway (calls the gRPC server internally) batchGateway.Stop(serverShutDownTimeout) - logger.Info("gRPC-gateway server shutdown completed") // Stop gRPC server (only pure gRPC connections remain) server.Stop(grpcStopTimeout) - logger.Info("gRPC server shutdown completed") // Close clients // These are fast cleanup operations that can run asynchronously. diff --git a/pkg/batch/jobs/calculator/experiment_calculate.go b/pkg/batch/jobs/calculator/experiment_calculate.go index 5c7d1efcd6..3b17aa8e76 100644 --- a/pkg/batch/jobs/calculator/experiment_calculate.go +++ b/pkg/batch/jobs/calculator/experiment_calculate.go @@ -135,7 +135,7 @@ func (e *experimentCalculate) runCalculation() { return } if experiments == nil { - e.logger.Info("There are no experiments for calculation in the specified environment", + e.logger.Debug("There are no experiments for calculation in the specified environment", log.FieldsFromIncomingContext(ctxWithTimeout).AddFields( zap.String("environmentId", env.Id), )..., diff --git a/pkg/batch/jobs/opsevent/progressove_rollout_watcher.go b/pkg/batch/jobs/opsevent/progressove_rollout_watcher.go index 53c34c68e9..18696ef09c 100644 --- a/pkg/batch/jobs/opsevent/progressove_rollout_watcher.go +++ b/pkg/batch/jobs/opsevent/progressove_rollout_watcher.go @@ -129,7 +129,7 @@ func (w *progressiveRolloutWatcher) executeProgressiveRollout( now := time.Now().Unix() for _, s := range schedules { if s.TriggeredAt == 0 && s.ExecuteAt <= now { - w.logger.Info("scheduled time is passed", + w.logger.Debug("scheduled time is passed", zap.String("environmentId", environmentId), zap.String("featureId", progressiveRollout.FeatureId), zap.String("progressiveRolloutId", progressiveRollout.Id), diff --git a/pkg/batch/jobs/rediscounter/redis_counter_deleter.go b/pkg/batch/jobs/rediscounter/redis_counter_deleter.go index 31de08f40d..e4f9c32817 100644 --- a/pkg/batch/jobs/rediscounter/redis_counter_deleter.go +++ b/pkg/batch/jobs/rediscounter/redis_counter_deleter.go @@ -131,7 +131,7 @@ func (r *redisCounterDeleter) deleteKeysByKind(environmentId, kind string) (int, return 0, err } if len(keys) == 0 { - r.logger.Info("No keys was found", + r.logger.Debug("No keys was found", zap.String("environmentId", environmentId), zap.String("kind", kind), ) @@ -141,7 +141,7 @@ func (r *redisCounterDeleter) deleteKeysByKind(environmentId, kind string) (int, if err != nil { return 0, err } - r.logger.Info("Filtered keys older than 31 days", + r.logger.Debug("Filtered keys older than 31 days", zap.String("environmentId", environmentId), zap.String("kind", kind), zap.Int("filteredKeysSize", len(filteredKeys)), @@ -149,7 +149,6 @@ func (r *redisCounterDeleter) deleteKeysByKind(environmentId, kind string) (int, // To avoid blocking Redis for too much time while deleting all the keys // we split the keys in chunks chunks := r.chunkSlice(filteredKeys, redisChunkMaxSize) - r.logger.Info("Chunked the filtered keys", zap.Int("chunkSize", len(chunks))) deletedKeys := 0 for _, chunk := range chunks { if err := r.deleteKeys(chunk); err != nil { @@ -163,7 +162,7 @@ func (r *redisCounterDeleter) deleteKeysByKind(environmentId, kind string) (int, return deletedKeys, err } deletedKeys += len(chunk) - r.logger.Info("Chunk deleted successfully", zap.Strings("keys", chunk)) + r.logger.Debug("Chunk deleted successfully", zap.Strings("keys", chunk)) } return deletedKeys, nil } @@ -174,7 +173,7 @@ func (r *redisCounterDeleter) newKeyPrefix(environmentId, kind string) string { } func (r *redisCounterDeleter) scan(environmentId, kind, key string) ([]string, error) { - r.logger.Info("Starting scan keys from Redis", + r.logger.Debug("Starting scan keys from Redis", zap.String("environmentId", environmentId), zap.String("kind", kind), ) @@ -196,7 +195,7 @@ func (r *redisCounterDeleter) scan(environmentId, kind, key string) ([]string, e if err != nil { return nil, err } - r.logger.Info("Finished scanning keys from Redis", + r.logger.Debug("Finished scanning keys from Redis", zap.String("environmentId", environmentId), zap.String("kind", kind), zap.Duration("elapsedTime", time.Since(startTime)), diff --git a/pkg/subscriber/cmd/server/server.go b/pkg/subscriber/cmd/server/server.go index 7def32d83d..34e4413d9c 100644 --- a/pkg/subscriber/cmd/server/server.go +++ b/pkg/subscriber/cmd/server/server.go @@ -368,12 +368,10 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L // This ensures Kubernetes readiness probe fails on next check, // preventing new traffic from being routed to this pod. restHealthChecker.Stop() - logger.Info("Health check marked as unhealthy (readiness will fail)") // Stop PubSub subscription // This stops receiving new messages and allows in-flight messages to be processed. multiPubSub.Stop() - logger.Info("PubSub subscription stopped, all messages processed") // Close clients // These are fast cleanup operations that can run asynchronously. diff --git a/pkg/web/cmd/server/server.go b/pkg/web/cmd/server/server.go index 4b7d4c150d..8801ef56d4 100644 --- a/pkg/web/cmd/server/server.go +++ b/pkg/web/cmd/server/server.go @@ -865,7 +865,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L // Wait for K8s endpoint propagation // This prevents "context deadline exceeded" errors during high traffic. time.Sleep(propagationDelay) - logger.Info("Starting HTTP/gRPC server shutdown") // Mark as unhealthy so readiness probes fail // This ensures Kubernetes readiness probe fails on next check, @@ -891,7 +890,6 @@ func (s *server) Run(ctx context.Context, metrics metrics.Metrics, logger *zap.L // Wait for REST traffic to drain restWg.Wait() - logger.Info("REST servers shutdown completed") // Gracefully stop all gRPC servers in parallel // Now safe to stop since REST traffic has drained