bucketeer-io
diff --git a/‎docker-compose/config/nginx/bucketeer.conf‎
Lines changed: 18 additions & 2 deletions b/‎docker-compose/config/nginx/bucketeer.conf‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎manifests/bucketeer/charts/api/templates/deployment.yaml‎
Lines changed: 12 additions & 25 deletions b/‎manifests/bucketeer/charts/api/templates/deployment.yaml‎
Lines changed: 12 additions & 25 deletions
diff --git a/‎manifests/bucketeer/charts/api/templates/envoy-configmap.yaml‎
Lines changed: 6 additions & 1 deletion b/‎manifests/bucketeer/charts/api/templates/envoy-configmap.yaml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎manifests/bucketeer/charts/api/values.yaml‎
Lines changed: 11 additions & 5 deletions b/‎manifests/bucketeer/charts/api/values.yaml‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎manifests/bucketeer/charts/batch/templates/deployment.yaml‎
Lines changed: 14 additions & 25 deletions b/‎manifests/bucketeer/charts/batch/templates/deployment.yaml‎
Lines changed: 14 additions & 25 deletions
diff --git a/‎manifests/bucketeer/charts/batch/templates/envoy-configmap.yaml‎
Lines changed: 1 addition & 1 deletion b/‎manifests/bucketeer/charts/batch/templates/envoy-configmap.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎manifests/bucketeer/charts/batch/values.yaml‎
Lines changed: 11 additions & 5 deletions b/‎manifests/bucketeer/charts/batch/values.yaml‎
Lines changed: 11 additions & 5 deletions
@@ -109,7 +109,7 @@ server {
     add_header X-Content-Type-Options nosniff;
     add_header X-XSS-Protection "1; mode=block";
 
-    # Health check endpoint
+    # Health check endpoints
     location /health {
         proxy_pass https://api_health_backend;
         proxy_set_header Host $host;
@@ -118,6 +118,14 @@ server {
         proxy_set_header X-Forwarded-Proto $scheme;
     }
 
+    location /ready {
+        proxy_pass https://api_health_backend;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+
     # API Gateway gRPC service (main API)
     location /bucketeer.gateway.Gateway {
         grpc_pass grpcs://api_grpc_backend;
@@ -192,7 +200,7 @@ server {
         return 204;
     }
 
-    # Health check endpoint
+    # Health check endpoints
     location /health {
         proxy_pass https://web_health_backend;
         proxy_set_header Host $host;
@@ -201,6 +209,14 @@ server {
         proxy_set_header X-Forwarded-Proto $scheme;
     }
 
+    location /ready {
+        proxy_pass https://web_health_backend;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+
     # gRPC/gRPC-Web service routes (backend handles both protocols)
     location /bucketeer.account.AccountService {
         if ($is_grpc_web = 1) {
 
@@ -21,12 +21,7 @@ spec:
       annotations:
         checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
     spec:
-      # Ensure pod has sufficient time for graceful shutdown before SIGKILL
-      # This matches GCP Spot VM termination window (30s) and allows:
-      # - 3s for K8s to detect pod is not ready
-      # - 20s for application graceful shutdown
-      # - 7s safety margin
-      terminationGracePeriodSeconds: 30
+      terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds | default 30 }}
       {{- with .Values.global.image.imagePullSecrets }}
       imagePullSecrets: {{- toYaml . | nindent 8 }}
       {{- end }}
@@ -166,7 +161,7 @@ spec:
             failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
             timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
             httpGet:
-              path: /health
+              path: /ready
               port: service
               scheme: HTTPS
           resources:
@@ -181,28 +176,18 @@ spec:
                   - "/bin/sh"
                   - "-c"
                   - |
-                    # Step 1: Fail Envoy health check so K8s removes pod from endpoints
-                    wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
-
-                    # Step 2: Wait for all active connections to drain (max 25s)
-                    # Uses Istio pattern: dynamically checks all connections excluding Envoy and TIME-WAIT
+                    # Wait for active requests to finish (not idle keep-alive connections)
                     elapsed=0
-                    max_wait=25
+                    max_wait=30
                     while [ $elapsed -lt $max_wait ]; do
-                      # Count active connections excluding Envoy process and TIME-WAIT states
-                      active_conns=$(ss -Htlp state all | grep -vE '(envoy|TIME-WAIT)' | wc -l | xargs)
-                      if [ "$active_conns" -eq 0 ]; then
-                        echo "All connections drained after ${elapsed}s"
-                        break
-                      fi
-                      echo "Waiting for $active_conns connections to drain..."
+                      active_reqs=$(wget -q -O- http://localhost:{{ .Values.envoy.adminPort }}/stats 2>/dev/null | grep "ingress_http.downstream_rq_active" | awk '{print $2}')
+                      [ -z "$active_reqs" ] && active_reqs=0
+                      [ "$active_reqs" -eq 0 ] && echo "All active requests completed in ${elapsed}s" && break
                       sleep 1
                       elapsed=$((elapsed + 1))
                     done
-
-                    if [ $elapsed -ge $max_wait ]; then
-                      echo "Timeout reached, forcing shutdown with $active_conns remaining connections"
-                    fi
+                    [ $elapsed -ge $max_wait ] && echo "Timeout: $active_reqs active requests remain"
+                    exit 0
           command: ["envoy"]
           args:
             - "-c"
@@ -233,6 +218,7 @@ spec:
             initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
             periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
             failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
+            timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
             httpGet:
               path: /ready
               port: admin
@@ -241,11 +227,12 @@ spec:
             initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
             periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
             failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
+            timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
             httpGet:
               path: /ready
               port: admin
               scheme: HTTP
           resources:
 {{ toYaml .Values.envoy.resources | indent 12 }}
   strategy:
-    type: RollingUpdate
+{{ toYaml .Values.strategy | indent 4 }}
@@ -224,12 +224,17 @@ data:
                             allow_credentials: true
                             max_age: "86400"
                         routes:
-                          # Health check endpoint
+                          # Health check endpoints
                           - match:
                               prefix: /health
                             route:
                               cluster: api
                               timeout: 35s
+                          - match:
+                              prefix: /ready
+                            route:
+                              cluster: api
+                              timeout: 35s
                           # API REST v1 Gateway routes (Deprecated)
                           - match:
                               prefix: /v1/gateway
 
@@ -48,6 +48,11 @@ nodeSelector: {}
 pdb:
   enabled:
   maxUnavailable: 20%
+strategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxUnavailable: 10%
+    maxSurge: 1
 hpa:
   enabled:
   minReplicas:
@@ -90,17 +95,18 @@ ingress:
           name: api
           port:
             number: 9000
+terminationGracePeriodSeconds: 30
 health:
   livenessProbe:
-    initialDelaySeconds: 10
-    periodSeconds: 3
+    initialDelaySeconds: 30
+    periodSeconds: 15
     failureThreshold: 5
     timeoutSeconds: 5
   readinessProbe:
-    initialDelaySeconds: 10
+    initialDelaySeconds: 15
     periodSeconds: 3
-    failureThreshold: 2
-    timeoutSeconds: 3
+    failureThreshold: 1
+    timeoutSeconds: 5
 resources: {}
 serviceAccount:
   annotations: {}
 
@@ -22,12 +22,7 @@ spec:
       annotations:
         checksum/config: {{ include (print $.Template.BasePath "/envoy-configmap.yaml") . | sha256sum }}
     spec:
-      # Ensure pod has sufficient time for graceful shutdown before SIGKILL
-      # This matches GCP Spot VM termination window (30s) and allows:
-      # - 3s for K8s to detect pod is not ready
-      # - 20s for application graceful shutdown
-      # - 7s safety margin
-      terminationGracePeriodSeconds: 30
+      terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds | default 30 }}
       {{- with .Values.global.image.imagePullSecrets }}
       imagePullSecrets: {{- toYaml . | nindent 8 }}
       {{- end }}
@@ -193,6 +188,7 @@ spec:
             initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
             periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
             failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
+            timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
             httpGet:
               path: /health
               port: service
@@ -201,8 +197,9 @@ spec:
             initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
             periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
             failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
+            timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
             httpGet:
-              path: /health
+              path: /ready
               port: service
               scheme: HTTPS
           resources:
@@ -217,28 +214,18 @@ spec:
                   - "/bin/sh"
                   - "-c"
                   - |
-                    # Step 1: Fail Envoy health check so K8s removes pod from endpoints
-                    wget -O- --post-data='{}' http://localhost:$ENVOY_ADMIN_PORT/healthcheck/fail
-
-                    # Step 2: Wait for all active connections to drain (max 25s)
-                    # Uses Istio pattern: dynamically checks all connections excluding Envoy and TIME-WAIT
+                    # Wait for active requests to finish (not idle keep-alive connections)
                     elapsed=0
-                    max_wait=25
+                    max_wait=30
                     while [ $elapsed -lt $max_wait ]; do
-                      # Count active connections excluding Envoy process and TIME-WAIT states
-                      active_conns=$(ss -Htlp state all | grep -vE '(envoy|TIME-WAIT)' | wc -l | xargs)
-                      if [ "$active_conns" -eq 0 ]; then
-                        echo "All connections drained after ${elapsed}s"
-                        break
-                      fi
-                      echo "Waiting for $active_conns connections to drain..."
+                      active_reqs=$(wget -q -O- http://localhost:{{ .Values.envoy.adminPort }}/stats 2>/dev/null | grep "ingress_http.downstream_rq_active" | awk '{print $2}')
+                      [ -z "$active_reqs" ] && active_reqs=0
+                      [ "$active_reqs" -eq 0 ] && echo "All active requests completed in ${elapsed}s" && break
                       sleep 1
                       elapsed=$((elapsed + 1))
                     done
-
-                    if [ $elapsed -ge $max_wait ]; then
-                      echo "Timeout reached, forcing shutdown with $active_conns remaining connections"
-                    fi
+                    [ $elapsed -ge $max_wait ] && echo "Timeout: $active_reqs active requests remain"
+                    exit 0
           command: ["envoy"]
           args:
             - "-c"
@@ -264,6 +251,7 @@ spec:
             initialDelaySeconds: {{ .Values.health.livenessProbe.initialDelaySeconds }}
             periodSeconds: {{ .Values.health.livenessProbe.periodSeconds }}
             failureThreshold: {{ .Values.health.livenessProbe.failureThreshold }}
+            timeoutSeconds: {{ .Values.health.livenessProbe.timeoutSeconds }}
             httpGet:
               path: /ready
               port: admin
@@ -272,6 +260,7 @@ spec:
             initialDelaySeconds: {{ .Values.health.readinessProbe.initialDelaySeconds }}
             periodSeconds: {{ .Values.health.readinessProbe.periodSeconds }}
             failureThreshold: {{ .Values.health.readinessProbe.failureThreshold }}
+            timeoutSeconds: {{ .Values.health.readinessProbe.timeoutSeconds }}
             httpGet:
               path: /ready
               port: admin
@@ -301,5 +290,5 @@ spec:
           resources:
 {{ toYaml .Values.httpstan.resources | indent 12 }}
   strategy:
-    type: RollingUpdate
+{{ toYaml .Values.strategy | indent 4 }}
 {{- end }}
@@ -158,7 +158,7 @@ data:
                             - name: :path
                               string_match:
                                 exact: /health
-                          pass_through_mode: false
+                          pass_through_mode: true  # Changed to true to allow app to handle /health and /ready
                       - name: envoy.filters.http.router
                         typed_config:
                           "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
 
@@ -59,6 +59,11 @@ nodeSelector: {}
 pdb:
   enabled:
   maxUnavailable: 50%
+strategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxUnavailable: 10%
+    maxSurge: 1
 
 hpa:
   enabled:
@@ -120,15 +125,15 @@ gcpMultiCluster:
 
 health:
   livenessProbe:
-    initialDelaySeconds: 10
-    periodSeconds: 3
+    initialDelaySeconds: 30
+    periodSeconds: 15
     failureThreshold: 5
     timeoutSeconds: 5
   readinessProbe:
-    initialDelaySeconds: 10
+    initialDelaySeconds: 15
     periodSeconds: 3
-    failureThreshold: 2
-    timeoutSeconds: 3
+    failureThreshold: 1
+    timeoutSeconds: 5
 
 resources: {}
 
@@ -194,3 +199,4 @@ cronjob:
     - name: tag-deleter
       jobId: TagDeleter
       schedule: "0 0 * * *"
+terminationGracePeriodSeconds: 30