From e76ea4766caae0d9d0c75c6f05ec57319e76567a Mon Sep 17 00:00:00 2001 From: motoki317 Date: Wed, 20 Nov 2024 14:02:29 +0900 Subject: [PATCH 1/7] refactor: Extract out stub_status scraper for external use --- .../ingress/metric/collectors/nginx_status.go | 94 ++++++++++--------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/internal/ingress/metric/collectors/nginx_status.go b/internal/ingress/metric/collectors/nginx_status.go index f3afdc334d..027a3be237 100644 --- a/internal/ingress/metric/collectors/nginx_status.go +++ b/internal/ingress/metric/collectors/nginx_status.go @@ -17,7 +17,7 @@ limitations under the License. package collectors import ( - "log" + "fmt" "regexp" "strconv" @@ -35,7 +35,10 @@ var ( ) type ( + NginxStatusScraper struct{} + nginxStatusCollector struct { + scraper NginxStatusScraper scrapeChan chan scrapeRequest data *nginxStatusData @@ -47,7 +50,7 @@ type ( connections *prometheus.Desc } - basicStatus struct { + NginxStubStatus struct { // Active total number of active connections Active int // Accepted total number of accepted client connections @@ -65,6 +68,49 @@ type ( } ) +func toInt(data []string, pos int) int { + if len(data) == 0 { + return 0 + } + if pos > len(data) { + return 0 + } + if v, err := strconv.Atoi(data[pos]); err == nil { + return v + } + return 0 +} + +func parse(data string) *NginxStubStatus { + acr := ac.FindStringSubmatch(data) + sahrr := sahr.FindStringSubmatch(data) + readingr := reading.FindStringSubmatch(data) + writingr := writing.FindStringSubmatch(data) + waitingr := waiting.FindStringSubmatch(data) + + return &NginxStubStatus{ + toInt(acr, 1), + toInt(sahrr, 1), + toInt(sahrr, 2), + toInt(sahrr, 3), + toInt(readingr, 1), + toInt(writingr, 1), + toInt(waitingr, 1), + } +} + +func (s *NginxStatusScraper) Scrape() (*NginxStubStatus, error) { + klog.V(3).InfoS("starting scraping socket", "path", nginx.StatusPath) + status, data, err := nginx.NewGetStatusRequest(nginx.StatusPath) + if err != nil { + return nil, fmt.Errorf("obtaining nginx status info: %w", err) + } + if status < 200 || status >= 400 { + return nil, fmt.Errorf("obtaining nginx status info (status %v)", status) + } + return parse(string(data)), nil +} + // NGINXStatusCollector defines a status collector interface type NGINXStatusCollector interface { prometheus.Collector @@ -131,54 +177,14 @@ func (p nginxStatusCollector) Stop() { close(p.scrapeChan) } -func toInt(data []string, pos int) int { - if len(data) == 0 { - return 0 - } - if pos > len(data) { - return 0 - } - if v, err := strconv.Atoi(data[pos]); err == nil { - return v - } - return 0 -} - -func parse(data string) *basicStatus { - acr := ac.FindStringSubmatch(data) - sahrr := sahr.FindStringSubmatch(data) - readingr := reading.FindStringSubmatch(data) - writingr := writing.FindStringSubmatch(data) - waitingr := waiting.FindStringSubmatch(data) - - return &basicStatus{ - toInt(acr, 1), - toInt(sahrr, 1), - toInt(sahrr, 2), - toInt(sahrr, 3), - toInt(readingr, 1), - toInt(writingr, 1), - toInt(waitingr, 1), - } -} - // nginxStatusCollector scrape the nginx status func (p nginxStatusCollector) scrape(ch chan<- prometheus.Metric) { - klog.V(3).InfoS("starting scraping socket", "path", nginx.StatusPath) - status, data, err := nginx.NewGetStatusRequest(nginx.StatusPath) + s, err := p.scraper.Scrape() if err != nil { - log.Printf("%v", err) - klog.Warningf("unexpected error obtaining nginx status info: %v", err) + klog.Warningf("failed to scrape nginx status: %v", err) return } - if status < 200 || status >= 400 { - klog.Warningf("unexpected error obtaining nginx status info (status %v)", status) - return - } - - s := parse(string(data)) - ch <- prometheus.MustNewConstMetric(p.data.connectionsTotal, prometheus.CounterValue, float64(s.Accepted), "accepted") ch <- prometheus.MustNewConstMetric(p.data.connectionsTotal, From 25f5d09ec06b7d72c52dfa46ccfbb560c82a9ad9 Mon Sep 17 00:00:00 2001 From: motoki317 Date: Wed, 20 Nov 2024 14:58:20 +0900 Subject: [PATCH 2/7] Improve shutdown logic: wait until no requests are made Pods in Kubernetes endpoints are expected to shut-down 'gracefully' after receiving SIGTERM - we should keep accepting new connections for a while. This is because Kubernetes updates Service endpoints and sends SIGTERM to pods *in parallel*. See https://github.com/kubernetes/kubernetes/issues/106476 for more detail. --- internal/ingress/controller/nginx.go | 61 ++++++++++++- test/e2e/framework/deployment.go | 4 + .../gracefulshutdown/k8s_async_shutdown.go | 87 +++++++++++++++++++ 3 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 test/e2e/gracefulshutdown/k8s_async_shutdown.go diff --git a/internal/ingress/controller/nginx.go b/internal/ingress/controller/nginx.go index 20fad5afb8..773868bdcc 100644 --- a/internal/ingress/controller/nginx.go +++ b/internal/ingress/controller/nginx.go @@ -23,6 +23,7 @@ import ( "errors" "fmt" "io/fs" + "k8s.io/ingress-nginx/internal/ingress/metric/collectors" "net" "net/http" "os" @@ -377,6 +378,63 @@ func (n *NGINXController) Start() { } } +// stopWait waits until no more connections are made to nginx. +// +// This waits until all of following conditions are met: +// - No more requests are made to nginx for the last 5 seconds. +// - 'shutdown-grace-period' seconds have passed after calling this method. +// +// Pods in Kubernetes endpoints are expected to shut-down 'gracefully' after receiving SIGTERM - +// we should keep accepting new connections for a while. This is because Kubernetes updates Service endpoints +// and sends SIGTERM to pods *in parallel*. +// If we don't see new requests for 5 seconds, then we assume that this pod was removed from the upstream endpoints +// (AWS ALB endpoints for example), and proceed with shutdown. +// +// See https://github.com/kubernetes/kubernetes/issues/106476 for more detail on this issue. +func (n *NGINXController) stopWait() { + const checkFrequency = time.Second + const waitUntilNoConnectionsFor = int((5 * time.Second) / checkFrequency) + waitAtLeastUntil := time.Now().Add(time.Duration(n.cfg.ShutdownGracePeriod) * time.Second) + + var scraper collectors.NginxStatusScraper + lastRequests := 0 + noChangeTimes := 0 + + for ; ; time.Sleep(checkFrequency) { + st, err := scraper.Scrape() + if err != nil { + klog.Warningf("failed to scrape nginx status: %v", err) + noChangeTimes = 0 + continue + } + + diff := st.Requests - lastRequests + // We assume that there were no client requests to nginx, if and only if + // there were 0 to 2 increase in handled requests from the last scrape. + // 1 is to account for our own stub_status request from this method, + // and the other 1 is to account for the readinessProbe. + // Note that readinessProbe DO happen even when the pod is terminating. + // See: https://github.com/kubernetes/kubernetes/issues/122824#issuecomment-1899224434 + noChange := 0 <= diff && diff <= 2 + if noChange { + noChangeTimes++ + if noChangeTimes >= waitUntilNoConnectionsFor { + // Safe to proceed shutdown, we are seeing no more client request. + break + } + } else { + noChangeTimes = 0 + } + lastRequests = st.Requests + } + + // Wait at least for the configured duration, if any + delay := waitAtLeastUntil.Sub(time.Now()) + if delay > 0 { + time.Sleep(delay) + } +} + // Stop gracefully stops the NGINX master process. func (n *NGINXController) Stop() error { n.isShuttingDown = true @@ -388,7 +446,8 @@ func (n *NGINXController) Stop() error { return fmt.Errorf("shutdown already in progress") } - time.Sleep(time.Duration(n.cfg.ShutdownGracePeriod) * time.Second) + klog.InfoS("Graceful shutdown - waiting until no more requests are made") + n.stopWait() klog.InfoS("Shutting down controller queues") close(n.stopCh) diff --git a/test/e2e/framework/deployment.go b/test/e2e/framework/deployment.go index d6e59f18a9..144154486f 100644 --- a/test/e2e/framework/deployment.go +++ b/test/e2e/framework/deployment.go @@ -624,6 +624,10 @@ func (f *Framework) ScaleDeploymentToZero(name string) { assert.Nil(ginkgo.GinkgoT(), err, "getting deployment") assert.NotNil(ginkgo.GinkgoT(), d, "expected a deployment but none returned") + err = waitForPodsDeleted(f.KubeClientSet, 2*time.Minute, f.Namespace, &metav1.ListOptions{ + LabelSelector: labelSelectorToString(d.Spec.Selector.MatchLabels), + }) + assert.Nil(ginkgo.GinkgoT(), err, "waiting for no pods") err = WaitForEndpoints(f.KubeClientSet, DefaultTimeout, name, f.Namespace, 0) assert.Nil(ginkgo.GinkgoT(), err, "waiting for no endpoints") } diff --git a/test/e2e/gracefulshutdown/k8s_async_shutdown.go b/test/e2e/gracefulshutdown/k8s_async_shutdown.go new file mode 100644 index 0000000000..cf245dc055 --- /dev/null +++ b/test/e2e/gracefulshutdown/k8s_async_shutdown.go @@ -0,0 +1,87 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gracefulshutdown + +import ( + "context" + "fmt" + "github.com/onsi/ginkgo/v2" + "github.com/stretchr/testify/assert" + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/ingress-nginx/test/e2e/framework" + "net/http" + "strings" + "time" +) + +var _ = framework.IngressNginxDescribe("[Shutdown] Asynchronous shutdown", func() { + f := framework.NewDefaultFramework("k8s-async-shutdown", func(f *framework.Framework) { + f.Namespace = "k8s-async-shutdown" + }) + + host := "async-shutdown" + + ginkgo.BeforeEach(func() { + f.NewSlowEchoDeployment() + }) + + ginkgo.It("should not shut down while still receiving traffic", func() { + defer ginkgo.GinkgoRecover() + + err := f.UpdateIngressControllerDeployment(func(deployment *appsv1.Deployment) error { + // Note: e2e's default terminationGracePeriodSeconds is 1 for some reason, so extend it + grace := int64(300) + deployment.Spec.Template.Spec.TerminationGracePeriodSeconds = &grace + _, err := f.KubeClientSet.AppsV1().Deployments(f.Namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{}) + return err + }) + assert.Nil(ginkgo.GinkgoT(), err, "updating ingress controller deployment") + + f.EnsureIngress(framework.NewSingleIngress(host, "/", host, f.Namespace, framework.SlowEchoService, 80, nil)) + + f.WaitForNginxServer(host, + func(server string) bool { + return strings.Contains(server, "server_name "+host) + }) + + // We need to get pod IP first because after the pod becomes terminating, + // it is removed from Service endpoints, and becomes unable to be discovered by "f.HTTPTestClient()". + ip := f.GetNginxPodIP() + + // Assume that the upstream takes 30 seconds to update its endpoints, + // therefore we are still receiving traffic while shutting down + go func() { + defer ginkgo.GinkgoRecover() + for i := 0; i < 120; i++ { + f.HTTPDumbTestClient(). + GET("/"). + WithURL(fmt.Sprintf("http://%s/", ip)). + WithHeader("Host", host). + Expect(). + Status(http.StatusOK) + + framework.Sleep(250 * time.Millisecond) + } + }() + + start := time.Now() + f.ScaleDeploymentToZero("nginx-ingress-controller") + assert.GreaterOrEqualf(ginkgo.GinkgoT(), int(time.Since(start).Seconds()), 35, + "should take more than 30 + 5 seconds for graceful shutdown") + }) +}) From 3fe18f1870655432e2c403193267aa2fc8074e71 Mon Sep 17 00:00:00 2001 From: motoki317 Date: Wed, 20 Nov 2024 15:03:42 +0900 Subject: [PATCH 3/7] Make shutdown grace periods to more reasonable defaults Note that post-shutdown-grace-period doesn't seem to contribute to graceful shutdown, see https://github.com/kubernetes/ingress-nginx/issues/8095 for discussion. --- pkg/flags/flags.go | 6 ++++-- pkg/util/process/sigterm.go | 10 ++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pkg/flags/flags.go b/pkg/flags/flags.go index ce24160fdc..8f1ee11902 100644 --- a/pkg/flags/flags.go +++ b/pkg/flags/flags.go @@ -222,9 +222,11 @@ Takes the form ":port". If not provided, no admission controller is starte statusUpdateInterval = flags.Int("status-update-interval", status.UpdateInterval, "Time interval in seconds in which the status should check if an update is required. Default is 60 seconds") - shutdownGracePeriod = flags.Int("shutdown-grace-period", 0, "Seconds to wait after receiving the shutdown signal, before stopping the nginx process.") + shutdownGracePeriod = flags.Int("shutdown-grace-period", 10, "Seconds to wait after receiving the shutdown signal, before stopping the nginx process.") - postShutdownGracePeriod = flags.Int("post-shutdown-grace-period", 10, "Seconds to wait after the nginx process has stopped before controller exits.") + postShutdownGracePeriod = flags.Int("post-shutdown-grace-period", 0, `[IN DEPRECATION] Seconds to wait after the nginx process has stopped before controller exits. +Note that increasing this value doesn't seem to contribute to graceful shutdown of the ingress controller. +If you would like to configure period for accepting requests before shutting down, use 'shutdown-grace-period' instead.'`) deepInspector = flags.Bool("deep-inspect", true, "Enables ingress object security deep inspector") diff --git a/pkg/util/process/sigterm.go b/pkg/util/process/sigterm.go index 1c0d729c12..689947c808 100644 --- a/pkg/util/process/sigterm.go +++ b/pkg/util/process/sigterm.go @@ -17,12 +17,11 @@ limitations under the License. package process import ( + klog "k8s.io/klog/v2" "os" "os/signal" "syscall" "time" - - klog "k8s.io/klog/v2" ) type exiter func(code int) @@ -41,8 +40,11 @@ func HandleSigterm(ngx Controller, delay int, exit exiter) { exitCode = 1 } - klog.Infof("Handled quit, delaying controller exit for %d seconds", delay) - time.Sleep(time.Duration(delay) * time.Second) + if delay > 0 { + klog.Warning("[DEPRECATED] Delaying controller exit for %d seconds", delay) + klog.Warning("[DEPRECATED] 'post-shutdown-grace-period' does not have any effect for graceful shutdown - use 'shutdown-grace-period' flag instead.") + time.Sleep(time.Duration(delay) * time.Second) + } klog.InfoS("Exiting", "code", exitCode) exit(exitCode) From ff45a2a6af7041c89ebd1b9543807a7c91c84120 Mon Sep 17 00:00:00 2001 From: motoki317 Date: Thu, 21 Nov 2024 21:16:04 +0900 Subject: [PATCH 4/7] refactor: wait-shutdown preStop hook is not necessary /wait-shutdown preStop script's only job is to send SIGTERM to nginx-ingress-controller, which is PID 1, so it's the same with or without in Kubernetes environments. See https://github.com/kubernetes/ingress-nginx/issues/6287 for discussion. --- build/build.sh | 10 ------ charts/ingress-nginx/README.md | 2 +- charts/ingress-nginx/values.yaml | 13 +------ cmd/waitshutdown/main.go | 43 ----------------------- rootfs/Dockerfile | 1 - rootfs/Dockerfile-chroot | 1 - test/e2e/gracefulshutdown/grace_period.go | 6 ++-- 7 files changed, 4 insertions(+), 72 deletions(-) delete mode 100644 cmd/waitshutdown/main.go diff --git a/build/build.sh b/build/build.sh index bbcaf78e8e..6d6953d93f 100755 --- a/build/build.sh +++ b/build/build.sh @@ -57,13 +57,3 @@ ${GO_BUILD_CMD} \ -X ${PKG}/version.REPO=${REPO_INFO}" \ -buildvcs=false \ -o "${TARGETS_DIR}/dbg" "${PKG}/cmd/dbg" - -echo "Building ${PKG}/cmd/waitshutdown" - -${GO_BUILD_CMD} \ - -trimpath -ldflags="-buildid= -w -s \ - -X ${PKG}/version.RELEASE=${TAG} \ - -X ${PKG}/version.COMMIT=${COMMIT_SHA} \ - -X ${PKG}/version.REPO=${REPO_INFO}" \ - -buildvcs=false \ - -o "${TARGETS_DIR}/wait-shutdown" "${PKG}/cmd/waitshutdown" \ No newline at end of file diff --git a/charts/ingress-nginx/README.md b/charts/ingress-nginx/README.md index c0b00e56d1..0369d685e9 100644 --- a/charts/ingress-nginx/README.md +++ b/charts/ingress-nginx/README.md @@ -372,7 +372,7 @@ metadata: | controller.keda.triggers | list | `[]` | | | controller.kind | string | `"Deployment"` | Use a `DaemonSet` or `Deployment` | | controller.labels | object | `{}` | Labels to be added to the controller Deployment or DaemonSet and other resources that do not have option to specify labels # | -| controller.lifecycle | object | `{"preStop":{"exec":{"command":["/wait-shutdown"]}}}` | Improve connection draining when ingress controller pod is deleted using a lifecycle hook: With this new hook, we increased the default terminationGracePeriodSeconds from 30 seconds to 300, allowing the draining of connections up to five minutes. If the active connections end before that, the pod will terminate gracefully at that time. To effectively take advantage of this feature, the Configmap feature worker-shutdown-timeout new value is 240s instead of 10s. # | +| controller.lifecycle | object | `{}` | | | controller.livenessProbe.failureThreshold | int | `5` | | | controller.livenessProbe.httpGet.path | string | `"/healthz"` | | | controller.livenessProbe.httpGet.port | int | `10254` | | diff --git a/charts/ingress-nginx/values.yaml b/charts/ingress-nginx/values.yaml index 116adf7ca5..4049f42c54 100644 --- a/charts/ingress-nginx/values.yaml +++ b/charts/ingress-nginx/values.yaml @@ -945,18 +945,7 @@ controller: # annotations: # description: Too many 4XXs # summary: More than 5% of all requests returned 4XX, this requires your attention - # -- Improve connection draining when ingress controller pod is deleted using a lifecycle hook: - # With this new hook, we increased the default terminationGracePeriodSeconds from 30 seconds - # to 300, allowing the draining of connections up to five minutes. - # If the active connections end before that, the pod will terminate gracefully at that time. - # To effectively take advantage of this feature, the Configmap feature - # worker-shutdown-timeout new value is 240s instead of 10s. - ## - lifecycle: - preStop: - exec: - command: - - /wait-shutdown + lifecycle: {} priorityClassName: "" # -- Rollback limit ## diff --git a/cmd/waitshutdown/main.go b/cmd/waitshutdown/main.go deleted file mode 100644 index 7bc50d9cb8..0000000000 --- a/cmd/waitshutdown/main.go +++ /dev/null @@ -1,43 +0,0 @@ -/* -Copyright 2019 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package main - -import ( - "os" - "os/exec" - "time" - - "k8s.io/ingress-nginx/internal/nginx" - "k8s.io/klog/v2" -) - -func main() { - err := exec.Command("bash", "-c", "pkill -SIGTERM -f nginx-ingress-controller").Run() - if err != nil { - klog.ErrorS(err, "terminating ingress controller") - os.Exit(1) - } - - // wait for the NGINX process to terminate - timer := time.NewTicker(time.Second * 1) - for range timer.C { - if !nginx.IsRunning() { - timer.Stop() - break - } - } -} diff --git a/rootfs/Dockerfile b/rootfs/Dockerfile index a04cfe3dec..cfcfe0c69f 100644 --- a/rootfs/Dockerfile +++ b/rootfs/Dockerfile @@ -43,7 +43,6 @@ COPY --chown=www-data:www-data etc /etc COPY --chown=www-data:www-data bin/${TARGETARCH}/dbg / COPY --chown=www-data:www-data bin/${TARGETARCH}/nginx-ingress-controller / -COPY --chown=www-data:www-data bin/${TARGETARCH}/wait-shutdown / # Fix permission during the build to avoid issues at runtime # with volumes (custom templates) diff --git a/rootfs/Dockerfile-chroot b/rootfs/Dockerfile-chroot index b719f2fc32..7f66f1da01 100644 --- a/rootfs/Dockerfile-chroot +++ b/rootfs/Dockerfile-chroot @@ -65,7 +65,6 @@ COPY --chown=www-data:www-data etc /chroot/etc COPY --chown=www-data:www-data bin/${TARGETARCH}/dbg / COPY --chown=www-data:www-data bin/${TARGETARCH}/nginx-ingress-controller / -COPY --chown=www-data:www-data bin/${TARGETARCH}/wait-shutdown / COPY --chown=www-data:www-data nginx-chroot-wrapper.sh /usr/bin/nginx WORKDIR /chroot/etc/nginx diff --git a/test/e2e/gracefulshutdown/grace_period.go b/test/e2e/gracefulshutdown/grace_period.go index 123892f3ac..cd000fa00b 100644 --- a/test/e2e/gracefulshutdown/grace_period.go +++ b/test/e2e/gracefulshutdown/grace_period.go @@ -41,16 +41,14 @@ var _ = framework.IngressNginxDescribe("[Shutdown] Grace period shutdown", func( if strings.Contains(v, "--shutdown-grace-period") { continue } - args = append(args, v) } - args = append(args, "--shutdown-grace-period=90") deployment.Spec.Template.Spec.Containers[0].Args = args - cmds := []string{"/wait-shutdown"} - deployment.Spec.Template.Spec.Containers[0].Lifecycle.PreStop.Exec.Command = cmds + grace := int64(3600) deployment.Spec.Template.Spec.TerminationGracePeriodSeconds = &grace + _, err := f.KubeClientSet.AppsV1().Deployments(f.Namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{}) return err }) From 872d64b05cebf742d55290f66801033db6b11367 Mon Sep 17 00:00:00 2001 From: motoki317 Date: Thu, 21 Nov 2024 22:08:21 +0900 Subject: [PATCH 5/7] Fix lint issues --- internal/ingress/controller/nginx.go | 5 +++-- pkg/util/process/sigterm.go | 5 +++-- test/e2e/gracefulshutdown/k8s_async_shutdown.go | 7 ++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/internal/ingress/controller/nginx.go b/internal/ingress/controller/nginx.go index 773868bdcc..86bc1b7077 100644 --- a/internal/ingress/controller/nginx.go +++ b/internal/ingress/controller/nginx.go @@ -23,7 +23,6 @@ import ( "errors" "fmt" "io/fs" - "k8s.io/ingress-nginx/internal/ingress/metric/collectors" "net" "net/http" "os" @@ -38,6 +37,8 @@ import ( "time" "unicode" + "k8s.io/ingress-nginx/internal/ingress/metric/collectors" + proxyproto "github.com/armon/go-proxyproto" "github.com/eapache/channels" apiv1 "k8s.io/api/core/v1" @@ -429,7 +430,7 @@ func (n *NGINXController) stopWait() { } // Wait at least for the configured duration, if any - delay := waitAtLeastUntil.Sub(time.Now()) + delay := time.Until(waitAtLeastUntil) if delay > 0 { time.Sleep(delay) } diff --git a/pkg/util/process/sigterm.go b/pkg/util/process/sigterm.go index 689947c808..8f25e2d994 100644 --- a/pkg/util/process/sigterm.go +++ b/pkg/util/process/sigterm.go @@ -17,11 +17,12 @@ limitations under the License. package process import ( - klog "k8s.io/klog/v2" "os" "os/signal" "syscall" "time" + + "k8s.io/klog/v2" ) type exiter func(code int) @@ -41,7 +42,7 @@ func HandleSigterm(ngx Controller, delay int, exit exiter) { } if delay > 0 { - klog.Warning("[DEPRECATED] Delaying controller exit for %d seconds", delay) + klog.Warningf("[DEPRECATED] Delaying controller exit for %d seconds", delay) klog.Warning("[DEPRECATED] 'post-shutdown-grace-period' does not have any effect for graceful shutdown - use 'shutdown-grace-period' flag instead.") time.Sleep(time.Duration(delay) * time.Second) } diff --git a/test/e2e/gracefulshutdown/k8s_async_shutdown.go b/test/e2e/gracefulshutdown/k8s_async_shutdown.go index cf245dc055..9a4bc368d7 100644 --- a/test/e2e/gracefulshutdown/k8s_async_shutdown.go +++ b/test/e2e/gracefulshutdown/k8s_async_shutdown.go @@ -19,14 +19,15 @@ package gracefulshutdown import ( "context" "fmt" + "net/http" + "strings" + "time" + "github.com/onsi/ginkgo/v2" "github.com/stretchr/testify/assert" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/ingress-nginx/test/e2e/framework" - "net/http" - "strings" - "time" ) var _ = framework.IngressNginxDescribe("[Shutdown] Asynchronous shutdown", func() { From 4f43d4a78c4953e062a59387c49e3ab61218eb24 Mon Sep 17 00:00:00 2001 From: motoki317 Date: Fri, 22 Nov 2024 00:48:36 +0900 Subject: [PATCH 6/7] fix: Add failure threshold to graceful shutdown --- internal/ingress/controller/nginx.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/internal/ingress/controller/nginx.go b/internal/ingress/controller/nginx.go index 86bc1b7077..945d6047bf 100644 --- a/internal/ingress/controller/nginx.go +++ b/internal/ingress/controller/nginx.go @@ -400,12 +400,18 @@ func (n *NGINXController) stopWait() { var scraper collectors.NginxStatusScraper lastRequests := 0 noChangeTimes := 0 + failures := 0 + const failureThreshold = 5 for ; ; time.Sleep(checkFrequency) { st, err := scraper.Scrape() if err != nil { klog.Warningf("failed to scrape nginx status: %v", err) - noChangeTimes = 0 + failures++ + if failures >= failureThreshold { + klog.Warningf("giving up graceful shutdown: too many nginx status scrape failures") + break + } continue } From ae42b6aeaa708e373ea126db5355aec0d0bf9311 Mon Sep 17 00:00:00 2001 From: motoki317 Date: Fri, 22 Nov 2024 21:19:18 +0900 Subject: [PATCH 7/7] fix: We do not need to account for readinessProbe requests since the probe is handled within ingress-nginx-controller and does not reach nginx process. --- internal/ingress/controller/nginx.go | 9 +++------ test/e2e/gracefulshutdown/k8s_async_shutdown.go | 4 ++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/internal/ingress/controller/nginx.go b/internal/ingress/controller/nginx.go index 945d6047bf..07978cee59 100644 --- a/internal/ingress/controller/nginx.go +++ b/internal/ingress/controller/nginx.go @@ -417,12 +417,9 @@ func (n *NGINXController) stopWait() { diff := st.Requests - lastRequests // We assume that there were no client requests to nginx, if and only if - // there were 0 to 2 increase in handled requests from the last scrape. - // 1 is to account for our own stub_status request from this method, - // and the other 1 is to account for the readinessProbe. - // Note that readinessProbe DO happen even when the pod is terminating. - // See: https://github.com/kubernetes/kubernetes/issues/122824#issuecomment-1899224434 - noChange := 0 <= diff && diff <= 2 + // there were 0 to 1 increase in handled requests from the last scrape - + // 1 is to account for our own stub_status request from this method. + noChange := 0 <= diff && diff <= 1 if noChange { noChangeTimes++ if noChangeTimes >= waitUntilNoConnectionsFor { diff --git a/test/e2e/gracefulshutdown/k8s_async_shutdown.go b/test/e2e/gracefulshutdown/k8s_async_shutdown.go index 9a4bc368d7..c627de2d3f 100644 --- a/test/e2e/gracefulshutdown/k8s_async_shutdown.go +++ b/test/e2e/gracefulshutdown/k8s_async_shutdown.go @@ -68,7 +68,7 @@ var _ = framework.IngressNginxDescribe("[Shutdown] Asynchronous shutdown", func( // therefore we are still receiving traffic while shutting down go func() { defer ginkgo.GinkgoRecover() - for i := 0; i < 120; i++ { + for i := 0; i < 30; i++ { f.HTTPDumbTestClient(). GET("/"). WithURL(fmt.Sprintf("http://%s/", ip)). @@ -76,7 +76,7 @@ var _ = framework.IngressNginxDescribe("[Shutdown] Asynchronous shutdown", func( Expect(). Status(http.StatusOK) - framework.Sleep(250 * time.Millisecond) + framework.Sleep(time.Second) } }()