Skip to content

Commit 4e11074

Browse files
authored
Allow configuring nginx worker reload behaviour, to prevent multiple concurrent worker reloads which can lead to high resource usage and OOMKill (#10884)
* feat: allow configuring nginx worker reload behaviour, to prevent multiple concurrent worker reloads Signed-off-by: Rafael da Fonseca <rafael.fonseca@wildlifestudios.com> * appease linter, remove unnecessary log line Signed-off-by: Rafael da Fonseca <rafael.fonseca@wildlifestudios.com> * Flip to using a positive behaviour flag instead of negative Signed-off-by: Rafael da Fonseca <rafael.fonseca@wildlifestudios.com> * Update helm-docs Signed-off-by: Rafael da Fonseca <rafael.fonseca@wildlifestudios.com> * Avoid calling GetBackendConfiguration() twice, use clearer name for helm chart option Signed-off-by: Rafael da Fonseca <rafael.fonseca@wildlifestudios.com> * Fix helm-docs ordering Signed-off-by: Rafael da Fonseca <rafael.fonseca@wildlifestudios.com> --------- Signed-off-by: Rafael da Fonseca <rafael.fonseca@wildlifestudios.com>
1 parent 689b993 commit 4e11074

File tree

7 files changed

+69
-3
lines changed

7 files changed

+69
-3
lines changed

charts/ingress-nginx/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ As of version `1.26.0` of this chart, by simply not providing any clusterIP valu
301301
| controller.enableAnnotationValidations | bool | `false` | |
302302
| controller.enableMimalloc | bool | `true` | Enable mimalloc as a drop-in replacement for malloc. # ref: https://github.com/microsoft/mimalloc # |
303303
| controller.enableTopologyAwareRouting | bool | `false` | This configuration enables Topology Aware Routing feature, used together with service annotation service.kubernetes.io/topology-mode="auto" Defaults to false |
304+
| controller.enableWorkerSerialReloads | bool | `false` | This configuration defines if NGINX workers should reload serially instead of concurrently when multiple changes that require reloads are queued |
304305
| controller.existingPsp | string | `""` | Use an existing PSP instead of creating one |
305306
| controller.extraArgs | object | `{}` | Additional command line arguments to pass to Ingress-Nginx Controller E.g. to specify the default SSL certificate you can use |
306307
| controller.extraContainers | list | `[]` | Additional containers to be added to the controller pod. See https://github.com/lemonldap-ng-controller/lemonldap-ng-controller as example. |

charts/ingress-nginx/templates/controller-configmap.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ metadata:
1414
namespace: {{ include "ingress-nginx.namespace" . }}
1515
data:
1616
allow-snippet-annotations: "{{ .Values.controller.allowSnippetAnnotations }}"
17+
enable-serial-reloads: "{{ .Values.controller.enableWorkerSerialReloads }}"
1718
{{- if .Values.controller.addHeaders }}
1819
add-headers: {{ include "ingress-nginx.namespace" . }}/{{ include "ingress-nginx.fullname" . }}-custom-add-headers
1920
{{- end }}

charts/ingress-nginx/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ controller:
9393
# when users add those annotations.
9494
# Global snippets in ConfigMap are still respected
9595
allowSnippetAnnotations: false
96+
# -- This configuration defines if NGINX workers should reload serially instead of concurrently when multiple changes that require reloads are queued
97+
enableWorkerSerialReloads: false
9698
# -- Required for use with CNI based kubernetes installations (such as ones set up by kubeadm),
9799
# since CNI and hostport don't mix yet. Can be deprecated once https://github.com/kubernetes/kubernetes/issues/23920
98100
# is merged

docs/user-guide/nginx-configuration/configmap.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ The following table shows a configuration option's name, type, and the default v
114114
|[worker-processes](#worker-processes)| string | `<Number of CPUs>` ||
115115
|[worker-cpu-affinity](#worker-cpu-affinity)| string | "" ||
116116
|[worker-shutdown-timeout](#worker-shutdown-timeout)| string | "240s" ||
117+
|[enable-serial-reloads](#enable-serial-reloads)|bool|"false"||
117118
|[load-balance](#load-balance)| string | "round_robin" ||
118119
|[variables-hash-bucket-size](#variables-hash-bucket-size)| int | 128 ||
119120
|[variables-hash-max-size](#variables-hash-max-size)| int | 2048 ||

internal/ingress/controller/config/config.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,13 @@ type Configuration struct {
477477
// http://nginx.org/en/docs/ngx_core_module.html#worker_processes
478478
WorkerProcesses string `json:"worker-processes,omitempty"`
479479

480+
// Defines whether multiple concurrent reloads of worker processes should occur.
481+
// Set this to false to prevent more than n x 2 workers to exist at any time, to avoid potential OOM situations and high CPU load
482+
// With this setting on false, configuration changes in the queue will be re-queued with an exponential backoff, until the number of worker process is the expected value.
483+
// By default new worker processes are spawned every time there's a change that cannot be applied dynamically with no upper limit to the number of running workers
484+
// http://nginx.org/en/docs/ngx_core_module.html#worker_processes
485+
WorkerSerialReloads bool `json:"enable-serial-reloads,omitempty"`
486+
480487
// Defines a timeout for a graceful shutdown of worker processes
481488
// http://nginx.org/en/docs/ngx_core_module.html#worker_shutdown_timeout
482489
WorkerShutdownTimeout string `json:"worker-shutdown-timeout,omitempty"`
@@ -851,6 +858,7 @@ func NewDefault() Configuration {
851858
UseGeoIP2: false,
852859
GeoIP2AutoReloadMinutes: 0,
853860
WorkerProcesses: strconv.Itoa(runtime.NumCPU()),
861+
WorkerSerialReloads: false,
854862
WorkerShutdownTimeout: "240s",
855863
VariablesHashBucketSize: 256,
856864
VariablesHashMaxSize: 2048,

internal/ingress/controller/nginx.go

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"syscall"
3636
"text/template"
3737
"time"
38+
"unicode"
3839

3940
proxyproto "github.com/armon/go-proxyproto"
4041
"github.com/eapache/channels"
@@ -87,9 +88,10 @@ func NewNGINXController(config *Configuration, mc metric.Collector) *NGINXContro
8788
n := &NGINXController{
8889
isIPV6Enabled: ing_net.IsIPv6Enabled(),
8990

90-
resolver: h,
91-
cfg: config,
92-
syncRateLimiter: flowcontrol.NewTokenBucketRateLimiter(config.SyncRateLimit, 1),
91+
resolver: h,
92+
cfg: config,
93+
syncRateLimiter: flowcontrol.NewTokenBucketRateLimiter(config.SyncRateLimit, 1),
94+
workersReloading: false,
9395

9496
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, apiv1.EventSource{
9597
Component: "nginx-ingress-controller",
@@ -229,6 +231,8 @@ type NGINXController struct {
229231

230232
syncRateLimiter flowcontrol.RateLimiter
231233

234+
workersReloading bool
235+
232236
// stopLock is used to enforce that only a single call to Stop send at
233237
// a given time. We allow stopping through an HTTP endpoint and
234238
// allowing concurrent stoppers leads to stack traces.
@@ -676,6 +680,11 @@ func (n *NGINXController) OnUpdate(ingressCfg ingress.Configuration) error {
676680
cfg := n.store.GetBackendConfiguration()
677681
cfg.Resolver = n.resolver
678682

683+
workerSerialReloads := cfg.WorkerSerialReloads
684+
if workerSerialReloads && n.workersReloading {
685+
return errors.New("worker reload already in progress, requeuing reload")
686+
}
687+
679688
content, err := n.generateTemplate(cfg, ingressCfg)
680689
if err != nil {
681690
return err
@@ -738,9 +747,41 @@ func (n *NGINXController) OnUpdate(ingressCfg ingress.Configuration) error {
738747
return fmt.Errorf("%v\n%v", err, string(o))
739748
}
740749

750+
// Reload status checking runs in a separate goroutine to avoid blocking the sync queue
751+
if workerSerialReloads {
752+
go n.awaitWorkersReload()
753+
}
754+
741755
return nil
742756
}
743757

758+
// awaitWorkersReload checks if the number of workers has returned to the expected count
759+
func (n *NGINXController) awaitWorkersReload() {
760+
n.workersReloading = true
761+
defer func() { n.workersReloading = false }()
762+
763+
expectedWorkers := n.store.GetBackendConfiguration().WorkerProcesses
764+
var numWorkers string
765+
klog.V(3).Infof("waiting for worker count to be equal to %s", expectedWorkers)
766+
for numWorkers != expectedWorkers {
767+
time.Sleep(time.Second)
768+
o, err := exec.Command("/bin/sh", "-c", "pgrep worker | wc -l").Output()
769+
if err != nil {
770+
klog.ErrorS(err, numWorkers)
771+
return
772+
}
773+
// cleanup any non-printable chars from shell output
774+
numWorkers = strings.Map(func(r rune) rune {
775+
if unicode.IsPrint(r) {
776+
return r
777+
}
778+
return -1
779+
}, string(o))
780+
781+
klog.V(3).Infof("Currently running nginx worker processes: %s, expected %s", numWorkers, expectedWorkers)
782+
}
783+
}
784+
744785
// nginxHashBucketSize computes the correct NGINX hash_bucket_size for a hash
745786
// with the given longest key.
746787
func nginxHashBucketSize(longestString int) int {

internal/ingress/controller/template/configmap.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ const (
6969
luaSharedDictsKey = "lua-shared-dicts"
7070
plugins = "plugins"
7171
debugConnections = "debug-connections"
72+
workerSerialReloads = "enable-serial-reloads"
7273
)
7374

7475
var (
@@ -404,6 +405,17 @@ func ReadConfig(src map[string]string) config.Configuration {
404405
delete(conf, workerProcesses)
405406
}
406407

408+
if val, ok := conf[workerSerialReloads]; ok {
409+
boolVal, err := strconv.ParseBool(val)
410+
if err != nil {
411+
to.WorkerSerialReloads = false
412+
klog.Warningf("failed to parse enable-serial-reloads setting, valid values are true or false, found %s", val)
413+
} else {
414+
to.WorkerSerialReloads = boolVal
415+
}
416+
delete(conf, workerSerialReloads)
417+
}
418+
407419
if val, ok := conf[plugins]; ok {
408420
to.Plugins = splitAndTrimSpace(val, ",")
409421
delete(conf, plugins)

0 commit comments

Comments
 (0)