From 4068f9de99a510d4d4acfce1fd187572ddc77e74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Thu, 7 Nov 2024 17:58:17 +0000 Subject: [PATCH 1/2] CA-401651: stunnel_cache: run the cache expiry code periodically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously it'd only run when we added or removed entries, but on an idle system we'd keep a large number of connections open that we don't need, and this then exceeded the connection limits on the coordinator. Signed-off-by: Edwin Török --- ocaml/libs/stunnel/stunnel_cache.ml | 5 +++-- ocaml/libs/stunnel/stunnel_cache.mli | 6 ++++++ ocaml/xapi/xapi_globs.ml | 2 ++ ocaml/xapi/xapi_periodic_scheduler_init.ml | 4 ++++ 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/ocaml/libs/stunnel/stunnel_cache.ml b/ocaml/libs/stunnel/stunnel_cache.ml index d69fbf10091..54ebe9c53df 100644 --- a/ocaml/libs/stunnel/stunnel_cache.ml +++ b/ocaml/libs/stunnel/stunnel_cache.ml @@ -42,9 +42,9 @@ type endpoint = {host: string; port: int} (* Need to limit the absolute number of stunnels as well as the maximum age *) let max_stunnel = 70 -let max_age = 180. *. 60. (* seconds *) +let max_age = ref (180. *. 60.) (* seconds *) -let max_idle = 5. *. 60. (* seconds *) +let max_idle = ref (5. *. 60.) (* seconds *) (* The add function adds the new stunnel before doing gc, so the cache *) (* can briefly contain one more than maximum. *) @@ -104,6 +104,7 @@ let unlocked_gc () = let to_gc = ref [] in (* Find the ones which are too old *) let now = Unix.gettimeofday () in + let max_age = !max_age and max_idle = !max_idle in Tbl.iter !stunnels (fun idx stunnel -> match Hashtbl.find_opt !times idx with | Some time -> diff --git a/ocaml/libs/stunnel/stunnel_cache.mli b/ocaml/libs/stunnel/stunnel_cache.mli index 9a2923dfcbf..6c581c422ff 100644 --- a/ocaml/libs/stunnel/stunnel_cache.mli +++ b/ocaml/libs/stunnel/stunnel_cache.mli @@ -46,3 +46,9 @@ val flush : unit -> unit val gc : unit -> unit (** GCs old stunnels *) + +val max_age : float ref +(** maximum time a connection is kept in the stunnel cache, counted from the time it got initially added to the cache *) + +val max_idle : float ref +(** maximum time a connection is kept in the stunnel cache, counted from the most recent time it got (re)added to the cache. *) diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index ef3c5ce66a9..6a8e7bb21cd 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -1145,6 +1145,8 @@ let xapi_globs_spec = ; ("conn_limit_tcp", Int conn_limit_tcp) ; ("conn_limit_unix", Int conn_limit_unix) ; ("conn_limit_clientcert", Int conn_limit_clientcert) + ; ("stunnel_cache_max_age", Float Stunnel_cache.max_age) + ; ("stunnel_cache_max_idle", Float Stunnel_cache.max_idle) ; ("export_interval", Float export_interval) ; ("max_spans", Int max_spans) ; ("max_traces", Int max_traces) diff --git a/ocaml/xapi/xapi_periodic_scheduler_init.ml b/ocaml/xapi/xapi_periodic_scheduler_init.ml index 5b49ebcde50..6fc6d0de299 100644 --- a/ocaml/xapi/xapi_periodic_scheduler_init.ml +++ b/ocaml/xapi/xapi_periodic_scheduler_init.ml @@ -114,6 +114,10 @@ let register ~__context = Xapi_host.alert_if_tls_verification_was_emergency_disabled ~__context ) ) ; + let stunnel_period = !Stunnel_cache.max_idle /. 2. in + Xapi_periodic_scheduler.add_to_queue "Check stunnel cache expiry" + (Xapi_periodic_scheduler.Periodic stunnel_period) stunnel_period + Stunnel_cache.gc ; if master && Db.Pool.get_update_sync_enabled ~__context From f9a523dccf805af91e8b4bc942cb5457b0834f93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Thu, 7 Nov 2024 17:47:31 +0000 Subject: [PATCH 2/2] CA-401652: stunnel_cache: set stunnel size limit based on host role MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce separate coordinator_max_stunnel_cache and member_max_stunnel_cache settings, and set these on XAPI startup based on host role. No functional change, the defaults for both match the previous hardcoded value (70). Signed-off-by: Edwin Török --- ocaml/libs/stunnel/stunnel_cache.ml | 9 +++++++-- ocaml/libs/stunnel/stunnel_cache.mli | 5 +++++ ocaml/xapi/xapi.ml | 3 +++ ocaml/xapi/xapi_globs.ml | 6 ++++++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ocaml/libs/stunnel/stunnel_cache.ml b/ocaml/libs/stunnel/stunnel_cache.ml index 54ebe9c53df..be865a216dc 100644 --- a/ocaml/libs/stunnel/stunnel_cache.ml +++ b/ocaml/libs/stunnel/stunnel_cache.ml @@ -40,7 +40,11 @@ let debug = if debug_enabled then debug else ignore_log type endpoint = {host: string; port: int} (* Need to limit the absolute number of stunnels as well as the maximum age *) -let max_stunnel = 70 +let max_stunnel = Atomic.make 70 + +let set_max_stunnel n = + D.info "Setting max_stunnel = %d" n ; + Atomic.set max_stunnel n let max_age = ref (180. *. 60.) (* seconds *) @@ -48,7 +52,7 @@ let max_idle = ref (5. *. 60.) (* seconds *) (* The add function adds the new stunnel before doing gc, so the cache *) (* can briefly contain one more than maximum. *) -let capacity = max_stunnel + 1 +let capacity = Atomic.get max_stunnel + 1 (** An index of endpoints to stunnel IDs *) let index : (endpoint, int list) Hashtbl.t ref = ref (Hashtbl.create capacity) @@ -123,6 +127,7 @@ let unlocked_gc () = debug "%s: found no entry for idx=%d" __FUNCTION__ idx ) ; let num_remaining = List.length all_ids - List.length !to_gc in + let max_stunnel = Atomic.get max_stunnel in if num_remaining > max_stunnel then ( let times' = Hashtbl.fold (fun k v acc -> (k, v) :: acc) !times [] in let times' = diff --git a/ocaml/libs/stunnel/stunnel_cache.mli b/ocaml/libs/stunnel/stunnel_cache.mli index 6c581c422ff..724642d1dc0 100644 --- a/ocaml/libs/stunnel/stunnel_cache.mli +++ b/ocaml/libs/stunnel/stunnel_cache.mli @@ -19,6 +19,11 @@ HTTP 1.1 should be used and the connection should be kept-alive. *) +val set_max_stunnel : int -> unit +(** [set_max_stunnel] set the maximum number of unusued, but cached client stunnel connections. + This should be a low number on pool members, to avoid hitting limits on the coordinator with large pools. + *) + val with_connect : ?use_fork_exec_helper:bool -> ?write_to_log:(string -> unit) diff --git a/ocaml/xapi/xapi.ml b/ocaml/xapi/xapi.ml index 9b87f1de6b5..d1784b50776 100644 --- a/ocaml/xapi/xapi.ml +++ b/ocaml/xapi/xapi.ml @@ -1143,6 +1143,8 @@ let server_init () = ] ; ( match Pool_role.get_role () with | Pool_role.Master -> + Stunnel_cache.set_max_stunnel + !Xapi_globs.coordinator_max_stunnel_cache ; () | Pool_role.Broken -> info "This node is broken; moving straight to emergency mode" ; @@ -1151,6 +1153,7 @@ let server_init () = server_run_in_emergency_mode () | Pool_role.Slave _ -> info "Running in 'Pool Slave' mode" ; + Stunnel_cache.set_max_stunnel !Xapi_globs.member_max_stunnel_cache ; (* Set emergency mode until we actually talk to the master *) Xapi_globs.slave_emergency_mode := true ; (* signal the init script that it should succeed even though we're bust *) diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index 6a8e7bb21cd..af1e06fe7e1 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -1011,6 +1011,10 @@ let header_total_timeout_tcp = ref 60. let max_header_length_tcp = ref 1024 (* Maximum accepted size of HTTP headers in bytes (on TCP only) *) +let coordinator_max_stunnel_cache = ref 70 + +let member_max_stunnel_cache = ref 70 + let conn_limit_tcp = ref 800 let conn_limit_unix = ref 1024 @@ -1142,6 +1146,8 @@ let xapi_globs_spec = ; ("header_read_timeout_tcp", Float header_read_timeout_tcp) ; ("header_total_timeout_tcp", Float header_total_timeout_tcp) ; ("max_header_length_tcp", Int max_header_length_tcp) + ; ("coordinator_max_stunnel_cache", Int coordinator_max_stunnel_cache) + ; ("member_max_stunnel_cache", Int member_max_stunnel_cache) ; ("conn_limit_tcp", Int conn_limit_tcp) ; ("conn_limit_unix", Int conn_limit_unix) ; ("conn_limit_clientcert", Int conn_limit_clientcert)