Skip to content

CA-401650: reduce open connections between pool members and the coordinator #6110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions ocaml/libs/stunnel/stunnel_cache.ml
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,19 @@ let debug = if debug_enabled then debug else ignore_log
type endpoint = {host: string; port: int}

(* Need to limit the absolute number of stunnels as well as the maximum age *)
let max_stunnel = 70
let max_stunnel = Atomic.make 70

let max_age = 180. *. 60. (* seconds *)
let set_max_stunnel n =
D.info "Setting max_stunnel = %d" n ;
Atomic.set max_stunnel n

let max_idle = 5. *. 60. (* seconds *)
let max_age = ref (180. *. 60.) (* seconds *)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More victims for my mtime changes :)


let max_idle = ref (5. *. 60.) (* seconds *)

(* The add function adds the new stunnel before doing gc, so the cache *)
(* can briefly contain one more than maximum. *)
let capacity = max_stunnel + 1
let capacity = Atomic.get max_stunnel + 1

(** An index of endpoints to stunnel IDs *)
let index : (endpoint, int list) Hashtbl.t ref = ref (Hashtbl.create capacity)
Expand Down Expand Up @@ -104,6 +108,7 @@ let unlocked_gc () =
let to_gc = ref [] in
(* Find the ones which are too old *)
let now = Unix.gettimeofday () in
let max_age = !max_age and max_idle = !max_idle in
Tbl.iter !stunnels (fun idx stunnel ->
match Hashtbl.find_opt !times idx with
| Some time ->
Expand All @@ -122,6 +127,7 @@ let unlocked_gc () =
debug "%s: found no entry for idx=%d" __FUNCTION__ idx
) ;
let num_remaining = List.length all_ids - List.length !to_gc in
let max_stunnel = Atomic.get max_stunnel in
if num_remaining > max_stunnel then (
let times' = Hashtbl.fold (fun k v acc -> (k, v) :: acc) !times [] in
let times' =
Expand Down
11 changes: 11 additions & 0 deletions ocaml/libs/stunnel/stunnel_cache.mli
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
HTTP 1.1 should be used and the connection should be kept-alive.
*)

val set_max_stunnel : int -> unit
(** [set_max_stunnel] set the maximum number of unusued, but cached client stunnel connections.
This should be a low number on pool members, to avoid hitting limits on the coordinator with large pools.
*)

val with_connect :
?use_fork_exec_helper:bool
-> ?write_to_log:(string -> unit)
Expand Down Expand Up @@ -46,3 +51,9 @@ val flush : unit -> unit

val gc : unit -> unit
(** GCs old stunnels *)

val max_age : float ref
(** maximum time a connection is kept in the stunnel cache, counted from the time it got initially added to the cache *)

val max_idle : float ref
(** maximum time a connection is kept in the stunnel cache, counted from the most recent time it got (re)added to the cache. *)
3 changes: 3 additions & 0 deletions ocaml/xapi/xapi.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,8 @@ let server_init () =
] ;
( match Pool_role.get_role () with
| Pool_role.Master ->
Stunnel_cache.set_max_stunnel
!Xapi_globs.coordinator_max_stunnel_cache ;
()
| Pool_role.Broken ->
info "This node is broken; moving straight to emergency mode" ;
Expand All @@ -1151,6 +1153,7 @@ let server_init () =
server_run_in_emergency_mode ()
| Pool_role.Slave _ ->
info "Running in 'Pool Slave' mode" ;
Stunnel_cache.set_max_stunnel !Xapi_globs.member_max_stunnel_cache ;
(* Set emergency mode until we actually talk to the master *)
Xapi_globs.slave_emergency_mode := true ;
(* signal the init script that it should succeed even though we're bust *)
Expand Down
8 changes: 8 additions & 0 deletions ocaml/xapi/xapi_globs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,10 @@ let header_total_timeout_tcp = ref 60.
let max_header_length_tcp = ref 1024
(* Maximum accepted size of HTTP headers in bytes (on TCP only) *)

let coordinator_max_stunnel_cache = ref 70

let member_max_stunnel_cache = ref 70

let conn_limit_tcp = ref 800

let conn_limit_unix = ref 1024
Expand Down Expand Up @@ -1142,9 +1146,13 @@ let xapi_globs_spec =
; ("header_read_timeout_tcp", Float header_read_timeout_tcp)
; ("header_total_timeout_tcp", Float header_total_timeout_tcp)
; ("max_header_length_tcp", Int max_header_length_tcp)
; ("coordinator_max_stunnel_cache", Int coordinator_max_stunnel_cache)
; ("member_max_stunnel_cache", Int member_max_stunnel_cache)
; ("conn_limit_tcp", Int conn_limit_tcp)
; ("conn_limit_unix", Int conn_limit_unix)
; ("conn_limit_clientcert", Int conn_limit_clientcert)
; ("stunnel_cache_max_age", Float Stunnel_cache.max_age)
; ("stunnel_cache_max_idle", Float Stunnel_cache.max_idle)
; ("export_interval", Float export_interval)
; ("max_spans", Int max_spans)
; ("max_traces", Int max_traces)
Expand Down
4 changes: 4 additions & 0 deletions ocaml/xapi/xapi_periodic_scheduler_init.ml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ let register ~__context =
Xapi_host.alert_if_tls_verification_was_emergency_disabled ~__context
)
) ;
let stunnel_period = !Stunnel_cache.max_idle /. 2. in
Xapi_periodic_scheduler.add_to_queue "Check stunnel cache expiry"
(Xapi_periodic_scheduler.Periodic stunnel_period) stunnel_period
Stunnel_cache.gc ;
if
master
&& Db.Pool.get_update_sync_enabled ~__context
Expand Down
Loading