Skip to content

Commit 195f932

Browse files
authored
Merge pull request #5871 from Vincent-lau/private/shul2/health-poll
CA-395789: Add polling to cluster health state update
2 parents 3c6d064 + 58a1420 commit 195f932

File tree

1 file changed

+24
-13
lines changed

1 file changed

+24
-13
lines changed

ocaml/xapi/xapi_clustering.ml

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -427,11 +427,16 @@ let compute_corosync_max_host_failures ~__context =
427427
corosync_ha_max_hosts
428428

429429
module Watcher = struct
430+
let routine_updates = "routine updates"
431+
430432
let on_corosync_update ~__context ~cluster updates =
431-
debug
432-
"%s: Received %d updates from corosync_notifyd, run diagnostics to get \
433-
new state"
434-
__FUNCTION__ (List.length updates) ;
433+
if updates = [routine_updates] then
434+
debug "%s: Perform routine updates" __FUNCTION__
435+
else
436+
debug
437+
"%s: Received %d updates from corosync_notifyd, run diagnostics to get \
438+
new state"
439+
__FUNCTION__ (List.length updates) ;
435440
let m =
436441
Cluster_client.LocalClient.diagnostics (rpc ~__context)
437442
"update quorum api fields with diagnostics"
@@ -535,10 +540,10 @@ module Watcher = struct
535540

536541
let cluster_change_watcher : bool Atomic.t = Atomic.make false
537542

538-
(* this is the time it takes for the update request to time out. It is ok to set
543+
(* This is the time it takes for the update request to time out. It is ok to set
539544
it to a relatively long value since the call will return immediately if there
540-
is an update *)
541-
let cluster_change_interval = Mtime.Span.min
545+
is an update. *)
546+
let cluster_change_interval = Mtime.Span.(5 * min)
542547

543548
let cluster_stack_watcher : bool Atomic.t = Atomic.make false
544549

@@ -550,21 +555,27 @@ module Watcher = struct
550555
while !Daemon.enabled do
551556
let m =
552557
Cluster_client.LocalClient.UPDATES.get (rpc ~__context)
553-
"call cluster watcher"
558+
"cluster change watcher call"
554559
(Clock.Timer.span_to_s cluster_change_interval)
555560
in
556-
match Idl.IdM.run @@ Cluster_client.IDL.T.get m with
557-
| Ok updates -> (
561+
let find_cluster_and_update updates =
558562
match find_cluster_host ~__context ~host with
559563
| Some ch ->
560564
let cluster = Db.Cluster_host.get_cluster ~__context ~self:ch in
561565
on_corosync_update ~__context ~cluster updates
562566
| None ->
563567
()
564-
)
568+
in
569+
match Idl.IdM.run @@ Cluster_client.IDL.T.get m with
570+
| Ok updates ->
571+
(* Received updates from corosync-notifyd *)
572+
find_cluster_and_update updates
565573
| Error (InternalError "UPDATES.Timeout") ->
566-
(* UPDATES.get timed out, this is normal, now retry *)
567-
()
574+
(* UPDATES.get timed out, this is normal. *)
575+
(* CA-395789: We send a query to xapi-clusterd to fetch the latest state
576+
anyway in case there is a race and the previous update did not give the
577+
most up-to-date information *)
578+
find_cluster_and_update [routine_updates]
568579
| Error (InternalError message) | Error (Unix_error message) ->
569580
warn "%s: Cannot query cluster host updates with error %s"
570581
__FUNCTION__ message

0 commit comments

Comments
 (0)