Skip to content

CA-394343: After clock jump the xapi assumed the host is HOST_OFFLINE #5700

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 21 additions & 35 deletions ocaml/xapi/db_gc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ let use_host_heartbeat_for_liveness = ref true

let use_host_heartbeat_for_liveness_m = Mutex.create ()

let host_heartbeat_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16
let host_heartbeat_table : (API.ref_host, Clock.Timer.t) Hashtbl.t =
Hashtbl.create 16

let host_skew_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16

Expand Down Expand Up @@ -77,45 +78,24 @@ let detect_clock_skew ~__context host skew =
(* Master compares the database with the in-memory host heartbeat table and sets the live flag accordingly.
Called with the use_host_heartbeat_for_liveness_m and use_host_heartbeat_for_liveness is true (ie non-HA mode) *)
let check_host_liveness ~__context =
(* Check for rolling upgrade mode - if so, use host metrics for liveness else use hashtbl *)
let rum =
try Helpers.rolling_upgrade_in_progress ~__context with _ -> false
in
(* CA-16351: when performing the initial GC pass on first boot there won't be a localhost *)
let localhost = try Helpers.get_localhost ~__context with _ -> Ref.null in
(* Look for "true->false" transition on Host_metrics.live *)
let check_host host =
if host <> localhost then
try
let hmetric = Db.Host.get_metrics ~__context ~self:host in
let live = Db.Host_metrics.get_live ~__context ~self:hmetric in
(* See if the host is using the new HB mechanism, if so we'll use that *)
let new_heartbeat_time =
let timer =
with_lock host_table_m (fun () ->
Option.value
(Hashtbl.find_opt host_heartbeat_table host)
~default:Clock.Date.(epoch |> to_unix_time)
match Hashtbl.find_opt host_heartbeat_table host with
| Some x ->
x
| None ->
Clock.Timer.start
~duration:!Xapi_globs.host_assumed_dead_interval
Comment on lines +93 to +95
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in case of None, it means the host has never tickle heartbeat? Then we should consider it as expired?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

287 let start_db_gc_thread () =
288   Thread.create
289     (fun () ->
290       Debug.with_thread_named "db_gc"
291         (fun () ->
292           while true do
293             try Thread.delay db_GC_TIMER ; single_pass ()
294             with e ->
295               debug "Exception in DB GC thread: %s" (ExnHelper.string_of_exn e)
296           done
297         )
298         ()
299     )
300     ()
301

For
293 try Thread.delay db_GC_TIMER ; single_pass ()
As we use a timer to check heartbeat and a default timer for each host, I think here we should start single_pass immediately.

)
in
let old_heartbeat_time =
if
rum
&& Xapi_version.platform_version ()
<> Helpers.version_string_of ~__context (Helpers.LocalObject host)
then (
debug
"Host %s considering using metrics last update time as heartbeat"
(Ref.string_of host) ;
Date.to_float
(Db.Host_metrics.get_last_updated ~__context ~self:hmetric)
) else
0.0
in
(* Use whichever value is the most recent to determine host liveness *)
let host_time = max old_heartbeat_time new_heartbeat_time in
let now = Unix.gettimeofday () in
(* we can now compare 'host_time' with 'now' *)
if now -. host_time < !Xapi_globs.host_assumed_dead_interval then
if not (Clock.Timer.has_expired timer) then
(* From the heartbeat PoV the host looks alive. We try to (i) minimise database sets; and (ii)
avoid toggling the host back to live if it has been marked as shutting_down. *)
with_lock Xapi_globs.hosts_which_are_shutting_down_m (fun () ->
Expand All @@ -131,10 +111,14 @@ let check_host_liveness ~__context =
)
)
else if live then (
let host_name_label = Db.Host.get_name_label ~__context ~self:host in
let host_uuid = Db.Host.get_uuid ~__context ~self:host in
let elapsed = Clock.Timer.elapsed timer in
debug
"Assuming host is offline since the heartbeat/metrics haven't been \
updated for %.2f seconds; setting live to false"
(now -. host_time) ;
"Assuming host '%s' (%s) is offline since the heartbeat hasn't \
been updated for %s seconds; setting live to false"
host_name_label host_uuid
(Clock.Timer.span_to_s elapsed |> string_of_float) ;
Db.Host_metrics.set_live ~__context ~self:hmetric ~value:false ;
Xapi_host_helpers.update_allowed_operations ~__context ~self:host
) ;
Expand Down Expand Up @@ -252,16 +236,18 @@ let tickle_heartbeat ~__context host stuff =
let reason = Xapi_hooks.reason__clean_shutdown in
if use_host_heartbeat_for_liveness then
Xapi_host_helpers.mark_host_as_dead ~__context ~host ~reason
) else
) else (
Hashtbl.replace host_heartbeat_table host
(Clock.Timer.start ~duration:!Xapi_globs.host_assumed_dead_interval) ;
let now = Unix.gettimeofday () in
Hashtbl.replace host_heartbeat_table host now ;
(* compute the clock skew for later analysis *)
if List.mem_assoc _time stuff then
try
let slave = float_of_string (List.assoc _time stuff) in
let skew = abs_float (now -. slave) in
Hashtbl.replace host_skew_table host skew
with _ -> ()
)
) ;
[]

Expand Down
6 changes: 4 additions & 2 deletions ocaml/xapi/xapi_globs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ let snapshot_with_quiesce_timeout = ref 600.
let host_heartbeat_interval = ref 30.

(* If we haven't heard a heartbeat from a host for this interval then the host is assumed dead *)
let host_assumed_dead_interval = ref 600.0
let host_assumed_dead_interval = ref Mtime.Span.(10 * min)

(* If a session has a last_active older than this we delete it *)
let inactive_session_timeout = ref 86400. (* 24 hrs in seconds *)
Expand Down Expand Up @@ -1070,7 +1070,9 @@ let xapi_globs_spec =
; ("wait_memory_target_timeout", Float wait_memory_target_timeout)
; ("snapshot_with_quiesce_timeout", Float snapshot_with_quiesce_timeout)
; ("host_heartbeat_interval", Float host_heartbeat_interval)
; ("host_assumed_dead_interval", Float host_assumed_dead_interval)
; ( "host_assumed_dead_interval"
, LongDurationFromSeconds host_assumed_dead_interval
)
; ("fuse_time", Float Constants.fuse_time)
; ("db_restore_fuse_time", Float Constants.db_restore_fuse_time)
; ("inactive_session_timeout", Float inactive_session_timeout)
Expand Down
2 changes: 1 addition & 1 deletion ocaml/xapi/xapi_ha.ml
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,7 @@ module Monitor = struct
(ExnHelper.string_of_exn e) ;
Thread.delay !Xapi_globs.ha_monitor_interval
done ;
debug "Re-enabling old Host_metrics.live heartbeat" ;
debug "Re-enabling host heartbeat" ;
with_lock Db_gc.use_host_heartbeat_for_liveness_m (fun () ->
Db_gc.use_host_heartbeat_for_liveness := true
) ;
Expand Down
Loading