Skip to content

Commit 7279089

Browse files
authored
Merge pull request #5696 from Vincent-lau/private/shul2/less-alert
CA-394109: Reduce number of alerts
2 parents b0e0bab + cca43a4 commit 7279089

File tree

8 files changed

+218
-151
lines changed

8 files changed

+218
-151
lines changed

ocaml/idl/datamodel_cluster.ml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,16 @@ let pool_resync =
169169
~params:[(Ref _cluster, "self", "The cluster to resync")]
170170
~lifecycle ~allowed_roles:_R_POOL_OP ~errs:[] ()
171171

172+
let cstack_sync =
173+
call ~name:"cstack_sync"
174+
~doc:
175+
"Sync xapi db with the cluster stack synchronously, and generate alerts \
176+
as needed. Only happens on the coordinator as this is where the cluster \
177+
watcher performs updates."
178+
~params:[(Ref _cluster, "self", "The cluster to sync")]
179+
~hide_from_docs:true ~pool_internal:true ~lifecycle
180+
~allowed_roles:_R_POOL_OP ~errs:[] ()
181+
172182
let t =
173183
create_obj ~name:_cluster ~descr:"Cluster-wide Cluster metadata"
174184
~doccomments:[] ~gen_constructor_destructor:false ~gen_events:true
@@ -245,5 +255,6 @@ let t =
245255
; pool_force_destroy
246256
; pool_destroy
247257
; pool_resync
258+
; cstack_sync
248259
]
249260
()

ocaml/tests/test_cluster.ml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ let test_rpc ~__context call =
9595
Rpc.{success= true; contents= Rpc.String ""; is_notification= false}
9696
| "Cluster_host.get_cluster_config", _ ->
9797
Rpc.{success= true; contents= Rpc.String ""; is_notification= false}
98+
| "Cluster.cstack_sync", [_session; self] ->
99+
let open API in
100+
Xapi_cluster.cstack_sync ~__context ~self:(ref_Cluster_of_rpc self) ;
101+
Rpc.{success= true; contents= Rpc.String ""; is_notification= false}
98102
| name, params ->
99103
Alcotest.failf "Unexpected RPC: %s(%s)" name
100104
(String.concat " " (List.map Rpc.to_string params))

ocaml/xapi/message_forwarding.ml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6419,6 +6419,14 @@ functor
64196419
) ;
64206420
debug "Cluster.pool_resync for host %s" (Ref.string_of host)
64216421
)
6422+
6423+
let cstack_sync ~__context ~self =
6424+
info "Cluster.cstack_sync cluster %s" (Ref.string_of self) ;
6425+
let local_fn = Local.Cluster.cstack_sync ~self in
6426+
let coor = Helpers.get_master ~__context in
6427+
do_op_on ~local_fn ~__context ~host:coor (fun session_id rpc ->
6428+
Client.Cluster.cstack_sync ~rpc ~session_id ~self
6429+
)
64226430
end
64236431

64246432
module Cluster_host = struct

ocaml/xapi/xapi_cluster.ml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ let create ~__context ~pIF ~cluster_stack ~pool_auto_join ~token_timeout
115115
~verify ;
116116
(* Create the watcher here in addition to resync_host since pool_create
117117
in resync_host only calls cluster_host.create for pool member nodes *)
118-
create_cluster_watcher_on_master ~__context ~host ;
118+
Watcher.create_as_necessary ~__context ~host ;
119119
Xapi_cluster_host_helpers.update_allowed_operations ~__context
120120
~self:cluster_host_ref ;
121121
D.debug "Created Cluster: %s and Cluster_host: %s"
@@ -294,3 +294,10 @@ let pool_resync ~__context ~self:_ =
294294
)
295295
(* If host.clustering_enabled then resync_host should successfully
296296
find or create a matching cluster_host which is also enabled *)
297+
298+
let cstack_sync ~__context ~self =
299+
if Xapi_cluster_helpers.cluster_health_enabled ~__context then (
300+
debug "%s: sync db data with cluster stack" __FUNCTION__ ;
301+
Watcher.on_corosync_update ~__context ~cluster:self
302+
["Updates due to cluster api calls"]
303+
)

ocaml/xapi/xapi_cluster.mli

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,13 @@ val pool_resync : __context:Context.t -> self:API.ref_Cluster -> unit
7474
Cluster_host objects (ie., one for each host in the pool if the Cluster
7575
has [pool_auto_join] set. If there is a failure, this function must return
7676
an error that enables the administrator to fix the problem. *)
77+
78+
val cstack_sync : __context:Context.t -> self:API.ref_Cluster -> unit
79+
(** [cstack_sync ~__context ~self] is the implementation of the internal XenAPI method,
80+
which synchronously performs a diagnostics call to xapi-clusterd and updates the
81+
xapi db according to the call. This is used internally by cluster-host-create/destroy
82+
to generate the correct alert as a result of the API call. The other part of the
83+
alerts generated due to network failure (e.g. a host left as its network is down)
84+
is handled by the cluster watcher. This call only happens on the coordinator as that
85+
is where the cluster watcher performs the updates, which shares the code with
86+
this function. *)

ocaml/xapi/xapi_cluster_helpers.ml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ let corosync3_enabled ~__context =
114114
let restrictions = Db.Pool.get_restrictions ~__context ~self:pool in
115115
List.assoc_opt "restrict_corosync3" restrictions = Some "false"
116116

117-
let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum
117+
let maybe_generate_alert ~__context ~num_hosts ~hosts_left ~hosts_joined ~quorum
118118
=
119119
let generate_alert join cluster_host =
120120
let host = Db.Cluster_host.get_host ~__context ~self:cluster_host in
@@ -148,10 +148,10 @@ let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum
148148
)
149149
in
150150
if cluster_health_enabled ~__context then (
151-
List.iter (generate_alert false) missing_hosts ;
152-
List.iter (generate_alert true) new_hosts ;
151+
List.iter (generate_alert false) hosts_left ;
152+
List.iter (generate_alert true) hosts_joined ;
153153
(* only generate this alert when the number of hosts is decreasing *)
154-
if missing_hosts <> [] && num_hosts <= quorum then
154+
if hosts_left <> [] && num_hosts <= quorum then
155155
let pool = Helpers.get_pool ~__context in
156156
let pool_uuid = Db.Pool.get_uuid ~__context ~self:pool in
157157
let name, priority = Api_messages.cluster_quorum_approaching_lost in

ocaml/xapi/xapi_cluster_host.ml

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
*)
1414

1515
open Xapi_clustering
16-
open Xapi_cluster_helpers
1716
open Ipaddr_rpc_type
1817

1918
module D = Debug.Make (struct let name = "xapi_cluster_host" end)
@@ -55,20 +54,6 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body
5554
raise err
5655
)
5756

58-
let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts =
59-
let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in
60-
let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in
61-
let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in
62-
maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum
63-
64-
let alert_for_cluster_host_leave ~__context ~cluster_host =
65-
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host]
66-
~new_hosts:[]
67-
68-
let alert_for_cluster_host_join ~__context ~cluster_host =
69-
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[]
70-
~new_hosts:[cluster_host]
71-
7257
(* Create xapi db object for cluster_host, resync_host calls clusterd *)
7358
let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
7459
with_clustering_lock __LOC__ (fun () ->
@@ -81,7 +66,6 @@ let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
8166
~enabled:false ~current_operations:[] ~allowed_operations:[]
8267
~other_config:[] ~joined:false ~live:false
8368
~last_update_live:API.Date.epoch ;
84-
alert_for_cluster_host_join ~__context ~cluster_host:ref ;
8569
ref
8670
)
8771

@@ -232,7 +216,7 @@ let resync_host ~__context ~host =
232216
(* If we have just joined, enable will prevent concurrent clustering ops *)
233217
if not (Db.Cluster_host.get_joined ~__context ~self) then (
234218
join_internal ~__context ~self ;
235-
create_cluster_watcher_on_master ~__context ~host ;
219+
Watcher.create_as_necessary ~__context ~host ;
236220
Xapi_observer.initialise_observer ~__context
237221
Xapi_observer_components.Xapi_clusterd
238222
) else if Db.Cluster_host.get_enabled ~__context ~self then (
@@ -269,16 +253,21 @@ let destroy_op ~__context ~self ~force =
269253
(Cluster_client.LocalClient.leave, "destroy")
270254
in
271255
let result = local_fn (rpc ~__context) dbg in
256+
let cluster = Db.Cluster_host.get_cluster ~__context ~self in
272257
match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
273258
| Ok () ->
274-
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
259+
Helpers.call_api_functions ~__context (fun rpc session_id ->
260+
Client.Client.Cluster.cstack_sync ~rpc ~session_id ~self:cluster
261+
) ;
275262
Db.Cluster_host.destroy ~__context ~self ;
276263
debug "Cluster_host.%s was successful" fn_str ;
277264
Xapi_clustering.Daemon.disable ~__context
278265
| Error error ->
279266
warn "Error occurred during Cluster_host.%s" fn_str ;
280267
if force then (
281-
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
268+
Helpers.call_api_functions ~__context (fun rpc session_id ->
269+
Client.Client.Cluster.cstack_sync ~rpc ~session_id ~self:cluster
270+
) ;
282271
let ref_str = Ref.string_of self in
283272
Db.Cluster_host.destroy ~__context ~self ;
284273
debug "Cluster_host %s force destroyed." ref_str
@@ -326,7 +315,9 @@ let forget ~__context ~self =
326315
Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:[] ;
327316
(* must not disable the daemon here, because we declared another unreachable node dead,
328317
* not the current one *)
329-
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
318+
Helpers.call_api_functions ~__context (fun rpc session_id ->
319+
Client.Client.Cluster.cstack_sync ~rpc ~session_id ~self:cluster
320+
) ;
330321
Db.Cluster_host.destroy ~__context ~self ;
331322
debug "Cluster_host.forget was successful"
332323
| Error error ->
@@ -375,7 +366,7 @@ let enable ~__context ~self =
375366
"Cluster_host.enable: xapi-clusterd not running - attempting to start" ;
376367
Xapi_clustering.Daemon.enable ~__context
377368
) ;
378-
create_cluster_watcher_on_master ~__context ~host ;
369+
Watcher.create_as_necessary ~__context ~host ;
379370
Xapi_observer.initialise_observer ~__context
380371
Xapi_observer_components.Xapi_clusterd ;
381372
let verify = Stunnel_client.get_verify_by_default () in

0 commit comments

Comments
 (0)