Skip to content

Commit 9913e47

Browse files
authored
CA-403379: pre-flight cluster_host state before pool-ha-enable (#7130)
When the chosen HA cluster_stack is corosync (i.e. for a gfs2 heartbeat SR) every pool host must have an enabled, joined cluster_host on the matching cluster stack, and this host must currently be quorate. Without this preflight, that failure surfaces much later inside Xha_statefile.check_sr_can_host_statefile with the misleading SR_NO_PBDS error from pool-ha-enable. This change adds a per-host preflight in `Xapi_ha.enable` that reuses the existing `NO_COMPATIBLE_CLUSTER_HOST`, `CLUSTERING_DISABLED` and `CLUSTER_HOST_NOT_JOINED` errors so the caller can pinpoint exactly which host is the problem. The preflight runs BEFORE the cluster_stack persisted to the pool DB and local db, matching the pattern of the existing host_offline check, so a failed precondition does not leak ha_cluster_stack into the pool state. The final assert_cluster_host_quorate call queries xapi-clusterd diagnostics directly rather than reading the Cluster_host.live DB field, which the corosync_notifyd watcher only updates asynchronously and which is reset to false for all hosts on any transient quorum blip.
2 parents 242f6db + e68fa90 commit 9913e47

3 files changed

Lines changed: 51 additions & 0 deletions

File tree

ocaml/xapi/xapi_clustering.ml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,52 @@ let assert_cluster_host_quorate ~__context ~self =
363363
warn "Cannot query cluster host quorate status" ;
364364
handle_error error
365365

366+
(* Pre-flight for pool-ha-enable when the chosen [cluster_stack] is corosync:
367+
every pool host must have an enabled, joined cluster_host on that stack, and
368+
the local coordinator host must currently be quorate. Otherwise the gfs2
369+
heartbeat SR's PBD cannot plug and pool-ha-enable would later fail with the
370+
misleading SR_NO_PBDS from check_sr_can_host_statefile. Intended to run
371+
before ha_cluster_stack is persisted, so a failed precondition does not leak
372+
it into the pool DB. *)
373+
let assert_pool_ready_for_corosync_ha ~__context ~cluster_stack =
374+
let localhost = Helpers.get_localhost ~__context in
375+
List.iter
376+
(fun host ->
377+
match find_cluster_host ~__context ~host with
378+
| None ->
379+
raise
380+
Api_errors.(
381+
Server_error (no_compatible_cluster_host, [Ref.string_of host])
382+
)
383+
| Some self ->
384+
let cluster = Db.Cluster_host.get_cluster ~__context ~self in
385+
let ch_stack =
386+
Db.Cluster.get_cluster_stack ~__context ~self:cluster
387+
in
388+
if ch_stack <> cluster_stack then
389+
raise
390+
Api_errors.(
391+
Server_error (no_compatible_cluster_host, [Ref.string_of host])
392+
) ;
393+
assert_cluster_host_enabled ~__context ~self ~expected:true ;
394+
if not (Db.Cluster_host.get_joined ~__context ~self) then
395+
raise
396+
Api_errors.(
397+
Server_error (cluster_host_not_joined, [Ref.string_of self])
398+
)
399+
)
400+
(Db.Host.get_all ~__context) ;
401+
(* Quorum only needs asserting once, for the local coordinator: it checks live
402+
quorum directly via xapi-clusterd diagnostics, sidestepping the
403+
Cluster_host.live DB field which the corosync_notifyd watcher only updates
404+
asynchronously. *)
405+
match find_cluster_host ~__context ~host:localhost with
406+
| Some self ->
407+
assert_cluster_host_quorate ~__context ~self
408+
| None ->
409+
warn "%s: coordinator %s has no cluster_host; skipping quorum check"
410+
__FUNCTION__ (Ref.string_of localhost)
411+
366412
let assert_cluster_host_is_enabled_for_matching_sms ~__context ~host ~sr_sm_type
367413
=
368414
match get_required_cluster_stacks ~__context ~sr_sm_type with

ocaml/xapi/xapi_clustering.mli

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ val get_network_internal :
5252
val assert_cluster_host_enabled :
5353
__context:Context.t -> self:[`Cluster_host] Ref.t -> expected:bool -> unit
5454

55+
val assert_pool_ready_for_corosync_ha :
56+
__context:Context.t -> cluster_stack:string -> unit
57+
5558
val assert_operation_host_target_is_localhost :
5659
__context:Context.t -> host:[`host] Ref.t -> unit
5760

ocaml/xapi/xapi_ha.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,6 +1908,8 @@ let enable __context heartbeat_srs configuration =
19081908
let cluster_stack =
19091909
Cluster_stack_constraints.choose_cluster_stack ~__context
19101910
in
1911+
if cluster_stack = Constants.Ha_cluster_stack.(to_string Corosync) then
1912+
Xapi_clustering.assert_pool_ready_for_corosync_ha ~__context ~cluster_stack ;
19111913
Db.Pool.set_ha_cluster_stack ~__context ~self:pool ~value:cluster_stack ;
19121914
Localdb.put Constants.ha_cluster_stack cluster_stack ;
19131915
(* Steps from 8.7 Enabling HA in Marathon spec:

0 commit comments

Comments
 (0)