Skip to content

Commit 32d2521

Browse files
committed
CA-403379: pre-flight cluster_host state before pool-ha-enable
When the chosen HA cluster_stack is corosync (i.e. for a gfs2 heartbeat SR) every pool host must have an enabled, joined cluster_host on the matching cluster stack, and this host must currently be quorate. Without this preflight, that failure surfaces much later inside Xha_statefile.check_sr_can_host_statefile with the misleading SR_NO_PBDS error from pool-ha-enable (CA-417077 / TC7509). This change adds a per-host preflight in Xapi_ha.enable that reuses the existing NO_COMPATIBLE_CLUSTER_HOST, CLUSTERING_DISABLED and CLUSTER_HOST_NOT_JOINED errors so the caller can pinpoint exactly which host is the problem. The preflight runs BEFORE the cluster_stack is persisted to the pool DB and localdb, matching the pattern of the existing host_offline check, so a failed precondition does not leak ha_cluster_stack into the pool state. The final assert_cluster_host_quorate call queries xapi-clusterd diagnostics directly rather than reading the Cluster_host.live DB field, which the corosync_notifyd watcher only updates asynchronously and which is reset to false for all hosts on any transient quorum blip. Signed-off-by: Lunfan Zhang[Lunfan.Zhang] <Lunfan.Zhang@cloud.com>
1 parent 5f6406f commit 32d2521

2 files changed

Lines changed: 49 additions & 0 deletions

File tree

ocaml/xapi/xapi_clustering.mli

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ val get_network_internal :
5252
val assert_cluster_host_enabled :
5353
__context:Context.t -> self:[`Cluster_host] Ref.t -> expected:bool -> unit
5454

55+
val assert_cluster_host_quorate :
56+
__context:Context.t -> self:[`Cluster_host] Ref.t -> unit
57+
5558
val assert_operation_host_target_is_localhost :
5659
__context:Context.t -> host:[`host] Ref.t -> unit
5760

ocaml/xapi/xapi_ha.ml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,6 +1908,52 @@ let enable __context heartbeat_srs configuration =
19081908
let cluster_stack =
19091909
Cluster_stack_constraints.choose_cluster_stack ~__context
19101910
in
1911+
(* when HA uses corosync, every pool host must have an
1912+
enabled, joined cluster_host on the chosen stack and this host
1913+
must currently be quorate; otherwise the gfs2 heartbeat SR's PBD
1914+
cannot plug and pool-ha-enable would later fail with the
1915+
misleading SR_NO_PBDS from check_sr_can_host_statefile. Run this
1916+
before persisting cluster_stack so a failed precondition does not
1917+
leak ha_cluster_stack into the pool DB. *)
1918+
if cluster_stack = Constants.Ha_cluster_stack.(to_string Corosync) then (
1919+
List.iter
1920+
(fun host ->
1921+
match Xapi_clustering.find_cluster_host ~__context ~host with
1922+
| None ->
1923+
raise
1924+
Api_errors.(
1925+
Server_error (no_compatible_cluster_host, [Ref.string_of host])
1926+
)
1927+
| Some self ->
1928+
let cluster = Db.Cluster_host.get_cluster ~__context ~self in
1929+
let ch_stack =
1930+
Db.Cluster.get_cluster_stack ~__context ~self:cluster
1931+
in
1932+
if ch_stack <> cluster_stack then
1933+
raise
1934+
Api_errors.(
1935+
Server_error (no_compatible_cluster_host, [Ref.string_of host])
1936+
) ;
1937+
Xapi_clustering.assert_cluster_host_enabled ~__context ~self
1938+
~expected:true ;
1939+
if not (Db.Cluster_host.get_joined ~__context ~self) then
1940+
raise
1941+
Api_errors.(
1942+
Server_error (cluster_host_not_joined, [Ref.string_of self])
1943+
)
1944+
)
1945+
(Db.Host.get_all ~__context) ;
1946+
(* Live quorum check on this (master) host: queries xapi-clusterd
1947+
diagnostics directly *)
1948+
match
1949+
Xapi_clustering.find_cluster_host ~__context
1950+
~host:(Helpers.get_localhost ~__context)
1951+
with
1952+
| None ->
1953+
() (* unreachable: covered by the iter above *)
1954+
| Some self ->
1955+
Xapi_clustering.assert_cluster_host_quorate ~__context ~self
1956+
) ;
19111957
Db.Pool.set_ha_cluster_stack ~__context ~self:pool ~value:cluster_stack ;
19121958
Localdb.put Constants.ha_cluster_stack cluster_stack ;
19131959
(* Steps from 8.7 Enabling HA in Marathon spec:

0 commit comments

Comments
 (0)