Skip to content

Commit 285d31d

Browse files
committed
CA-403379: pre-flight cluster_host state before pool-ha-enable
When the chosen HA cluster_stack is corosync (i.e. for a gfs2 heartbeat SR) every pool host must have an enabled, joined cluster_host on the matching cluster stack, and this host must currently be quorate. Without this preflight, that failure surfaces much later inside Xha_statefile.check_sr_can_host_statefile with the misleading SR_NO_PBDS error from pool-ha-enable (CA-417077 / TC7509). This change adds a per-host preflight in Xapi_ha.enable that reuses the existing NO_COMPATIBLE_CLUSTER_HOST, CLUSTERING_DISABLED and CLUSTER_HOST_NOT_JOINED errors so the caller can pinpoint exactly which host is the problem. The preflight runs BEFORE the cluster_stack is persisted to the pool DB and localdb, matching the pattern of the existing host_offline check, so a failed precondition does not leak ha_cluster_stack into the pool state. The final assert_cluster_host_quorate call queries xapi-clusterd diagnostics directly rather than reading the Cluster_host.live DB field, which the corosync_notifyd watcher only updates asynchronously and which is reset to false for all hosts on any transient quorum blip. Signed-off-by: Lunfan Zhang[Lunfan.Zhang] <Lunfan.Zhang@cloud.com>
1 parent 5f6406f commit 285d31d

2 files changed

Lines changed: 47 additions & 0 deletions

File tree

ocaml/xapi/xapi_clustering.mli

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ val get_network_internal :
5252
val assert_cluster_host_enabled :
5353
__context:Context.t -> self:[`Cluster_host] Ref.t -> expected:bool -> unit
5454

55+
val assert_cluster_host_quorate :
56+
__context:Context.t -> self:[`Cluster_host] Ref.t -> unit
57+
5558
val assert_operation_host_target_is_localhost :
5659
__context:Context.t -> host:[`host] Ref.t -> unit
5760

ocaml/xapi/xapi_ha.ml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,6 +1908,50 @@ let enable __context heartbeat_srs configuration =
19081908
let cluster_stack =
19091909
Cluster_stack_constraints.choose_cluster_stack ~__context
19101910
in
1911+
(* when HA uses corosync, every pool host must have an
1912+
enabled, joined cluster_host on the chosen stack and this host
1913+
must currently be quorate; otherwise the gfs2 heartbeat SR's PBD
1914+
cannot plug and pool-ha-enable would later fail with the
1915+
misleading SR_NO_PBDS from check_sr_can_host_statefile. Run this
1916+
before persisting cluster_stack so a failed precondition does not
1917+
leak ha_cluster_stack into the pool DB. *)
1918+
( if cluster_stack = Constants.Ha_cluster_stack.(to_string Corosync) then
1919+
let localhost = Helpers.get_localhost ~__context in
1920+
List.iter
1921+
(fun host ->
1922+
match Xapi_clustering.find_cluster_host ~__context ~host with
1923+
| None ->
1924+
raise
1925+
Api_errors.(
1926+
Server_error (no_compatible_cluster_host, [Ref.string_of host])
1927+
)
1928+
| Some self ->
1929+
let cluster = Db.Cluster_host.get_cluster ~__context ~self in
1930+
let ch_stack =
1931+
Db.Cluster.get_cluster_stack ~__context ~self:cluster
1932+
in
1933+
if ch_stack <> cluster_stack then
1934+
raise
1935+
Api_errors.(
1936+
Server_error
1937+
(no_compatible_cluster_host, [Ref.string_of host])
1938+
) ;
1939+
Xapi_clustering.assert_cluster_host_enabled ~__context ~self
1940+
~expected:true ;
1941+
if not (Db.Cluster_host.get_joined ~__context ~self) then
1942+
raise
1943+
Api_errors.(
1944+
Server_error (cluster_host_not_joined, [Ref.string_of self])
1945+
) ;
1946+
(* On the master: also check live quorum directly via
1947+
xapi-clusterd diagnostics, sidestepping the
1948+
Cluster_host.live DB field which the corosync_notifyd
1949+
watcher only updates asynchronously. *)
1950+
if host = localhost then
1951+
Xapi_clustering.assert_cluster_host_quorate ~__context ~self
1952+
)
1953+
(Db.Host.get_all ~__context)
1954+
) ;
19111955
Db.Pool.set_ha_cluster_stack ~__context ~self:pool ~value:cluster_stack ;
19121956
Localdb.put Constants.ha_cluster_stack cluster_stack ;
19131957
(* Steps from 8.7 Enabling HA in Marathon spec:

0 commit comments

Comments
 (0)