6161#include "cluster/cluster_inject.h" /* CLUSTER_INJECTION_POINT (D12) */
6262#include "cluster/cluster_pcm_lock.h" /* PCM release-all-self + no-leftover verify (D5) */
6363#include "cluster/cluster_qvotec.h" /* cluster_qvotec_in_quorum (request gate) */
64- #include "cluster/cluster_reconfig.h" /* apply_clean_leave + record/is_clean_departed */
64+ #include "cluster/cluster_reconfig.h" /* apply_clean_leave + record/is_clean_departed + join_in_progress */
65+ #include "cluster/cluster_membership.h" /* v1.0.4 — cluster_membership_is_member (P1-2 INV-J8) */
6566#include "cluster/cluster_shmem.h"
6667#include "cluster/cluster_voting_disk_io.h" /* leave-slot raw I/O + CLUSTER_VOTING_SLOT_BYTES */
6768
@@ -243,6 +244,30 @@ cluster_clean_leave_node_refuses_writes(void)
243244 return phase >= CLUSTER_LEAVE_REQUESTED && phase <= CLUSTER_LEAVE_COMMITTED ;
244245}
245246
247+ /*
248+ * cluster_clean_leave_in_progress -- is THIS node participating in any clean
249+ * leave right now (as the leaver OR as a survivor tracking someone else's
250+ * leave, OR mid-request before it has bound its slot)? Hardening v1.0.4
251+ * (P1-1/P2): the spec-5.15 online-join driver consults this to enforce "one
252+ * membership reconfig at a time" — it must NOT start or commit a join (which
253+ * bumps the epoch with dead_gen unchanged, indistinguishable from a clean-leave
254+ * commit, CL barrier-observe hang) while a clean leave is active anywhere this
255+ * node can see it. Cheap: a few unlocked atomic reads; the authoritative race
256+ * resolution is the re-check the join driver does under the reconfig lock at its
257+ * own commit point, plumbed symmetrically with cluster_reconfig_join_in_progress.
258+ */
259+ bool
260+ cluster_clean_leave_in_progress (void )
261+ {
262+ if (cl_state == NULL )
263+ return false;
264+ if (pg_atomic_read_u32 (& cl_state -> request_in_progress ) != 0 )
265+ return true; /* mid-request: reserved before phase/leaving_node_id are set */
266+ if (cl_state -> leaving_node_id != -1 )
267+ return true; /* leaver or survivor tracking a leave */
268+ return pg_atomic_read_u32 (& cl_state -> phase ) != CLUSTER_LEAVE_IDLE ;
269+ }
270+
246271/*
247272 * cluster_clean_leave_survivor_ack -- a survivor signals readiness (§3.2 step 3,
248273 * F1 PRE-epoch). Sent to the leaving node AFTER this survivor has dropped all
@@ -382,6 +407,30 @@ cl_announce_handler(const ClusterICEnvelope *env, const void *payload)
382407 return ;
383408 }
384409
410+ /*
411+ * Hardening v1.0.4 (P1-1 + P1-2 + P2) — decided BEFORE the preflight/real split
412+ * so it gates BOTH; gating only the real announce is NOT enough (the leaver would
413+ * pass preflight, broadcast the real announce, and OTHER survivors would accept +
414+ * track it before this node's NAK arrived, then the leaver clean-aborts, leaving
415+ * those survivors tracking an aborted leave).
416+ * - Non-MEMBER (INV-J8): silently ignore. A JOINING/ABSENT node is not a
417+ * survivor, must not ACK/track/drop — and must NOT NAK either, because the
418+ * leaver's cl_all_survivors_acked already excludes it; a NAK would wrongly
419+ * abort the leave.
420+ * - MEMBER but busy (this node is itself mid-request to leave, or a membership
421+ * join is in flight): NAK LEAVE_IN_PROGRESS at preflight so the leaver rejects
422+ * before any survivor enters tracking (one membership reconfig at a time).
423+ */
424+ if (!cluster_membership_is_member (cluster_node_id ))
425+ return ;
426+ if (pg_atomic_read_u32 (& cl_state -> request_in_progress ) != 0
427+ || cluster_reconfig_join_in_progress ()) {
428+ cluster_clean_leave_ic_send_ack (env -> source_node_id , leaving , p -> leave_epoch ,
429+ p -> leave_nonce , true,
430+ (uint8 )CLUSTER_LEAVE_NAK_LEAVE_IN_PROGRESS );
431+ return ;
432+ }
433+
385434 /* preflight probe (F6 layer-1): we are enabled → ACK (enablement only, NO state
386435 * change, NO GRD/PCM cleanup). This is the side-effect-free half of the
387436 * two-layer mixed-mode defense — the leaver gathers these ACKs before it ever
@@ -423,6 +472,19 @@ cl_announce_handler(const ClusterICEnvelope *env, const void *payload)
423472 leaving , cl_state -> leaving_node_id )));
424473 return ;
425474 }
475+ /*
476+ * Hardening v1.0.4 (P1-1, TOCTOU re-check under the lock): the membership/busy
477+ * gate before the preflight split was lock-free; if this node reserved its OWN
478+ * request between that gate and here, NAK now rather than become a survivor
479+ * (the requester's self-bind re-check is the symmetric guard).
480+ */
481+ if (pg_atomic_read_u32 (& cl_state -> request_in_progress ) != 0 ) {
482+ LWLockRelease (& cl_state -> lock );
483+ cluster_clean_leave_ic_send_ack (env -> source_node_id , leaving , p -> leave_epoch ,
484+ p -> leave_nonce , true,
485+ (uint8 )CLUSTER_LEAVE_NAK_LEAVE_IN_PROGRESS );
486+ return ;
487+ }
426488
427489 /*
428490 * Real announce: enter leave-aware reconfig (record the leaving node + bound
@@ -797,50 +859,82 @@ cluster_clean_leave_rebuild_from_disks(const int *fds, int n_disks)
797859 if (cl_state == NULL || fds == NULL || n_disks <= 0 )
798860 return ;
799861
862+ if (n_disks > CLUSTER_MAX_VOTING_DISKS )
863+ n_disks = CLUSTER_MAX_VOTING_DISKS ; /* defensive clamp (fixed-array bound, no VLA) */
800864 majority = ((uint32 )n_disks / 2u ) + 1u ;
801865
802866 for (s = 0 ; s < CLUSTER_MAX_NODES ; s ++ ) {
803- union {
804- uint8 bytes [CLUSTER_VOTING_SLOT_BYTES ];
805- uint64 _align ;
806- } slot ;
807- ClusterLeaveIntentMarker m ;
808- int32 departed = -1 ;
809- uint64 epoch = 0 ;
810- uint32 agree = 0 ;
811- int d ;
867+ ClusterLeaveIntentMarker markers [CLUSTER_MAX_VOTING_DISKS ];
868+ bool valid [CLUSTER_MAX_VOTING_DISKS ];
869+ int n_read = 0 ;
870+ int d , e ;
812871
872+ /*
873+ * Read this slot from every disk, keeping each disk's COMMITTED-basis marker
874+ * for a declared peer. Hardening v1.0.4 (finding 4): the majority must be of
875+ * the SAME durable proof. The earlier code counted agreement by
876+ * leaving_node_id and took max(leave_epoch); a stale ghost leave-slot is NOT
877+ * zeroed on rejoin (qvotec read-only ghost mitigation), so an OLD attempt's
878+ * COMMITTED marker can persist on most disks while a NEWER attempt's COMMITTED
879+ * marker reached only a minority — node-id counting then synthesized a
880+ * "majority" and rebuilt at the newer epoch that itself never reached majority
881+ * (trusting a non-majority durable fact, 8.A-adjacent). Keep the markers and
882+ * count by full identity below instead.
883+ */
813884 for (d = 0 ; d < n_disks ; d ++ ) {
885+ union {
886+ uint8 bytes [CLUSTER_VOTING_SLOT_BYTES ];
887+ uint64 _align ;
888+ } slot ;
889+
890+ valid [d ] = false;
814891 if (cluster_voting_disk_read_leave_slot (fds [d ], (uint32 )s , slot .bytes )
815892 != CLUSTER_VOTING_DISK_IO_OK )
816893 continue ;
817- memcpy (& m , slot .bytes , sizeof (m ));
818-
894+ memcpy (& markers [d ], slot .bytes , sizeof (markers [d ]));
819895 /* COMMITTED + magic/version/CRC/dead_bitmap valid for its own
820- * leaving_node_id, which must be a declared peer (defence vs a
821- * marker naming a node not in cluster.conf). */
822- if (! cluster_clean_leave_marker_is_committed_basis ( & m , m .leaving_node_id ))
896+ * leaving_node_id, which must be a declared peer. */
897+ if (! cluster_clean_leave_marker_is_committed_basis ( & markers [ d ],
898+ markers [ d ] .leaving_node_id ))
823899 continue ;
824- if (cluster_conf_lookup_node (m .leaving_node_id ) == NULL )
900+ if (cluster_conf_lookup_node (markers [ d ] .leaving_node_id ) == NULL )
825901 continue ;
826-
827- if (departed < 0 ) {
828- departed = m .leaving_node_id ;
829- epoch = m .leave_epoch ;
830- agree = 1 ;
831- } else if (m .leaving_node_id == departed ) {
832- agree ++ ;
833- if (m .leave_epoch > epoch )
834- epoch = m .leave_epoch ;
835- }
902+ valid [d ] = true;
903+ n_read ++ ;
836904 }
905+ if (n_read == 0 )
906+ continue ;
907+
908+ /*
909+ * Find a marker IDENTITY {leaving_node_id, leave_epoch, event_id,
910+ * cssd_dead_generation} present on a quorum-majority of disks — THAT is the
911+ * durable proof. At most one identity can hold a majority, so the first
912+ * match is unique; rebuild at its own leave_epoch (never a different
913+ * attempt's higher epoch).
914+ */
915+ for (d = 0 ; d < n_disks ; d ++ ) {
916+ uint32 agree = 0 ;
837917
838- if (departed >= 0 && agree >= majority ) {
839- cluster_reconfig_record_clean_departed (departed , epoch , /*raise_epoch_floor*/ true);
840- ereport (LOG ,
841- (errmsg ("cluster clean-leave: rebuilt clean-departed node %d at epoch %llu "
842- "from %u/%d durable COMMITTED marker(s)" ,
843- departed , (unsigned long long )epoch , agree , n_disks )));
918+ if (!valid [d ])
919+ continue ;
920+ for (e = 0 ; e < n_disks ; e ++ ) {
921+ if (valid [e ] && markers [e ].leaving_node_id == markers [d ].leaving_node_id
922+ && markers [e ].leave_epoch == markers [d ].leave_epoch
923+ && markers [e ].event_id == markers [d ].event_id
924+ && markers [e ].cssd_dead_generation == markers [d ].cssd_dead_generation )
925+ agree ++ ;
926+ }
927+ if (agree >= majority ) {
928+ cluster_reconfig_record_clean_departed (markers [d ].leaving_node_id ,
929+ markers [d ].leave_epoch ,
930+ /*raise_epoch_floor*/ true);
931+ ereport (LOG ,
932+ (errmsg ("cluster clean-leave: rebuilt clean-departed node %d at epoch %llu "
933+ "from %u/%d durable COMMITTED marker(s) of one identity" ,
934+ markers [d ].leaving_node_id ,
935+ (unsigned long long )markers [d ].leave_epoch , agree , n_disks )));
936+ break ; /* one identity per slot can reach majority */
937+ }
844938 }
845939 }
846940}
@@ -877,7 +971,7 @@ cl_set_phase(ClusterLeavePhase to)
877971 pg_atomic_write_u32 (& cl_state -> phase , to );
878972}
879973
880- /* min declared node that is alive (CSSD not DEAD) and != leaving; -1 if none. */
974+ /* min MEMBER node that is alive (CSSD not DEAD) and != leaving; -1 if none. */
881975static int32
882976cl_compute_coordinator (int32 leaving )
883977{
@@ -886,8 +980,15 @@ cl_compute_coordinator(int32 leaving)
886980 for (i = 0 ; i < CLUSTER_MAX_NODES ; i ++ ) {
887981 if (i == leaving )
888982 continue ;
889- if (cluster_conf_lookup_node (i ) == NULL )
890- continue ; /* not a declared peer */
983+ /*
984+ * Hardening v1.0.4 (P1-2, INV-J8): only a MEMBER may be elected clean-leave
985+ * coordinator (it drives the two-phase commit + epoch bump). A JOINING /
986+ * ABSENT (but CSSD-alive) declared node must never be chosen — that would let
987+ * a non-member bump the membership epoch, conflicting with safe-publication.
988+ * Subsumes the old declared-peer check (a member is a declared peer).
989+ */
990+ if (!cluster_membership_is_member (i ))
991+ continue ;
891992 if (i == cluster_node_id )
892993 return i ; /* self is alive by definition */
893994 if (cluster_cssd_get_peer_state (i ) != CLUSTER_CSSD_PEER_DEAD )
@@ -896,7 +997,7 @@ cl_compute_coordinator(int32 leaving)
896997 return -1 ;
897998}
898999
899- /* every alive declared survivor (!= leaving) has set its ack bit? */
1000+ /* every alive MEMBER survivor (!= leaving) has set its ack bit? */
9001001static bool
9011002cl_all_survivors_acked (int32 leaving )
9021003{
@@ -907,7 +1008,14 @@ cl_all_survivors_acked(int32 leaving)
9071008 for (i = 0 ; i < CLUSTER_MAX_NODES ; i ++ ) {
9081009 if (i == leaving )
9091010 continue ;
910- if (cluster_conf_lookup_node (i ) == NULL )
1011+ /*
1012+ * Hardening v1.0.4 (P1-2, INV-J8): only MEMBERs are expected to ACK a clean
1013+ * leave. A JOINING / ABSENT (but CSSD-alive) node is NOT a survivor — it
1014+ * ignores the announce (mirror gate in the handler), so counting it here
1015+ * would wait forever for an ACK it must never send. Subsumes the old
1016+ * declared-peer check (a member is a declared peer).
1017+ */
1018+ if (!cluster_membership_is_member (i ))
9111019 continue ;
9121020 if (cluster_cssd_get_peer_state (i ) == CLUSTER_CSSD_PEER_DEAD )
9131021 continue ; /* a dead survivor is not expected to ack (CL-I7 escalate handles it) */
@@ -1061,6 +1169,15 @@ cl_request_body(void)
10611169 */
10621170 if (cl_state -> leaving_node_id != -1 && cl_state -> leaving_node_id != cluster_node_id )
10631171 return CLUSTER_LEAVE_REQ_REJECTED_IN_PROGRESS ;
1172+ /*
1173+ * Hardening v1.0.4 (P2 serialization): one membership reconfig at a time. If a
1174+ * spec-5.15 online JOIN is in its pending window, do NOT start a clean leave —
1175+ * the join bumps the membership epoch with dead_gen unchanged, which the leaving
1176+ * node would mis-observe as its own commit and wedge in BARRIER_WAIT. The join
1177+ * driver defers symmetrically while a clean leave is active.
1178+ */
1179+ if (cluster_reconfig_join_in_progress ())
1180+ return CLUSTER_LEAVE_REQ_REJECTED_IN_PROGRESS ;
10641181 if (!cluster_qvotec_in_quorum ())
10651182 return CLUSTER_LEAVE_REQ_REJECTED_NOT_IN_QUORUM ;
10661183 coordinator = cl_compute_coordinator (cluster_node_id );
@@ -1086,6 +1203,16 @@ cl_request_body(void)
10861203 int i ;
10871204
10881205 LWLockAcquire (& cl_state -> lock , LW_EXCLUSIVE );
1206+ /*
1207+ * Hardening v1.0.4 (P1-1): re-check leaving_node_id UNDER the lock. The
1208+ * test above was unlocked; in between, an incoming real announce could have
1209+ * made this node a survivor of someone else's leave (leaving_node_id :=
1210+ * other). Bail before we stamp our own nonce/bitmap over that tracking.
1211+ */
1212+ if (cl_state -> leaving_node_id != -1 && cl_state -> leaving_node_id != cluster_node_id ) {
1213+ LWLockRelease (& cl_state -> lock );
1214+ return CLUSTER_LEAVE_REQ_REJECTED_IN_PROGRESS ;
1215+ }
10891216 pg_atomic_write_u64 (& cl_state -> leave_attempt_nonce , nonce );
10901217 memset (cl_state -> ack_bitmap , 0 , sizeof (cl_state -> ack_bitmap ));
10911218 pg_atomic_write_u32 (& cl_state -> nak_received , 0 );
@@ -1135,6 +1262,18 @@ cl_request_body(void)
11351262 baseline_epoch = cluster_epoch_get_current ();
11361263
11371264 LWLockAcquire (& cl_state -> lock , LW_EXCLUSIVE );
1265+ /*
1266+ * Hardening v1.0.4 (P1-1, final recheck): the ~5s preflight wait above ran
1267+ * with the lock released, so an incoming real announce could have made this
1268+ * node a survivor (leaving_node_id := other) during it. Binding self here
1269+ * would silently drop that other leave's serve-gate protection (false-visible,
1270+ * 8.A). Re-check under the lock and bail if so; the announce handler is the
1271+ * symmetric enforcement point (it NAKs while our request is in progress).
1272+ */
1273+ if (cl_state -> leaving_node_id != -1 && cl_state -> leaving_node_id != cluster_node_id ) {
1274+ LWLockRelease (& cl_state -> lock );
1275+ return CLUSTER_LEAVE_REQ_REJECTED_IN_PROGRESS ;
1276+ }
11381277 cl_state -> leaving_node_id = cluster_node_id ;
11391278 cl_state -> leave_epoch = baseline_epoch ;
11401279 cl_state -> leave_baseline_dead_gen = cluster_cssd_get_dead_generation ();
@@ -1423,6 +1562,18 @@ cl_leaving_barrier_tick(void)
14231562 * CSSD dead_generation: a cooperative leave does NOT mark anyone CSSD-dead, so
14241563 * an unchanged dead_generation at the bump means OUR clean-leave committed; a
14251564 * changed one means a real death (escalate).
1565+ *
1566+ * Hardening v1.0.4 (P2): this "epoch bump + dead_gen unchanged == OUR commit"
1567+ * inference is sound ONLY because there is no OTHER dead_gen-unchanged reconfig
1568+ * in flight. A spec-5.15 online JOIN also bumps the epoch with dead_gen
1569+ * unchanged and would be mis-observed here as the leave's commit (then the node
1570+ * stops escalating but never gets the real LEAVE_COMMITTED -> BARRIER_WAIT
1571+ * hang). That collision is removed STRUCTURALLY by the one-membership-reconfig-
1572+ * at-a-time serialization: the join driver does not bump the epoch while a clean
1573+ * leave is active (cluster_clean_leave_in_progress gates drive_joins +
1574+ * commit_member), and the leave does not start while a join is pending
1575+ * (cluster_reconfig_join_in_progress gates the request). Under that invariant a
1576+ * dead_gen-unchanged bump during a leave can only be the leave's own commit.
14261577 */
14271578 if (cluster_epoch_get_current () > baseline_epoch ) {
14281579 if (cluster_cssd_get_dead_generation () == cl_state -> leave_baseline_dead_gen ) {
0 commit comments