@@ -270,14 +270,21 @@ cluster_node_remove_self_is_removed(void)
270270 if (!cluster_enabled || cluster_node_id < 0 )
271271 return false;
272272 /*
273- * Lock-free hot-path check (called at every writable-xid assignment): the
274- * membership_state[self] byte is the naturally-atomic SSOT. REMOVED is published
275- * into this node's own table by the startup durable-marker rebuild
273+ * Lock-free hot-path check (called at every writable-xid assignment). REMOVED
274+ * is published into this node's own table by the startup durable-marker rebuild
276275 * (rebuild_from_disks) and, for a still-running removed node, by the
277- * NODE_REMOVE_ANNOUNCE handler (nr_announce_handler self-demote). No reconfig
278- * lock is taken here (mirrors the clean-leave refuse-writes gate).
276+ * NODE_REMOVE_ANNOUNCE handler (nr_announce_handler self-demote).
277+ *
278+ * HF-2: consult the durable removed_bitmap (lock-free, monotonic) IN ADDITION to
279+ * the membership_state[self] byte. membership_state[self] is rewritten every
280+ * LMON tick by the joiner / self-state maintenance paths; although those now
281+ * carry a REMOVED terminal guard, the durable bitmap is the authoritative floor
282+ * and closes any residual window where a stale self-state write could transiently
283+ * un-REMOVE this node and open the 53R64 write gate. Either signal = removed.
284+ * No reconfig lock is taken (mirrors the clean-leave refuse-writes gate).
279285 */
280- return cluster_membership_get_state (cluster_node_id ) == CLUSTER_MEMBER_REMOVED ;
286+ return cluster_membership_get_state (cluster_node_id ) == CLUSTER_MEMBER_REMOVED
287+ || cluster_reconfig_is_removed_unlocked (cluster_node_id );
281288}
282289
283290
@@ -441,6 +448,16 @@ cluster_node_remove_request(int32 node_id)
441448 pg_atomic_write_u32 (& nr_state -> phase , CLUSTER_REMOVE_REQUESTED );
442449 pg_atomic_fetch_add_u64 (& nr_state -> removal_request_count , 1 );
443450 }
451+ /*
452+ * HF-5: ACCEPTED at the lock-free precheck but the phase advanced out of the
453+ * reservable set between the precheck and this exclusive section — a concurrent
454+ * request reserved first. Do NOT return a stale ACCEPTED without actually
455+ * reserving (the caller would believe its target is being removed when it is
456+ * not); downgrade to removal_in_progress so it retries.
457+ */
458+ else if (verdict == CLUSTER_REMOVE_REQ_ACCEPTED ) {
459+ verdict = CLUSTER_REMOVE_REQ_IN_PROGRESS ;
460+ }
444461 /* RESUME: keep the existing target/epoch; just re-arm the drive from SHRUNK by
445462 * moving CLEANUP_BLOCKED back to CLEANUP (the lmon_tick retries the cleanup). */
446463 else if (verdict == CLUSTER_REMOVE_REQ_RESUME && cur_phase == CLUSTER_REMOVE_CLEANUP_BLOCKED ) {
@@ -656,16 +673,50 @@ cluster_node_remove_drive(void)
656673void
657674cluster_node_remove_survivor_ack (int32 target_node_id , uint64 remove_epoch )
658675{
659- /* a non-coordinator survivor drops its local refs to the removed node and
660- * records its ACK so the coordinator's barrier can complete. */
676+ uint64 removal_event_id ;
677+ uint64 removed_incarnation ;
678+ int32 coordinator ;
679+
680+ /*
681+ * HF-1/HF-3 (INV-LF11): a non-coordinator survivor APPLIES the removal
682+ * locally — not just drops its N-refs. It seeds the durable removed set +
683+ * membership_state[N]=REMOVED (so its own removed_bitmap carries N for any
684+ * fence baseline it later publishes, INV-LF10), permanently remasters
685+ * N-mastered shards onto a MEMBER survivor, clears N's GES/PCM, and PROVES
686+ * zero leftover — and only ACKs once verify passes. The coordinator's final
687+ * REMOVED marker (the trust source) is built on "local verify + all-survivor
688+ * ACK", so an ACK must mean THIS survivor is genuinely clean, not merely
689+ * "dropped some refs". Idempotent: the survivor lmon_tick re-runs it every
690+ * tick until it converges (the announce is one-shot, so a transient leftover
691+ * must be retried locally, not re-announced).
692+ */
661693 if (nr_state == NULL || target_node_id < 0 || target_node_id >= CLUSTER_MAX_NODES )
662694 return ;
663695
664- (void )cluster_grd_cleanup_on_node_dead (target_node_id );
665- (void )cluster_pcm_lock_clear_pending_x_for_node (target_node_id );
696+ /* identity for the seed + ACK = THIS attempt (recorded by the announce
697+ * handler / lmon_tick adopt path, HF-4). */
698+ LWLockAcquire (& nr_state -> lock , LW_SHARED );
699+ removal_event_id = nr_state -> removal_event_id ;
700+ removed_incarnation = nr_state -> target_last_incarnation ;
701+ coordinator = nr_state -> coordinator_node_id ;
702+ LWLockRelease (& nr_state -> lock );
703+
704+ /* seed the durable removed set + membership REMOVED with the coordinator's
705+ * pinned incarnation floor (carried in the announce, HF-1). */
706+ cluster_reconfig_seed_removed_membership (target_node_id , remove_epoch , removed_incarnation ,
707+ /*raise_epoch_floor*/ true);
708+
709+ /* full cluster-wide cleanup on THIS survivor + zero-leftover proof (HF-3).
710+ * run_cleanup bumps leftover_detected_count + returns false when not clean. */
711+ if (!cluster_node_remove_run_cleanup (target_node_id , remove_epoch )) {
712+ pg_atomic_write_u32 (& nr_state -> survivor_acked , 0 );
713+ return ; /* leftover -> retry next survivor tick, do NOT ACK */
714+ }
666715
667- cluster_node_remove_ic_send_ack (nr_state -> coordinator_node_id , target_node_id , remove_epoch ,
668- nr_state -> removal_event_id );
716+ /* clean: ACK with THIS attempt's identity so the coordinator's barrier keys on
717+ * this removal, not a stale prior one. */
718+ cluster_node_remove_ic_send_ack (coordinator , target_node_id , remove_epoch , removal_event_id );
719+ pg_atomic_write_u32 (& nr_state -> survivor_acked , 1 );
669720}
670721
671722void
@@ -674,6 +725,7 @@ cluster_node_remove_lmon_tick(void)
674725 ClusterRemovePhase phase ;
675726 int32 node_id ;
676727 int32 coordinator ;
728+ uint64 remove_epoch ;
677729
678730 if (nr_state == NULL || !cluster_enabled || !cluster_online_node_removal )
679731 return ;
@@ -684,6 +736,7 @@ cluster_node_remove_lmon_tick(void)
684736 phase = (ClusterRemovePhase )pg_atomic_read_u32 (& nr_state -> phase );
685737 node_id = nr_state -> target_node_id ;
686738 coordinator = nr_state -> coordinator_node_id ;
739+ remove_epoch = nr_state -> remove_epoch ;
687740 LWLockRelease (& nr_state -> lock );
688741
689742 if (node_id < 0 )
@@ -695,10 +748,20 @@ cluster_node_remove_lmon_tick(void)
695748 if (phase >= CLUSTER_REMOVE_CLEANUP && phase <= CLUSTER_REMOVE_CLEANUP_BLOCKED
696749 && pg_atomic_read_u32 (& nr_state -> announce_sent ) == 0 ) {
697750 cluster_node_remove_ic_broadcast_announce (node_id , nr_state -> remove_epoch ,
698- nr_state -> removal_event_id );
751+ nr_state -> removal_event_id ,
752+ nr_state -> target_last_incarnation );
699753 pg_atomic_write_u32 (& nr_state -> announce_sent , 1 );
700754 }
701755 cluster_node_remove_drive ();
756+ } else {
757+ /*
758+ * HF-1/HF-3 (INV-LF11): survivor side — (re)apply the recorded removal
759+ * locally + ACK; retry each tick until verify converges. The announce is
760+ * one-shot, so a transient leftover (or an announce that arrived before the
761+ * GRD/PCM state was settleable) must be retried here, not re-announced.
762+ */
763+ if (pg_atomic_read_u32 (& nr_state -> survivor_acked ) == 0 )
764+ cluster_node_remove_survivor_ack (node_id , remove_epoch );
702765 }
703766}
704767
@@ -707,7 +770,7 @@ cluster_node_remove_lmon_tick(void)
707770 * IC wire (D10): NODE_REMOVE_ANNOUNCE (broadcast) + REMOVE_CLEANUP_ACK (p2p).
708771 * ============================================================ */
709772
710- /* survivor side: a coordinator announced a removal — drop refs + ACK. */
773+ /* survivor side: a coordinator announced a removal — apply it locally + ACK. */
711774static void
712775nr_announce_handler (const ClusterICEnvelope * env , const void * payload )
713776{
@@ -721,24 +784,38 @@ nr_announce_handler(const ClusterICEnvelope *env, const void *payload)
721784 * into our own membership table (lock-free SSOT for the self-demote write
722785 * gate) + the durable removed set so a still-running removed node fail-closes
723786 * new writable transactions (53R64) instead of serving as a phantom member.
724- * Do NOT send a cleanup ACK — a removed node is not a survivor.
787+ * HF-1: pin the coordinator's incarnation floor from the announce so a future
788+ * re-admit must present a strictly newer incarnation. Do NOT send a cleanup
789+ * ACK — a removed node is not a survivor.
725790 */
726791 cluster_reconfig_seed_removed_membership (cluster_node_id , p -> remove_epoch ,
727- 0 /* incarnation floor set by coordinator */ ,
792+ p -> removed_incarnation ,
728793 /*raise_epoch_floor*/ false);
729794 return ;
730795 }
731796
732- /* record the attempt so our ACK echoes the right identity. */
797+ /*
798+ * HF-4: adopt THIS removal attempt's identity when our recorded one is absent,
799+ * terminal, or a different attempt — a survivor never runs the driver's
800+ * abort/commit resets, so a prior attempt's identity would otherwise linger and
801+ * get this attempt's ACK rejected by the coordinator (event_id mismatch),
802+ * wedging the next removal's cleanup barrier. Same event_id = an idempotent
803+ * re-announce: keep progress (do not reset survivor_acked).
804+ */
733805 LWLockAcquire (& nr_state -> lock , LW_EXCLUSIVE );
734- if (nr_state -> target_node_id < 0 ) {
806+ if (nr_state -> removal_event_id != p -> removal_event_id
807+ || nr_state -> target_node_id != p -> target_node_id ) {
735808 nr_state -> target_node_id = p -> target_node_id ;
736809 nr_state -> coordinator_node_id = p -> coordinator_node_id ;
737810 nr_state -> remove_epoch = p -> remove_epoch ;
738811 nr_state -> removal_event_id = p -> removal_event_id ;
812+ nr_state -> target_last_incarnation = p -> removed_incarnation ;
813+ pg_atomic_write_u32 (& nr_state -> survivor_acked , 0 ); /* re-apply for the new attempt */
739814 }
740815 LWLockRelease (& nr_state -> lock );
741816
817+ /* INV-LF11: apply the removal locally + ACK when clean (the survivor lmon_tick
818+ * retries until verify converges). */
742819 cluster_node_remove_survivor_ack (p -> target_node_id , p -> remove_epoch );
743820}
744821
@@ -753,11 +830,18 @@ nr_ack_handler(const ClusterICEnvelope *env, const void *payload)
753830 return ;
754831 if (p -> survivor_node_id < 0 || p -> survivor_node_id >= CLUSTER_NODE_REMOVE_ACK_BITMAP_BYTES * 8 )
755832 return ;
756- if (p -> removal_event_id != nr_state -> removal_event_id )
757- return ; /* stale ACK from a prior attempt */
758833
834+ /*
835+ * HF-4: an ACK counts toward the barrier only for THIS exact removal attempt —
836+ * validate the full identity tuple (target, epoch, event_id) under the lock, not
837+ * just event_id, so a stale ACK from a prior attempt can never satisfy the
838+ * current barrier (and the snapshot is consistent with the bitmap write).
839+ */
759840 LWLockAcquire (& nr_state -> lock , LW_EXCLUSIVE );
760- nr_state -> ack_bitmap [p -> survivor_node_id / 8 ] |= (uint8 )(1u << (p -> survivor_node_id % 8 ));
841+ if (p -> removal_event_id == nr_state -> removal_event_id
842+ && p -> target_node_id == nr_state -> target_node_id
843+ && p -> remove_epoch == nr_state -> remove_epoch )
844+ nr_state -> ack_bitmap [p -> survivor_node_id / 8 ] |= (uint8 )(1u << (p -> survivor_node_id % 8 ));
761845 LWLockRelease (& nr_state -> lock );
762846}
763847
@@ -785,7 +869,7 @@ cluster_node_remove_register_ic_msg_types(void)
785869
786870void
787871cluster_node_remove_ic_broadcast_announce (int32 target_node_id , uint64 remove_epoch ,
788- uint64 removal_event_id )
872+ uint64 removal_event_id , uint64 removed_incarnation )
789873{
790874 ClusterNodeRemoveAnnouncePayload p ;
791875 ClusterICFanoutResult per_peer [CLUSTER_MAX_NODES ];
@@ -797,6 +881,7 @@ cluster_node_remove_ic_broadcast_announce(int32 target_node_id, uint64 remove_ep
797881 p .target_node_id = target_node_id ;
798882 p .remove_epoch = remove_epoch ;
799883 p .removal_event_id = removal_event_id ;
884+ p .removed_incarnation = removed_incarnation ; /* HF-1: incarnation floor for survivor seed */
800885 cluster_node_remove_announce_compute_crc (& p );
801886
802887 cluster_ic_send_envelope_fanout (PGRAC_IC_MSG_NODE_REMOVE_ANNOUNCE , & p , (uint32 )sizeof (p ),
0 commit comments