@@ -632,8 +632,8 @@ cluster_grd_shmem_init(void)
632632
633633 /* spec-5.16 D2/D3b/D5 — online-join remaster fence + counters. */
634634 pg_atomic_init_u64 (& cluster_grd_state -> join_pcm_fence_epoch , 0 );
635- for (i = 0 ; i < ( CLUSTER_MAX_NODES + 63 ) / 64 ; i ++ )
636- pg_atomic_init_u64 (& cluster_grd_state -> join_pcm_fenced_member [i ], 0 );
635+ for (i = 0 ; i < CLUSTER_MAX_NODES ; i ++ )
636+ pg_atomic_init_u64 (& cluster_grd_state -> join_pcm_fence_member_epoch [i ], 0 );
637637 pg_atomic_init_u32 (& cluster_grd_state -> recovery_direction , (uint32 )GRD_REMASTER_DIR_NONE );
638638 pg_atomic_init_u64 (& cluster_grd_state -> join_remaster_started_count , 0 );
639639 pg_atomic_init_u64 (& cluster_grd_state -> join_remaster_done_count , 0 );
@@ -1221,46 +1221,55 @@ cluster_grd_master_map_recompute_for_membership(const uint8 *active_member, uint
12211221 * Arm the joiner-home PCM block fence SYNCHRONOUSLY (NEVER from the LMON
12221222 * tick — the async tick is structurally later than the 5.15 write-gate open,
12231223 * so a fence armed there would leave a "MEMBER-writable but unfenced" window).
1224- * Sets join_pcm_fenced_member (the rejoining set) THEN raises join_pcm_fence_
1225- * epoch (a write barrier between, so a reader that sees the raised epoch sees
1226- * the member set). Monotonic-max: a later join re-arms higher; re-arm of
1227- * the same epoch is an idempotent no-op (INV-R12).
1224+ * Stamps each rejoining recipient's join_pcm_fence_member_epoch with THIS
1225+ * episode's epoch THEN raises join_pcm_fence_epoch (a write barrier between, so
1226+ * a reader that sees the raised epoch sees the recipient stamps). Monotonic-
1227+ * max: a later join re-arms higher; re-arm of the same epoch is an idempotent
1228+ * no-op (INV-R12).
12281229 *
1229- * spec-5.16 Hardening v1.3 (Rule 8.A) — the member set is OR-accumulated, never
1230- * overwritten. Two arms race on the rejoining node: qvotec (note_self_admitted,
1231- * {self}) and the LMON tick ({evt.join_bitmap} for a multi-joiner episode). A
1232- * plain write would let the {self} arm zero a co-joiner's bit, UNDER-fencing its
1233- * home block -> cold-serve -> 8.A double-grant. OR makes concurrent arms
1234- * accumulate the union regardless of interleaving (no under-fence). A stale bit
1235- * carried from a completed prior episode only OVER-fences (the block waits for
1236- * this epoch's view_rebuilt barrier, then lifts) — benign liveness, never a
1237- * correctness fault, so no per-bit clear is required here.
1230+ * spec-5.16 Hardening v1.4 (Rule 8.A) — the recipient set is keyed PER NODE by
1231+ * the arming epoch, not an OR-accumulated bitmap. Two arms race on the
1232+ * rejoining node: qvotec (note_self_admitted, {self}) and the LMON tick
1233+ * ({evt.join_bitmap} for a multi-joiner episode). Both stamp the SAME epoch on
1234+ * their respective nodes, so they union with no lost update and no under-fence.
1235+ * Crucially, a node armed in a COMPLETED prior episode keeps its lower stamp,
1236+ * which is < the current join_pcm_fence_epoch, so the recipient test (used by
1237+ * active_for_shard AND the re-declare barriers) excludes it from THIS episode
1238+ * automatically. The previous v1.3 bitmap was never cleared, so a prior
1239+ * rejoiner — now a steady survivor that may hold X on the new joiner's home
1240+ * block — was wrongly skipped by the barrier -> premature fence lift -> cold-
1241+ * serve -> 8.A double-grant / false-visible (reviewer P1 #1). Per-node epoch
1242+ * keying fixes both the union (no under-fence) and the staleness (no cross-
1243+ * episode under-wait) with no reset race.
12381244 */
12391245void
12401246cluster_grd_arm_join_pcm_fence (const uint8 * rejoining_set )
12411247{
12421248 uint64 epoch ;
12431249 uint64 prev ;
1244- int w ;
1250+ int node ;
12451251
12461252 if (cluster_grd_state == NULL || rejoining_set == NULL )
12471253 return ;
12481254
12491255 epoch = cluster_epoch_get_current ();
12501256
1251- for (w = 0 ; w < (CLUSTER_MAX_NODES + 63 ) / 64 ; w ++ ) {
1252- uint64 word = 0 ;
1253- int j ;
1254-
1255- for (j = 0 ; j < 8 ; j ++ ) {
1256- int byte_idx = w * 8 + j ;
1257+ for (node = 0 ; node < CLUSTER_MAX_NODES ; node ++ ) {
1258+ uint64 cur ;
12571259
1258- if (byte_idx < CLUSTER_RECONFIG_DEAD_BITMAP_BYTES )
1259- word |= ((uint64 )rejoining_set [byte_idx ]) << (8 * j );
1260+ if (node >= CLUSTER_RECONFIG_DEAD_BITMAP_BYTES * 8 )
1261+ break ; /* rejoining_set bitmap is exhausted */
1262+ if (((rejoining_set [node >> 3 ] >> (node & 7 )) & 1 ) == 0 )
1263+ continue ;
1264+ /* monotonic-max per node — concurrent same-epoch arms union safely */
1265+ cur = pg_atomic_read_u64 (& cluster_grd_state -> join_pcm_fence_member_epoch [node ]);
1266+ while (epoch > cur ) {
1267+ if (pg_atomic_compare_exchange_u64 (
1268+ & cluster_grd_state -> join_pcm_fence_member_epoch [node ], & cur , epoch ))
1269+ break ;
12601270 }
1261- pg_atomic_fetch_or_u64 (& cluster_grd_state -> join_pcm_fenced_member [w ], word );
12621271 }
1263- pg_write_barrier (); /* member set visible before the epoch is raised */
1272+ pg_write_barrier (); /* recipient stamps visible before the epoch is raised */
12641273
12651274 prev = pg_atomic_read_u64 (& cluster_grd_state -> join_pcm_fence_epoch );
12661275 while (epoch > prev ) {
@@ -1286,18 +1295,19 @@ cluster_grd_join_remaster_in_progress(void)
12861295/*
12871296 * cluster_grd_join_remaster_active_for_shard -- spec-5.16 D3 (INV-R8).
12881297 *
1289- * True iff the block's STATIC PCM home (cluster_gcs_lookup_master_static) is
1290- * in the armed join_pcm_fenced_member set. Bound to online_join (the fence
1291- * epoch is armed by note_self_admitted / LMON P0-accept), INDEPENDENT of any
1292- * GRD master[] movement — so join_remaster_enabled=off still fences (r2 P1-①,
1293- * P1-A closure). false when the fence is not armed.
1298+ * True iff the block's STATIC PCM home (cluster_gcs_lookup_master_static) is a
1299+ * rejoining RECIPIENT of the CURRENT fence episode (member_epoch[home] ==
1300+ * join_pcm_fence_epoch). Bound to online_join (the fence epoch is armed by
1301+ * note_self_admitted / LMON P0-accept), INDEPENDENT of any GRD master[]
1302+ * movement — so join_remaster_enabled=off still fences (r2 P1-①, P1-A
1303+ * closure). false when the fence is not armed or the home is a steady member
1304+ * (incl. a prior rejoiner whose stamp is from a completed earlier episode).
12941305 */
12951306bool
12961307cluster_grd_join_remaster_active_for_shard (BufferTag tag )
12971308{
12981309 uint64 fence_epoch ;
12991310 int home ;
1300- int w , b ;
13011311
13021312 if (cluster_grd_state == NULL )
13031313 return false;
@@ -1309,11 +1319,7 @@ cluster_grd_join_remaster_active_for_shard(BufferTag tag)
13091319 home = cluster_gcs_lookup_master_static (tag );
13101320 if (home < 0 || home >= CLUSTER_MAX_NODES )
13111321 return false;
1312- w = home >> 6 ;
1313- b = home & 63 ;
1314- return (pg_atomic_read_u64 (& cluster_grd_state -> join_pcm_fenced_member [w ])
1315- & (UINT64CONST (1 ) << b ))
1316- != 0 ;
1322+ return pg_atomic_read_u64 (& cluster_grd_state -> join_pcm_fence_member_epoch [home ]) == fence_epoch ;
13171323}
13181324
13191325/*
@@ -1340,19 +1346,21 @@ cluster_grd_join_remaster_active_for_shard(BufferTag tag)
13401346 * gate on the joiner is the authoritative backstop (INV-R8/R14).
13411347 */
13421348
1343- /* Test whether node_id is in the armed join_pcm_fenced_member (rejoining) set. */
1349+ /*
1350+ * Test whether node_id is a rejoining RECIPIENT of the fence episode identified
1351+ * by ref_epoch (member_epoch[node] == ref_epoch). A stale stamp from a prior
1352+ * episode (< ref_epoch) returns false, so a now-steady survivor is correctly
1353+ * waited for by the re-declare barriers (Hardening v1.4, reviewer P1 #1).
1354+ */
13441355static inline bool
1345- join_fenced_member_test (int32 node_id )
1356+ join_fence_is_recipient_for (int32 node_id , uint64 ref_epoch )
13461357{
1347- int w , b ;
1348-
13491358 if (cluster_grd_state == NULL || node_id < 0 || node_id >= CLUSTER_MAX_NODES )
13501359 return false;
1351- w = node_id >> 6 ;
1352- b = node_id & 63 ;
1353- return (pg_atomic_read_u64 (& cluster_grd_state -> join_pcm_fenced_member [w ])
1354- & (UINT64CONST (1 ) << b ))
1355- != 0 ;
1360+ if (ref_epoch == 0 )
1361+ return false;
1362+ return pg_atomic_read_u64 (& cluster_grd_state -> join_pcm_fence_member_epoch [node_id ])
1363+ == ref_epoch ;
13561364}
13571365
13581366bool
@@ -1379,10 +1387,12 @@ cluster_grd_block_view_rebuilt(BufferTag tag)
13791387 * JOIN_COMMITTED event as a reconfig episode (it is published coordinator-
13801388 * side only), so it never announces REDECLARE_DONE. The binding safety
13811389 * condition is "every SURVIVOR finished re-declaring its held joiner-home
1382- * blocks" — exclude the fenced set so view_rebuilt converges on the
1383- * survivors' done (Hardening v1.1 + D8 fix).
1390+ * blocks" — exclude only THIS episode's recipients so view_rebuilt
1391+ * converges on the survivors' done (Hardening v1.1 + D8 fix). Keyed on
1392+ * fence_epoch so a prior rejoiner (now a steady survivor) is NOT skipped
1393+ * (Hardening v1.4, reviewer P1 #1: cross-episode under-wait -> 8.A).
13841394 */
1385- if (join_fenced_member_test ( i ))
1395+ if (join_fence_is_recipient_for ( i , fence_epoch ))
13861396 continue ;
13871397 if (pg_atomic_read_u64 (& cluster_grd_state -> recovery_done_epoch [i ]) < fence_epoch )
13881398 return false;
@@ -2074,7 +2084,7 @@ cluster_grd_recovery_lmon_tick(void)
20742084 * (P5-P7 below); for JOIN that rebuilds the joiner's PCM block view even
20752085 * when join_remaster_enabled is off. The two scopes are independent:
20762086 * grd_moved_shards = `affected` here (GRD, GUC-gated); pcm_fenced_home_set
2077- * lives in join_pcm_fenced_member (PCM, online_join-gated, armed at P0).
2087+ * lives in join_pcm_fence_member_epoch (PCM, online_join-gated, armed at P0).
20782088 */
20792089 memset (affected , 0 , sizeof (affected ));
20802090 if (pg_atomic_read_u32 (& cluster_grd_state -> recovery_direction )
@@ -2307,10 +2317,14 @@ cluster_grd_recovery_lmon_tick(void)
23072317 * and never observes the JOIN_COMMITTED event as its own reconfig
23082318 * episode (it is published coordinator-side only), so it never
23092319 * announces REDECLARE_DONE. Waiting for it would wedge the survivor's
2310- * barrier forever. The survivors (everyone outside the fenced set)
2311- * ARE the re-declarers and must converge.
2320+ * barrier forever. The survivors (everyone outside THIS episode's
2321+ * recipient set) ARE the re-declarers and must converge. Keyed on
2322+ * episode_epoch so a prior rejoiner (now a steady survivor) is still
2323+ * waited for (Hardening v1.4, reviewer P1 #1: a stale cross-episode
2324+ * exclusion would skip a survivor holding X on the joiner's home
2325+ * block -> premature unfreeze -> 8.A double-grant).
23122326 */
2313- if (is_join && join_fenced_member_test ( i ))
2327+ if (is_join && join_fence_is_recipient_for ( i , episode_epoch ))
23142328 continue ;
23152329 if (pg_atomic_read_u64 (& cluster_grd_state -> recovery_done_epoch [i ])
23162330 < episode_epoch ) {
0 commit comments