Skip to content

Commit 4c45b40

Browse files
author
SqlRush
committed
fix(cluster): spec-5.15 Hardening v1.3 — cold-bootstrap proof requires fresh-alive, not just a valid slot
v1.2 anchored the co-boot quorum on a valid voting-disk slot (generation > 0 at epoch INITIAL), but a generation > 0 slot alone is not liveness: a CRASHED peer leaves a stale leftover slot at epoch INITIAL. decide_quorum_view's P2.1 heartbeat-freshness gate already excludes such stale slots from the alive_bitmap, but the v1.2 bootstrap proof read the raw observed slot (no freshness) — so a node could reach quorum with self + a stale peer slot and fail-open (latch BOOTSTRAP without a live co-boot quorum). Fix: publish the per-node FRESH-ALIVE signal (decide_quorum_view's alive_bitmap) into the reconfig region, and require it in the cold-bootstrap proof: fresh-alive AND generation > 0 AND epoch INITIAL. Anchored on the durable voting-disk heartbeat (not live CSSD), so it rejects stale slots WITHOUT reintroducing the v1.2 IC-churn race (the disk heartbeat keeps flowing while CSSD/tier1 churns). Quorum threshold and rejoiner predicate unchanged. New ReconfigShmem field observed_fresh_alive[CLUSTER_MAX_NODES] (atomic, default 0 = fail-closed); qvotec publishes it each poll from decide_quorum_view. Unit: test_cluster_reconfig U21 (stale slot must not count, TDD red->green) + U19/U20 updated to fresh-alive semantics; UT_PLAN 43->45 (also fixes a pre-existing plan/count mismatch). No on-disk/wire/catalog/GUC change; no catversion bump. Spec: spec-5.15-online-declared-node-join-membership.md (Hardening v1.3)
1 parent 758f403 commit 4c45b40

5 files changed

Lines changed: 147 additions & 13 deletions

File tree

src/backend/cluster/cluster_qvotec.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,25 @@ qvotec_poll_once(void)
969969
CLUSTER_MAX_NODES, (uint32)cluster_node_id, qvotec_self_incarnation,
970970
now_us, heartbeat_timeout_us, &decision);
971971

972+
/*
973+
* spec-5.15 Hardening v1.3 (INV-J14 stale-slot fail-open) — publish the
974+
* per-node FRESH-ALIVE liveness from decide_quorum_view's alive_bitmap (the
975+
* P2.1 heartbeat-freshness gate that already excludes a crashed peer's stale
976+
* leftover slot) into the reconfig region. The cold-bootstrap proof reads it
977+
* so it counts only genuinely live co-booting peers, never a stale gen > 0
978+
* leftover (which would fail-open). Anchored on the durable voting-disk
979+
* heartbeat, so it is robust to CSSD / tier1 churn — the v1.2 race fix stands.
980+
*/
981+
{
982+
uint32 node;
983+
984+
for (node = 0; node < CLUSTER_MAX_NODES; node++) {
985+
bool fresh = (decision.alive_bitmap[node / 8] & (uint8)(1u << (node % 8))) != 0;
986+
987+
cluster_reconfig_record_observed_fresh_alive((int32)node, fresh);
988+
}
989+
}
990+
972991
/*
973992
* Hardening v0.4 P1.1: Q6 v0.2 newer-self-FATAL. decide_quorum_
974993
* view observed an OK-disk fresh slot at our node_id offset with

src/backend/cluster/cluster_reconfig.c

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,30 @@ cluster_reconfig_get_observed_slot(int32 node_id, uint64 *incarnation, uint64 *g
666666
return gen > 0;
667667
}
668668

669+
/*
670+
* spec-5.15 Hardening v1.3 (INV-J14 stale-slot fail-open) — publish / read the
671+
* per-node FRESH-ALIVE liveness qvotec derived from decide_quorum_view's
672+
* heartbeat-freshness gate (P2.1). The cold-bootstrap proof counts a peer only
673+
* when it is fresh-alive at epoch INITIAL — a generation > 0 slot alone may be a
674+
* crashed peer's stale leftover. Anchored on the durable voting-disk heartbeat,
675+
* not live CSSD, so the v1.2 IC-churn race fix is preserved.
676+
*/
677+
void
678+
cluster_reconfig_record_observed_fresh_alive(int32 node_id, bool fresh_alive)
679+
{
680+
if (ReconfigShmem == NULL || node_id < 0 || node_id >= CLUSTER_MAX_NODES)
681+
return;
682+
pg_atomic_write_u64(&ReconfigShmem->observed_fresh_alive[node_id], fresh_alive ? 1 : 0);
683+
}
684+
685+
bool
686+
cluster_reconfig_get_observed_fresh_alive(int32 node_id)
687+
{
688+
if (ReconfigShmem == NULL || node_id < 0 || node_id >= CLUSTER_MAX_NODES)
689+
return false;
690+
return pg_atomic_read_u64(&ReconfigShmem->observed_fresh_alive[node_id]) != 0;
691+
}
692+
669693

670694
/*
671695
* Read snapshot of last_applied.event_id under shared lock. Used by
@@ -1252,12 +1276,20 @@ cluster_reconfig_bootstrap_quorum_at_initial(void)
12521276
if (ep > CLUSTER_EPOCH_INITIAL)
12531277
return false;
12541278
/*
1255-
* Count a peer only on a VALID durable co-boot slot: a real observed
1256-
* voting-disk slot (generation > 0) at epoch INITIAL. Never count a
1257-
* default-0 placeholder (generation 0) nor rely on live CSSD state.
1279+
* Count a peer only on a FRESH-ALIVE co-boot slot: a real observed
1280+
* voting-disk slot (generation > 0) that qvotec's decide_quorum_view saw
1281+
* FRESH-ALIVE this poll (heartbeat_ts_us recent, the P2.1 freshness gate),
1282+
* at epoch INITIAL. Hardening v1.3: the generation > 0 test alone is NOT
1283+
* liveness — a CRASHED peer leaves a stale leftover slot (gen > 0, epoch
1284+
* INITIAL) that v1.2 wrongly counted, letting a node fail-open (latch
1285+
* BOOTSTRAP on self + a stale peer slot, with no live co-boot quorum). The
1286+
* fresh-alive signal is anchored on the durable voting-disk heartbeat (NOT
1287+
* live CSSD), so it rejects stale slots WITHOUT reintroducing the v1.2
1288+
* IC-churn race (the disk heartbeat keeps flowing while CSSD/tier1 churns).
1289+
* A default-0 placeholder (generation 0) never counts either.
12581290
*/
12591291
if (cluster_reconfig_get_observed_slot(i, &inc, &gen) && gen > 0
1260-
&& ep == CLUSTER_EPOCH_INITIAL)
1292+
&& cluster_reconfig_get_observed_fresh_alive(i) && ep == CLUSTER_EPOCH_INITIAL)
12611293
proven_at_initial++;
12621294
}
12631295
if (declared == 0)
@@ -2869,6 +2901,17 @@ cluster_reconfig_get_observed_epoch(int32 node_id pg_attribute_unused())
28692901
return 0;
28702902
}
28712903

2904+
void
2905+
cluster_reconfig_record_observed_fresh_alive(int32 node_id pg_attribute_unused(),
2906+
bool fresh_alive pg_attribute_unused())
2907+
{}
2908+
2909+
bool
2910+
cluster_reconfig_get_observed_fresh_alive(int32 node_id pg_attribute_unused())
2911+
{
2912+
return false;
2913+
}
2914+
28722915
ClusterJoinMarkerSubmitResult
28732916
cluster_reconfig_submit_join_marker(int32 target_node pg_attribute_unused(),
28742917
const ClusterJoinCommitMarker *m pg_attribute_unused())

src/include/cluster/cluster_reconfig.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,20 @@ typedef struct ClusterReconfigState {
338338
*/
339339
pg_atomic_uint64 observed_epoch[CLUSTER_MAX_NODES];
340340

341+
/*
342+
* spec-5.15 Hardening v1.3 (INV-J14 stale-slot fail-open) — per declared node,
343+
* whether qvotec's decide_quorum_view saw that node FRESH-ALIVE this poll (its
344+
* voting-disk heartbeat_ts_us recent, per the P2.1 freshness gate). This is the
345+
* liveness signal the cold-bootstrap proof needs: a generation > 0 slot alone
346+
* may be a CRASHED peer's stale leftover at epoch INITIAL — counting it would
347+
* fail-open (latch BOOTSTRAP without a live co-boot quorum). Anchored on the
348+
* durable voting-disk heartbeat (NOT live CSSD), so it is robust to IC/tier1
349+
* heartbeat churn — the v1.2 race fix is preserved. 1 = fresh-alive, 0 = stale /
350+
* absent (default 0 = fail-closed). pg_atomic — qvotec (writer) and LMON (reader)
351+
* are different processes.
352+
*/
353+
pg_atomic_uint64 observed_fresh_alive[CLUSTER_MAX_NODES];
354+
341355
/*
342356
* spec-5.15 D4 — join-commit-marker submit mailbox (§2.6). The coordinator
343357
* stages a marker for the joiner (join_marker_target_node_id), bumps
@@ -482,6 +496,16 @@ extern bool cluster_reconfig_get_observed_slot(int32 node_id, uint64 *incarnatio
482496
uint64 *generation);
483497
extern uint64 cluster_reconfig_get_observed_epoch(int32 node_id);
484498

499+
/*
500+
* spec-5.15 Hardening v1.3 — publish / read the per-node FRESH-ALIVE liveness
501+
* qvotec's decide_quorum_view derived from the durable voting-disk heartbeat
502+
* (the P2.1 freshness gate). The cold-bootstrap proof counts a peer only when
503+
* it is fresh-alive AND at epoch INITIAL — never on a generation > 0 slot alone
504+
* (a crashed peer's stale leftover). get returns false when absent (fail-closed).
505+
*/
506+
extern void cluster_reconfig_record_observed_fresh_alive(int32 node_id, bool fresh_alive);
507+
extern bool cluster_reconfig_get_observed_fresh_alive(int32 node_id);
508+
485509
/*
486510
* spec-5.15 Hardening v1.1 (HF-1 / INV-J9): true iff a majority of the current
487511
* MEMBER survivors have advanced their durable observed epoch to >=

src/test/cluster_unit/test_cluster_qvotec.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,12 @@ cluster_reconfig_record_observed_slot(int32 node_id pg_attribute_unused(),
447447
uint64 generation pg_attribute_unused(),
448448
uint64 epoch pg_attribute_unused())
449449
{}
450+
/* spec-5.15 Hardening v1.3: qvotec.o now also publishes per-node fresh-alive. */
451+
void cluster_reconfig_record_observed_fresh_alive(int32 node_id, bool fresh_alive);
452+
void
453+
cluster_reconfig_record_observed_fresh_alive(int32 node_id pg_attribute_unused(),
454+
bool fresh_alive pg_attribute_unused())
455+
{}
450456
bool cluster_reconfig_join_qvotec_poll_pending(int32 *out_target_node, void *out_slot512);
451457
bool
452458
cluster_reconfig_join_qvotec_poll_pending(int32 *out_target_node,

src/test/cluster_unit/test_cluster_reconfig.c

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,10 +1414,12 @@ UT_TEST(test_reconfig_bootstrap_quorum_epoch_proof)
14141414
ut_declared_set[1] = true; /* 3 declared nodes */
14151415
ut_declared_set[2] = true;
14161416

1417-
/* quorum of declared on VALID co-boot slots at INITIAL -> bootstrap proven
1418-
* (v1.2: anchored on the durable voting-disk slot, not live CSSD). */
1417+
/* quorum of declared FRESH-ALIVE co-boot slots at INITIAL -> bootstrap proven
1418+
* (v1.3: durable voting-disk heartbeat freshness + valid slot, not live CSSD). */
14191419
cluster_reconfig_record_observed_slot(1, 1, 1, 0);
14201420
cluster_reconfig_record_observed_slot(2, 1, 1, 0);
1421+
cluster_reconfig_record_observed_fresh_alive(1, true);
1422+
cluster_reconfig_record_observed_fresh_alive(2, true);
14211423
UT_ASSERT(cluster_reconfig_bootstrap_quorum_at_initial());
14221424

14231425
/* a peer past INITIAL (running cluster) -> NOT a bootstrap (fail-closed). */
@@ -1448,15 +1450,18 @@ UT_TEST(test_reconfig_bootstrap_quorum_epoch_proof)
14481450
* ====================================================================== */
14491451
UT_TEST(test_reconfig_bootstrap_proof_valid_slot_not_cssd)
14501452
{
1451-
/* --- A. valid co-boot slots at INITIAL but peers CSSD-DEAD -> still a
1452-
* proven bootstrap (durable slot, not live CSSD). v1.1 returned
1453-
* false here (the race window); v1.2 returns true. --- */
1453+
/* --- A. FRESH-ALIVE co-boot slots at INITIAL but peers CSSD-DEAD -> still a
1454+
* proven bootstrap (durable voting-disk heartbeat, not live CSSD). v1.1
1455+
* returned false here (the race window); v1.3 returns true because the
1456+
* liveness is the voting-disk fresh-alive signal, immune to CSSD churn. --- */
14541457
ut_join_setup(); /* self = node 0 */
14551458
ut_declared_set[1] = true; /* 3 declared nodes */
14561459
ut_declared_set[2] = true;
1457-
cluster_reconfig_record_observed_slot(1, 7, 1, 0); /* valid slot, INITIAL */
1458-
cluster_reconfig_record_observed_slot(2, 7, 1, 0); /* valid slot, INITIAL */
1459-
ut_peer_state[1] = CLUSTER_CSSD_PEER_DEAD; /* live CSSD churned down */
1460+
cluster_reconfig_record_observed_slot(1, 7, 1, 0); /* valid slot, INITIAL */
1461+
cluster_reconfig_record_observed_slot(2, 7, 1, 0); /* valid slot, INITIAL */
1462+
cluster_reconfig_record_observed_fresh_alive(1, true); /* voting-disk fresh */
1463+
cluster_reconfig_record_observed_fresh_alive(2, true);
1464+
ut_peer_state[1] = CLUSTER_CSSD_PEER_DEAD; /* live CSSD churned down */
14601465
ut_peer_state[2] = CLUSTER_CSSD_PEER_DEAD;
14611466
UT_ASSERT(cluster_reconfig_bootstrap_quorum_at_initial());
14621467

@@ -1481,14 +1486,50 @@ UT_TEST(test_reconfig_bootstrap_proof_valid_slot_not_cssd)
14811486
}
14821487

14831488

1489+
/* ======================================================================
1490+
* U21 (spec-5.15 Hardening v1.3 / INV-J14 stale-slot fail-open) -- a valid
1491+
* generation > 0 slot at epoch INITIAL is NOT proof of co-booting: it may be a
1492+
* CRASHED peer's stale leftover (decide_quorum_view's P2.1 freshness gate marks
1493+
* it not-fresh). The cold-bootstrap proof must additionally require the per-node
1494+
* FRESH-ALIVE signal (durable voting-disk heartbeat), not slot existence alone —
1495+
* else a node with self + a stale peer slot fail-opens (latches BOOTSTRAP without
1496+
* a live co-boot quorum). v1.2 counted such a stale slot (the regression this
1497+
* guards); v1.3 fail-closes on it.
1498+
* ====================================================================== */
1499+
UT_TEST(test_reconfig_bootstrap_proof_stale_slot_failclosed)
1500+
{
1501+
/* --- A. valid slots at INITIAL but STALE heartbeat (fresh_alive=false) must
1502+
* NOT count -> only self proven -> below quorum -> false. v1.2 (no
1503+
* freshness) counted them = fail-open; v1.3 fail-closes. --- */
1504+
ut_join_setup(); /* self = node 0 */
1505+
ut_declared_set[1] = true; /* 3 declared nodes */
1506+
ut_declared_set[2] = true;
1507+
cluster_reconfig_record_observed_slot(1, 7, 1, 0); /* valid slot, INITIAL */
1508+
cluster_reconfig_record_observed_slot(2, 7, 1, 0); /* valid slot, INITIAL */
1509+
cluster_reconfig_record_observed_fresh_alive(1, false); /* crashed peer: stale hb */
1510+
cluster_reconfig_record_observed_fresh_alive(2, false);
1511+
UT_ASSERT(!cluster_reconfig_bootstrap_quorum_at_initial());
1512+
1513+
/* --- B. the same slots but FRESH-ALIVE -> genuine co-boot -> proven. --- */
1514+
cluster_reconfig_record_observed_fresh_alive(1, true);
1515+
cluster_reconfig_record_observed_fresh_alive(2, true);
1516+
UT_ASSERT(cluster_reconfig_bootstrap_quorum_at_initial());
1517+
1518+
/* --- C. one fresh + one stale -> self + the one fresh = quorum (3-node) ->
1519+
* proven; the stale peer simply does not contribute (no over-reject). --- */
1520+
cluster_reconfig_record_observed_fresh_alive(2, false);
1521+
UT_ASSERT(cluster_reconfig_bootstrap_quorum_at_initial());
1522+
}
1523+
1524+
14841525
/* ============================================================
14851526
* Main — register + run all tests.
14861527
* ============================================================ */
14871528

14881529
int
14891530
main(void)
14901531
{
1491-
UT_PLAN(43);
1532+
UT_PLAN(45);
14921533

14931534
/* T-reconfig-1 */
14941535
UT_RUN(test_reconfig_dead_bitmap_bytes_eq_16);
@@ -1559,6 +1600,7 @@ main(void)
15591600
UT_RUN(test_reconfig_join_publish_proven_no_member_failclosed);
15601601
UT_RUN(test_reconfig_bootstrap_quorum_epoch_proof);
15611602
UT_RUN(test_reconfig_bootstrap_proof_valid_slot_not_cssd);
1603+
UT_RUN(test_reconfig_bootstrap_proof_stale_slot_failclosed);
15621604

15631605
UT_DONE();
15641606
return ut_failed_count == 0 ? 0 : 1;

0 commit comments

Comments
 (0)