From b64e8c60a2286119f9d758ed67fce8842212d9ed Mon Sep 17 00:00:00 2001 From: SqlRush Date: Mon, 29 Jun 2026 20:51:43 +0800 Subject: [PATCH 1/3] =?UTF-8?q?test(cluster):=20spec-5.19=20MG-B=20?= =?UTF-8?q?=E2=80=94=20t/328=202-node=20report-only=20leg=20+=20always-log?= =?UTF-8?q?ged=20median?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - M3: two-node peer-online single-writer write-tax measurement (real ClusterPair, strict quorum + shared_data; node0 writes while node1 is in quorum). REPORT ONLY: never asserts a threshold and never fails the single-node hard gate -- if the ClusterPair cannot boot/quorum/measure this run it passes with an explicit "unavailable" note. Addresses the 2-node write-path question without weakening the M1 single-node gate. - M1: emit the measured median write tax via diag() (reaches the CI log even on PASS; note() is swallowed by non-verbose prove) so the gate's headroom is visible without re-running the shard verbose. The HARD gate stays the single-node M1 tax <= 10% (rule 8.B). --- .../t/328_stage5_multinode_write_perf.pl | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/src/test/cluster_tap/t/328_stage5_multinode_write_perf.pl b/src/test/cluster_tap/t/328_stage5_multinode_write_perf.pl index d42a656701..143ad08d2f 100644 --- a/src/test/cluster_tap/t/328_stage5_multinode_write_perf.pl +++ b/src/test/cluster_tap/t/328_stage5_multinode_write_perf.pl @@ -18,6 +18,13 @@ # fsync/shared_buffers, pgbench TPC-B write workload, best-of-N for # stability, ratio-based so the runner's absolute speed cancels. # +# Additional report-only measurement: +# two-node peer-online single-writer write tax +# +# This boots a real ClusterPair (strict quorum + shared_data) and measures +# TPC-B writes on node0 while node1 is connected/in quorum. It is a value +# report only: no percentage threshold is asserted here. +# # SEPARATE capability limitation (does NOT cover or excuse the gate above): # true concurrent multi-node shared-block write competition is bounded by # cross-node holder migration (DRM = Stage 6; spec-5.57 cross-instance @@ -39,6 +46,7 @@ use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; +use PostgreSQL::Test::ClusterPair; use PostgreSQL::Test::Stage5IntegratedAcceptanceReport; use Test::More; @@ -50,6 +58,7 @@ my $SECS = $ENV{PGRAC_PGBENCH_SECS} // 8; my $CLIENTS = 4; my $ROUNDS = $ENV{PGRAC_PGBENCH_ROUNDS} // 7; # interleaved rounds +my $TWO_NODE_ROUNDS = $ENV{PGRAC_2NODE_PGBENCH_ROUNDS} // $ROUNDS; # The hard gate: cluster write tax must not exceed this percentage. my $GATE_PCT = $ENV{PGRAC_WRITE_TAX_GATE_PCT} // 10.0; @@ -79,6 +88,23 @@ sub median return $s[int((@s) / 2)]; } +sub poll_sql_eq +{ + my ($node, $sql, $want, $timeout_s) = @_; + $timeout_s //= 15; + my $deadline = time + $timeout_s; + my $last = '(never-queried)'; + while (time < $deadline) + { + my $got = eval { $node->safe_psql('postgres', $sql); }; + $last = defined $got ? $got : '(undef)'; + return 1 if defined $got && $got eq $want; + select(undef, undef, undef, 0.25); + } + diag("poll_sql_eq timeout after ${timeout_s}s: want='$want' last='$last' sql=$sql"); + return 0; +} + # Common perf-isolation knobs (CPU-overhead measurement: fsync off removes # disk variance so the gate measures the cluster machinery's added work). my @perf_conf = ( @@ -149,6 +175,16 @@ sub median note(" cluster TPC-B median tps = " . (defined $tps_cluster ? sprintf('%.0f', $tps_cluster) : 'n/a')); note(" write tax % (median) = $tax_s (gate: <= $GATE_PCT%)"); +# Surface the measured median to the captured CI log unconditionally: note() +# is swallowed by non-verbose prove, but diag() reaches the log even on PASS. +# This makes the gate's headroom (e.g. spec-5.19 MG-D v3 WAL delta effect) +# visible without re-running the shard verbose. +diag(sprintf("MG-B single-node write tax (median of %d rounds) = %s%% " + . "(gate <= %s%%; native=%s cluster=%s median tps)", + scalar(@taxes), $tax_s, $GATE_PCT, + (defined $tps_native ? sprintf('%.0f', $tps_native) : 'n/a'), + (defined $tps_cluster ? sprintf('%.0f', $tps_cluster) : 'n/a'))); + ok($have_both, "M0 native + cluster single-node throughput measured over " . scalar(@taxes) . " interleaved rounds"); @@ -181,6 +217,94 @@ sub median ok($wait_events_present > 0, "M2 cluster write-path wait-event surface present ($wait_events_present rows)"); +# --------------------------------------------------------------------- +# M3: two-node peer-online write tax — REPORT ONLY. +# +# This is deliberately NOT a hard gate. It measures the current two-node +# online shape that Stage 5 can soundly run: strict-quorum ClusterPair with +# shared_data, node0 executing TPC-B writes while node1 is connected and in +# quorum. True concurrent dual-writer shared-block competition remains the +# separate DRM/Stage-6 limitation recorded below. +# --------------------------------------------------------------------- +my @two_node_tps; +my $two_node_started = 0; +my $two_node_ready = 0; +eval { + my @pair_perf_conf = map { my $line = $_; chomp $line; $line } @perf_conf; + my $pair = PostgreSQL::Test::ClusterPair->new_pair( + 'mnw_pair', + quorum_voting_disks => 3, + shared_data => 1, + extra_conf => [ + @pair_perf_conf, + 'cluster.quorum_poll_interval_ms = 500', + 'cluster.cssd_heartbeat_interval_ms = 2000', + 'cluster.cssd_dead_deadband_factor = 10', + ]); + $pair->start_pair; + $two_node_started = 1; + $two_node_ready = + $pair->wait_for_peer_state(0, 1, 'connected', 30) + && $pair->wait_for_peer_state(1, 0, 'connected', 30) + && poll_sql_eq($pair->node0, 'SELECT in_quorum FROM pg_cluster_quorum_state', 't', 20) + && poll_sql_eq($pair->node1, 'SELECT in_quorum FROM pg_cluster_quorum_state', 't', 20); + + if ($two_node_ready && pgbench_init($pair->node0)) + { + for my $r (1 .. $TWO_NODE_ROUNDS) + { + my $t = pgbench_one($pair->node0); + next unless defined $t && $t > 0; + push @two_node_tps, $t; + note(sprintf(" two-node round %d: node0 tps=%.0f", $r, $t)); + } + } + $pair->stop_pair; + 1; +} or do { + my $err = $@ || 'unknown error'; + diag("M3 two-node report-only measurement failed before completion: $err"); +}; + +my $two_have = ($two_node_started && $two_node_ready && scalar(@two_node_tps) > 0 + && defined $tps_native && $tps_native > 0); +my $two_tps = scalar(@two_node_tps) ? median(@two_node_tps) : undef; +my $two_tax = ($two_have && defined $two_tps) + ? 100.0 * (1.0 - $two_tps / $tps_native) : undef; +my $two_tax_s = defined $two_tax ? sprintf('%.2f', $two_tax) : 'n/a'; + +note("MG-B two-node peer-online write-path REPORT-ONLY measurement:"); +note(" native single-node TPC-B median tps = " + . (defined $tps_native ? sprintf('%.0f', $tps_native) : 'n/a')); +note(" two-node peer-online node0 TPC-B median tps = " + . (defined $two_tps ? sprintf('%.0f', $two_tps) : 'n/a')); +note(" two-node write tax % (report-only) = $two_tax_s"); + +# REPORT ONLY: this leg must never fail the single-node hard gate. If the +# 2-node ClusterPair could not boot / reach quorum / produce a number this run +# (transient runner shmem pressure, etc.), pass with an explicit unavailable +# note rather than failing -- the HARD gate is the single-node M1 tax only. +if ($two_have) +{ + diag("MG-B two-node peer-online write tax (report-only) = ${two_tax_s}%"); + ok(1, + "M3 two-node peer-online single-writer write tax measured: ${two_tax_s}% " + . "(REPORT ONLY; no threshold asserted)"); +} +else +{ + ok(1, + "M3 two-node peer-online write tax unavailable this run " + . "(REPORT ONLY; never fails the single-node hard gate)"); +} +$report->record_multinode_write_value(2, 'tpcb-peer-online-single-writer', + tps_native => (defined $tps_native ? $tps_native : 0), + tps_cluster => (defined $two_tps ? $two_tps : 0), + write_tax_pct => $two_tax_s, + gate => 'REPORT-ONLY', + note => 'ClusterPair strict-quorum + shared_data; node0 writes while node1 ' + . 'is connected/in quorum. No threshold asserted.'); + # --------------------------------------------------------------------- # SOUNDNESS — the single-node tax above is REAL + gated. The TRUE concurrent # multi-node shared-block write limit is a SEPARATE capability limitation that From 48086011393d81d57264462eba4104f97d628749 Mon Sep 17 00:00:00 2001 From: SqlRush Date: Mon, 29 Jun 2026 20:52:06 +0800 Subject: [PATCH 2/3] =?UTF-8?q?perf(cluster):=20spec-5.19=20MG-D=20?= =?UTF-8?q?=E2=80=94=20heap-ITL=20WAL=20delta=20v3=20(drop=20always-Invali?= =?UTF-8?q?d=20commit=5Fscn,=2048->40B/record)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the MG-B single-node write-tax blocker (rule 8.B). CI nightly measured the v2 (48B/record) tax at 10.62% > the 10% hard gate; an A/B that registered the delta 8B shorter passed the gate, confirming the 8B drop is sufficient. The write-time commit_scn (8B) is ALWAYS InvalidScn at every write-path ITL emit site: heap_insert / multi_insert / delete / lock / lock-chain / update old+new only ever stamp ITL_FLAG_ACTIVE / ITL_FLAG_LOCK_ONLY_ACTIVE transitions (the slot is not yet committed; COMMITTED stamping happens via the later commit-time / delayed-cleanout page mutation, FPI-logged, not via a write-path delta). Dropping an always-Invalid field is lossless. - heapam_xlog.h: new xl_heap_itl_delta_v3 (32B) + CLUSTER_ITL_DELTA_FORMAT_V3; keeps UBA (undo_segment_head moves 24->16), only commit_scn elided. StaticAsserts pin sizeof==32 and all field offsets. - cluster_itl.c: redo dispatches v1/v2/v3 by block format_version; v3 reconstructs commit_scn=InvalidScn. v1/v2 branches retained for backward WAL replay. consumed-bytes helper extended for v3. - heapam.c: all 8 write-path emit sites switch v2->v3 (drop the commit_scn assignment; register sizeof(v3)=32). - The COMMITTED-requires-valid-SCN redo guard still fires for any v3 delta that carries ITL_FLAG_COMMITTED -> fails closed (PANIC), so v3 can never silently install a committed slot with InvalidScn (8.A). - catversion 202606330 -> 202606340: fences an old binary from replaying v3-format WAL (unknown format_version -> redo PANIC). - Per mutating heap record: 8 + 40 == 48B -> 8 + 32 == 40B. - Tests: test_cluster_itl_wal v3 layout (T30-T33); D8 L6 invariant -> 40B (v2 40B retained for backward replay); t/329 MG-D model -> 40B + decision framing (v3 GO part shipped; same-block coalesce remains a follow-up). --- src/backend/access/heap/heapam.c | 48 ++++++++---------- src/backend/cluster/cluster_itl.c | 24 ++++++++- src/include/access/heapam_xlog.h | 49 +++++++++++++++++++ src/include/catalog/catversion.h | 9 +++- .../t/329_stage5_heap_itl_wal_measure.pl | 27 ++++++---- src/test/cluster_unit/test_cluster_itl_wal.c | 39 +++++++++++++++ ...est_cluster_stage5_integrated_acceptance.c | 30 +++++++----- 7 files changed, 174 insertions(+), 52 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ed7453c6d0..fceec3e062 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2185,7 +2185,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, int bufflags = 0; #ifdef USE_PGRAC_CLUSTER xl_heap_itl_delta_block cluster_itl_hdr; - xl_heap_itl_delta_v2 cluster_itl_delta; /* spec-3.4b D6 F9 — v2 40B */ + xl_heap_itl_delta_v3 cluster_itl_delta; /* spec-5.19 MG-D — v3 32B (commit_scn dropped) */ #endif /* @@ -2241,12 +2241,11 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, { cluster_itl_hdr.ndeltas = 1; cluster_itl_hdr.reserved = 0; - cluster_itl_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V2; + cluster_itl_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V3; cluster_itl_delta.slot_idx = cluster_itl_slot; cluster_itl_delta.flags_after = ITL_FLAG_ACTIVE; cluster_itl_delta.xid = xid; cluster_itl_delta.write_scn = ClusterPageGetItlSlots(BufferGetPage(buffer))[cluster_itl_slot].write_scn; - cluster_itl_delta.commit_scn = InvalidScn; cluster_itl_delta.undo_segment_head = cluster_itl_uba; xlrec.flags |= XLH_INSERT_ITL_DELTA; @@ -2777,16 +2776,15 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (cluster_mi_active) { xl_heap_itl_delta_block mi_hdr; - xl_heap_itl_delta_v2 mi_delta; /* spec-3.4b D6 F9 — v2 40B */ + xl_heap_itl_delta_v3 mi_delta; /* spec-5.19 MG-D — v3 32B (commit_scn dropped) */ mi_hdr.ndeltas = 1; mi_hdr.reserved = 0; - mi_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V2; + mi_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V3; mi_delta.slot_idx = cluster_mi_slot; mi_delta.flags_after = ITL_FLAG_ACTIVE; mi_delta.xid = xid; mi_delta.write_scn = ClusterPageGetItlSlots(page)[cluster_mi_slot].write_scn; - mi_delta.commit_scn = InvalidScn; mi_delta.undo_segment_head = cluster_mi_uba; XLogRegisterData((char *) &mi_hdr, @@ -3603,7 +3601,7 @@ heap_delete(Relation relation, ItemPointer tid, XLogRecPtr recptr; #ifdef USE_PGRAC_CLUSTER xl_heap_itl_delta_block cluster_itl_hdr; - xl_heap_itl_delta_v2 cluster_itl_delta; /* spec-3.4b D6 F9 — v2 40B */ + xl_heap_itl_delta_v3 cluster_itl_delta; /* spec-5.19 MG-D — v3 32B (commit_scn dropped) */ #endif /* @@ -3638,12 +3636,11 @@ heap_delete(Relation relation, ItemPointer tid, { cluster_itl_hdr.ndeltas = 1; cluster_itl_hdr.reserved = 0; - cluster_itl_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V2; + cluster_itl_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V3; cluster_itl_delta.slot_idx = cluster_itl_slot; cluster_itl_delta.flags_after = ITL_FLAG_ACTIVE; cluster_itl_delta.xid = xid; cluster_itl_delta.write_scn = ClusterPageGetItlSlots(page)[cluster_itl_slot].write_scn; - cluster_itl_delta.commit_scn = InvalidScn; cluster_itl_delta.undo_segment_head = cluster_itl_uba; xlrec.flags |= XLH_DELETE_ITL_DELTA; } @@ -6464,7 +6461,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, XLogRecPtr recptr; #ifdef USE_PGRAC_CLUSTER xl_heap_itl_delta_block hdr; - xl_heap_itl_delta_v2 delta; + xl_heap_itl_delta_v3 delta; #endif XLogBeginInsert(); @@ -6485,7 +6482,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, #ifdef USE_PGRAC_CLUSTER /* - * PGRAC (spec-3.4d D4 WAL emit / Q4 A2): append v2 40B ITL delta + * PGRAC (spec-3.4d D4 WAL emit / Q4 A2): append v3 32B ITL delta (spec-5.19 MG-D) * + 4B block header inside same xlrec so heap_xlog_lock can replay * the lock-only ITL slot stamp on standbys. Layout mirrors * spec-3.4b D6 single-block delta WAL ABI. @@ -6493,7 +6490,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (cluster_did_lock_stamp) { memset(&hdr, 0, sizeof(hdr)); - hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V2; + hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V3; hdr.ndeltas = 1; hdr.reserved = 0; XLogRegisterData((char *) &hdr, offsetof(xl_heap_itl_delta_block, deltas)); @@ -6503,7 +6500,6 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, delta.flags_after = ITL_FLAG_LOCK_ONLY_ACTIVE; delta.xid = xid; delta.write_scn = cluster_lock_write_scn; - delta.commit_scn = InvalidScn; delta.undo_segment_head = cluster_lock_uba; XLogRegisterData((char *) &delta, sizeof(delta)); } @@ -7366,7 +7362,7 @@ heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax, Page page = BufferGetPage(buf); #ifdef USE_PGRAC_CLUSTER xl_heap_itl_delta_block chain_hdr; - xl_heap_itl_delta_v2 chain_delta; + xl_heap_itl_delta_v3 chain_delta; #endif XLogBeginInsert(); @@ -7388,7 +7384,7 @@ heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax, if (cluster_chain_lock_stamp) { memset(&chain_hdr, 0, sizeof(chain_hdr)); - chain_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V2; + chain_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V3; chain_hdr.ndeltas = 1; chain_hdr.reserved = 0; XLogRegisterData((char *) &chain_hdr, @@ -7399,7 +7395,6 @@ heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax, chain_delta.flags_after = ITL_FLAG_LOCK_ONLY_ACTIVE; chain_delta.xid = xid; chain_delta.write_scn = cluster_chain_write_scn; - chain_delta.commit_scn = InvalidScn; chain_delta.undo_segment_head = cluster_chain_uba; XLogRegisterData((char *) &chain_delta, sizeof(chain_delta)); } @@ -10590,9 +10585,9 @@ log_heap_update(Relation reln, Buffer oldbuf, int bufflags; #ifdef USE_PGRAC_CLUSTER xl_heap_itl_delta_block cluster_itl_old_hdr; - xl_heap_itl_delta_v2 cluster_itl_old_delta; /* spec-3.4b D6 F9 — v2 40B */ + xl_heap_itl_delta_v3 cluster_itl_old_delta; /* spec-5.19 MG-D — v3 32B (commit_scn dropped) */ xl_heap_itl_delta_block cluster_itl_new_hdr; - xl_heap_itl_delta_v2 cluster_itl_new_delta; /* spec-3.4b D6 F9 — v2 40B */ + xl_heap_itl_delta_v3 cluster_itl_new_delta; /* spec-5.19 MG-D — v3 32B (commit_scn dropped) */ #endif /* Caller should not call me on a non-WAL-logged relation */ @@ -10695,24 +10690,22 @@ log_heap_update(Relation reln, Buffer oldbuf, { cluster_itl_new_hdr.ndeltas = 1; cluster_itl_new_hdr.reserved = 0; - cluster_itl_new_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V2; + cluster_itl_new_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V3; cluster_itl_new_delta.slot_idx = cluster_itl_new_slot; cluster_itl_new_delta.flags_after = ITL_FLAG_ACTIVE; cluster_itl_new_delta.xid = cluster_itl_xid; cluster_itl_new_delta.write_scn = ClusterPageGetItlSlots(BufferGetPage(newbuf))[cluster_itl_new_slot].write_scn; - cluster_itl_new_delta.commit_scn = InvalidScn; cluster_itl_new_delta.undo_segment_head = cluster_itl_uba; } if (cluster_itl_old_active && oldbuf != newbuf) { cluster_itl_old_hdr.ndeltas = 1; cluster_itl_old_hdr.reserved = 0; - cluster_itl_old_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V2; + cluster_itl_old_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V3; cluster_itl_old_delta.slot_idx = cluster_itl_old_slot; cluster_itl_old_delta.flags_after = ITL_FLAG_ACTIVE; cluster_itl_old_delta.xid = cluster_itl_xid; cluster_itl_old_delta.write_scn = ClusterPageGetItlSlots(BufferGetPage(oldbuf))[cluster_itl_old_slot].write_scn; - cluster_itl_old_delta.commit_scn = InvalidScn; cluster_itl_old_delta.undo_segment_head = cluster_itl_uba; } else if (cluster_itl_old_active) @@ -10724,12 +10717,11 @@ log_heap_update(Relation reln, Buffer oldbuf, */ cluster_itl_new_hdr.ndeltas = 1; cluster_itl_new_hdr.reserved = 0; - cluster_itl_new_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V2; + cluster_itl_new_hdr.format_version = CLUSTER_ITL_DELTA_FORMAT_V3; cluster_itl_new_delta.slot_idx = cluster_itl_old_slot; cluster_itl_new_delta.flags_after = ITL_FLAG_ACTIVE; cluster_itl_new_delta.xid = cluster_itl_xid; cluster_itl_new_delta.write_scn = ClusterPageGetItlSlots(BufferGetPage(newbuf))[cluster_itl_old_slot].write_scn; - cluster_itl_new_delta.commit_scn = InvalidScn; cluster_itl_new_delta.undo_segment_head = cluster_itl_uba; } } @@ -11524,7 +11516,7 @@ heap_xlog_delete(XLogReaderState *record) /* * PGRAC (spec-3.4a D9 / spec-3.4b D6): replay block-local ITL * delta when XLH_DELETE_ITL_DELTA is set. The helper dispatches - * by format_version (v1 24B legacy / v2 40B with UBA). + * by format_version (v1 24B legacy / v2 40B / v3 32B, all with UBA). */ if (xlrec->flags & XLH_DELETE_ITL_DELTA) { @@ -11666,7 +11658,7 @@ heap_xlog_insert(XLogReaderState *record) /* * PGRAC (spec-3.4a D9 / spec-3.4b D6): replay block-local ITL * delta array when XLH_INSERT_ITL_DELTA is set. The helper - * dispatches by format_version (v1 24B legacy / v2 40B with UBA). + * dispatches by format_version (v1 24B legacy / v2 40B / v3 32B, all with UBA). */ if (xlrec->flags & XLH_INSERT_ITL_DELTA) { @@ -12343,7 +12335,7 @@ heap_xlog_lock(XLogReaderState *record) #ifdef USE_PGRAC_CLUSTER /* * PGRAC (spec-3.4d D6 redo / Q4 A2): replay lock-only ITL slot - * stamp from v2 40B delta appended after xlrec. See spec-3.4b D6 + * stamp from the dispatched ITL delta (v1 24B / v2 40B / v3 32B) appended after xlrec. See spec-3.4b D6 * for delta block layout. htup tuple header has no * t_lock_itl_slot_idx field (per F2 raw_xmax scan derivation), so * we do not patch any tuple header field; the slot stamp itself @@ -12419,7 +12411,7 @@ heap_xlog_lock_updated(XLogReaderState *record) #ifdef USE_PGRAC_CLUSTER /* PGRAC (spec-3.4d D6 redo / follow_updates): replay lock-only - * ITL slot for successor tuple from v2 40B delta. */ + * ITL slot for successor tuple from the dispatched ITL delta (v2 40B / v3 32B). */ if (xlrec->flags & XLH_LOCK_UPDATED_ITL_DELTA) { const char *delta_start = ((const char *) xlrec) + SizeOfHeapLockUpdated; diff --git a/src/backend/cluster/cluster_itl.c b/src/backend/cluster/cluster_itl.c index 3c440d425a..64b0c068c1 100644 --- a/src/backend/cluster/cluster_itl.c +++ b/src/backend/cluster/cluster_itl.c @@ -968,6 +968,8 @@ cluster_itl_redo_apply_block_local_delta(Page page, HeapTupleHeader htup, delta_size = sizeof(xl_heap_itl_delta); else if (hdr.format_version == CLUSTER_ITL_DELTA_FORMAT_V2) delta_size = sizeof(xl_heap_itl_delta_v2); + else if (hdr.format_version == CLUSTER_ITL_DELTA_FORMAT_V3) + delta_size = sizeof(xl_heap_itl_delta_v3); else elog(PANIC, "spec-3.4b D6: unknown xl_heap_itl_delta_block.format_version %u", (unsigned)hdr.format_version); @@ -996,7 +998,7 @@ cluster_itl_redo_apply_block_local_delta(Page page, HeapTupleHeader htup, * slot's existing UBA on page is preserved. Legacy ACTIVE * stamps wrote InvalidUba to the page anyway, so reader * 3-branch (D7) will fall back to zero triple. */ - } else { + } else if (hdr.format_version == CLUSTER_ITL_DELTA_FORMAT_V2) { xl_heap_itl_delta_v2 d; memcpy(&d, p, sizeof(d)); @@ -1006,6 +1008,24 @@ cluster_itl_redo_apply_block_local_delta(Page page, HeapTupleHeader htup, d_write_scn = d.write_scn; d_commit_scn = d.commit_scn; d_uba = d.undo_segment_head; + } else { + /* CLUSTER_ITL_DELTA_FORMAT_V3 (delta_size dispatch above already + * PANICked on any other value). */ + xl_heap_itl_delta_v3 d; + + memcpy(&d, p, sizeof(d)); + slot_idx = d.slot_idx; + flags_after = d.flags_after; + d_xid = d.xid; + d_write_scn = d.write_scn; + /* spec-5.19 MG-D: v3 elides the write-time commit_scn (it is + * always InvalidScn for the ACTIVE / LOCK_ONLY_ACTIVE transitions + * the write path emits). Reconstruct it as InvalidScn. If a v3 + * delta ever carries ITL_FLAG_COMMITTED, the COMMITTED-requires- + * valid-SCN guard below fails closed (PANIC) -- v3 must never be + * used for a COMMITTED transition. */ + d_commit_scn = InvalidScn; + d_uba = d.undo_segment_head; } if (flags_after == ITL_FLAG_COMMITTED && !SCN_VALID(d_commit_scn)) @@ -1085,6 +1105,8 @@ cluster_itl_wal_block_consumed_bytes(const char *itl_block_start) delta_size = sizeof(xl_heap_itl_delta); else if (hdr.format_version == CLUSTER_ITL_DELTA_FORMAT_V2) delta_size = sizeof(xl_heap_itl_delta_v2); + else if (hdr.format_version == CLUSTER_ITL_DELTA_FORMAT_V3) + delta_size = sizeof(xl_heap_itl_delta_v3); else elog(PANIC, "spec-3.4b D6: unknown xl_heap_itl_delta_block.format_version %u", (unsigned)hdr.format_version); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index f17ffc61d2..5171ca25a9 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -545,8 +545,34 @@ StaticAssertDecl(offsetof(xl_heap_itl_delta_block, deltas) == 8, * falls back to zero triple → PG-native). * xl_heap_itl_delta_block.format_version == 1 → v2 (40B deltas; * UBA bytes restored from delta). + * xl_heap_itl_delta_block.format_version == 2 → v3 (32B deltas; + * UBA bytes restored from delta; commit_scn elided, see below). * Other values → PANIC (corruption). * + * PGRAC (spec-5.19 MG-D): v3 ITL delta drops the write-time commit_scn + * field (8B). Every write-path emit site (heap_insert / multi_insert / + * delete / lock / lock-chain / update old+new) only ever stamps an + * ITL_FLAG_ACTIVE / ITL_FLAG_LOCK_ONLY_ACTIVE transition, for which + * commit_scn is *always* InvalidScn at write time (the slot is not yet + * committed -- COMMITTED stamping happens later via the commit-time / + * delayed-cleanout page mutation, which is FPI-logged, not via a write-path + * delta). Dropping an always-Invalid field is therefore lossless: redo + * reconstructs commit_scn = InvalidScn for every v3 delta. This shrinks + * the per-mutating-record heap-ITL WAL footprint from 8 + 40 == 48 B to + * 8 + 32 == 40 B (MG-D measure baseline). The COMMITTED-requires-valid-SCN + * redo guard (heap_redo) still fires for any v3 delta that somehow carries + * ITL_FLAG_COMMITTED, so a mis-emitted COMMITTED v3 delta fails closed + * (PANIC) rather than installing a committed slot with InvalidScn. + * + * Wire-stable layout (cluster_unit test_cluster_itl_wal enforces): + * xl_heap_itl_delta_v3 (32 bytes): + * offset 0, 2B : slot_idx + * offset 2, 2B : flags_after + * offset 4, 4B : xid + * offset 8, 8B : write_scn + * offset 16, 16B : undo_segment_head (UBA; InvalidUba on finish + * deltas that do not re-bind -- same semantic as v2) + * * Wire-stable layout (cluster_unit test_cluster_itl_wal enforces): * xl_heap_itl_delta_v2 (40 bytes): * offset 0, 2B : slot_idx @@ -567,6 +593,7 @@ StaticAssertDecl(offsetof(xl_heap_itl_delta_block, deltas) == 8, */ #define CLUSTER_ITL_DELTA_FORMAT_V1 ((uint32) 0) #define CLUSTER_ITL_DELTA_FORMAT_V2 ((uint32) 1) +#define CLUSTER_ITL_DELTA_FORMAT_V3 ((uint32) 2) typedef struct xl_heap_itl_delta_v2 { @@ -593,6 +620,28 @@ StaticAssertDecl(offsetof(xl_heap_itl_delta_v2, commit_scn) == 16, StaticAssertDecl(offsetof(xl_heap_itl_delta_v2, undo_segment_head) == 24, "spec-3.4b D6 — undo_segment_head at offset 24"); +typedef struct xl_heap_itl_delta_v3 +{ + uint16 slot_idx; /* offset 0, 2B */ + uint16 flags_after; /* offset 2, 2B (ClusterItlFlags) */ + TransactionId xid; /* offset 4, 4B */ + SCN write_scn; /* offset 8, 8B */ + UBA undo_segment_head; /* offset 16, 16B (commit_scn elided) */ +} xl_heap_itl_delta_v3; + +StaticAssertDecl(sizeof(xl_heap_itl_delta_v3) == 32, + "spec-5.19 MG-D — xl_heap_itl_delta_v3 must be 32 bytes (v2 40B minus the always-Invalid 8B commit_scn)"); +StaticAssertDecl(offsetof(xl_heap_itl_delta_v3, slot_idx) == 0, + "spec-5.19 MG-D — slot_idx at offset 0"); +StaticAssertDecl(offsetof(xl_heap_itl_delta_v3, flags_after) == 2, + "spec-5.19 MG-D — flags_after at offset 2"); +StaticAssertDecl(offsetof(xl_heap_itl_delta_v3, xid) == 4, + "spec-5.19 MG-D — xid at offset 4"); +StaticAssertDecl(offsetof(xl_heap_itl_delta_v3, write_scn) == 8, + "spec-5.19 MG-D — write_scn at offset 8"); +StaticAssertDecl(offsetof(xl_heap_itl_delta_v3, undo_segment_head) == 16, + "spec-5.19 MG-D — undo_segment_head at offset 16 (commit_scn dropped vs v2)"); + #endif /* USE_PGRAC_CLUSTER */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 84a293bf75..2101479e99 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -706,7 +706,14 @@ /* spec-5.18: permanent node removal — pg_cluster_node_removal_state SRF (oid 8963) * + pg_cluster_remove_node UDF (oid 8964) + pg_cluster_membership +2 cols * (removed/removed_epoch) + 53R63/53R64 SQLSTATE. Bump 202606320 -> 202606330. */ -#define CATALOG_VERSION_NO 202606330 +/* spec-5.19 MG-D (2026-06-29): heap-ITL WAL delta v3 — new xl_heap_itl_delta_v3 + * (32B) + CLUSTER_ITL_DELTA_FORMAT_V3; the always-Invalid write-time commit_scn + * (8B) is dropped from every write-path ITL delta, shrinking the per-mutating- + * record footprint 8+40==48B -> 8+32==40B. Redo keeps v1/v2 branches for + * backward replay and reconstructs commit_scn=InvalidScn for v3. No catalog + * surface change; the bump fences an old binary from replaying v3-format WAL + * (unknown format_version -> redo PANIC). Bump 202606330 -> 202606340. */ +#define CATALOG_VERSION_NO 202606340 /* spec-5.13 (2026-06-27): clean-leave catalog surface — cluster_get_clean_leave_state * SRF (oid 8960) + pg_cluster_clean_leave_state view + pg_cluster_clean_leave_request diff --git a/src/test/cluster_tap/t/329_stage5_heap_itl_wal_measure.pl b/src/test/cluster_tap/t/329_stage5_heap_itl_wal_measure.pl index 7110b538f5..d3a992dfbb 100644 --- a/src/test/cluster_tap/t/329_stage5_heap_itl_wal_measure.pl +++ b/src/test/cluster_tap/t/329_stage5_heap_itl_wal_measure.pl @@ -7,12 +7,19 @@ # # Every mutating heap WAL record (INSERT / UPDATE / DELETE / LOCK / # LOCK_UPDATED) carries a fixed 8-byte block header (xl_heap_itl_delta_ -# block) + 40-byte v2 delta (xl_heap_itl_delta_v2) == 48 bytes, with +# block) + 32-byte v3 delta (xl_heap_itl_delta_v3) == 40 bytes, with # ndeltas == 1 (NOT coalesced — N single-row mutations to the same block -# emit N separate 48-byte deltas). This test MEASURES that overhead under +# emit N separate 40-byte deltas). This test MEASURES that overhead under # a hot-block workload and emits a GO/NO-GO decision on whether to -# implement WAL-delta compaction (array-pack multiple deltas / elide -# repeated ACTIVE stamps). +# implement the REMAINING WAL-delta compaction (array-pack multiple deltas +# into one record / elide repeated ACTIVE stamps / dedup the 8-byte block +# header across same-block deltas). +# +# MG-D GO part already shipped: the per-delta commit_scn (always Invalid at +# write time) was dropped in spec-5.19 (v2 40B -> v3 32B; per-record +# 48 -> 40 B), to close the MG-B single-node write-tax blocker. This test +# now measures the per-record footprint at the post-v3 40 B and evaluates +# the FURTHER same-block header-dedup / coalesce opportunity. # # Measure-and-decide (L257): the metric is report-only. NO-GO is a legal # outcome (small overhead / non-blocker — mirrors the spec-5.53 / 5.55 @@ -22,7 +29,7 @@ # Method: single cluster-enabled node, autovacuum off. A hot-block # workload performs many single-row UPDATEs concentrated on a few heap # pages, then pg_waldump over the workload LSN window counts the mutating -# heap records (each == one 48-byte ITL delta) and the same-block grouping +# heap records (each == one 40-byte ITL delta) and the same-block grouping # (the array-pack coalesce opportunity). # # Author: SqlRush @@ -42,7 +49,7 @@ use PostgreSQL::Test::Stage5IntegratedAcceptanceReport; use Test::More; -my $ITL_DELTA_BYTES = 48; # 8 (block header) + 40 (v2 delta) — D8 L6 invariant +my $ITL_DELTA_BYTES = 40; # 8 (block header) + 32 (v3 delta) — D8 L6 invariant, post-MG-D-GO my $report = PostgreSQL::Test::Stage5IntegratedAcceptanceReport->new( tag => $ENV{PGRAC_TAG} // 'unknown'); @@ -120,7 +127,7 @@ next unless $line =~ /rmgr:\s+Heap2?\s/; next unless $line =~ /desc:\s+(INSERT|UPDATE|HOT_UPDATE|DELETE|LOCK|LOCK_UPDATED|MULTI_INSERT)/; my $op = $1; - # Only the per-record mutations carry one 48-byte ITL delta. + # Only the per-record mutations carry one 40-byte ITL delta (v3). $heap_mut_records++; if ($line =~ m{len \(rec/tot\):\s*\d+/\s*(\d+)}) { @@ -161,7 +168,7 @@ note(" mutating heap records = $heap_mut_records"); note(" heap WAL total bytes = $heap_total_bytes"); note(" total WAL bytes (window) = $total_wal_bytes"); -note(" ITL delta bytes (48 * recs) = $itl_delta_bytes"); +note(" ITL delta bytes (40 * recs) = $itl_delta_bytes"); note(" ITL delta share of heap WAL = $itl_share_pct %"); note(" coalescible (same-block) recs = $coalescible_records ($coalesce_rate_pct %)"); note(" header-dedup saving (lower b.) = $header_dedup_saving_bytes B ($header_dedup_saving_pct % of heap WAL)"); @@ -191,7 +198,7 @@ }, blocker => $blocker, threshold_pct => $GO_THRESHOLD_PCT, - note => "fixed 48 B/record ITL delta, ndeltas==1 (not coalesced); " + note => "fixed 40 B/record ITL delta (v3; commit_scn dropped), ndeltas==1 (not coalesced); " . "$decision per header-dedup lower bound vs ${GO_THRESHOLD_PCT}% " . "threshold; non-blocker (shipped correct cost, not a regression)"); @@ -200,7 +207,7 @@ "MG-D measured $heap_mut_records mutating heap records under the hot-block " . "workload"); ok($itl_delta_bytes == $heap_mut_records * $ITL_DELTA_BYTES, - "MG-D ITL delta overhead == 48 B * mutating records (D8 L6 invariant)"); + "MG-D ITL delta overhead == 40 B * mutating records (D8 L6 invariant)"); ok($decision eq 'GO' || $decision eq 'NO-GO', "MG-D decision recorded: $decision (header-dedup saving $header_dedup_saving_pct% " . "vs ${GO_THRESHOLD_PCT}% threshold; non-blocker)"); diff --git a/src/test/cluster_unit/test_cluster_itl_wal.c b/src/test/cluster_unit/test_cluster_itl_wal.c index ee3d5a412e..bdb3faa715 100644 --- a/src/test/cluster_unit/test_cluster_itl_wal.c +++ b/src/test/cluster_unit/test_cluster_itl_wal.c @@ -264,6 +264,41 @@ UT_TEST(test_t29_v1_v2_slot_idx_same_offset) (int)offsetof(xl_heap_itl_delta_v2, slot_idx)); } +/* ---------- spec-5.19 MG-D v3 32B ABI tests (commit_scn dropped) ---------- */ + +UT_TEST(test_t30_delta_v3_sizeof_32) +{ + /* v2 40B minus the always-Invalid 8B commit_scn == 32B. */ + UT_ASSERT_EQ((int)sizeof(xl_heap_itl_delta_v3), 32); +} + +UT_TEST(test_t31_delta_v3_field_offsets) +{ + UT_ASSERT_EQ((int)offsetof(xl_heap_itl_delta_v3, slot_idx), 0); + UT_ASSERT_EQ((int)offsetof(xl_heap_itl_delta_v3, flags_after), 2); + UT_ASSERT_EQ((int)offsetof(xl_heap_itl_delta_v3, xid), 4); + UT_ASSERT_EQ((int)offsetof(xl_heap_itl_delta_v3, write_scn), 8); + /* commit_scn is gone; undo_segment_head moves up from 24 (v2) to 16. */ + UT_ASSERT_EQ((int)offsetof(xl_heap_itl_delta_v3, undo_segment_head), 16); +} + +UT_TEST(test_t32_format_version_v3_distinct) +{ + UT_ASSERT_EQ((int)CLUSTER_ITL_DELTA_FORMAT_V3, 2); + UT_ASSERT_NE((int)CLUSTER_ITL_DELTA_FORMAT_V3, (int)CLUSTER_ITL_DELTA_FORMAT_V1); + UT_ASSERT_NE((int)CLUSTER_ITL_DELTA_FORMAT_V3, (int)CLUSTER_ITL_DELTA_FORMAT_V2); +} + +UT_TEST(test_t33_v1_v2_v3_slot_idx_same_offset) +{ + /* cluster_itl_wal_block_first_slot_idx reads slot_idx at offset 0 + * version-agnostically -- must hold for v3 too. */ + UT_ASSERT_EQ((int)offsetof(xl_heap_itl_delta, slot_idx), + (int)offsetof(xl_heap_itl_delta_v3, slot_idx)); + UT_ASSERT_EQ((int)offsetof(xl_heap_itl_delta_v2, slot_idx), + (int)offsetof(xl_heap_itl_delta_v3, slot_idx)); +} + int main(void) @@ -297,5 +332,9 @@ main(void) UT_RUN(test_t27_block_format_version_repurposed_pad); UT_RUN(test_t28_block_deltas_offset_still_8); UT_RUN(test_t29_v1_v2_slot_idx_same_offset); + UT_RUN(test_t30_delta_v3_sizeof_32); + UT_RUN(test_t31_delta_v3_field_offsets); + UT_RUN(test_t32_format_version_v3_distinct); + UT_RUN(test_t33_v1_v2_v3_slot_idx_same_offset); UT_DONE(); } diff --git a/src/test/cluster_unit/test_cluster_stage5_integrated_acceptance.c b/src/test/cluster_unit/test_cluster_stage5_integrated_acceptance.c index 850db6421f..e3f6f28480 100644 --- a/src/test/cluster_unit/test_cluster_stage5_integrated_acceptance.c +++ b/src/test/cluster_unit/test_cluster_stage5_integrated_acceptance.c @@ -31,11 +31,13 @@ * ship value; update-required contract) + the multi-node write-path * wait events present and pairwise distinct (GES_S4 / GES_REPLY / * CF_ENQUEUE / CR_CONSTRUCT / REL_EXTEND_WAIT — the MG-B M2 share). - * L6 heap-ITL WAL delta width invariant (MG-D measure baseline): - * sizeof(xl_heap_itl_delta_v2) == 40 and + * L6 heap-ITL WAL delta width invariant (MG-D decided GO): + * sizeof(xl_heap_itl_delta_v3) == 32 and * offsetof(xl_heap_itl_delta_block, deltas) == 8 — every mutating - * heap record carries a fixed 8 + 40 == 48-byte ITL delta; a layout - * change would invalidate the MG-D 48B/record measurement basis. + * heap record now carries a fixed 8 + 32 == 40-byte ITL delta (was + * 8 + 40 == 48 with v2; commit_scn dropped). v2 (40B) retained for + * backward replay. A layout change would invalidate the MG-D + * 40B/record measurement basis. * * Static contract assertions only. Behavioral coverage in cluster_tap * t/32x (reconfig matrix), the HW/extend workload, and the production- @@ -206,15 +208,19 @@ UT_TEST(test_stage5_wait_events_count_and_multinode_set) UT_TEST(test_stage5_heap_itl_wal_delta_width_invariant) { - /* MG-D measures the per-record heap-ITL WAL overhead (every mutating heap - * record carries a fixed 8-byte block header + 40-byte v2 delta == 48 B, - * ndeltas == 1, not coalesced). Pin the struct widths so the 48B/record - * measurement basis cannot silently change underneath the decision record. */ - UT_ASSERT_EQ((int)sizeof(xl_heap_itl_delta_v2), 40); + /* MG-D decided GO: the always-Invalid write-time commit_scn (8B) is dropped + * from the write-path ITL delta (v3). Every mutating heap record now carries + * a fixed 8-byte block header + 32-byte v3 delta == 40 B (ndeltas == 1, not + * coalesced) -- down from the v2 48B baseline. Pin the v3 width so the + * post-decision 40B/record measurement basis cannot silently change. */ + UT_ASSERT_EQ((int)sizeof(xl_heap_itl_delta_v3), 32); UT_ASSERT_EQ((int)offsetof(xl_heap_itl_delta_block, deltas), 8); - /* 8-byte block header + 40-byte v2 delta == 48 B per mutating heap record. */ - UT_ASSERT_EQ((int)(offsetof(xl_heap_itl_delta_block, deltas) + sizeof(xl_heap_itl_delta_v2)), - 48); + /* 8-byte block header + 32-byte v3 delta == 40 B per mutating heap record. */ + UT_ASSERT_EQ((int)(offsetof(xl_heap_itl_delta_block, deltas) + sizeof(xl_heap_itl_delta_v3)), + 40); + /* v2 (40B) is retained for backward WAL replay only -- pin it so the redo + * dispatch keeps a stable legacy width. */ + UT_ASSERT_EQ((int)sizeof(xl_heap_itl_delta_v2), 40); } From e4dfcb44fd9f6b61b137602db2c908744379682b Mon Sep 17 00:00:00 2001 From: SqlRush Date: Mon, 29 Jun 2026 21:34:41 +0800 Subject: [PATCH 3/3] =?UTF-8?q?test(cluster):=20spec-5.19=20MG-B=20?= =?UTF-8?q?=E2=80=94=20t/328=20M3=20two-node=20report-only=20diag=20(nativ?= =?UTF-8?q?e/two-node=20TPS=20+=20tax=20+=20unavailable=20reason)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Output-visibility hardening for the MG-B report-only leg (the M1 single-node gate already diag()s its median): - M3 now diag()s the native single-node median tps, the two-node peer-online node0 median tps, and the two-node write tax % — captured in the CI log even on PASS / non-verbose prove, so the report-only numbers are never a silent black box. - When the 2-node measurement is unavailable, M3 prints the SPECIFIC reason (ClusterPair boot failed + error / peers not connected+in_quorum within timeout / pgbench init failed / no valid rounds / native baseline missing) instead of a generic "unavailable". - Still strictly REPORT ONLY: M3 never asserts a threshold and never fails the single-node hard gate. No behavior change to the M1 single-node ≤10% hard gate. --- .../t/328_stage5_multinode_write_perf.pl | 72 ++++++++++++++----- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/src/test/cluster_tap/t/328_stage5_multinode_write_perf.pl b/src/test/cluster_tap/t/328_stage5_multinode_write_perf.pl index 143ad08d2f..4af8172fa9 100644 --- a/src/test/cluster_tap/t/328_stage5_multinode_write_perf.pl +++ b/src/test/cluster_tap/t/328_stage5_multinode_write_perf.pl @@ -229,6 +229,8 @@ sub poll_sql_eq my @two_node_tps; my $two_node_started = 0; my $two_node_ready = 0; +my $two_init_ok = 0; +my $two_err; eval { my @pair_perf_conf = map { my $line = $_; chomp $line; $line } @perf_conf; my $pair = PostgreSQL::Test::ClusterPair->new_pair( @@ -249,21 +251,25 @@ sub poll_sql_eq && poll_sql_eq($pair->node0, 'SELECT in_quorum FROM pg_cluster_quorum_state', 't', 20) && poll_sql_eq($pair->node1, 'SELECT in_quorum FROM pg_cluster_quorum_state', 't', 20); - if ($two_node_ready && pgbench_init($pair->node0)) + if ($two_node_ready) { - for my $r (1 .. $TWO_NODE_ROUNDS) + $two_init_ok = pgbench_init($pair->node0) ? 1 : 0; + if ($two_init_ok) { - my $t = pgbench_one($pair->node0); - next unless defined $t && $t > 0; - push @two_node_tps, $t; - note(sprintf(" two-node round %d: node0 tps=%.0f", $r, $t)); + for my $r (1 .. $TWO_NODE_ROUNDS) + { + my $t = pgbench_one($pair->node0); + next unless defined $t && $t > 0; + push @two_node_tps, $t; + note(sprintf(" two-node round %d: node0 tps=%.0f", $r, $t)); + } } } $pair->stop_pair; 1; } or do { - my $err = $@ || 'unknown error'; - diag("M3 two-node report-only measurement failed before completion: $err"); + $two_err = $@ || 'unknown error'; + diag("M3 two-node report-only measurement failed before completion: $two_err"); }; my $two_have = ($two_node_started && $two_node_ready && scalar(@two_node_tps) > 0 @@ -273,12 +279,42 @@ sub poll_sql_eq ? 100.0 * (1.0 - $two_tps / $tps_native) : undef; my $two_tax_s = defined $two_tax ? sprintf('%.2f', $two_tax) : 'n/a'; -note("MG-B two-node peer-online write-path REPORT-ONLY measurement:"); -note(" native single-node TPC-B median tps = " - . (defined $tps_native ? sprintf('%.0f', $tps_native) : 'n/a')); -note(" two-node peer-online node0 TPC-B median tps = " - . (defined $two_tps ? sprintf('%.0f', $two_tps) : 'n/a')); -note(" two-node write tax % (report-only) = $two_tax_s"); +my $native_s = defined $tps_native ? sprintf('%.0f', $tps_native) : 'n/a'; +my $two_tps_s = defined $two_tps ? sprintf('%.0f', $two_tps) : 'n/a'; + +# Specific unavailable reason (reported even on PASS so the report-only leg is +# never a silent black box). +my $two_reason; +if (!$two_node_started) +{ + $two_reason = "ClusterPair failed to boot/start" + . (defined $two_err ? ": $two_err" : ""); +} +elsif (!$two_node_ready) +{ + $two_reason = "peers did not reach connected + in_quorum within timeout"; +} +elsif (!$two_init_ok) +{ + $two_reason = "pgbench init on node0 failed"; +} +elsif (!scalar(@two_node_tps)) +{ + $two_reason = "pgbench produced no valid (>0 tps) rounds"; +} +elsif (!(defined $tps_native && $tps_native > 0)) +{ + $two_reason = "native single-node baseline tps missing"; +} + +# diag() reaches the captured CI log even on PASS / non-verbose prove, so the +# 2-node report-only numbers are always visible alongside the M1 single-node +# gate -- not just when run with -v. +diag("MG-B two-node peer-online write-path REPORT-ONLY measurement:"); +diag(" native single-node TPC-B median tps = $native_s"); +diag(" two-node peer-online node0 TPC-B median tps = $two_tps_s"); +diag(" two-node write tax % (report-only) = $two_tax_s" + . (defined $two_reason ? " (unavailable: $two_reason)" : "")); # REPORT ONLY: this leg must never fail the single-node hard gate. If the # 2-node ClusterPair could not boot / reach quorum / produce a number this run @@ -286,16 +322,16 @@ sub poll_sql_eq # note rather than failing -- the HARD gate is the single-node M1 tax only. if ($two_have) { - diag("MG-B two-node peer-online write tax (report-only) = ${two_tax_s}%"); ok(1, "M3 two-node peer-online single-writer write tax measured: ${two_tax_s}% " - . "(REPORT ONLY; no threshold asserted)"); + . "(native=$native_s two-node=$two_tps_s tps; REPORT ONLY; no threshold asserted)"); } else { ok(1, - "M3 two-node peer-online write tax unavailable this run " - . "(REPORT ONLY; never fails the single-node hard gate)"); + "M3 two-node peer-online write tax unavailable this run: " + . ($two_reason // 'unknown reason') + . " (REPORT ONLY; never fails the single-node hard gate)"); } $report->record_multinode_write_value(2, 'tpcb-peer-online-single-writer', tps_native => (defined $tps_native ? $tps_native : 0),