|
2 | 2 | #------------------------------------------------------------------------- |
3 | 3 | # |
4 | 4 | # 328_stage5_multinode_write_perf.pl |
5 | | -# spec-5.19 D3 — Stage 5 integrated acceptance: multi-node write-path |
6 | | -# performance measure-and-decide (MG-B, value + soundness gate). |
| 5 | +# spec-5.19 D3 — Stage 5 integrated acceptance: write-path performance |
| 6 | +# HARD GATE (MG-B). |
7 | 7 | # |
8 | | -# MG-B has two gates (L257): |
9 | | -# * VALUE gate — quantify the write tax / wait share / affinity. |
10 | | -# * SOUNDNESS gate — is the harness really running cross-node shared-heap |
11 | | -# write competition, or inject-partial? REAL vs measure-only. |
| 8 | +# Acceptance ruling (user, 2026-06-29, rule 8.B): the currently-measurable |
| 9 | +# native-vs-cluster write-path degradation is a **perf BLOCKER that spec- |
| 10 | +# 5.19 must close**, NOT a measure-only / forward item. The gate: |
12 | 11 | # |
13 | | -# spec-5.19 D0 soundness finding (decisive): at Stage 5 there is NO sound |
14 | | -# concurrent multi-node WRITE path to shared storage — same-relation |
15 | | -# contention needs cross-node holder migration (DRM = Stage 6) and even |
16 | | -# "partitioned" per-node relations collide on the shared OID/file space. |
17 | | -# Concurrent cross-node writes fail CLOSED (no corruption — proven in |
18 | | -# t/327 L3). Therefore the SOUNDNESS verdict for true multi-node write |
19 | | -# competition is MEASURE-ONLY / FORWARD (spec-5.57 cross-instance CR + |
20 | | -# Stage 6 6.4a Smart Fusion; spec-5.19 §3.2 / §10 forward-link 5). |
| 12 | +# single-node cluster write tax <= 10% (REQUIRED PASS) |
| 13 | +# > 10% -> FAIL / BLOCKED -> root-cause + |
| 14 | +# optimize inside 5.19 |
21 | 15 | # |
22 | | -# What IS soundly measurable at Stage 5 is the SINGLE-NODE cluster write |
23 | | -# tax — the per-node cost of the cluster MVCC/ITL/GES machinery vs the |
24 | | -# native (cluster.enabled = off) path on the same build (spec-3.25 |
25 | | -# single-node-no-peer C floor). This test measures that (report-only) |
26 | | -# and records the multi-node soundness verdict as measure-only/forward. |
| 16 | +# This is the spec-3.25 single-node-no-peer C-floor comparison: native |
| 17 | +# (cluster.enabled = off) vs cluster-enabled single node, same build, same |
| 18 | +# fsync/shared_buffers, pgbench TPC-B write workload, best-of-N for |
| 19 | +# stability, ratio-based so the runner's absolute speed cancels. |
27 | 20 | # |
28 | | -# Legs: |
29 | | -# M1 single-node write tax: pgbench TPC-B on a cluster-enabled node vs |
30 | | -# a native (cluster.enabled off) node; tax % is report-only (no |
31 | | -# numeric gate — spec §3.3 / docs/perf-gates.md class-1 model). |
32 | | -# M2 cluster write-path wait events present during the workload (the |
33 | | -# observability surface MG-B aggregates). |
34 | | -# SOUNDNESS multi-node concurrent write = measure-only/forward (the |
35 | | -# empirical basis is t/327 L3 fail-closed); no inject-partial number |
36 | | -# is passed off as a true multi-node perf result (rule 8.A/8.B). |
| 21 | +# SEPARATE capability limitation (does NOT cover or excuse the gate above): |
| 22 | +# true concurrent multi-node shared-block write competition is bounded by |
| 23 | +# cross-node holder migration (DRM = Stage 6; spec-5.57 cross-instance |
| 24 | +# CR). That limit is recorded as a capability forward — it MUST NOT be |
| 25 | +# used to reframe the single-node tax as measure-only (the user's explicit |
| 26 | +# correction). |
37 | 27 | # |
38 | 28 | # Author: SqlRush <sqlrush@gmail.com> |
39 | 29 | # |
|
55 | 45 | my $report = PostgreSQL::Test::Stage5IntegratedAcceptanceReport->new( |
56 | 46 | tag => $ENV{PGRAC_TAG} // 'unknown'); |
57 | 47 |
|
58 | | -my $PGBENCH = $ENV{PGBENCH} // 'pgbench'; |
59 | | -my $SCALE = 2; |
60 | | -my $SECS = $ENV{PGRAC_PGBENCH_SECS} // 4; |
61 | | -my $CLIENTS = 4; |
| 48 | +my $PGBENCH = $ENV{PGBENCH} // 'pgbench'; |
| 49 | +my $SCALE = 10; |
| 50 | +my $SECS = $ENV{PGRAC_PGBENCH_SECS} // 8; |
| 51 | +my $CLIENTS = 4; |
| 52 | +my $ROUNDS = $ENV{PGRAC_PGBENCH_ROUNDS} // 7; # interleaved rounds |
| 53 | +# The hard gate: cluster write tax must not exceed this percentage. |
| 54 | +my $GATE_PCT = $ENV{PGRAC_WRITE_TAX_GATE_PCT} // 10.0; |
62 | 55 |
|
63 | | -# Run pgbench TPC-B (default script) and return the reported tps (float) or |
64 | | -# undef on failure. |
65 | | -sub pgbench_tps |
| 56 | +# One pgbench TPC-B (-N) run against a node; returns tps (float) or undef. |
| 57 | +sub pgbench_one |
66 | 58 | { |
67 | 59 | my ($node) = @_; |
68 | 60 | my $conn = '-h ' . $node->host . ' -p ' . $node->port . ' postgres'; |
69 | | - system("$PGBENCH -i -s $SCALE -q $conn >/dev/null 2>&1"); |
70 | | - return undef if $? != 0; |
71 | | - my $out = `$PGBENCH -T $SECS -c $CLIENTS -j 2 -N $conn 2>&1`; |
72 | | - # -N: skip updates to pgbench_tellers/branches (less contention, still a |
73 | | - # write workload on pgbench_accounts + history) — a clean write-tax signal. |
74 | | - if ($out =~ /tps\s*=\s*([\d.]+)\s*\(without initial/) |
75 | | - { |
76 | | - return $1 + 0.0; |
77 | | - } |
78 | | - if ($out =~ /tps\s*=\s*([\d.]+)/) |
79 | | - { |
80 | | - return $1 + 0.0; |
81 | | - } |
| 61 | + my $out = `$PGBENCH -n -T $SECS -c $CLIENTS -j 2 -N $conn 2>&1`; |
| 62 | + return $1 + 0.0 if $out =~ /tps\s*=\s*([\d.]+)\s*\(without initial/; |
| 63 | + return $1 + 0.0 if $out =~ /tps\s*=\s*([\d.]+)/; |
82 | 64 | return undef; |
83 | 65 | } |
84 | 66 |
|
| 67 | +sub pgbench_init |
| 68 | +{ |
| 69 | + my ($node) = @_; |
| 70 | + my $conn = '-h ' . $node->host . ' -p ' . $node->port . ' postgres'; |
| 71 | + system("$PGBENCH -i -s $SCALE -q $conn >/dev/null 2>&1"); |
| 72 | + return $? == 0; |
| 73 | +} |
| 74 | + |
| 75 | +sub median |
| 76 | +{ |
| 77 | + my @s = sort { $a <=> $b } @_; |
| 78 | + return undef unless @s; |
| 79 | + return $s[int((@s) / 2)]; |
| 80 | +} |
| 81 | + |
| 82 | +# Common perf-isolation knobs (CPU-overhead measurement: fsync off removes |
| 83 | +# disk variance so the gate measures the cluster machinery's added work). |
| 84 | +my @perf_conf = ( |
| 85 | + "autovacuum = off\n", |
| 86 | + "fsync = off\n", |
| 87 | + "shared_buffers = 64MB\n", |
| 88 | + "max_wal_size = 4GB\n", |
| 89 | +); |
| 90 | + |
85 | 91 | # --------------------------------------------------------------------- |
86 | | -# Native baseline node (cluster.enabled off — the lower-bound C floor). |
| 92 | +# Boot BOTH nodes simultaneously so the interleaved rounds measure native |
| 93 | +# and cluster under the same momentary system load — the ratio (tax) then |
| 94 | +# cancels runner speed AND transient load. The MEDIAN of N rounds rejects |
| 95 | +# the occasional load-skewed outlier (essential on a shared runner). |
87 | 96 | # --------------------------------------------------------------------- |
88 | 97 | my $native = PostgreSQL::Test::Cluster->new('mnw_native'); |
89 | 98 | $native->init; |
90 | | -$native->append_conf('postgresql.conf', "autovacuum = off\n"); |
91 | | -$native->append_conf('postgresql.conf', "fsync = off\n"); # perf measure only |
| 99 | +$native->append_conf('postgresql.conf', $_) for @perf_conf; |
92 | 100 | $native->start; |
93 | | -my $tps_native = pgbench_tps($native); |
94 | | -$native->stop; |
95 | 101 |
|
96 | | -# --------------------------------------------------------------------- |
97 | | -# Cluster-enabled single node (1-node cluster — the spec-3.25 C floor). |
98 | | -# --------------------------------------------------------------------- |
99 | 102 | my $ic_port = PostgreSQL::Test::Cluster::get_free_port(); |
100 | 103 | my $clu = PostgreSQL::Test::Cluster->new('mnw_cluster'); |
101 | 104 | $clu->init; |
102 | 105 | $clu->append_conf('postgresql.conf', "cluster.enabled = on\n"); |
103 | 106 | $clu->append_conf('postgresql.conf', "cluster.interconnect_tier = tier1\n"); |
104 | 107 | $clu->append_conf('postgresql.conf', "cluster.allow_single_node = on\n"); |
105 | 108 | $clu->append_conf('postgresql.conf', "cluster.node_id = 0\n"); |
106 | | -$clu->append_conf('postgresql.conf', "autovacuum = off\n"); |
107 | | -$clu->append_conf('postgresql.conf', "fsync = off\n"); |
| 109 | +$clu->append_conf('postgresql.conf', $_) for @perf_conf; |
108 | 110 | PostgreSQL::Test::Utils::append_to_file( |
109 | 111 | $clu->data_dir . '/pgrac.conf', |
110 | 112 | "[cluster]\nname = mnw_cluster\n\n[node.0]\ninterconnect_addr = 127.0.0.1:$ic_port\n\n"); |
111 | 113 | $clu->start; |
112 | | -my $tps_cluster = pgbench_tps($clu); |
113 | 114 |
|
114 | | -# M2: cluster write-path wait-event observability surface present. |
| 115 | +my $init_ok = pgbench_init($native) && pgbench_init($clu); |
| 116 | + |
| 117 | +my (@nat, @clu, @taxes); |
| 118 | +if ($init_ok) |
| 119 | +{ |
| 120 | + for my $r (1 .. $ROUNDS) |
| 121 | + { |
| 122 | + my $n = pgbench_one($native); |
| 123 | + my $c = pgbench_one($clu); |
| 124 | + next unless defined $n && $n > 0 && defined $c && $c > 0; |
| 125 | + push @nat, $n; |
| 126 | + push @clu, $c; |
| 127 | + push @taxes, 100.0 * (1.0 - $c / $n); |
| 128 | + note(sprintf(" round %d: native=%.0f cluster=%.0f tax=%.2f%%", |
| 129 | + $r, $n, $c, $taxes[-1])); |
| 130 | + } |
| 131 | +} |
115 | 132 | my $wait_events_present = $clu->safe_psql('postgres', |
116 | 133 | "SELECT count(*) FROM pg_stat_cluster_wait_events") // 0; |
117 | | - |
| 134 | +$native->stop; |
118 | 135 | $clu->stop; |
119 | 136 |
|
120 | 137 | # --------------------------------------------------------------------- |
121 | | -# M1: single-node write tax (report-only). |
| 138 | +# M1: single-node write tax — HARD GATE (median of interleaved rounds). |
122 | 139 | # --------------------------------------------------------------------- |
123 | | -my $have_both = (defined $tps_native && $tps_native > 0 |
124 | | - && defined $tps_cluster && $tps_cluster > 0); |
125 | | -my $tax_pct = $have_both |
126 | | - ? sprintf('%.2f', 100.0 * (1.0 - $tps_cluster / $tps_native)) : 'n/a'; |
127 | | - |
128 | | -note("MG-B single-node write tax (report-only):"); |
129 | | -note(" native TPC-B tps = " . (defined $tps_native ? $tps_native : 'n/a')); |
130 | | -note(" cluster TPC-B tps = " . (defined $tps_cluster ? $tps_cluster : 'n/a')); |
131 | | -note(" write tax % = $tax_pct (cluster vs native, single node)"); |
132 | | -note(" cluster wait-event rows = $wait_events_present"); |
| 140 | +my $have_both = (scalar(@taxes) > 0); |
| 141 | +my $tax = $have_both ? median(@taxes) : undef; |
| 142 | +my $tax_s = defined $tax ? sprintf('%.2f', $tax) : 'n/a'; |
| 143 | +my $tps_native = $have_both ? median(@nat) : undef; |
| 144 | +my $tps_cluster = $have_both ? median(@clu) : undef; |
| 145 | + |
| 146 | +note("MG-B single-node write-path HARD GATE (median of " |
| 147 | + . scalar(@taxes) . " interleaved rounds):"); |
| 148 | +note(" native TPC-B median tps = " . (defined $tps_native ? sprintf('%.0f', $tps_native) : 'n/a')); |
| 149 | +note(" cluster TPC-B median tps = " . (defined $tps_cluster ? sprintf('%.0f', $tps_cluster) : 'n/a')); |
| 150 | +note(" write tax % (median) = $tax_s (gate: <= $GATE_PCT%)"); |
| 151 | + |
| 152 | +ok($have_both, |
| 153 | + "M0 native + cluster single-node throughput measured over " |
| 154 | + . scalar(@taxes) . " interleaved rounds"); |
| 155 | + |
| 156 | +# THE HARD GATE (rule 8.B): tax <= 10%. > 10% is a perf blocker. |
| 157 | +my $gate_pass = (defined $tax && $tax <= $GATE_PCT); |
| 158 | +ok($gate_pass, |
| 159 | + "M1 single-node cluster write tax ${tax_s}% <= ${GATE_PCT}% (HARD GATE; " |
| 160 | + . "tax > ${GATE_PCT}% is a perf blocker that must be root-caused + " |
| 161 | + . "optimized inside spec-5.19 — rule 8.B, never deferred)"); |
| 162 | +unless ($gate_pass) |
| 163 | +{ |
| 164 | + diag("MG-B PERF BLOCKER: single-node cluster write tax ${tax_s}% exceeds " |
| 165 | + . "the ${GATE_PCT}% gate. spec-5.19 is BLOCKED on this until the " |
| 166 | + . "cluster write hot-path (ITL stamp + undo record + UBA + per-record " |
| 167 | + . "heap-ITL WAL delta) is optimized below the gate."); |
| 168 | +} |
133 | 169 |
|
134 | 170 | $report->record_multinode_write_value(1, 'tpcb', |
135 | 171 | tps_native => (defined $tps_native ? $tps_native : 0), |
136 | 172 | tps_cluster => (defined $tps_cluster ? $tps_cluster : 0), |
137 | | - write_tax_pct => $tax_pct, |
| 173 | + write_tax_pct => $tax_s, |
| 174 | + gate_pct => $GATE_PCT, |
| 175 | + gate => $gate_pass ? 'PASS' : 'FAIL-BLOCKER', |
138 | 176 | wait_event_rows => $wait_events_present); |
139 | 177 |
|
140 | 178 | # --------------------------------------------------------------------- |
141 | | -# SOUNDNESS gate (the decisive MG-B verdict). |
| 179 | +# M2: cluster write-path wait-event observability surface present. |
142 | 180 | # --------------------------------------------------------------------- |
143 | | -$report->set_multinode_write_soundness('measure-only', |
144 | | - real => 0, |
145 | | - note => 'no sound concurrent multi-node write at Stage 5 (cross-node holder ' |
146 | | - . 'migration / DRM = Stage 6; concurrent cross-node writes fail closed, ' |
147 | | - . 't/327 L3). True multi-node write competition is forward (spec-5.57 ' |
148 | | - . 'cross-instance CR + Stage 6 6.4a Smart Fusion). Single-node cluster ' |
149 | | - . 'write tax is the soundly-measurable Stage-5 metric.'); |
150 | | -$report->record_limitation('multi-node write-path perf (true cross-node)', |
151 | | - kind => 'perf-forward', forward => '5.57 / 6.4a', |
152 | | - note => 'MG-B soundness = measure-only; single-node cluster write tax ' |
153 | | - . 'measured (report-only); true concurrent multi-node write perf needs ' |
154 | | - . 'DRM (Stage 6)'); |
| 181 | +ok($wait_events_present > 0, |
| 182 | + "M2 cluster write-path wait-event surface present ($wait_events_present rows)"); |
155 | 183 |
|
156 | 184 | # --------------------------------------------------------------------- |
157 | | -# Assertions (8.A safe-direction): measurement well-formed; soundness |
158 | | -# honestly recorded. No numeric perf gate (report-only, spec §3.3). |
| 185 | +# SOUNDNESS — the single-node tax above is REAL + gated. The TRUE concurrent |
| 186 | +# multi-node shared-block write limit is a SEPARATE capability limitation that |
| 187 | +# does NOT cover the single-node gate (the user's explicit correction). |
159 | 188 | # --------------------------------------------------------------------- |
160 | | -ok(defined $tps_native && $tps_native > 0, |
161 | | - "M1 native single-node write throughput measured (tps=" . |
162 | | - (defined $tps_native ? $tps_native : 'n/a') . ")"); |
163 | | -ok(defined $tps_cluster && $tps_cluster > 0, |
164 | | - "M1 cluster single-node write throughput measured (tps=" . |
165 | | - (defined $tps_cluster ? $tps_cluster : 'n/a') . ")"); |
166 | | -ok($wait_events_present > 0, |
167 | | - "M2 cluster write-path wait-event surface present ($wait_events_present rows)"); |
168 | | -ok($report->{matrix}{multinode_write_perf}{soundness}{verdict} eq 'measure-only', |
169 | | - "MG-B soundness verdict = measure-only/forward (true multi-node write = " |
170 | | - . "Stage 6 DRM; no inject-partial number passed off as real)"); |
| 189 | +$report->set_multinode_write_soundness('single-node-gated', |
| 190 | + real => 1, |
| 191 | + note => 'single-node write tax is REAL + HARD-GATED at <= ' . $GATE_PCT |
| 192 | + . '%. SEPARATE capability limitation: true concurrent cross-node ' |
| 193 | + . 'shared-block write competition is DRM-bounded (Stage 6 / spec-5.57) ' |
| 194 | + . '-- a capability forward, NOT a cover for the single-node tax gate.'); |
| 195 | +$report->record_limitation('true concurrent multi-node shared-block write', |
| 196 | + kind => 'capability-forward', forward => '5.57 / 6.4a-DRM', |
| 197 | + note => 'capability limit only; does NOT excuse the single-node write-tax ' |
| 198 | + . 'gate (rule 8.B). Concurrent cross-node writes fail-closed (no ' |
| 199 | + . 'corruption, t/327 L3).'); |
171 | 200 |
|
172 | 201 | my $out_path = $ENV{PGRAC_ACCEPTANCE_JSON} |
173 | 202 | // $report->default_path($ENV{TESTDATADIR} || "tmp_check"); |
174 | | -eval { $report->emit_json($out_path); Test::More::note("MG-B value+soundness record: $out_path"); }; |
| 203 | +eval { $report->emit_json($out_path); Test::More::note("MG-B gate record: $out_path"); }; |
175 | 204 |
|
176 | 205 | done_testing(); |
0 commit comments