|
| 1 | +#!/usr/bin/env perl |
| 2 | +#------------------------------------------------------------------------- |
| 3 | +# |
| 4 | +# 327_stage5_hw_extend_workload.pl |
| 5 | +# spec-5.19 D4 — Stage 5 integrated acceptance: HW/relation-extend |
| 6 | +# multi-node workload (Hard gate #4, N-14 workload validation of the |
| 7 | +# spec-5.7 HW authority that shipped in v0.101.0). |
| 8 | +# |
| 9 | +# spec-5.7 proved the HW (relation-extend) block-number authority on a |
| 10 | +# 2-node single-extend path (t/292). This test validates it under a |
| 11 | +# multi-node workload, HONESTLY bounded by the Stage 5 cross-node write |
| 12 | +# model (spec-5.19 D0 soundness finding): |
| 13 | +# |
| 14 | +# At Stage 5 there is NO sound concurrent multi-node WRITE path to |
| 15 | +# shared storage — neither same-relation (cross-node holder migration / |
| 16 | +# DRM is Stage 6) nor partitioned (the per-node catalog + shared-file |
| 17 | +# space is not independently partitionable). Concurrent cross-node |
| 18 | +# writes fail CLOSED (cluster_gcs_block "master does not hold tag" / |
| 19 | +# cluster_shared_fs "could not open file"), never corrupt. The |
| 20 | +# cross-node positive data service is forward (spec-5.57 + Stage 6 |
| 21 | +# 6.4a Smart Fusion). |
| 22 | +# |
| 23 | +# So HG#4 is validated on the SOUND Stage-5 surface: single-node HW |
| 24 | +# extend correctness, the supported 2-node sequential one-hop handoff |
| 25 | +# (t/292 L6), concurrent-contention fail-closed SAFETY, and crash/restart |
| 26 | +# HWM rebuild — with the concurrent cross-node write boundary registered |
| 27 | +# as a forward limitation (rule 8.A/8.B: honest, not faked). |
| 28 | +# |
| 29 | +# Legs (real multi-node, shared storage, cluster_fs): |
| 30 | +# L1 single-node HW multi-block extend correctness: node0 bulk-inserts |
| 31 | +# across many blocks in two batches; count == rows AND distinct |
| 32 | +# ctid == count (HW authority hands out disjoint blocks, no lost/dup). |
| 33 | +# L2 2-node sequential cross-node disjoint extend (t/292 L6 handoff): |
| 34 | +# node0 fills + checkpoints, node1 extends PAST node0's blocks; |
| 35 | +# total == sum AND distinct ctid == total (the supported one-hop |
| 36 | +# cross-node disjoint guarantee). CF X-transfer ship-timeout is a |
| 37 | +# transient -> bounded retry, then SKIP-with-reason (not an HW fault). |
| 38 | +# L3 concurrent same-relation write fail-closed SAFETY (8.A): three |
| 39 | +# nodes write the SAME relation at once; the relation stays |
| 40 | +# consistent (no overlapping ctid) and at least one writer fails |
| 41 | +# CLOSED (the cross-node boundary is enforced, not bypassed). The |
| 42 | +# DRM/Stage-6 boundary is registered as a limitation. |
| 43 | +# L4 extend-during-crash/restart HWM rebuild: node0 extends a batch, |
| 44 | +# restarts; the durable HW snapshot recovery survives committed |
| 45 | +# rows and a subsequent extend lands PAST the rebuilt HWM with no |
| 46 | +# lost/dup blocks. |
| 47 | +# L5 authority fail-closed contract (53RA6): the encodable SQLSTATE is |
| 48 | +# pinned in test_cluster_stage5_integrated_acceptance.c L4 and the |
| 49 | +# allocate-path fail-closed is unit-proven (test_cluster_hw / |
| 50 | +# test_cluster_extend_gate); deterministic e2e injection of an |
| 51 | +# unreachable authority needs a fault hook not present this stage — |
| 52 | +# registered as a limitation, not faked. |
| 53 | +# |
| 54 | +# Harness: ClusterQuad shared_data + 3 voting disks + autovacuum off + |
| 55 | +# cluster.relation_extend_lock_enabled = on (default). Per spec-5.19 D0 |
| 56 | +# the single-machine 4-node ClusterQuad is stable (12/12 spike). |
| 57 | +# |
| 58 | +# Author: SqlRush <sqlrush@gmail.com> |
| 59 | +# |
| 60 | +# Portions Copyright (c) 2026, pgrac contributors |
| 61 | +# |
| 62 | +# IDENTIFICATION |
| 63 | +# src/test/cluster_tap/t/327_stage5_hw_extend_workload.pl |
| 64 | +# |
| 65 | +#------------------------------------------------------------------------- |
| 66 | + |
| 67 | +use strict; |
| 68 | +use warnings; |
| 69 | + |
| 70 | +use PostgreSQL::Test::Cluster; |
| 71 | +use PostgreSQL::Test::Utils; |
| 72 | +use PostgreSQL::Test::ClusterQuad; |
| 73 | +use PostgreSQL::Test::Stage5IntegratedAcceptanceReport; |
| 74 | +use Test::More; |
| 75 | +use IPC::Run qw(start finish); |
| 76 | + |
| 77 | +my $CF_RETRY = qr/could not obtain X transfer|did not ship a current image|ship timeout|master does not hold tag|could not open/; |
| 78 | + |
| 79 | +my $report = PostgreSQL::Test::Stage5IntegratedAcceptanceReport->new( |
| 80 | + tag => $ENV{PGRAC_TAG} // 'unknown'); |
| 81 | + |
| 82 | +# Fire one SQL on each (node, sql) simultaneously, then harvest. |
| 83 | +sub concurrent_psql |
| 84 | +{ |
| 85 | + my (@jobs) = @_; |
| 86 | + my @state; |
| 87 | + for my $j (@jobs) |
| 88 | + { |
| 89 | + my ($node, $sql) = @$j; |
| 90 | + my %s = (out => '', err => ''); |
| 91 | + my @argv = ('psql', '-X', '-q', '-v', 'ON_ERROR_STOP=1', |
| 92 | + '-d', $node->connstr('postgres'), '-c', $sql); |
| 93 | + $s{h} = start(\@argv, '<', \undef, '>', \$s{out}, '2>', \$s{err}); |
| 94 | + push @state, \%s; |
| 95 | + } |
| 96 | + my @res; |
| 97 | + for my $s (@state) |
| 98 | + { |
| 99 | + my $ok = eval { finish($s->{h}); }; |
| 100 | + push @res, { ok => ($ok ? 1 : 0), out => $s->{out}, err => $s->{err} }; |
| 101 | + } |
| 102 | + return \@res; |
| 103 | +} |
| 104 | + |
| 105 | +# Scalar read with bounded retry to absorb the transient CF X-transfer ship- |
| 106 | +# timeout (not an HW fault — t/292 L6). |
| 107 | +sub read_scalar_retry |
| 108 | +{ |
| 109 | + my ($node, $sql, $tries) = @_; |
| 110 | + $tries //= 12; |
| 111 | + for my $i (1 .. $tries) |
| 112 | + { |
| 113 | + my ($rc, $out, $err) = $node->psql('postgres', $sql, timeout => 90); |
| 114 | + return $out if defined $rc && $rc == 0; |
| 115 | + last unless ($err // '') =~ $CF_RETRY; |
| 116 | + select(undef, undef, undef, 1.0); |
| 117 | + } |
| 118 | + return undef; |
| 119 | +} |
| 120 | + |
| 121 | +# --------------------------------------------------------------------- |
| 122 | +# S0: strict 4-node cluster (shared data + 3 voting disks). |
| 123 | +# --------------------------------------------------------------------- |
| 124 | +my $quad = PostgreSQL::Test::ClusterQuad->new_quad( |
| 125 | + 'hw_extend', |
| 126 | + quorum_voting_disks => 3, |
| 127 | + shared_data => 1, |
| 128 | + extra_conf => [ |
| 129 | + 'autovacuum = off', |
| 130 | + 'cluster.relation_extend_lock_enabled = on', |
| 131 | + 'cluster.quorum_poll_interval_ms = 500', |
| 132 | + 'cluster.cssd_heartbeat_interval_ms = 2000', |
| 133 | + 'cluster.cssd_dead_deadband_factor = 10', |
| 134 | + ]); |
| 135 | + |
| 136 | +$quad->start_quad; |
| 137 | +select(undef, undef, undef, 3.0); |
| 138 | + |
| 139 | +for my $to (1 .. 3) |
| 140 | +{ |
| 141 | + $quad->wait_for_peer_state(0, $to, 'connected', 30) |
| 142 | + or BAIL_OUT("node0 never saw node$to connected"); |
| 143 | +} |
| 144 | + |
| 145 | +my $n0 = $quad->node0; |
| 146 | +my $n1 = $quad->node1; |
| 147 | + |
| 148 | +# Cross-node SHARED relations must be created on ALL nodes FIRST, before any |
| 149 | +# node-specific DDL, so the OID counters are still aligned -> the same OID -> |
| 150 | +# the same shared relfilenode (t/292 discipline). hw2 (L2 sequential handoff) |
| 151 | +# and hwc (L3 concurrent contention) are the shared relations; the node0-only |
| 152 | +# single-node tables (hw1, hwr) are created afterwards and advance node0's OID |
| 153 | +# counter past the others without colliding. |
| 154 | +for my $i (0 .. 3) |
| 155 | +{ |
| 156 | + $quad->node($i)->safe_psql('postgres', 'CREATE TABLE hw2 (node int, id int)'); |
| 157 | + $quad->node($i)->safe_psql('postgres', 'CREATE TABLE hwc (node int, id int)'); |
| 158 | +} |
| 159 | + |
| 160 | +# --------------------------------------------------------------------- |
| 161 | +# L1: single-node HW multi-block extend correctness (deterministic). |
| 162 | +# --------------------------------------------------------------------- |
| 163 | +$n0->safe_psql('postgres', 'CREATE TABLE hw1 (id int, pad int)'); |
| 164 | +$n0->safe_psql('postgres', |
| 165 | + 'INSERT INTO hw1 SELECT g, g FROM generate_series(1,5000) g'); |
| 166 | +$n0->safe_psql('postgres', |
| 167 | + 'INSERT INTO hw1 SELECT g, g FROM generate_series(5001,9000) g'); |
| 168 | +my $c1 = $n0->safe_psql('postgres', 'SELECT count(*) FROM hw1'); |
| 169 | +my $d1 = $n0->safe_psql('postgres', 'SELECT count(DISTINCT ctid) FROM hw1'); |
| 170 | +my $l1_ok = ($c1 eq '9000' && $d1 eq '9000'); |
| 171 | +ok($l1_ok, |
| 172 | + "L1 single-node HW multi-block extend: 9000 rows, distinct ctid == count " |
| 173 | + . "(no lost/dup blocks)") or diag("L1 count=$c1 distinct_ctid=$d1"); |
| 174 | +$report->record_hw_extend('HG#4a', 'single-node multi-block extend correctness', |
| 175 | + status => $l1_ok ? 'PASS' : 'FAIL', required => 1); |
| 176 | + |
| 177 | +# --------------------------------------------------------------------- |
| 178 | +# L2: 2-node sequential cross-node disjoint extend (t/292 L6 handoff). |
| 179 | +# hw2 is the aligned-OID shared relation (created on all nodes above). |
| 180 | +# --------------------------------------------------------------------- |
| 181 | +$n0->safe_psql('postgres', |
| 182 | + 'INSERT INTO hw2 SELECT 0, g FROM generate_series(1,5000) g'); |
| 183 | +$n0->safe_psql('postgres', 'CHECKPOINT'); |
| 184 | +select(undef, undef, undef, 1.5); |
| 185 | + |
| 186 | +my $hop_ok = 0; |
| 187 | +my $hop_err = ''; |
| 188 | +for my $attempt (1 .. 10) |
| 189 | +{ |
| 190 | + my ($rc, $out, $err) = $n1->psql('postgres', |
| 191 | + 'INSERT INTO hw2 SELECT 1, g FROM generate_series(5001,7000) g', |
| 192 | + timeout => 90); |
| 193 | + if (defined $rc && $rc == 0) { $hop_ok = 1; last; } |
| 194 | + $hop_err = $err // ''; |
| 195 | + last unless $hop_err =~ $CF_RETRY; |
| 196 | + select(undef, undef, undef, 1.0); |
| 197 | +} |
| 198 | + |
| 199 | +SKIP: |
| 200 | +{ |
| 201 | + skip "L2 node1 cross-node extend hit the transient CF X-transfer timeout " |
| 202 | + . "(not an HW fault)", 1 |
| 203 | + unless $hop_ok; |
| 204 | + |
| 205 | + $n1->safe_psql('postgres', 'CHECKPOINT'); |
| 206 | + my $total = read_scalar_retry($n0, 'SELECT count(*) FROM hw2'); |
| 207 | + my $dctid = (defined $total && $total == 7000) |
| 208 | + ? read_scalar_retry($n0, 'SELECT count(DISTINCT ctid) FROM hw2') |
| 209 | + : undef; |
| 210 | + my $l2_ok = (defined $total && $total == 7000 |
| 211 | + && defined $dctid && $dctid == 7000); |
| 212 | + ok($l2_ok, |
| 213 | + "L2 2-node sequential cross-node extend is disjoint: total == 7000 " |
| 214 | + . "AND distinct ctid == total (node1 landed past node0)") |
| 215 | + or diag("L2 total=" . (defined $total ? $total : '(timeout)') |
| 216 | + . " distinct_ctid=" . (defined $dctid ? $dctid : '(n/a)')); |
| 217 | + $report->record_hw_extend('HG#4b', '2-node sequential cross-node disjoint extend', |
| 218 | + status => $l2_ok ? 'PASS' : 'FAIL', required => 1); |
| 219 | +} |
| 220 | +if (!$hop_ok) |
| 221 | +{ |
| 222 | + $report->record_hw_extend('HG#4b', '2-node sequential cross-node disjoint extend', |
| 223 | + status => 'SKIP', required => 0, |
| 224 | + note => "transient CF X-transfer timeout: $hop_err"); |
| 225 | +} |
| 226 | + |
| 227 | +# --------------------------------------------------------------------- |
| 228 | +# L3: concurrent same-relation write fail-closed SAFETY (DRM/Stage-6). |
| 229 | +# hwc is the aligned-OID shared relation (created on all nodes above). |
| 230 | +# --------------------------------------------------------------------- |
| 231 | +$n0->safe_psql('postgres', |
| 232 | + 'INSERT INTO hwc SELECT 0, g FROM generate_series(1,1000) g'); |
| 233 | +$n0->safe_psql('postgres', 'CHECKPOINT'); |
| 234 | +select(undef, undef, undef, 1.0); |
| 235 | +my $cres = concurrent_psql( |
| 236 | + [ $quad->node0, 'INSERT INTO hwc SELECT 0, g FROM generate_series(1001,3000) g' ], |
| 237 | + [ $quad->node1, 'INSERT INTO hwc SELECT 1, g FROM generate_series(1,2000) g' ], |
| 238 | + [ $quad->node2, 'INSERT INTO hwc SELECT 2, g FROM generate_series(1,2000) g' ]); |
| 239 | +my $n_failed_closed = grep { !$_->{ok} } @$cres; |
| 240 | +my $n_ok = grep { $_->{ok} } @$cres; |
| 241 | +my $consistent = read_scalar_retry($n0, |
| 242 | + 'SELECT count(DISTINCT ctid) = count(*) FROM hwc'); |
| 243 | +# Safe-direction: relation stays consistent (no overlapping ctid) AND either a |
| 244 | +# writer fails closed (boundary enforced) or all succeed (no corruption). |
| 245 | +my $l3_safe = (defined $consistent && $consistent eq 't') |
| 246 | + && ($n_failed_closed >= 1 || $n_ok == scalar(@$cres)); |
| 247 | +ok($l3_safe, |
| 248 | + "L3 concurrent same-relation write is safe: $n_failed_closed fail-closed / " |
| 249 | + . "$n_ok ok, relation stays consistent (no overlapping ctid) — cross-node " |
| 250 | + . "holder migration / DRM is Stage 6") |
| 251 | + or diag("L3 consistent=" . (defined $consistent ? $consistent : '(timeout)')); |
| 252 | +$report->record_hw_extend('HG#4c', 'concurrent same-relation write fail-closed safety', |
| 253 | + status => $l3_safe ? 'PASS' : 'FAIL', required => 1, |
| 254 | + fail_closed => $n_failed_closed, ok => $n_ok); |
| 255 | +$report->record_limitation('concurrent cross-node multi-node write', |
| 256 | + kind => 'correctness-forward', forward => '5.57 / 6.x-DRM', |
| 257 | + note => 'no sound concurrent cross-node write at Stage 5 (DRM = Stage 6); ' |
| 258 | + . 'same-block contention fails closed (no corruption)'); |
| 259 | + |
| 260 | +# --------------------------------------------------------------------- |
| 261 | +# L4: extend-during-restart HWM rebuild (durable HW snapshot recovery). |
| 262 | +# Clean restart (t/325 pattern) — proven to work within a live cluster. |
| 263 | +# WAL-tail crash recovery of the HWM is unit-covered (test_cluster_hw_snapshot). |
| 264 | +# --------------------------------------------------------------------- |
| 265 | +$n0->safe_psql('postgres', 'CREATE TABLE hwr (id int, pad int)'); |
| 266 | +$n0->safe_psql('postgres', |
| 267 | + 'INSERT INTO hwr SELECT g, g FROM generate_series(1,4000) g'); |
| 268 | +$n0->safe_psql('postgres', 'CHECKPOINT'); # durable HW snapshot written |
| 269 | +my $pre = $n0->safe_psql('postgres', 'SELECT count(*) FROM hwr'); |
| 270 | + |
| 271 | +my $restarted = eval { $n0->restart; 1; }; |
| 272 | +select(undef, undef, undef, 2.0); |
| 273 | + |
| 274 | +my $post = $restarted ? read_scalar_retry($n0, 'SELECT count(*) FROM hwr') : undef; |
| 275 | +my $recovered = ($restarted && defined $post && $post eq $pre); |
| 276 | +my $ext_ok = 0; |
| 277 | +if ($recovered) |
| 278 | +{ |
| 279 | + for my $attempt (1 .. 10) |
| 280 | + { |
| 281 | + my ($rc, $out, $err) = $n0->psql('postgres', |
| 282 | + 'INSERT INTO hwr SELECT g, g FROM generate_series(4001,6000) g', |
| 283 | + timeout => 90); |
| 284 | + if (defined $rc && $rc == 0) { $ext_ok = 1; last; } |
| 285 | + last unless ($err // '') =~ $CF_RETRY; |
| 286 | + select(undef, undef, undef, 1.0); |
| 287 | + } |
| 288 | +} |
| 289 | +my $final = $ext_ok ? read_scalar_retry($n0, 'SELECT count(*) FROM hwr') : undef; |
| 290 | +my $dfinal = $ext_ok ? read_scalar_retry($n0, 'SELECT count(DISTINCT ctid) FROM hwr') : undef; |
| 291 | +my $l4_ok = ($recovered && $ext_ok |
| 292 | + && defined $final && $final == 6000 |
| 293 | + && defined $dfinal && $dfinal == 6000); |
| 294 | +ok($l4_ok, |
| 295 | + "L4 extend-during-restart HWM rebuild: committed rows survive (pre=$pre " |
| 296 | + . "post=" . (defined $post ? $post : '(unrecovered)') . ") and the post-" |
| 297 | + . "restart extend lands past the rebuilt HWM (count==6000, distinct ctid==count)"); |
| 298 | +$report->record_hw_extend('HG#4d', 'extend-during-restart HWM rebuild', |
| 299 | + status => $l4_ok ? 'PASS' : 'FAIL', required => 1); |
| 300 | + |
| 301 | +# --------------------------------------------------------------------- |
| 302 | +# L5: authority fail-closed contract (53RA6). |
| 303 | +# --------------------------------------------------------------------- |
| 304 | +$report->record_hw_extend('HG#4e', 'authority-unreachable fail-closed 53RA6', |
| 305 | + status => 'SKIP', required => 0, |
| 306 | + note => 'encodable contract pinned (D8 L4) + unit-proven (test_cluster_hw); ' |
| 307 | + . 'e2e unreachable-authority injection forward (no fault hook this stage)'); |
| 308 | +$report->record_limitation('HW authority-unreachable e2e injection', |
| 309 | + kind => 'substrate', forward => '5.19-followup-or-6.0a', |
| 310 | + note => '53RA6 fail-closed unit-proven; deterministic e2e needs an ' |
| 311 | + . 'authority-unreachable fault hook'); |
| 312 | +pass('L5 53RA6 fail-closed contract registered (unit-proven + encodable, ' |
| 313 | + . 'e2e injection limitation logged)'); |
| 314 | + |
| 315 | +# --------------------------------------------------------------------- |
| 316 | +# Emit the acceptance report fragment. |
| 317 | +# --------------------------------------------------------------------- |
| 318 | +my $out_path = $ENV{PGRAC_ACCEPTANCE_JSON} |
| 319 | + // $report->default_path($ENV{TESTDATADIR} || "tmp_check"); |
| 320 | +eval { $report->emit_json($out_path); Test::More::note("acceptance report: $out_path"); }; |
| 321 | + |
| 322 | +$quad->stop_quad; |
| 323 | + |
| 324 | +done_testing(); |
0 commit comments