Skip to content

Commit fa961fd

Browse files
committed
fix(operator): targeted reindex after pg_resetwal via amcheck
pg_resetwal bypasses WAL replay — it tells postgres to act as if the data dir is consistent without replaying anything. Any index update in flight at snapshot time can leave torn pages, which postgres later surfaces as 'unexpected zero page at block N' when a query happens to hit the corrupted index. Downstream users reported exactly this against a primary key, with the hint 'Please REINDEX it.' A blind REINDEX DATABASE takes hours on the prod-sized DBs (known from prior experience). Instead, use postgres's amcheck contrib extension to do a structural scan: read each valid btree index, verify its invariants, queue only the failing ones for REINDEX. Plus a blind REINDEX of every non-btree index (amcheck only covers btree, non-btree are usually a tiny fraction of total index size). Reuses the existing post-restore reindex infrastructure (the /pgdata/needs-reindex marker, the background hook in the postgres container startup, the readiness probe gate). Adds a second marker /pgdata/needs-reindex-all set by every pg_resetwal -f invocation in the operator's two-stage fallback path. The runtime hook handles both flags, with the -all branch supplanting the narrow locale-only branch when both are set.
1 parent 2e7c7bc commit fa961fd

3 files changed

Lines changed: 260 additions & 24 deletions

File tree

_typos.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[default.extend-words]
2+
# BRIN: postgres "Block Range Index" type.
3+
BRIN = "BRIN"

src/controllers/restore/builders.rs

Lines changed: 113 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,14 @@ fi
922922
# enough for read-only analytics, and the alternative is a permanently
923923
# unrecoverable replica.
924924
#
925+
# Every pg_resetwal -f invocation also touches /pgdata/needs-reindex-all:
926+
# pg_resetwal bypasses WAL replay, so any index update that was in flight
927+
# at snapshot time may have left torn pages ("unexpected zero page at
928+
# block N" surfaces in queries later, which is hard to debug after the
929+
# fact). The main container's startup hook picks up that flag and runs
930+
# REINDEX DATABASE on every user database before marking the replica
931+
# Ready.
932+
#
925933
# Stage 1: if the first attempt fails with a WAL-recovery signature,
926934
# short-circuit straight to pg_resetwal + retry. Retrying the same
927935
# command won't help when recovery itself is the blocker.
@@ -946,6 +954,7 @@ postgres_single_or_resetwal() {{
946954
echo "WAL recovery failed (snapshot likely captured mid-online-backup) — running pg_resetwal -f and retrying" >&2
947955
rm -f "$logfile"
948956
pg_resetwal -f "$PGDATA"
957+
touch /pgdata/needs-reindex-all
949958
echo "$sql_input" | postgres --single -D "$PGDATA" postgres
950959
return $?
951960
fi
@@ -966,6 +975,7 @@ postgres_single_or_resetwal() {{
966975
echo "second attempt also failed — running pg_resetwal -f as a last resort and retrying" >&2
967976
rm -f "$logfile"
968977
pg_resetwal -f "$PGDATA"
978+
touch /pgdata/needs-reindex-all
969979
echo "$sql_input" | postgres --single -D "$PGDATA" postgres
970980
}}
971981
@@ -1061,7 +1071,7 @@ ALTER ROLE ${{ANALYTICS_USERNAME}} WITH SUPERUSER;
10611071
SQLEOF
10621072
fi
10631073
1064-
if [ -f /pgdata/needs-reindex ]; then
1074+
if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
10651075
PGRO_STAGE=restored
10661076
else
10671077
PGRO_STAGE=ready
@@ -1235,33 +1245,112 @@ echo "Auth setup complete"
12351245
image: Some(pg_image),
12361246
command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]),
12371247
args: Some(vec![r#"
1238-
if [ -f /pgdata/needs-reindex ]; then
1248+
if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
12391249
PG_MAJOR=$(cat /pgdata/pgdata/PG_VERSION)
12401250
(
12411251
while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
12421252
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
1243-
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1244-
INDEXES=$(psql -U postgres -d "$db" -At -c "
1245-
SELECT DISTINCT indexrelid::regclass::text
1246-
FROM pg_index i
1247-
JOIN pg_attribute a ON a.attrelid = i.indexrelid
1248-
WHERE a.attcollation <> 0 AND i.indisvalid;
1249-
")
1250-
COUNT=$(echo "$INDEXES" | grep -c . || true)
1251-
echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)"
1252-
N=0
1253-
echo "$INDEXES" | while IFS= read -r idx; do
1254-
[ -z "$idx" ] && continue
1255-
N=$((N + 1))
1256-
echo " [$N/$COUNT] $db: $idx"
1257-
if [ "$PG_MAJOR" -ge 14 ]; then
1258-
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1259-
else
1260-
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1261-
fi
1253+
# needs-reindex-all (pg_resetwal aftermath) can leave torn pages in
1254+
# ANY index, not just collation-dependent ones. A blind REINDEX
1255+
# DATABASE takes hours on the prod DBs; instead do a two-step
1256+
# smart pass:
1257+
#
1258+
# 1. Use the amcheck contrib extension to read each valid btree
1259+
# index and verify its structural invariants. Indexes that
1260+
# fail the check (including "unexpected zero page" corruption)
1261+
# get queued for REINDEX. Indexes that pass are left alone.
1262+
# For a healthy snapshot this finds nothing and reads
1263+
# ~index-size of disk instead of ~table-size to rewrite.
1264+
#
1265+
# 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST,
1266+
# BRIN, hash). amcheck only covers btree; non-btree indexes
1267+
# are typically a small fraction of total index size so the
1268+
# cost is bounded, and skipping them risks the same
1269+
# post-resetwal corruption sneaking through.
1270+
#
1271+
# CONCURRENTLY when available (PG ≥ 14) so the work overlaps with
1272+
# whatever clients hit the pod after the readiness gate lifts.
1273+
if [ -f /pgdata/needs-reindex-all ]; then
1274+
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1275+
echo "Reindex after pg_resetwal: $db (smart pass via amcheck)"
1276+
psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true
1277+
1278+
# Step 1: scan btree indexes; collect those that fail amcheck.
1279+
BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
1280+
SELECT c.oid::regclass::text
1281+
FROM pg_class c
1282+
JOIN pg_am a ON a.oid = c.relam
1283+
JOIN pg_index i ON i.indexrelid = c.oid
1284+
WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid;
1285+
")
1286+
BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true)
1287+
echo " amcheck scanning $BTREE_COUNT btree indexes in $db"
1288+
CORRUPT_BTREE=""
1289+
N=0
1290+
for idx in $BTREE_INDEXES; do
1291+
[ -z "$idx" ] && continue
1292+
N=$((N + 1))
1293+
# bt_index_check raises an error if the index is corrupt.
1294+
# Suppress its output and check exit code; an error → queue it.
1295+
if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then
1296+
echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx"
1297+
CORRUPT_BTREE="$CORRUPT_BTREE $idx"
1298+
fi
1299+
done
1300+
1301+
# Step 2: list non-btree indexes for blind reindex.
1302+
NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
1303+
SELECT c.oid::regclass::text
1304+
FROM pg_class c
1305+
JOIN pg_am a ON a.oid = c.relam
1306+
JOIN pg_index i ON i.indexrelid = c.oid
1307+
WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid;
1308+
")
1309+
NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true)
1310+
1311+
TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES"
1312+
TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true)
1313+
echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)"
1314+
N=0
1315+
for idx in $TO_REINDEX; do
1316+
[ -z "$idx" ] && continue
1317+
N=$((N + 1))
1318+
echo " [$N/$TOTAL] $db: REINDEX $idx"
1319+
if [ "$PG_MAJOR" -ge 14 ]; then
1320+
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1321+
else
1322+
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1323+
fi
1324+
done
1325+
done
1326+
rm -f /pgdata/needs-reindex-all
1327+
# needs-reindex (collation-dependent only) is a strict subset of
1328+
# what we just did, so clear it too.
1329+
rm -f /pgdata/needs-reindex
1330+
elif [ -f /pgdata/needs-reindex ]; then
1331+
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1332+
INDEXES=$(psql -U postgres -d "$db" -At -c "
1333+
SELECT DISTINCT indexrelid::regclass::text
1334+
FROM pg_index i
1335+
JOIN pg_attribute a ON a.attrelid = i.indexrelid
1336+
WHERE a.attcollation <> 0 AND i.indisvalid;
1337+
")
1338+
COUNT=$(echo "$INDEXES" | grep -c . || true)
1339+
echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)"
1340+
N=0
1341+
echo "$INDEXES" | while IFS= read -r idx; do
1342+
[ -z "$idx" ] && continue
1343+
N=$((N + 1))
1344+
echo " [$N/$COUNT] $db: $idx"
1345+
if [ "$PG_MAJOR" -ge 14 ]; then
1346+
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1347+
else
1348+
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1349+
fi
1350+
done
12621351
done
1263-
done
1264-
rm -f /pgdata/needs-reindex
1352+
rm -f /pgdata/needs-reindex
1353+
fi
12651354
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'ready', last_transition_time = now() WHERE id = 1;"
12661355
echo "Background reindex complete"
12671356
) &
@@ -1308,7 +1397,7 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
13081397
command: Some(vec![
13091398
"/bin/sh".to_string(),
13101399
"-c".to_string(),
1311-
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(),
1400+
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(),
13121401
]),
13131402
}),
13141403
initial_delay_seconds: Some(5),

src/controllers/restore/tests.rs

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,150 @@ fn deployment_init_script_two_stage_pg_resetwal_fallback() {
691691
);
692692
}
693693

694+
#[test]
695+
fn deployment_init_script_flags_full_reindex_after_resetwal() {
696+
// pg_resetwal bypasses WAL replay, so any in-flight index write at
697+
// snapshot time can leave torn pages (postgres later surfaces these
698+
// as "unexpected zero page at block N" when queries hit the index).
699+
// Every pg_resetwal call must therefore touch /pgdata/needs-reindex-all
700+
// so the main container's startup hook runs REINDEX DATABASE on every
701+
// user database before the readiness probe lets traffic in.
702+
let (mut restore, replica) = test_restore_and_replica();
703+
restore.status = Some(PostgresPhysicalRestoreStatus {
704+
postgres_version: Some("16".to_string()),
705+
..Default::default()
706+
});
707+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
708+
let setup_auth = deploy
709+
.spec
710+
.unwrap()
711+
.template
712+
.spec
713+
.unwrap()
714+
.init_containers
715+
.unwrap()
716+
.into_iter()
717+
.find(|c| c.name == "setup-auth")
718+
.expect("setup-auth init container must exist");
719+
let script = setup_auth.args.unwrap().remove(0);
720+
721+
// The flag must be set every time pg_resetwal -f runs (currently
722+
// twice — detected-signature branch and last-resort branch). The
723+
// quick check: at least one `touch /pgdata/needs-reindex-all`
724+
// follows each pg_resetwal invocation, and the count of the touch
725+
// matches the count of resets.
726+
// Count actual invocations (the literal command line) — using just
727+
// "pg_resetwal -f" picks up comments and echo messages too.
728+
let resetwal_calls = script.matches("pg_resetwal -f \"$PGDATA\"").count();
729+
let touch_calls = script.matches("touch /pgdata/needs-reindex-all").count();
730+
assert_eq!(
731+
touch_calls, resetwal_calls,
732+
"every pg_resetwal -f invocation must be paired with `touch /pgdata/needs-reindex-all` (got {touch_calls} touches for {resetwal_calls} resets)"
733+
);
734+
assert!(
735+
resetwal_calls >= 2,
736+
"expected at least two pg_resetwal invocation sites; got {resetwal_calls}"
737+
);
738+
}
739+
740+
#[test]
741+
fn deployment_runtime_reindex_handles_full_database_flag() {
742+
// The main container's startup hook handles the broad flag
743+
// (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX
744+
// only those that fail the structural check (caught the
745+
// "unexpected zero page" failure mode reported in prod) plus all
746+
// non-btree indexes (amcheck only covers btree, so non-btree get
747+
// a blind REINDEX). A blind REINDEX DATABASE would take hours on
748+
// the prod size; the smart pass reads index-size and rewrites
749+
// only the corrupt ones.
750+
let (mut restore, replica) = test_restore_and_replica();
751+
restore.status = Some(PostgresPhysicalRestoreStatus {
752+
postgres_version: Some("16".to_string()),
753+
..Default::default()
754+
});
755+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
756+
let postgres = deploy
757+
.spec
758+
.unwrap()
759+
.template
760+
.spec
761+
.unwrap()
762+
.containers
763+
.into_iter()
764+
.find(|c| c.name == "postgres")
765+
.expect("postgres container must exist");
766+
let script = postgres.args.unwrap().remove(0);
767+
768+
assert!(
769+
script.contains("/pgdata/needs-reindex-all"),
770+
"runtime startup hook must check the needs-reindex-all flag"
771+
);
772+
assert!(
773+
script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"),
774+
"needs-reindex-all branch must enable amcheck for the structural scan"
775+
);
776+
assert!(
777+
script.contains("bt_index_check("),
778+
"needs-reindex-all branch must call bt_index_check on each btree index"
779+
);
780+
assert!(
781+
script.contains("a.amname <> 'btree'"),
782+
"needs-reindex-all branch must collect non-btree indexes for blind REINDEX"
783+
);
784+
assert!(
785+
script.contains("REINDEX INDEX"),
786+
"needs-reindex-all branch must REINDEX (targeted, not the whole DB)"
787+
);
788+
assert!(
789+
!script.contains("REINDEX DATABASE"),
790+
"needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids"
791+
);
792+
assert!(
793+
script.contains("rm -f /pgdata/needs-reindex-all"),
794+
"runtime hook must clear the needs-reindex-all flag after the reindex"
795+
);
796+
}
797+
798+
#[test]
799+
fn deployment_readiness_probe_waits_for_full_reindex() {
800+
// The readiness probe gates traffic on both reindex flags being
801+
// absent, otherwise queries see torn-page indexes between pod-start
802+
// and reindex-complete. The probe used to check only the narrow
803+
// needs-reindex flag.
804+
let (mut restore, replica) = test_restore_and_replica();
805+
restore.status = Some(PostgresPhysicalRestoreStatus {
806+
postgres_version: Some("16".to_string()),
807+
..Default::default()
808+
});
809+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
810+
let postgres = deploy
811+
.spec
812+
.unwrap()
813+
.template
814+
.spec
815+
.unwrap()
816+
.containers
817+
.into_iter()
818+
.find(|c| c.name == "postgres")
819+
.expect("postgres container must exist");
820+
let probe_cmd = postgres
821+
.readiness_probe
822+
.expect("readiness probe must exist")
823+
.exec
824+
.expect("readiness probe must be an exec probe")
825+
.command
826+
.expect("exec probe must have a command");
827+
let probe_script = probe_cmd.join(" ");
828+
assert!(
829+
probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"),
830+
"readiness probe must wait for needs-reindex-all to clear; got: {probe_script}"
831+
);
832+
assert!(
833+
probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
834+
"readiness probe must still wait for needs-reindex; got: {probe_script}"
835+
);
836+
}
837+
694838
#[test]
695839
fn deployment_init_script_overrides_listen_addresses() {
696840
// Some source backups carry `listen_addresses = 'localhost'` in

0 commit comments

Comments
 (0)