Skip to content

Commit d9c6e95

Browse files
authored
fix(operator): targeted reindex after pg_resetwal via amcheck (#68)
2 parents 2e7c7bc + fa961fd commit d9c6e95

3 files changed

Lines changed: 260 additions & 24 deletions

File tree

_typos.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[default.extend-words]
2+
# BRIN: postgres "Block Range Index" type.
3+
BRIN = "BRIN"

src/controllers/restore/builders.rs

Lines changed: 113 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,14 @@ fi
922922
# enough for read-only analytics, and the alternative is a permanently
923923
# unrecoverable replica.
924924
#
925+
# Every pg_resetwal -f invocation also touches /pgdata/needs-reindex-all:
926+
# pg_resetwal bypasses WAL replay, so any index update that was in flight
927+
# at snapshot time may have left torn pages ("unexpected zero page at
928+
# block N" surfaces in queries later, which is hard to debug after the
929+
# fact). The main container's startup hook picks up that flag and runs
930+
# REINDEX DATABASE on every user database before marking the replica
931+
# Ready.
932+
#
925933
# Stage 1: if the first attempt fails with a WAL-recovery signature,
926934
# short-circuit straight to pg_resetwal + retry. Retrying the same
927935
# command won't help when recovery itself is the blocker.
@@ -946,6 +954,7 @@ postgres_single_or_resetwal() {{
946954
echo "WAL recovery failed (snapshot likely captured mid-online-backup) — running pg_resetwal -f and retrying" >&2
947955
rm -f "$logfile"
948956
pg_resetwal -f "$PGDATA"
957+
touch /pgdata/needs-reindex-all
949958
echo "$sql_input" | postgres --single -D "$PGDATA" postgres
950959
return $?
951960
fi
@@ -966,6 +975,7 @@ postgres_single_or_resetwal() {{
966975
echo "second attempt also failed — running pg_resetwal -f as a last resort and retrying" >&2
967976
rm -f "$logfile"
968977
pg_resetwal -f "$PGDATA"
978+
touch /pgdata/needs-reindex-all
969979
echo "$sql_input" | postgres --single -D "$PGDATA" postgres
970980
}}
971981
@@ -1061,7 +1071,7 @@ ALTER ROLE ${{ANALYTICS_USERNAME}} WITH SUPERUSER;
10611071
SQLEOF
10621072
fi
10631073
1064-
if [ -f /pgdata/needs-reindex ]; then
1074+
if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
10651075
PGRO_STAGE=restored
10661076
else
10671077
PGRO_STAGE=ready
@@ -1235,33 +1245,112 @@ echo "Auth setup complete"
12351245
image: Some(pg_image),
12361246
command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]),
12371247
args: Some(vec![r#"
1238-
if [ -f /pgdata/needs-reindex ]; then
1248+
if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
12391249
PG_MAJOR=$(cat /pgdata/pgdata/PG_VERSION)
12401250
(
12411251
while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
12421252
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
1243-
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1244-
INDEXES=$(psql -U postgres -d "$db" -At -c "
1245-
SELECT DISTINCT indexrelid::regclass::text
1246-
FROM pg_index i
1247-
JOIN pg_attribute a ON a.attrelid = i.indexrelid
1248-
WHERE a.attcollation <> 0 AND i.indisvalid;
1249-
")
1250-
COUNT=$(echo "$INDEXES" | grep -c . || true)
1251-
echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)"
1252-
N=0
1253-
echo "$INDEXES" | while IFS= read -r idx; do
1254-
[ -z "$idx" ] && continue
1255-
N=$((N + 1))
1256-
echo " [$N/$COUNT] $db: $idx"
1257-
if [ "$PG_MAJOR" -ge 14 ]; then
1258-
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1259-
else
1260-
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1261-
fi
1253+
# needs-reindex-all (pg_resetwal aftermath) can leave torn pages in
1254+
# ANY index, not just collation-dependent ones. A blind REINDEX
1255+
# DATABASE takes hours on the prod DBs; instead do a two-step
1256+
# smart pass:
1257+
#
1258+
# 1. Use the amcheck contrib extension to read each valid btree
1259+
# index and verify its structural invariants. Indexes that
1260+
# fail the check (including "unexpected zero page" corruption)
1261+
# get queued for REINDEX. Indexes that pass are left alone.
1262+
# For a healthy snapshot this finds nothing and reads
1263+
# ~index-size of disk instead of ~table-size to rewrite.
1264+
#
1265+
# 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST,
1266+
# BRIN, hash). amcheck only covers btree; non-btree indexes
1267+
# are typically a small fraction of total index size so the
1268+
# cost is bounded, and skipping them risks the same
1269+
# post-resetwal corruption sneaking through.
1270+
#
1271+
# CONCURRENTLY when available (PG ≥ 14) so the work overlaps with
1272+
# whatever clients hit the pod after the readiness gate lifts.
1273+
if [ -f /pgdata/needs-reindex-all ]; then
1274+
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1275+
echo "Reindex after pg_resetwal: $db (smart pass via amcheck)"
1276+
psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true
1277+
1278+
# Step 1: scan btree indexes; collect those that fail amcheck.
1279+
BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
1280+
SELECT c.oid::regclass::text
1281+
FROM pg_class c
1282+
JOIN pg_am a ON a.oid = c.relam
1283+
JOIN pg_index i ON i.indexrelid = c.oid
1284+
WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid;
1285+
")
1286+
BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true)
1287+
echo " amcheck scanning $BTREE_COUNT btree indexes in $db"
1288+
CORRUPT_BTREE=""
1289+
N=0
1290+
for idx in $BTREE_INDEXES; do
1291+
[ -z "$idx" ] && continue
1292+
N=$((N + 1))
1293+
# bt_index_check raises an error if the index is corrupt.
1294+
# Suppress its output and check exit code; an error → queue it.
1295+
if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then
1296+
echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx"
1297+
CORRUPT_BTREE="$CORRUPT_BTREE $idx"
1298+
fi
1299+
done
1300+
1301+
# Step 2: list non-btree indexes for blind reindex.
1302+
NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
1303+
SELECT c.oid::regclass::text
1304+
FROM pg_class c
1305+
JOIN pg_am a ON a.oid = c.relam
1306+
JOIN pg_index i ON i.indexrelid = c.oid
1307+
WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid;
1308+
")
1309+
NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true)
1310+
1311+
TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES"
1312+
TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true)
1313+
echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)"
1314+
N=0
1315+
for idx in $TO_REINDEX; do
1316+
[ -z "$idx" ] && continue
1317+
N=$((N + 1))
1318+
echo " [$N/$TOTAL] $db: REINDEX $idx"
1319+
if [ "$PG_MAJOR" -ge 14 ]; then
1320+
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1321+
else
1322+
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1323+
fi
1324+
done
1325+
done
1326+
rm -f /pgdata/needs-reindex-all
1327+
# needs-reindex (collation-dependent only) is a strict subset of
1328+
# what we just did, so clear it too.
1329+
rm -f /pgdata/needs-reindex
1330+
elif [ -f /pgdata/needs-reindex ]; then
1331+
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1332+
INDEXES=$(psql -U postgres -d "$db" -At -c "
1333+
SELECT DISTINCT indexrelid::regclass::text
1334+
FROM pg_index i
1335+
JOIN pg_attribute a ON a.attrelid = i.indexrelid
1336+
WHERE a.attcollation <> 0 AND i.indisvalid;
1337+
")
1338+
COUNT=$(echo "$INDEXES" | grep -c . || true)
1339+
echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)"
1340+
N=0
1341+
echo "$INDEXES" | while IFS= read -r idx; do
1342+
[ -z "$idx" ] && continue
1343+
N=$((N + 1))
1344+
echo " [$N/$COUNT] $db: $idx"
1345+
if [ "$PG_MAJOR" -ge 14 ]; then
1346+
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1347+
else
1348+
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1349+
fi
1350+
done
12621351
done
1263-
done
1264-
rm -f /pgdata/needs-reindex
1352+
rm -f /pgdata/needs-reindex
1353+
fi
12651354
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'ready', last_transition_time = now() WHERE id = 1;"
12661355
echo "Background reindex complete"
12671356
) &
@@ -1308,7 +1397,7 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
13081397
command: Some(vec![
13091398
"/bin/sh".to_string(),
13101399
"-c".to_string(),
1311-
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(),
1400+
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(),
13121401
]),
13131402
}),
13141403
initial_delay_seconds: Some(5),

src/controllers/restore/tests.rs

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,150 @@ fn deployment_init_script_two_stage_pg_resetwal_fallback() {
691691
);
692692
}
693693

694+
#[test]
695+
fn deployment_init_script_flags_full_reindex_after_resetwal() {
696+
// pg_resetwal bypasses WAL replay, so any in-flight index write at
697+
// snapshot time can leave torn pages (postgres later surfaces these
698+
// as "unexpected zero page at block N" when queries hit the index).
699+
// Every pg_resetwal call must therefore touch /pgdata/needs-reindex-all
700+
// so the main container's startup hook runs REINDEX DATABASE on every
701+
// user database before the readiness probe lets traffic in.
702+
let (mut restore, replica) = test_restore_and_replica();
703+
restore.status = Some(PostgresPhysicalRestoreStatus {
704+
postgres_version: Some("16".to_string()),
705+
..Default::default()
706+
});
707+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
708+
let setup_auth = deploy
709+
.spec
710+
.unwrap()
711+
.template
712+
.spec
713+
.unwrap()
714+
.init_containers
715+
.unwrap()
716+
.into_iter()
717+
.find(|c| c.name == "setup-auth")
718+
.expect("setup-auth init container must exist");
719+
let script = setup_auth.args.unwrap().remove(0);
720+
721+
// The flag must be set every time pg_resetwal -f runs (currently
722+
// twice — detected-signature branch and last-resort branch). The
723+
// quick check: at least one `touch /pgdata/needs-reindex-all`
724+
// follows each pg_resetwal invocation, and the count of the touch
725+
// matches the count of resets.
726+
// Count actual invocations (the literal command line) — using just
727+
// "pg_resetwal -f" picks up comments and echo messages too.
728+
let resetwal_calls = script.matches("pg_resetwal -f \"$PGDATA\"").count();
729+
let touch_calls = script.matches("touch /pgdata/needs-reindex-all").count();
730+
assert_eq!(
731+
touch_calls, resetwal_calls,
732+
"every pg_resetwal -f invocation must be paired with `touch /pgdata/needs-reindex-all` (got {touch_calls} touches for {resetwal_calls} resets)"
733+
);
734+
assert!(
735+
resetwal_calls >= 2,
736+
"expected at least two pg_resetwal invocation sites; got {resetwal_calls}"
737+
);
738+
}
739+
740+
#[test]
741+
fn deployment_runtime_reindex_handles_full_database_flag() {
742+
// The main container's startup hook handles the broad flag
743+
// (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX
744+
// only those that fail the structural check (caught the
745+
// "unexpected zero page" failure mode reported in prod) plus all
746+
// non-btree indexes (amcheck only covers btree, so non-btree get
747+
// a blind REINDEX). A blind REINDEX DATABASE would take hours on
748+
// the prod size; the smart pass reads index-size and rewrites
749+
// only the corrupt ones.
750+
let (mut restore, replica) = test_restore_and_replica();
751+
restore.status = Some(PostgresPhysicalRestoreStatus {
752+
postgres_version: Some("16".to_string()),
753+
..Default::default()
754+
});
755+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
756+
let postgres = deploy
757+
.spec
758+
.unwrap()
759+
.template
760+
.spec
761+
.unwrap()
762+
.containers
763+
.into_iter()
764+
.find(|c| c.name == "postgres")
765+
.expect("postgres container must exist");
766+
let script = postgres.args.unwrap().remove(0);
767+
768+
assert!(
769+
script.contains("/pgdata/needs-reindex-all"),
770+
"runtime startup hook must check the needs-reindex-all flag"
771+
);
772+
assert!(
773+
script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"),
774+
"needs-reindex-all branch must enable amcheck for the structural scan"
775+
);
776+
assert!(
777+
script.contains("bt_index_check("),
778+
"needs-reindex-all branch must call bt_index_check on each btree index"
779+
);
780+
assert!(
781+
script.contains("a.amname <> 'btree'"),
782+
"needs-reindex-all branch must collect non-btree indexes for blind REINDEX"
783+
);
784+
assert!(
785+
script.contains("REINDEX INDEX"),
786+
"needs-reindex-all branch must REINDEX (targeted, not the whole DB)"
787+
);
788+
assert!(
789+
!script.contains("REINDEX DATABASE"),
790+
"needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids"
791+
);
792+
assert!(
793+
script.contains("rm -f /pgdata/needs-reindex-all"),
794+
"runtime hook must clear the needs-reindex-all flag after the reindex"
795+
);
796+
}
797+
798+
#[test]
799+
fn deployment_readiness_probe_waits_for_full_reindex() {
800+
// The readiness probe gates traffic on both reindex flags being
801+
// absent, otherwise queries see torn-page indexes between pod-start
802+
// and reindex-complete. The probe used to check only the narrow
803+
// needs-reindex flag.
804+
let (mut restore, replica) = test_restore_and_replica();
805+
restore.status = Some(PostgresPhysicalRestoreStatus {
806+
postgres_version: Some("16".to_string()),
807+
..Default::default()
808+
});
809+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
810+
let postgres = deploy
811+
.spec
812+
.unwrap()
813+
.template
814+
.spec
815+
.unwrap()
816+
.containers
817+
.into_iter()
818+
.find(|c| c.name == "postgres")
819+
.expect("postgres container must exist");
820+
let probe_cmd = postgres
821+
.readiness_probe
822+
.expect("readiness probe must exist")
823+
.exec
824+
.expect("readiness probe must be an exec probe")
825+
.command
826+
.expect("exec probe must have a command");
827+
let probe_script = probe_cmd.join(" ");
828+
assert!(
829+
probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"),
830+
"readiness probe must wait for needs-reindex-all to clear; got: {probe_script}"
831+
);
832+
assert!(
833+
probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
834+
"readiness probe must still wait for needs-reindex; got: {probe_script}"
835+
);
836+
}
837+
694838
#[test]
695839
fn deployment_init_script_overrides_listen_addresses() {
696840
// Some source backups carry `listen_addresses = 'localhost'` in

0 commit comments

Comments
 (0)