Skip to content

Commit dfcf7d2

Browse files
committed
fix(operator): REINDEX DATABASE after pg_resetwal fallback
pg_resetwal bypasses WAL replay — it tells postgres to pretend the data dir is consistent without replaying anything. Any index update that was in flight at snapshot time can leave torn pages, which postgres later surfaces as 'unexpected zero page at block N' when a query happens to hit the corrupted page. The corruption is silent until then, so it surfaces well after the restore completes — long after debugging context is gone. Reuse the existing post-restore reindex infrastructure (the /pgdata/needs-reindex marker, the background hook in the postgres container's startup script, the readiness probe that waits for the marker to clear). Add a second marker file /pgdata/needs-reindex-all that the pg_resetwal fallback paths touch every time they run. The runtime hook now handles both flags: - needs-reindex-all (set by pg_resetwal): REINDEX DATABASE on every user database. Supersedes needs-reindex (its set of indexes is a strict subset). - needs-reindex (set by locale change): existing collation-dependent REINDEX INDEX CONCURRENTLY loop. The readiness probe gates pod-Ready on BOTH flags being absent, so clients don't see corrupted indexes during the window between pod startup and reindex completion. Trade-off: REINDEX DATABASE isn't CONCURRENT so the reindex blocks writes to each table in turn. For a read-only analytics replica the operator's other pre-restore work has already taken the table to single-user mode anyway, and the alternative is letting corrupt indexes reach clients. REINDEX runs in the background after startup, not in the foreground init script, so it doesn't extend the pod-creation timeout.
1 parent 2e7c7bc commit dfcf7d2

2 files changed

Lines changed: 175 additions & 24 deletions

File tree

src/controllers/restore/builders.rs

Lines changed: 51 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,14 @@ fi
922922
# enough for read-only analytics, and the alternative is a permanently
923923
# unrecoverable replica.
924924
#
925+
# Every pg_resetwal -f invocation also touches /pgdata/needs-reindex-all:
926+
# pg_resetwal bypasses WAL replay, so any index update that was in flight
927+
# at snapshot time may have left torn pages ("unexpected zero page at
928+
# block N" surfaces in queries later, which is hard to debug after the
929+
# fact). The main container's startup hook picks up that flag and runs
930+
# REINDEX DATABASE on every user database before marking the replica
931+
# Ready.
932+
#
925933
# Stage 1: if the first attempt fails with a WAL-recovery signature,
926934
# short-circuit straight to pg_resetwal + retry. Retrying the same
927935
# command won't help when recovery itself is the blocker.
@@ -946,6 +954,7 @@ postgres_single_or_resetwal() {{
946954
echo "WAL recovery failed (snapshot likely captured mid-online-backup) — running pg_resetwal -f and retrying" >&2
947955
rm -f "$logfile"
948956
pg_resetwal -f "$PGDATA"
957+
touch /pgdata/needs-reindex-all
949958
echo "$sql_input" | postgres --single -D "$PGDATA" postgres
950959
return $?
951960
fi
@@ -966,6 +975,7 @@ postgres_single_or_resetwal() {{
966975
echo "second attempt also failed — running pg_resetwal -f as a last resort and retrying" >&2
967976
rm -f "$logfile"
968977
pg_resetwal -f "$PGDATA"
978+
touch /pgdata/needs-reindex-all
969979
echo "$sql_input" | postgres --single -D "$PGDATA" postgres
970980
}}
971981
@@ -1061,7 +1071,7 @@ ALTER ROLE ${{ANALYTICS_USERNAME}} WITH SUPERUSER;
10611071
SQLEOF
10621072
fi
10631073
1064-
if [ -f /pgdata/needs-reindex ]; then
1074+
if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
10651075
PGRO_STAGE=restored
10661076
else
10671077
PGRO_STAGE=ready
@@ -1235,33 +1245,50 @@ echo "Auth setup complete"
12351245
image: Some(pg_image),
12361246
command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]),
12371247
args: Some(vec![r#"
1238-
if [ -f /pgdata/needs-reindex ]; then
1248+
if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
12391249
PG_MAJOR=$(cat /pgdata/pgdata/PG_VERSION)
12401250
(
12411251
while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
12421252
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
1243-
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1244-
INDEXES=$(psql -U postgres -d "$db" -At -c "
1245-
SELECT DISTINCT indexrelid::regclass::text
1246-
FROM pg_index i
1247-
JOIN pg_attribute a ON a.attrelid = i.indexrelid
1248-
WHERE a.attcollation <> 0 AND i.indisvalid;
1249-
")
1250-
COUNT=$(echo "$INDEXES" | grep -c . || true)
1251-
echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)"
1252-
N=0
1253-
echo "$INDEXES" | while IFS= read -r idx; do
1254-
[ -z "$idx" ] && continue
1255-
N=$((N + 1))
1256-
echo " [$N/$COUNT] $db: $idx"
1257-
if [ "$PG_MAJOR" -ge 14 ]; then
1258-
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1259-
else
1260-
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1261-
fi
1253+
# needs-reindex-all supersedes needs-reindex: pg_resetwal can leave
1254+
# torn pages in ANY index (not just collation-dependent ones), so
1255+
# we reindex everything via REINDEX DATABASE. CONCURRENTLY isn't
1256+
# available at the DATABASE level, but the script runs in the
1257+
# background after startup so readiness gating (below) hides the
1258+
# work from clients until it's done.
1259+
if [ -f /pgdata/needs-reindex-all ]; then
1260+
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1261+
echo "Reindex after pg_resetwal: $db (REINDEX DATABASE)"
1262+
psql -U postgres -d "$db" -c "REINDEX DATABASE \"$db\";" 2>&1 || true
1263+
done
1264+
rm -f /pgdata/needs-reindex-all
1265+
# needs-reindex (collation-dependent only) is a strict subset of
1266+
# what we just did, so clear it too.
1267+
rm -f /pgdata/needs-reindex
1268+
elif [ -f /pgdata/needs-reindex ]; then
1269+
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1270+
INDEXES=$(psql -U postgres -d "$db" -At -c "
1271+
SELECT DISTINCT indexrelid::regclass::text
1272+
FROM pg_index i
1273+
JOIN pg_attribute a ON a.attrelid = i.indexrelid
1274+
WHERE a.attcollation <> 0 AND i.indisvalid;
1275+
")
1276+
COUNT=$(echo "$INDEXES" | grep -c . || true)
1277+
echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)"
1278+
N=0
1279+
echo "$INDEXES" | while IFS= read -r idx; do
1280+
[ -z "$idx" ] && continue
1281+
N=$((N + 1))
1282+
echo " [$N/$COUNT] $db: $idx"
1283+
if [ "$PG_MAJOR" -ge 14 ]; then
1284+
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1285+
else
1286+
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1287+
fi
1288+
done
12621289
done
1263-
done
1264-
rm -f /pgdata/needs-reindex
1290+
rm -f /pgdata/needs-reindex
1291+
fi
12651292
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'ready', last_transition_time = now() WHERE id = 1;"
12661293
echo "Background reindex complete"
12671294
) &
@@ -1308,7 +1335,7 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
13081335
command: Some(vec![
13091336
"/bin/sh".to_string(),
13101337
"-c".to_string(),
1311-
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(),
1338+
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(),
13121339
]),
13131340
}),
13141341
initial_delay_seconds: Some(5),

src/controllers/restore/tests.rs

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,130 @@ fn deployment_init_script_two_stage_pg_resetwal_fallback() {
691691
);
692692
}
693693

694+
#[test]
695+
fn deployment_init_script_flags_full_reindex_after_resetwal() {
696+
// pg_resetwal bypasses WAL replay, so any in-flight index write at
697+
// snapshot time can leave torn pages (postgres later surfaces these
698+
// as "unexpected zero page at block N" when queries hit the index).
699+
// Every pg_resetwal call must therefore touch /pgdata/needs-reindex-all
700+
// so the main container's startup hook runs REINDEX DATABASE on every
701+
// user database before the readiness probe lets traffic in.
702+
let (mut restore, replica) = test_restore_and_replica();
703+
restore.status = Some(PostgresPhysicalRestoreStatus {
704+
postgres_version: Some("16".to_string()),
705+
..Default::default()
706+
});
707+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
708+
let setup_auth = deploy
709+
.spec
710+
.unwrap()
711+
.template
712+
.spec
713+
.unwrap()
714+
.init_containers
715+
.unwrap()
716+
.into_iter()
717+
.find(|c| c.name == "setup-auth")
718+
.expect("setup-auth init container must exist");
719+
let script = setup_auth.args.unwrap().remove(0);
720+
721+
// The flag must be set every time pg_resetwal -f runs (currently
722+
// twice — detected-signature branch and last-resort branch). The
723+
// quick check: at least one `touch /pgdata/needs-reindex-all`
724+
// follows each pg_resetwal invocation, and the count of the touch
725+
// matches the count of resets.
726+
// Count actual invocations (the literal command line) — using just
727+
// "pg_resetwal -f" picks up comments and echo messages too.
728+
let resetwal_calls = script.matches("pg_resetwal -f \"$PGDATA\"").count();
729+
let touch_calls = script.matches("touch /pgdata/needs-reindex-all").count();
730+
assert_eq!(
731+
touch_calls, resetwal_calls,
732+
"every pg_resetwal -f invocation must be paired with `touch /pgdata/needs-reindex-all` (got {touch_calls} touches for {resetwal_calls} resets)"
733+
);
734+
assert!(
735+
resetwal_calls >= 2,
736+
"expected at least two pg_resetwal invocation sites; got {resetwal_calls}"
737+
);
738+
}
739+
740+
#[test]
741+
fn deployment_runtime_reindex_handles_full_database_flag() {
742+
// The main container's startup hook needs to handle the broad flag
743+
// (needs-reindex-all) by running REINDEX DATABASE on every user
744+
// database — not just the collation-dependent subset the existing
745+
// needs-reindex flag triggers.
746+
let (mut restore, replica) = test_restore_and_replica();
747+
restore.status = Some(PostgresPhysicalRestoreStatus {
748+
postgres_version: Some("16".to_string()),
749+
..Default::default()
750+
});
751+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
752+
let postgres = deploy
753+
.spec
754+
.unwrap()
755+
.template
756+
.spec
757+
.unwrap()
758+
.containers
759+
.into_iter()
760+
.find(|c| c.name == "postgres")
761+
.expect("postgres container must exist");
762+
let script = postgres.args.unwrap().remove(0);
763+
764+
assert!(
765+
script.contains("/pgdata/needs-reindex-all"),
766+
"runtime startup hook must check the needs-reindex-all flag"
767+
);
768+
assert!(
769+
script.contains("REINDEX DATABASE"),
770+
"needs-reindex-all branch must run REINDEX DATABASE"
771+
);
772+
assert!(
773+
script.contains("rm -f /pgdata/needs-reindex-all"),
774+
"runtime hook must clear the needs-reindex-all flag after the reindex"
775+
);
776+
}
777+
778+
#[test]
779+
fn deployment_readiness_probe_waits_for_full_reindex() {
780+
// The readiness probe gates traffic on both reindex flags being
781+
// absent, otherwise queries see torn-page indexes between pod-start
782+
// and reindex-complete. The probe used to check only the narrow
783+
// needs-reindex flag.
784+
let (mut restore, replica) = test_restore_and_replica();
785+
restore.status = Some(PostgresPhysicalRestoreStatus {
786+
postgres_version: Some("16".to_string()),
787+
..Default::default()
788+
});
789+
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
790+
let postgres = deploy
791+
.spec
792+
.unwrap()
793+
.template
794+
.spec
795+
.unwrap()
796+
.containers
797+
.into_iter()
798+
.find(|c| c.name == "postgres")
799+
.expect("postgres container must exist");
800+
let probe_cmd = postgres
801+
.readiness_probe
802+
.expect("readiness probe must exist")
803+
.exec
804+
.expect("readiness probe must be an exec probe")
805+
.command
806+
.expect("exec probe must have a command");
807+
let probe_script = probe_cmd.join(" ");
808+
assert!(
809+
probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"),
810+
"readiness probe must wait for needs-reindex-all to clear; got: {probe_script}"
811+
);
812+
assert!(
813+
probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
814+
"readiness probe must still wait for needs-reindex; got: {probe_script}"
815+
);
816+
}
817+
694818
#[test]
695819
fn deployment_init_script_overrides_listen_addresses() {
696820
// Some source backups carry `listen_addresses = 'localhost'` in

0 commit comments

Comments
 (0)