Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions _typos.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[default.extend-words]
# BRIN: postgres "Block Range Index" type.
BRIN = "BRIN"
137 changes: 113 additions & 24 deletions src/controllers/restore/builders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,14 @@ fi
# enough for read-only analytics, and the alternative is a permanently
# unrecoverable replica.
#
# Every pg_resetwal -f invocation also touches /pgdata/needs-reindex-all:
# pg_resetwal bypasses WAL replay, so any index update that was in flight
# at snapshot time may have left torn pages ("unexpected zero page at
# block N" surfaces in queries later, which is hard to debug after the
# fact). The main container's startup hook picks up that flag and runs
# REINDEX DATABASE on every user database before marking the replica
# Ready.
#
# Stage 1: if the first attempt fails with a WAL-recovery signature,
# short-circuit straight to pg_resetwal + retry. Retrying the same
# command won't help when recovery itself is the blocker.
Expand All @@ -946,6 +954,7 @@ postgres_single_or_resetwal() {{
echo "WAL recovery failed (snapshot likely captured mid-online-backup) — running pg_resetwal -f and retrying" >&2
rm -f "$logfile"
pg_resetwal -f "$PGDATA"
touch /pgdata/needs-reindex-all
echo "$sql_input" | postgres --single -D "$PGDATA" postgres
return $?
fi
Expand All @@ -966,6 +975,7 @@ postgres_single_or_resetwal() {{
echo "second attempt also failed — running pg_resetwal -f as a last resort and retrying" >&2
rm -f "$logfile"
pg_resetwal -f "$PGDATA"
touch /pgdata/needs-reindex-all
echo "$sql_input" | postgres --single -D "$PGDATA" postgres
}}

Expand Down Expand Up @@ -1061,7 +1071,7 @@ ALTER ROLE ${{ANALYTICS_USERNAME}} WITH SUPERUSER;
SQLEOF
fi

if [ -f /pgdata/needs-reindex ]; then
if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
PGRO_STAGE=restored
else
PGRO_STAGE=ready
Expand Down Expand Up @@ -1235,33 +1245,112 @@ echo "Auth setup complete"
image: Some(pg_image),
command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]),
args: Some(vec![r#"
if [ -f /pgdata/needs-reindex ]; then
if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
PG_MAJOR=$(cat /pgdata/pgdata/PG_VERSION)
(
while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
INDEXES=$(psql -U postgres -d "$db" -At -c "
SELECT DISTINCT indexrelid::regclass::text
FROM pg_index i
JOIN pg_attribute a ON a.attrelid = i.indexrelid
WHERE a.attcollation <> 0 AND i.indisvalid;
")
COUNT=$(echo "$INDEXES" | grep -c . || true)
echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)"
N=0
echo "$INDEXES" | while IFS= read -r idx; do
[ -z "$idx" ] && continue
N=$((N + 1))
echo " [$N/$COUNT] $db: $idx"
if [ "$PG_MAJOR" -ge 14 ]; then
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
else
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
fi
# needs-reindex-all (pg_resetwal aftermath) can leave torn pages in
# ANY index, not just collation-dependent ones. A blind REINDEX
# DATABASE takes hours on the prod DBs; instead do a two-step
# smart pass:
#
# 1. Use the amcheck contrib extension to read each valid btree
# index and verify its structural invariants. Indexes that
# fail the check (including "unexpected zero page" corruption)
# get queued for REINDEX. Indexes that pass are left alone.
# For a healthy snapshot this finds nothing and reads
# ~index-size of disk instead of ~table-size to rewrite.
#
# 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST,
# BRIN, hash). amcheck only covers btree; non-btree indexes
# are typically a small fraction of total index size so the
# cost is bounded, and skipping them risks the same
# post-resetwal corruption sneaking through.
#
# CONCURRENTLY when available (PG ≥ 14) so the work overlaps with
# whatever clients hit the pod after the readiness gate lifts.
if [ -f /pgdata/needs-reindex-all ]; then
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
echo "Reindex after pg_resetwal: $db (smart pass via amcheck)"
psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true

# Step 1: scan btree indexes; collect those that fail amcheck.
BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
SELECT c.oid::regclass::text
FROM pg_class c
JOIN pg_am a ON a.oid = c.relam
JOIN pg_index i ON i.indexrelid = c.oid
WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid;
")
BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true)
echo " amcheck scanning $BTREE_COUNT btree indexes in $db"
CORRUPT_BTREE=""
N=0
for idx in $BTREE_INDEXES; do
[ -z "$idx" ] && continue
N=$((N + 1))
# bt_index_check raises an error if the index is corrupt.
# Suppress its output and check exit code; an error → queue it.
if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then
echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx"
CORRUPT_BTREE="$CORRUPT_BTREE $idx"
fi
done

# Step 2: list non-btree indexes for blind reindex.
NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
SELECT c.oid::regclass::text
FROM pg_class c
JOIN pg_am a ON a.oid = c.relam
JOIN pg_index i ON i.indexrelid = c.oid
WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid;
")
NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true)

TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES"
TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true)
echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)"
N=0
for idx in $TO_REINDEX; do
[ -z "$idx" ] && continue
N=$((N + 1))
echo " [$N/$TOTAL] $db: REINDEX $idx"
if [ "$PG_MAJOR" -ge 14 ]; then
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
else
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
fi
done
done
rm -f /pgdata/needs-reindex-all
# needs-reindex (collation-dependent only) is a strict subset of
# what we just did, so clear it too.
rm -f /pgdata/needs-reindex
elif [ -f /pgdata/needs-reindex ]; then
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
INDEXES=$(psql -U postgres -d "$db" -At -c "
SELECT DISTINCT indexrelid::regclass::text
FROM pg_index i
JOIN pg_attribute a ON a.attrelid = i.indexrelid
WHERE a.attcollation <> 0 AND i.indisvalid;
")
COUNT=$(echo "$INDEXES" | grep -c . || true)
echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)"
N=0
echo "$INDEXES" | while IFS= read -r idx; do
[ -z "$idx" ] && continue
N=$((N + 1))
echo " [$N/$COUNT] $db: $idx"
if [ "$PG_MAJOR" -ge 14 ]; then
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
else
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
fi
done
done
done
rm -f /pgdata/needs-reindex
rm -f /pgdata/needs-reindex
fi
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'ready', last_transition_time = now() WHERE id = 1;"
echo "Background reindex complete"
) &
Expand Down Expand Up @@ -1308,7 +1397,7 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
command: Some(vec![
"/bin/sh".to_string(),
"-c".to_string(),
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(),
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(),
]),
}),
initial_delay_seconds: Some(5),
Expand Down
144 changes: 144 additions & 0 deletions src/controllers/restore/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,150 @@ fn deployment_init_script_two_stage_pg_resetwal_fallback() {
);
}

#[test]
fn deployment_init_script_flags_full_reindex_after_resetwal() {
// pg_resetwal bypasses WAL replay, so any in-flight index write at
// snapshot time can leave torn pages (postgres later surfaces these
// as "unexpected zero page at block N" when queries hit the index).
// Every pg_resetwal call must therefore touch /pgdata/needs-reindex-all
// so the main container's startup hook runs REINDEX DATABASE on every
// user database before the readiness probe lets traffic in.
let (mut restore, replica) = test_restore_and_replica();
restore.status = Some(PostgresPhysicalRestoreStatus {
postgres_version: Some("16".to_string()),
..Default::default()
});
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
let setup_auth = deploy
.spec
.unwrap()
.template
.spec
.unwrap()
.init_containers
.unwrap()
.into_iter()
.find(|c| c.name == "setup-auth")
.expect("setup-auth init container must exist");
let script = setup_auth.args.unwrap().remove(0);

// The flag must be set every time pg_resetwal -f runs (currently
// twice — detected-signature branch and last-resort branch). The
// quick check: at least one `touch /pgdata/needs-reindex-all`
// follows each pg_resetwal invocation, and the count of the touch
// matches the count of resets.
// Count actual invocations (the literal command line) — using just
// "pg_resetwal -f" picks up comments and echo messages too.
let resetwal_calls = script.matches("pg_resetwal -f \"$PGDATA\"").count();
let touch_calls = script.matches("touch /pgdata/needs-reindex-all").count();
assert_eq!(
touch_calls, resetwal_calls,
"every pg_resetwal -f invocation must be paired with `touch /pgdata/needs-reindex-all` (got {touch_calls} touches for {resetwal_calls} resets)"
);
assert!(
resetwal_calls >= 2,
"expected at least two pg_resetwal invocation sites; got {resetwal_calls}"
);
}

#[test]
fn deployment_runtime_reindex_handles_full_database_flag() {
// The main container's startup hook handles the broad flag
// (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX
// only those that fail the structural check (caught the
// "unexpected zero page" failure mode reported in prod) plus all
// non-btree indexes (amcheck only covers btree, so non-btree get
// a blind REINDEX). A blind REINDEX DATABASE would take hours on
// the prod size; the smart pass reads index-size and rewrites
// only the corrupt ones.
let (mut restore, replica) = test_restore_and_replica();
restore.status = Some(PostgresPhysicalRestoreStatus {
postgres_version: Some("16".to_string()),
..Default::default()
});
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
let postgres = deploy
.spec
.unwrap()
.template
.spec
.unwrap()
.containers
.into_iter()
.find(|c| c.name == "postgres")
.expect("postgres container must exist");
let script = postgres.args.unwrap().remove(0);

assert!(
script.contains("/pgdata/needs-reindex-all"),
"runtime startup hook must check the needs-reindex-all flag"
);
assert!(
script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"),
"needs-reindex-all branch must enable amcheck for the structural scan"
);
assert!(
script.contains("bt_index_check("),
"needs-reindex-all branch must call bt_index_check on each btree index"
);
assert!(
script.contains("a.amname <> 'btree'"),
"needs-reindex-all branch must collect non-btree indexes for blind REINDEX"
);
assert!(
script.contains("REINDEX INDEX"),
"needs-reindex-all branch must REINDEX (targeted, not the whole DB)"
);
assert!(
!script.contains("REINDEX DATABASE"),
"needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids"
);
assert!(
script.contains("rm -f /pgdata/needs-reindex-all"),
"runtime hook must clear the needs-reindex-all flag after the reindex"
);
}

#[test]
fn deployment_readiness_probe_waits_for_full_reindex() {
// The readiness probe gates traffic on both reindex flags being
// absent, otherwise queries see torn-page indexes between pod-start
// and reindex-complete. The probe used to check only the narrow
// needs-reindex flag.
let (mut restore, replica) = test_restore_and_replica();
restore.status = Some(PostgresPhysicalRestoreStatus {
postgres_version: Some("16".to_string()),
..Default::default()
});
let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap();
let postgres = deploy
.spec
.unwrap()
.template
.spec
.unwrap()
.containers
.into_iter()
.find(|c| c.name == "postgres")
.expect("postgres container must exist");
let probe_cmd = postgres
.readiness_probe
.expect("readiness probe must exist")
.exec
.expect("readiness probe must be an exec probe")
.command
.expect("exec probe must have a command");
let probe_script = probe_cmd.join(" ");
assert!(
probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"),
"readiness probe must wait for needs-reindex-all to clear; got: {probe_script}"
);
assert!(
probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
"readiness probe must still wait for needs-reindex; got: {probe_script}"
);
}

#[test]
fn deployment_init_script_overrides_listen_addresses() {
// Some source backups carry `listen_addresses = 'localhost'` in
Expand Down