Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 45 additions & 68 deletions src/controllers/restore/builders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1251,77 +1251,46 @@ if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
# needs-reindex-all (pg_resetwal aftermath) can leave torn pages in
# ANY index, not just collation-dependent ones. A blind REINDEX
# DATABASE takes hours on the prod DBs; instead do a two-step
# smart pass:
# ANY index, not just collation-dependent ones. We tried a "smart
# pass" using the amcheck contrib extension (scan each btree, queue
# only the corrupt ones for REINDEX) β€” empirically that hits the
# same postgres-internal pathology that wedges other vanilla DDL on
# this dataset: bt_index_check itself burns 100% CPU forever on
# specific indexes with no visible progress, blocking the whole
# reindex behind it.
#
# 1. Use the amcheck contrib extension to read each valid btree
# index and verify its structural invariants. Indexes that
# fail the check (including "unexpected zero page" corruption)
# get queued for REINDEX. Indexes that pass are left alone.
# For a healthy snapshot this finds nothing and reads
# ~index-size of disk instead of ~table-size to rewrite.
# Fall back to blind REINDEX DATABASE. REINDEX reads the heap and
# rebuilds the index from scratch (different code path from amcheck,
# which reads the corrupt index pages directly) and so isn't subject
# to the same wedge. Slow on prod-sized DBs but it makes progress;
# the alternative was a permanently-stuck restore.
#
# 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST,
# BRIN, hash). amcheck only covers btree; non-btree indexes
# are typically a small fraction of total index size so the
# cost is bounded, and skipping them risks the same
# post-resetwal corruption sneaking through.
#
# CONCURRENTLY when available (PG β‰₯ 14) so the work overlaps with
# whatever clients hit the pod after the readiness gate lifts.
# Crucially this branch does NOT remove needs-reindex-all at the
# top of the work β€” the readiness probe ignores -all for exactly
# this reason (see the probe spec below). The pod becomes Ready as
# soon as postgres accepts connections; clients hitting a not-yet-
# reindexed corrupt index get the explicit "unexpected zero page"
# error, retry, succeed once the rebuild lands. After REINDEX
# DATABASE completes for every user db, the flag is cleared and
# _pgro.restore_info.stage flips to ready.
if [ -f /pgdata/needs-reindex-all ]; then
# CONCURRENTLY (PG β‰₯ 12) builds replacement indexes alongside the
# existing ones and atomically swaps. Clients can keep using the
# old indexes during the rebuild β€” they'll see "unexpected zero
# page" only if a query happens to hit a corrupt page on the old
# side; once the swap lands the corruption is gone.
#
# REINDEX DATABASE CONCURRENTLY skips system catalogs (PG won't
# CONCURRENTLY them). For an analytics replica that's the right
# trade: user-data indexes matter for client queries, system
# catalog corruption shows up as different errors and is rare.
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
echo "Reindex after pg_resetwal: $db (smart pass via amcheck)"
psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true

# Step 1: scan btree indexes; collect those that fail amcheck.
BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
SELECT c.oid::regclass::text
FROM pg_class c
JOIN pg_am a ON a.oid = c.relam
JOIN pg_index i ON i.indexrelid = c.oid
WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid;
")
BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true)
echo " amcheck scanning $BTREE_COUNT btree indexes in $db"
CORRUPT_BTREE=""
N=0
for idx in $BTREE_INDEXES; do
[ -z "$idx" ] && continue
N=$((N + 1))
# bt_index_check raises an error if the index is corrupt.
# Suppress its output and check exit code; an error β†’ queue it.
if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then
echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx"
CORRUPT_BTREE="$CORRUPT_BTREE $idx"
fi
done

# Step 2: list non-btree indexes for blind reindex.
NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
SELECT c.oid::regclass::text
FROM pg_class c
JOIN pg_am a ON a.oid = c.relam
JOIN pg_index i ON i.indexrelid = c.oid
WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid;
")
NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true)

TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES"
TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true)
echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)"
N=0
for idx in $TO_REINDEX; do
[ -z "$idx" ] && continue
N=$((N + 1))
echo " [$N/$TOTAL] $db: REINDEX $idx"
if [ "$PG_MAJOR" -ge 14 ]; then
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
else
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
fi
done
echo "Reindex after pg_resetwal: $db (REINDEX DATABASE CONCURRENTLY)"
if [ "$PG_MAJOR" -ge 12 ]; then
psql -U postgres -d "$db" -c "REINDEX DATABASE CONCURRENTLY \"$db\";" 2>&1 || true
else
psql -U postgres -d "$db" -c "REINDEX DATABASE \"$db\";" 2>&1 || true
fi
done
rm -f /pgdata/needs-reindex-all
# needs-reindex (collation-dependent only) is a strict subset of
Expand Down Expand Up @@ -1397,7 +1366,15 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
command: Some(vec![
"/bin/sh".to_string(),
"-c".to_string(),
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(),
// Gate readiness on the locale-only needs-reindex flag
// (small, fast, finishes in seconds-to-minutes) but NOT on
// needs-reindex-all (post-pg_resetwal blind REINDEX DATABASE
// β€” takes hours on prod-sized indexes; gating here would
// trip the operator's deployment_ready_timeout). The -all
// reindex runs in the background; clients hitting a
// not-yet-reindexed corrupt index see the explicit
// "unexpected zero page" error and retry.
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(),
]),
}),
initial_delay_seconds: Some(5),
Expand Down
59 changes: 29 additions & 30 deletions src/controllers/restore/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -740,13 +740,12 @@ fn deployment_init_script_flags_full_reindex_after_resetwal() {
#[test]
fn deployment_runtime_reindex_handles_full_database_flag() {
// The main container's startup hook handles the broad flag
// (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX
// only those that fail the structural check (caught the
// "unexpected zero page" failure mode reported in prod) plus all
// non-btree indexes (amcheck only covers btree, so non-btree get
// a blind REINDEX). A blind REINDEX DATABASE would take hours on
// the prod size; the smart pass reads index-size and rewrites
// only the corrupt ones.
// (needs-reindex-all) via blind REINDEX DATABASE on every user DB.
// The earlier amcheck-driven smart pass turned out to hit the same
// postgres-internal pathology that wedges other vanilla DDL on
// the prod data β€” bt_index_check burned 100% CPU forever on
// individual indexes. REINDEX uses a different code path (reads
// the heap, rebuilds from scratch) and doesn't trip that wedge.
let (mut restore, replica) = test_restore_and_replica();
restore.status = Some(PostgresPhysicalRestoreStatus {
postgres_version: Some("16".to_string()),
Expand All @@ -770,24 +769,20 @@ fn deployment_runtime_reindex_handles_full_database_flag() {
"runtime startup hook must check the needs-reindex-all flag"
);
assert!(
script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"),
"needs-reindex-all branch must enable amcheck for the structural scan"
script.contains("REINDEX DATABASE CONCURRENTLY"),
"needs-reindex-all branch must run REINDEX DATABASE CONCURRENTLY on PG β‰₯ 12 so clients can keep querying the old indexes during the rebuild"
);
// The PG < 12 fallback uses plain REINDEX DATABASE β€” verify the
// version gate exists in the script. (We can't string-match on the
// literal SQL because the shell-quoted `\"` form differs from a
// Rust string literal.)
assert!(
script.contains("bt_index_check("),
"needs-reindex-all branch must call bt_index_check on each btree index"
script.contains(r#""$PG_MAJOR" -ge 12"#),
"needs-reindex-all branch must gate CONCURRENTLY behind a PG β‰₯ 12 check"
);
assert!(
script.contains("a.amname <> 'btree'"),
"needs-reindex-all branch must collect non-btree indexes for blind REINDEX"
);
assert!(
script.contains("REINDEX INDEX"),
"needs-reindex-all branch must REINDEX (targeted, not the whole DB)"
);
assert!(
!script.contains("REINDEX DATABASE"),
"needs-reindex-all must not REINDEX DATABASE β€” it's the expensive hammer the smart pass avoids"
!script.contains("bt_index_check("),
"runtime hook must not call bt_index_check β€” amcheck wedges on the prod data"
);
assert!(
script.contains("rm -f /pgdata/needs-reindex-all"),
Expand All @@ -796,11 +791,15 @@ fn deployment_runtime_reindex_handles_full_database_flag() {
}

#[test]
fn deployment_readiness_probe_waits_for_full_reindex() {
// The readiness probe gates traffic on both reindex flags being
// absent, otherwise queries see torn-page indexes between pod-start
// and reindex-complete. The probe used to check only the narrow
// needs-reindex flag.
fn deployment_readiness_probe_only_gates_on_locale_reindex() {
// The readiness probe waits for the locale-only `needs-reindex`
// flag to clear (small, fast, finishes in seconds) but NOT for
// `needs-reindex-all` (the post-pg_resetwal blind REINDEX DATABASE
// β€” takes hours on prod-sized indexes; gating here would trip the
// operator's deployment_ready_timeout and block restores
// indefinitely). The -all reindex runs in the background; clients
// hitting a not-yet-reindexed corrupt index see the explicit
// "unexpected zero page" error and can retry.
let (mut restore, replica) = test_restore_and_replica();
restore.status = Some(PostgresPhysicalRestoreStatus {
postgres_version: Some("16".to_string()),
Expand All @@ -826,12 +825,12 @@ fn deployment_readiness_probe_waits_for_full_reindex() {
.expect("exec probe must have a command");
let probe_script = probe_cmd.join(" ");
assert!(
probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"),
"readiness probe must wait for needs-reindex-all to clear; got: {probe_script}"
probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
"readiness probe must still wait for the locale-only needs-reindex flag; got: {probe_script}"
);
assert!(
probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
"readiness probe must still wait for needs-reindex; got: {probe_script}"
!probe_script.contains("needs-reindex-all"),
"readiness probe must NOT gate on needs-reindex-all (the long-running post-resetwal reindex); got: {probe_script}"
);
}

Expand Down