From fa961fdbb8af02f20a5f8e783cf614bdbea8e39f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Tue, 9 Jun 2026 18:52:24 +1200 Subject: [PATCH] fix(operator): targeted reindex after pg_resetwal via amcheck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pg_resetwal bypasses WAL replay — it tells postgres to act as if the data dir is consistent without replaying anything. Any index update in flight at snapshot time can leave torn pages, which postgres later surfaces as 'unexpected zero page at block N' when a query happens to hit the corrupted index. Downstream users reported exactly this against a primary key, with the hint 'Please REINDEX it.' A blind REINDEX DATABASE takes hours on the prod-sized DBs (known from prior experience). Instead, use postgres's amcheck contrib extension to do a structural scan: read each valid btree index, verify its invariants, queue only the failing ones for REINDEX. Plus a blind REINDEX of every non-btree index (amcheck only covers btree, non-btree are usually a tiny fraction of total index size). Reuses the existing post-restore reindex infrastructure (the /pgdata/needs-reindex marker, the background hook in the postgres container startup, the readiness probe gate). Adds a second marker /pgdata/needs-reindex-all set by every pg_resetwal -f invocation in the operator's two-stage fallback path. The runtime hook handles both flags, with the -all branch supplanting the narrow locale-only branch when both are set. --- _typos.toml | 3 + src/controllers/restore/builders.rs | 137 +++++++++++++++++++++----- src/controllers/restore/tests.rs | 144 ++++++++++++++++++++++++++++ 3 files changed, 260 insertions(+), 24 deletions(-) create mode 100644 _typos.toml diff --git a/_typos.toml b/_typos.toml new file mode 100644 index 0000000..b67b8b7 --- /dev/null +++ b/_typos.toml @@ -0,0 +1,3 @@ +[default.extend-words] +# BRIN: postgres "Block Range Index" type. +BRIN = "BRIN" diff --git a/src/controllers/restore/builders.rs b/src/controllers/restore/builders.rs index 58d7fd6..82c0953 100644 --- a/src/controllers/restore/builders.rs +++ b/src/controllers/restore/builders.rs @@ -922,6 +922,14 @@ fi # enough for read-only analytics, and the alternative is a permanently # unrecoverable replica. # +# Every pg_resetwal -f invocation also touches /pgdata/needs-reindex-all: +# pg_resetwal bypasses WAL replay, so any index update that was in flight +# at snapshot time may have left torn pages ("unexpected zero page at +# block N" surfaces in queries later, which is hard to debug after the +# fact). The main container's startup hook picks up that flag and runs +# REINDEX DATABASE on every user database before marking the replica +# Ready. +# # Stage 1: if the first attempt fails with a WAL-recovery signature, # short-circuit straight to pg_resetwal + retry. Retrying the same # command won't help when recovery itself is the blocker. @@ -946,6 +954,7 @@ postgres_single_or_resetwal() {{ echo "WAL recovery failed (snapshot likely captured mid-online-backup) — running pg_resetwal -f and retrying" >&2 rm -f "$logfile" pg_resetwal -f "$PGDATA" + touch /pgdata/needs-reindex-all echo "$sql_input" | postgres --single -D "$PGDATA" postgres return $? fi @@ -966,6 +975,7 @@ postgres_single_or_resetwal() {{ echo "second attempt also failed — running pg_resetwal -f as a last resort and retrying" >&2 rm -f "$logfile" pg_resetwal -f "$PGDATA" + touch /pgdata/needs-reindex-all echo "$sql_input" | postgres --single -D "$PGDATA" postgres }} @@ -1061,7 +1071,7 @@ ALTER ROLE ${{ANALYTICS_USERNAME}} WITH SUPERUSER; SQLEOF fi -if [ -f /pgdata/needs-reindex ]; then +if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then PGRO_STAGE=restored else PGRO_STAGE=ready @@ -1235,33 +1245,112 @@ echo "Auth setup complete" image: Some(pg_image), command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]), args: Some(vec![r#" -if [ -f /pgdata/needs-reindex ]; then +if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then PG_MAJOR=$(cat /pgdata/pgdata/PG_VERSION) ( while ! pg_isready -q -U postgres -d postgres; do sleep 2; done psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;" - for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do - INDEXES=$(psql -U postgres -d "$db" -At -c " - SELECT DISTINCT indexrelid::regclass::text - FROM pg_index i - JOIN pg_attribute a ON a.attrelid = i.indexrelid - WHERE a.attcollation <> 0 AND i.indisvalid; - ") - COUNT=$(echo "$INDEXES" | grep -c . || true) - echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)" - N=0 - echo "$INDEXES" | while IFS= read -r idx; do - [ -z "$idx" ] && continue - N=$((N + 1)) - echo " [$N/$COUNT] $db: $idx" - if [ "$PG_MAJOR" -ge 14 ]; then - psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true - else - psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true - fi + # needs-reindex-all (pg_resetwal aftermath) can leave torn pages in + # ANY index, not just collation-dependent ones. A blind REINDEX + # DATABASE takes hours on the prod DBs; instead do a two-step + # smart pass: + # + # 1. Use the amcheck contrib extension to read each valid btree + # index and verify its structural invariants. Indexes that + # fail the check (including "unexpected zero page" corruption) + # get queued for REINDEX. Indexes that pass are left alone. + # For a healthy snapshot this finds nothing and reads + # ~index-size of disk instead of ~table-size to rewrite. + # + # 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST, + # BRIN, hash). amcheck only covers btree; non-btree indexes + # are typically a small fraction of total index size so the + # cost is bounded, and skipping them risks the same + # post-resetwal corruption sneaking through. + # + # CONCURRENTLY when available (PG ≥ 14) so the work overlaps with + # whatever clients hit the pod after the readiness gate lifts. + if [ -f /pgdata/needs-reindex-all ]; then + for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do + echo "Reindex after pg_resetwal: $db (smart pass via amcheck)" + psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true + + # Step 1: scan btree indexes; collect those that fail amcheck. + BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c " + SELECT c.oid::regclass::text + FROM pg_class c + JOIN pg_am a ON a.oid = c.relam + JOIN pg_index i ON i.indexrelid = c.oid + WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid; + ") + BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true) + echo " amcheck scanning $BTREE_COUNT btree indexes in $db" + CORRUPT_BTREE="" + N=0 + for idx in $BTREE_INDEXES; do + [ -z "$idx" ] && continue + N=$((N + 1)) + # bt_index_check raises an error if the index is corrupt. + # Suppress its output and check exit code; an error → queue it. + if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then + echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx" + CORRUPT_BTREE="$CORRUPT_BTREE $idx" + fi + done + + # Step 2: list non-btree indexes for blind reindex. + NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c " + SELECT c.oid::regclass::text + FROM pg_class c + JOIN pg_am a ON a.oid = c.relam + JOIN pg_index i ON i.indexrelid = c.oid + WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid; + ") + NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true) + + TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES" + TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true) + echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)" + N=0 + for idx in $TO_REINDEX; do + [ -z "$idx" ] && continue + N=$((N + 1)) + echo " [$N/$TOTAL] $db: REINDEX $idx" + if [ "$PG_MAJOR" -ge 14 ]; then + psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true + else + psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true + fi + done + done + rm -f /pgdata/needs-reindex-all + # needs-reindex (collation-dependent only) is a strict subset of + # what we just did, so clear it too. + rm -f /pgdata/needs-reindex + elif [ -f /pgdata/needs-reindex ]; then + for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do + INDEXES=$(psql -U postgres -d "$db" -At -c " + SELECT DISTINCT indexrelid::regclass::text + FROM pg_index i + JOIN pg_attribute a ON a.attrelid = i.indexrelid + WHERE a.attcollation <> 0 AND i.indisvalid; + ") + COUNT=$(echo "$INDEXES" | grep -c . || true) + echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)" + N=0 + echo "$INDEXES" | while IFS= read -r idx; do + [ -z "$idx" ] && continue + N=$((N + 1)) + echo " [$N/$COUNT] $db: $idx" + if [ "$PG_MAJOR" -ge 14 ]; then + psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true + else + psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true + fi + done done - done - rm -f /pgdata/needs-reindex + rm -f /pgdata/needs-reindex + fi psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'ready', last_transition_time = now() WHERE id = 1;" echo "Background reindex complete" ) & @@ -1308,7 +1397,7 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_ command: Some(vec![ "/bin/sh".to_string(), "-c".to_string(), - "pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(), + "pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(), ]), }), initial_delay_seconds: Some(5), diff --git a/src/controllers/restore/tests.rs b/src/controllers/restore/tests.rs index 6046cf2..75f84af 100644 --- a/src/controllers/restore/tests.rs +++ b/src/controllers/restore/tests.rs @@ -691,6 +691,150 @@ fn deployment_init_script_two_stage_pg_resetwal_fallback() { ); } +#[test] +fn deployment_init_script_flags_full_reindex_after_resetwal() { + // pg_resetwal bypasses WAL replay, so any in-flight index write at + // snapshot time can leave torn pages (postgres later surfaces these + // as "unexpected zero page at block N" when queries hit the index). + // Every pg_resetwal call must therefore touch /pgdata/needs-reindex-all + // so the main container's startup hook runs REINDEX DATABASE on every + // user database before the readiness probe lets traffic in. + let (mut restore, replica) = test_restore_and_replica(); + restore.status = Some(PostgresPhysicalRestoreStatus { + postgres_version: Some("16".to_string()), + ..Default::default() + }); + let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap(); + let setup_auth = deploy + .spec + .unwrap() + .template + .spec + .unwrap() + .init_containers + .unwrap() + .into_iter() + .find(|c| c.name == "setup-auth") + .expect("setup-auth init container must exist"); + let script = setup_auth.args.unwrap().remove(0); + + // The flag must be set every time pg_resetwal -f runs (currently + // twice — detected-signature branch and last-resort branch). The + // quick check: at least one `touch /pgdata/needs-reindex-all` + // follows each pg_resetwal invocation, and the count of the touch + // matches the count of resets. + // Count actual invocations (the literal command line) — using just + // "pg_resetwal -f" picks up comments and echo messages too. + let resetwal_calls = script.matches("pg_resetwal -f \"$PGDATA\"").count(); + let touch_calls = script.matches("touch /pgdata/needs-reindex-all").count(); + assert_eq!( + touch_calls, resetwal_calls, + "every pg_resetwal -f invocation must be paired with `touch /pgdata/needs-reindex-all` (got {touch_calls} touches for {resetwal_calls} resets)" + ); + assert!( + resetwal_calls >= 2, + "expected at least two pg_resetwal invocation sites; got {resetwal_calls}" + ); +} + +#[test] +fn deployment_runtime_reindex_handles_full_database_flag() { + // The main container's startup hook handles the broad flag + // (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX + // only those that fail the structural check (caught the + // "unexpected zero page" failure mode reported in prod) plus all + // non-btree indexes (amcheck only covers btree, so non-btree get + // a blind REINDEX). A blind REINDEX DATABASE would take hours on + // the prod size; the smart pass reads index-size and rewrites + // only the corrupt ones. + let (mut restore, replica) = test_restore_and_replica(); + restore.status = Some(PostgresPhysicalRestoreStatus { + postgres_version: Some("16".to_string()), + ..Default::default() + }); + let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap(); + let postgres = deploy + .spec + .unwrap() + .template + .spec + .unwrap() + .containers + .into_iter() + .find(|c| c.name == "postgres") + .expect("postgres container must exist"); + let script = postgres.args.unwrap().remove(0); + + assert!( + script.contains("/pgdata/needs-reindex-all"), + "runtime startup hook must check the needs-reindex-all flag" + ); + assert!( + script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"), + "needs-reindex-all branch must enable amcheck for the structural scan" + ); + assert!( + script.contains("bt_index_check("), + "needs-reindex-all branch must call bt_index_check on each btree index" + ); + assert!( + script.contains("a.amname <> 'btree'"), + "needs-reindex-all branch must collect non-btree indexes for blind REINDEX" + ); + assert!( + script.contains("REINDEX INDEX"), + "needs-reindex-all branch must REINDEX (targeted, not the whole DB)" + ); + assert!( + !script.contains("REINDEX DATABASE"), + "needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids" + ); + assert!( + script.contains("rm -f /pgdata/needs-reindex-all"), + "runtime hook must clear the needs-reindex-all flag after the reindex" + ); +} + +#[test] +fn deployment_readiness_probe_waits_for_full_reindex() { + // The readiness probe gates traffic on both reindex flags being + // absent, otherwise queries see torn-page indexes between pod-start + // and reindex-complete. The probe used to check only the narrow + // needs-reindex flag. + let (mut restore, replica) = test_restore_and_replica(); + restore.status = Some(PostgresPhysicalRestoreStatus { + postgres_version: Some("16".to_string()), + ..Default::default() + }); + let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap(); + let postgres = deploy + .spec + .unwrap() + .template + .spec + .unwrap() + .containers + .into_iter() + .find(|c| c.name == "postgres") + .expect("postgres container must exist"); + let probe_cmd = postgres + .readiness_probe + .expect("readiness probe must exist") + .exec + .expect("readiness probe must be an exec probe") + .command + .expect("exec probe must have a command"); + let probe_script = probe_cmd.join(" "); + assert!( + probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"), + "readiness probe must wait for needs-reindex-all to clear; got: {probe_script}" + ); + assert!( + probe_script.contains("[ ! -f /pgdata/needs-reindex ]"), + "readiness probe must still wait for needs-reindex; got: {probe_script}" + ); +} + #[test] fn deployment_init_script_overrides_listen_addresses() { // Some source backups carry `listen_addresses = 'localhost'` in