diff --git a/_typos.toml b/_typos.toml new file mode 100644 index 0000000..b67b8b7 --- /dev/null +++ b/_typos.toml @@ -0,0 +1,3 @@ +[default.extend-words] +# BRIN: postgres "Block Range Index" type. +BRIN = "BRIN" diff --git a/src/controllers/restore/builders.rs b/src/controllers/restore/builders.rs index 58d7fd6..82c0953 100644 --- a/src/controllers/restore/builders.rs +++ b/src/controllers/restore/builders.rs @@ -922,6 +922,14 @@ fi # enough for read-only analytics, and the alternative is a permanently # unrecoverable replica. # +# Every pg_resetwal -f invocation also touches /pgdata/needs-reindex-all: +# pg_resetwal bypasses WAL replay, so any index update that was in flight +# at snapshot time may have left torn pages ("unexpected zero page at +# block N" surfaces in queries later, which is hard to debug after the +# fact). The main container's startup hook picks up that flag and runs +# REINDEX DATABASE on every user database before marking the replica +# Ready. +# # Stage 1: if the first attempt fails with a WAL-recovery signature, # short-circuit straight to pg_resetwal + retry. Retrying the same # command won't help when recovery itself is the blocker. @@ -946,6 +954,7 @@ postgres_single_or_resetwal() {{ echo "WAL recovery failed (snapshot likely captured mid-online-backup) — running pg_resetwal -f and retrying" >&2 rm -f "$logfile" pg_resetwal -f "$PGDATA" + touch /pgdata/needs-reindex-all echo "$sql_input" | postgres --single -D "$PGDATA" postgres return $? fi @@ -966,6 +975,7 @@ postgres_single_or_resetwal() {{ echo "second attempt also failed — running pg_resetwal -f as a last resort and retrying" >&2 rm -f "$logfile" pg_resetwal -f "$PGDATA" + touch /pgdata/needs-reindex-all echo "$sql_input" | postgres --single -D "$PGDATA" postgres }} @@ -1061,7 +1071,7 @@ ALTER ROLE ${{ANALYTICS_USERNAME}} WITH SUPERUSER; SQLEOF fi -if [ -f /pgdata/needs-reindex ]; then +if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then PGRO_STAGE=restored else PGRO_STAGE=ready @@ -1235,33 +1245,112 @@ echo "Auth setup complete" image: Some(pg_image), command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]), args: Some(vec![r#" -if [ -f /pgdata/needs-reindex ]; then +if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then PG_MAJOR=$(cat /pgdata/pgdata/PG_VERSION) ( while ! pg_isready -q -U postgres -d postgres; do sleep 2; done psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;" - for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do - INDEXES=$(psql -U postgres -d "$db" -At -c " - SELECT DISTINCT indexrelid::regclass::text - FROM pg_index i - JOIN pg_attribute a ON a.attrelid = i.indexrelid - WHERE a.attcollation <> 0 AND i.indisvalid; - ") - COUNT=$(echo "$INDEXES" | grep -c . || true) - echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)" - N=0 - echo "$INDEXES" | while IFS= read -r idx; do - [ -z "$idx" ] && continue - N=$((N + 1)) - echo " [$N/$COUNT] $db: $idx" - if [ "$PG_MAJOR" -ge 14 ]; then - psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true - else - psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true - fi + # needs-reindex-all (pg_resetwal aftermath) can leave torn pages in + # ANY index, not just collation-dependent ones. A blind REINDEX + # DATABASE takes hours on the prod DBs; instead do a two-step + # smart pass: + # + # 1. Use the amcheck contrib extension to read each valid btree + # index and verify its structural invariants. Indexes that + # fail the check (including "unexpected zero page" corruption) + # get queued for REINDEX. Indexes that pass are left alone. + # For a healthy snapshot this finds nothing and reads + # ~index-size of disk instead of ~table-size to rewrite. + # + # 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST, + # BRIN, hash). amcheck only covers btree; non-btree indexes + # are typically a small fraction of total index size so the + # cost is bounded, and skipping them risks the same + # post-resetwal corruption sneaking through. + # + # CONCURRENTLY when available (PG ≥ 14) so the work overlaps with + # whatever clients hit the pod after the readiness gate lifts. + if [ -f /pgdata/needs-reindex-all ]; then + for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do + echo "Reindex after pg_resetwal: $db (smart pass via amcheck)" + psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true + + # Step 1: scan btree indexes; collect those that fail amcheck. + BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c " + SELECT c.oid::regclass::text + FROM pg_class c + JOIN pg_am a ON a.oid = c.relam + JOIN pg_index i ON i.indexrelid = c.oid + WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid; + ") + BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true) + echo " amcheck scanning $BTREE_COUNT btree indexes in $db" + CORRUPT_BTREE="" + N=0 + for idx in $BTREE_INDEXES; do + [ -z "$idx" ] && continue + N=$((N + 1)) + # bt_index_check raises an error if the index is corrupt. + # Suppress its output and check exit code; an error → queue it. + if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then + echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx" + CORRUPT_BTREE="$CORRUPT_BTREE $idx" + fi + done + + # Step 2: list non-btree indexes for blind reindex. + NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c " + SELECT c.oid::regclass::text + FROM pg_class c + JOIN pg_am a ON a.oid = c.relam + JOIN pg_index i ON i.indexrelid = c.oid + WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid; + ") + NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true) + + TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES" + TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true) + echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)" + N=0 + for idx in $TO_REINDEX; do + [ -z "$idx" ] && continue + N=$((N + 1)) + echo " [$N/$TOTAL] $db: REINDEX $idx" + if [ "$PG_MAJOR" -ge 14 ]; then + psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true + else + psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true + fi + done + done + rm -f /pgdata/needs-reindex-all + # needs-reindex (collation-dependent only) is a strict subset of + # what we just did, so clear it too. + rm -f /pgdata/needs-reindex + elif [ -f /pgdata/needs-reindex ]; then + for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do + INDEXES=$(psql -U postgres -d "$db" -At -c " + SELECT DISTINCT indexrelid::regclass::text + FROM pg_index i + JOIN pg_attribute a ON a.attrelid = i.indexrelid + WHERE a.attcollation <> 0 AND i.indisvalid; + ") + COUNT=$(echo "$INDEXES" | grep -c . || true) + echo "Reindex after locale change: $db ($COUNT collation-dependent indexes)" + N=0 + echo "$INDEXES" | while IFS= read -r idx; do + [ -z "$idx" ] && continue + N=$((N + 1)) + echo " [$N/$COUNT] $db: $idx" + if [ "$PG_MAJOR" -ge 14 ]; then + psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true + else + psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true + fi + done done - done - rm -f /pgdata/needs-reindex + rm -f /pgdata/needs-reindex + fi psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'ready', last_transition_time = now() WHERE id = 1;" echo "Background reindex complete" ) & @@ -1308,7 +1397,7 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_ command: Some(vec![ "/bin/sh".to_string(), "-c".to_string(), - "pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(), + "pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(), ]), }), initial_delay_seconds: Some(5), diff --git a/src/controllers/restore/tests.rs b/src/controllers/restore/tests.rs index 6046cf2..75f84af 100644 --- a/src/controllers/restore/tests.rs +++ b/src/controllers/restore/tests.rs @@ -691,6 +691,150 @@ fn deployment_init_script_two_stage_pg_resetwal_fallback() { ); } +#[test] +fn deployment_init_script_flags_full_reindex_after_resetwal() { + // pg_resetwal bypasses WAL replay, so any in-flight index write at + // snapshot time can leave torn pages (postgres later surfaces these + // as "unexpected zero page at block N" when queries hit the index). + // Every pg_resetwal call must therefore touch /pgdata/needs-reindex-all + // so the main container's startup hook runs REINDEX DATABASE on every + // user database before the readiness probe lets traffic in. + let (mut restore, replica) = test_restore_and_replica(); + restore.status = Some(PostgresPhysicalRestoreStatus { + postgres_version: Some("16".to_string()), + ..Default::default() + }); + let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap(); + let setup_auth = deploy + .spec + .unwrap() + .template + .spec + .unwrap() + .init_containers + .unwrap() + .into_iter() + .find(|c| c.name == "setup-auth") + .expect("setup-auth init container must exist"); + let script = setup_auth.args.unwrap().remove(0); + + // The flag must be set every time pg_resetwal -f runs (currently + // twice — detected-signature branch and last-resort branch). The + // quick check: at least one `touch /pgdata/needs-reindex-all` + // follows each pg_resetwal invocation, and the count of the touch + // matches the count of resets. + // Count actual invocations (the literal command line) — using just + // "pg_resetwal -f" picks up comments and echo messages too. + let resetwal_calls = script.matches("pg_resetwal -f \"$PGDATA\"").count(); + let touch_calls = script.matches("touch /pgdata/needs-reindex-all").count(); + assert_eq!( + touch_calls, resetwal_calls, + "every pg_resetwal -f invocation must be paired with `touch /pgdata/needs-reindex-all` (got {touch_calls} touches for {resetwal_calls} resets)" + ); + assert!( + resetwal_calls >= 2, + "expected at least two pg_resetwal invocation sites; got {resetwal_calls}" + ); +} + +#[test] +fn deployment_runtime_reindex_handles_full_database_flag() { + // The main container's startup hook handles the broad flag + // (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX + // only those that fail the structural check (caught the + // "unexpected zero page" failure mode reported in prod) plus all + // non-btree indexes (amcheck only covers btree, so non-btree get + // a blind REINDEX). A blind REINDEX DATABASE would take hours on + // the prod size; the smart pass reads index-size and rewrites + // only the corrupt ones. + let (mut restore, replica) = test_restore_and_replica(); + restore.status = Some(PostgresPhysicalRestoreStatus { + postgres_version: Some("16".to_string()), + ..Default::default() + }); + let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap(); + let postgres = deploy + .spec + .unwrap() + .template + .spec + .unwrap() + .containers + .into_iter() + .find(|c| c.name == "postgres") + .expect("postgres container must exist"); + let script = postgres.args.unwrap().remove(0); + + assert!( + script.contains("/pgdata/needs-reindex-all"), + "runtime startup hook must check the needs-reindex-all flag" + ); + assert!( + script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"), + "needs-reindex-all branch must enable amcheck for the structural scan" + ); + assert!( + script.contains("bt_index_check("), + "needs-reindex-all branch must call bt_index_check on each btree index" + ); + assert!( + script.contains("a.amname <> 'btree'"), + "needs-reindex-all branch must collect non-btree indexes for blind REINDEX" + ); + assert!( + script.contains("REINDEX INDEX"), + "needs-reindex-all branch must REINDEX (targeted, not the whole DB)" + ); + assert!( + !script.contains("REINDEX DATABASE"), + "needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids" + ); + assert!( + script.contains("rm -f /pgdata/needs-reindex-all"), + "runtime hook must clear the needs-reindex-all flag after the reindex" + ); +} + +#[test] +fn deployment_readiness_probe_waits_for_full_reindex() { + // The readiness probe gates traffic on both reindex flags being + // absent, otherwise queries see torn-page indexes between pod-start + // and reindex-complete. The probe used to check only the narrow + // needs-reindex flag. + let (mut restore, replica) = test_restore_and_replica(); + restore.status = Some(PostgresPhysicalRestoreStatus { + postgres_version: Some("16".to_string()), + ..Default::default() + }); + let deploy = build_deployment(&restore, "test-restore", "default", &replica).unwrap(); + let postgres = deploy + .spec + .unwrap() + .template + .spec + .unwrap() + .containers + .into_iter() + .find(|c| c.name == "postgres") + .expect("postgres container must exist"); + let probe_cmd = postgres + .readiness_probe + .expect("readiness probe must exist") + .exec + .expect("readiness probe must be an exec probe") + .command + .expect("exec probe must have a command"); + let probe_script = probe_cmd.join(" "); + assert!( + probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"), + "readiness probe must wait for needs-reindex-all to clear; got: {probe_script}" + ); + assert!( + probe_script.contains("[ ! -f /pgdata/needs-reindex ]"), + "readiness probe must still wait for needs-reindex; got: {probe_script}" + ); +} + #[test] fn deployment_init_script_overrides_listen_addresses() { // Some source backups carry `listen_addresses = 'localhost'` in