diff --git a/src/controllers/restore/builders.rs b/src/controllers/restore/builders.rs index 82c0953..ddc95cb 100644 --- a/src/controllers/restore/builders.rs +++ b/src/controllers/restore/builders.rs @@ -1251,77 +1251,46 @@ if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then while ! pg_isready -q -U postgres -d postgres; do sleep 2; done psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;" # needs-reindex-all (pg_resetwal aftermath) can leave torn pages in - # ANY index, not just collation-dependent ones. A blind REINDEX - # DATABASE takes hours on the prod DBs; instead do a two-step - # smart pass: + # ANY index, not just collation-dependent ones. We tried a "smart + # pass" using the amcheck contrib extension (scan each btree, queue + # only the corrupt ones for REINDEX) — empirically that hits the + # same postgres-internal pathology that wedges other vanilla DDL on + # this dataset: bt_index_check itself burns 100% CPU forever on + # specific indexes with no visible progress, blocking the whole + # reindex behind it. # - # 1. Use the amcheck contrib extension to read each valid btree - # index and verify its structural invariants. Indexes that - # fail the check (including "unexpected zero page" corruption) - # get queued for REINDEX. Indexes that pass are left alone. - # For a healthy snapshot this finds nothing and reads - # ~index-size of disk instead of ~table-size to rewrite. + # Fall back to blind REINDEX DATABASE. REINDEX reads the heap and + # rebuilds the index from scratch (different code path from amcheck, + # which reads the corrupt index pages directly) and so isn't subject + # to the same wedge. Slow on prod-sized DBs but it makes progress; + # the alternative was a permanently-stuck restore. # - # 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST, - # BRIN, hash). amcheck only covers btree; non-btree indexes - # are typically a small fraction of total index size so the - # cost is bounded, and skipping them risks the same - # post-resetwal corruption sneaking through. - # - # CONCURRENTLY when available (PG ≥ 14) so the work overlaps with - # whatever clients hit the pod after the readiness gate lifts. + # Crucially this branch does NOT remove needs-reindex-all at the + # top of the work — the readiness probe ignores -all for exactly + # this reason (see the probe spec below). The pod becomes Ready as + # soon as postgres accepts connections; clients hitting a not-yet- + # reindexed corrupt index get the explicit "unexpected zero page" + # error, retry, succeed once the rebuild lands. After REINDEX + # DATABASE completes for every user db, the flag is cleared and + # _pgro.restore_info.stage flips to ready. if [ -f /pgdata/needs-reindex-all ]; then + # CONCURRENTLY (PG ≥ 12) builds replacement indexes alongside the + # existing ones and atomically swaps. Clients can keep using the + # old indexes during the rebuild — they'll see "unexpected zero + # page" only if a query happens to hit a corrupt page on the old + # side; once the swap lands the corruption is gone. + # + # REINDEX DATABASE CONCURRENTLY skips system catalogs (PG won't + # CONCURRENTLY them). For an analytics replica that's the right + # trade: user-data indexes matter for client queries, system + # catalog corruption shows up as different errors and is rare. for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do - echo "Reindex after pg_resetwal: $db (smart pass via amcheck)" - psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true - - # Step 1: scan btree indexes; collect those that fail amcheck. - BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c " - SELECT c.oid::regclass::text - FROM pg_class c - JOIN pg_am a ON a.oid = c.relam - JOIN pg_index i ON i.indexrelid = c.oid - WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid; - ") - BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true) - echo " amcheck scanning $BTREE_COUNT btree indexes in $db" - CORRUPT_BTREE="" - N=0 - for idx in $BTREE_INDEXES; do - [ -z "$idx" ] && continue - N=$((N + 1)) - # bt_index_check raises an error if the index is corrupt. - # Suppress its output and check exit code; an error → queue it. - if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then - echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx" - CORRUPT_BTREE="$CORRUPT_BTREE $idx" - fi - done - - # Step 2: list non-btree indexes for blind reindex. - NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c " - SELECT c.oid::regclass::text - FROM pg_class c - JOIN pg_am a ON a.oid = c.relam - JOIN pg_index i ON i.indexrelid = c.oid - WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid; - ") - NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true) - - TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES" - TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true) - echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)" - N=0 - for idx in $TO_REINDEX; do - [ -z "$idx" ] && continue - N=$((N + 1)) - echo " [$N/$TOTAL] $db: REINDEX $idx" - if [ "$PG_MAJOR" -ge 14 ]; then - psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true - else - psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true - fi - done + echo "Reindex after pg_resetwal: $db (REINDEX DATABASE CONCURRENTLY)" + if [ "$PG_MAJOR" -ge 12 ]; then + psql -U postgres -d "$db" -c "REINDEX DATABASE CONCURRENTLY \"$db\";" 2>&1 || true + else + psql -U postgres -d "$db" -c "REINDEX DATABASE \"$db\";" 2>&1 || true + fi done rm -f /pgdata/needs-reindex-all # needs-reindex (collation-dependent only) is a strict subset of @@ -1397,7 +1366,15 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_ command: Some(vec![ "/bin/sh".to_string(), "-c".to_string(), - "pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(), + // Gate readiness on the locale-only needs-reindex flag + // (small, fast, finishes in seconds-to-minutes) but NOT on + // needs-reindex-all (post-pg_resetwal blind REINDEX DATABASE + // — takes hours on prod-sized indexes; gating here would + // trip the operator's deployment_ready_timeout). The -all + // reindex runs in the background; clients hitting a + // not-yet-reindexed corrupt index see the explicit + // "unexpected zero page" error and retry. + "pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(), ]), }), initial_delay_seconds: Some(5), diff --git a/src/controllers/restore/tests.rs b/src/controllers/restore/tests.rs index 75f84af..5589e7f 100644 --- a/src/controllers/restore/tests.rs +++ b/src/controllers/restore/tests.rs @@ -740,13 +740,12 @@ fn deployment_init_script_flags_full_reindex_after_resetwal() { #[test] fn deployment_runtime_reindex_handles_full_database_flag() { // The main container's startup hook handles the broad flag - // (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX - // only those that fail the structural check (caught the - // "unexpected zero page" failure mode reported in prod) plus all - // non-btree indexes (amcheck only covers btree, so non-btree get - // a blind REINDEX). A blind REINDEX DATABASE would take hours on - // the prod size; the smart pass reads index-size and rewrites - // only the corrupt ones. + // (needs-reindex-all) via blind REINDEX DATABASE on every user DB. + // The earlier amcheck-driven smart pass turned out to hit the same + // postgres-internal pathology that wedges other vanilla DDL on + // the prod data — bt_index_check burned 100% CPU forever on + // individual indexes. REINDEX uses a different code path (reads + // the heap, rebuilds from scratch) and doesn't trip that wedge. let (mut restore, replica) = test_restore_and_replica(); restore.status = Some(PostgresPhysicalRestoreStatus { postgres_version: Some("16".to_string()), @@ -770,24 +769,20 @@ fn deployment_runtime_reindex_handles_full_database_flag() { "runtime startup hook must check the needs-reindex-all flag" ); assert!( - script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"), - "needs-reindex-all branch must enable amcheck for the structural scan" + script.contains("REINDEX DATABASE CONCURRENTLY"), + "needs-reindex-all branch must run REINDEX DATABASE CONCURRENTLY on PG ≥ 12 so clients can keep querying the old indexes during the rebuild" ); + // The PG < 12 fallback uses plain REINDEX DATABASE — verify the + // version gate exists in the script. (We can't string-match on the + // literal SQL because the shell-quoted `\"` form differs from a + // Rust string literal.) assert!( - script.contains("bt_index_check("), - "needs-reindex-all branch must call bt_index_check on each btree index" + script.contains(r#""$PG_MAJOR" -ge 12"#), + "needs-reindex-all branch must gate CONCURRENTLY behind a PG ≥ 12 check" ); assert!( - script.contains("a.amname <> 'btree'"), - "needs-reindex-all branch must collect non-btree indexes for blind REINDEX" - ); - assert!( - script.contains("REINDEX INDEX"), - "needs-reindex-all branch must REINDEX (targeted, not the whole DB)" - ); - assert!( - !script.contains("REINDEX DATABASE"), - "needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids" + !script.contains("bt_index_check("), + "runtime hook must not call bt_index_check — amcheck wedges on the prod data" ); assert!( script.contains("rm -f /pgdata/needs-reindex-all"), @@ -796,11 +791,15 @@ fn deployment_runtime_reindex_handles_full_database_flag() { } #[test] -fn deployment_readiness_probe_waits_for_full_reindex() { - // The readiness probe gates traffic on both reindex flags being - // absent, otherwise queries see torn-page indexes between pod-start - // and reindex-complete. The probe used to check only the narrow - // needs-reindex flag. +fn deployment_readiness_probe_only_gates_on_locale_reindex() { + // The readiness probe waits for the locale-only `needs-reindex` + // flag to clear (small, fast, finishes in seconds) but NOT for + // `needs-reindex-all` (the post-pg_resetwal blind REINDEX DATABASE + // — takes hours on prod-sized indexes; gating here would trip the + // operator's deployment_ready_timeout and block restores + // indefinitely). The -all reindex runs in the background; clients + // hitting a not-yet-reindexed corrupt index see the explicit + // "unexpected zero page" error and can retry. let (mut restore, replica) = test_restore_and_replica(); restore.status = Some(PostgresPhysicalRestoreStatus { postgres_version: Some("16".to_string()), @@ -826,12 +825,12 @@ fn deployment_readiness_probe_waits_for_full_reindex() { .expect("exec probe must have a command"); let probe_script = probe_cmd.join(" "); assert!( - probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"), - "readiness probe must wait for needs-reindex-all to clear; got: {probe_script}" + probe_script.contains("[ ! -f /pgdata/needs-reindex ]"), + "readiness probe must still wait for the locale-only needs-reindex flag; got: {probe_script}" ); assert!( - probe_script.contains("[ ! -f /pgdata/needs-reindex ]"), - "readiness probe must still wait for needs-reindex; got: {probe_script}" + !probe_script.contains("needs-reindex-all"), + "readiness probe must NOT gate on needs-reindex-all (the long-running post-resetwal reindex); got: {probe_script}" ); }