fix(operator): un-gate readiness on needs-reindex-all, REINDEX DATABASE CONCURRENTLY

passcod · passcod · commit 6987f640304c · 2026-06-10T15:53:02.000+12:00
The amcheck-driven smart pass introduced in #68 hits the same family of postgres-internal pathology that wedges other vanilla DDL on the prod dataset — bt_index_check itself burns 100% CPU forever on specific indexes (observed via pg_stat_activity: state=active, empty wait_event, query_start growing linearly with wall clock for 4+ minutes on a tiny system catalog index). The smart pass is unusable on this data. Two changes: 1. Drop bt_index_check; revert the needs-reindex-all branch to a blind REINDEX DATABASE per user DB, CONCURRENTLY on PG ≥ 12 so clients can keep using the old indexes during the rebuild and the atomic swap makes corruption disappear without downtime. REINDEX reads the heap and rebuilds the index from scratch — a different code path from amcheck (which reads corrupt index pages directly) — so it isn't subject to the same wedge. Slow on prod-sized DBs (hours) but makes progress. 2. Drop the readiness probe's gate on /pgdata/needs-reindex-all. With the reindex taking hours, gating readiness here trips the operator's 30-minute deployment_ready_timeout and the restore is marked Failed before postgres even has a chance to come up. The probe still gates on /pgdata/needs-reindex (locale-only, finishes in seconds) since that's small and proven. Trade-off: clients hitting a not-yet-reindexed corrupt index in the window between pod-Ready and reindex-complete get the explicit "unexpected zero page" error from postgres. With CONCURRENTLY the window is narrow (queries hit the old index until the atomic swap) and clients can retry. Strictly better than the alternative (permanently failed restore, replica stuck indefinitely on the previous Active). The pre-#68 behaviour for the locale-only path is unchanged.
diff --git a/src/controllers/restore/builders.rs b/src/controllers/restore/builders.rs
@@ -1251,77 +1251,46 @@ if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
     while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
     psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
     # needs-reindex-all (pg_resetwal aftermath) can leave torn pages in
-    # ANY index, not just collation-dependent ones. A blind REINDEX
-    # DATABASE takes hours on the prod DBs; instead do a two-step
-    # smart pass:
+    # ANY index, not just collation-dependent ones. We tried a "smart
+    # pass" using the amcheck contrib extension (scan each btree, queue
+    # only the corrupt ones for REINDEX) — empirically that hits the
+    # same postgres-internal pathology that wedges other vanilla DDL on
+    # this dataset: bt_index_check itself burns 100% CPU forever on
+    # specific indexes with no visible progress, blocking the whole
+    # reindex behind it.
     #
-    #  1. Use the amcheck contrib extension to read each valid btree
-    #     index and verify its structural invariants. Indexes that
-    #     fail the check (including "unexpected zero page" corruption)
-    #     get queued for REINDEX. Indexes that pass are left alone.
-    #     For a healthy snapshot this finds nothing and reads
-    #     ~index-size of disk instead of ~table-size to rewrite.
+    # Fall back to blind REINDEX DATABASE. REINDEX reads the heap and
+    # rebuilds the index from scratch (different code path from amcheck,
+    # which reads the corrupt index pages directly) and so isn't subject
+    # to the same wedge. Slow on prod-sized DBs but it makes progress;
+    # the alternative was a permanently-stuck restore.
     #
-    #  2. Blindly REINDEX every non-btree index in the DB (GIN, GiST,
-    #     BRIN, hash). amcheck only covers btree; non-btree indexes
-    #     are typically a small fraction of total index size so the
-    #     cost is bounded, and skipping them risks the same
-    #     post-resetwal corruption sneaking through.
-    #
-    # CONCURRENTLY when available (PG ≥ 14) so the work overlaps with
-    # whatever clients hit the pod after the readiness gate lifts.
+    # Crucially this branch does NOT remove needs-reindex-all at the
+    # top of the work — the readiness probe ignores -all for exactly
+    # this reason (see the probe spec below). The pod becomes Ready as
+    # soon as postgres accepts connections; clients hitting a not-yet-
+    # reindexed corrupt index get the explicit "unexpected zero page"
+    # error, retry, succeed once the rebuild lands. After REINDEX
+    # DATABASE completes for every user db, the flag is cleared and
+    # _pgro.restore_info.stage flips to ready.
     if [ -f /pgdata/needs-reindex-all ]; then
+      # CONCURRENTLY (PG ≥ 12) builds replacement indexes alongside the
+      # existing ones and atomically swaps. Clients can keep using the
+      # old indexes during the rebuild — they'll see "unexpected zero
+      # page" only if a query happens to hit a corrupt page on the old
+      # side; once the swap lands the corruption is gone.
+      #
+      # REINDEX DATABASE CONCURRENTLY skips system catalogs (PG won't
+      # CONCURRENTLY them). For an analytics replica that's the right
+      # trade: user-data indexes matter for client queries, system
+      # catalog corruption shows up as different errors and is rare.
       for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
-        echo "Reindex after pg_resetwal: $db (smart pass via amcheck)"
-        psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true
-
-        # Step 1: scan btree indexes; collect those that fail amcheck.
-        BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
-          SELECT c.oid::regclass::text
-          FROM pg_class c
-          JOIN pg_am a ON a.oid = c.relam
-          JOIN pg_index i ON i.indexrelid = c.oid
-          WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid;
-        ")
-        BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true)
-        echo "  amcheck scanning $BTREE_COUNT btree indexes in $db"
-        CORRUPT_BTREE=""
-        N=0
-        for idx in $BTREE_INDEXES; do
-          [ -z "$idx" ] && continue
-          N=$((N + 1))
-          # bt_index_check raises an error if the index is corrupt.
-          # Suppress its output and check exit code; an error → queue it.
-          if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then
-            echo "  [$N/$BTREE_COUNT] CORRUPT: $db: $idx"
-            CORRUPT_BTREE="$CORRUPT_BTREE $idx"
-          fi
-        done
-
-        # Step 2: list non-btree indexes for blind reindex.
-        NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
-          SELECT c.oid::regclass::text
-          FROM pg_class c
-          JOIN pg_am a ON a.oid = c.relam
-          JOIN pg_index i ON i.indexrelid = c.oid
-          WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid;
-        ")
-        NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true)
-
-        TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES"
-        TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true)
-        echo "  $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)"
-        N=0
-        for idx in $TO_REINDEX; do
-          [ -z "$idx" ] && continue
-          N=$((N + 1))
-          echo "  [$N/$TOTAL] $db: REINDEX $idx"
-          if [ "$PG_MAJOR" -ge 14 ]; then
-            psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
-          else
-            psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
-          fi
-        done
+        echo "Reindex after pg_resetwal: $db (REINDEX DATABASE CONCURRENTLY)"
+        if [ "$PG_MAJOR" -ge 12 ]; then
+          psql -U postgres -d "$db" -c "REINDEX DATABASE CONCURRENTLY \"$db\";" 2>&1 || true
+        else
+          psql -U postgres -d "$db" -c "REINDEX DATABASE \"$db\";" 2>&1 || true
+        fi
       done
       rm -f /pgdata/needs-reindex-all
       # needs-reindex (collation-dependent only) is a strict subset of
@@ -1397,7 +1366,15 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
 								command: Some(vec![
 									"/bin/sh".to_string(),
 									"-c".to_string(),
-									"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(),
+									// Gate readiness on the locale-only needs-reindex flag
+								// (small, fast, finishes in seconds-to-minutes) but NOT on
+								// needs-reindex-all (post-pg_resetwal blind REINDEX DATABASE
+								// — takes hours on prod-sized indexes; gating here would
+								// trip the operator's deployment_ready_timeout). The -all
+								// reindex runs in the background; clients hitting a
+								// not-yet-reindexed corrupt index see the explicit
+								// "unexpected zero page" error and retry.
+								"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(),
 								]),
 							}),
 							initial_delay_seconds: Some(5),
diff --git a/src/controllers/restore/tests.rs b/src/controllers/restore/tests.rs
@@ -740,13 +740,12 @@ fn deployment_init_script_flags_full_reindex_after_resetwal() {
 #[test]
 fn deployment_runtime_reindex_handles_full_database_flag() {
 	// The main container's startup hook handles the broad flag
-	// (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX
-	// only those that fail the structural check (caught the
-	// "unexpected zero page" failure mode reported in prod) plus all
-	// non-btree indexes (amcheck only covers btree, so non-btree get
-	// a blind REINDEX). A blind REINDEX DATABASE would take hours on
-	// the prod size; the smart pass reads index-size and rewrites
-	// only the corrupt ones.
+	// (needs-reindex-all) via blind REINDEX DATABASE on every user DB.
+	// The earlier amcheck-driven smart pass turned out to hit the same
+	// postgres-internal pathology that wedges other vanilla DDL on
+	// the prod data — bt_index_check burned 100% CPU forever on
+	// individual indexes. REINDEX uses a different code path (reads
+	// the heap, rebuilds from scratch) and doesn't trip that wedge.
 	let (mut restore, replica) = test_restore_and_replica();
 	restore.status = Some(PostgresPhysicalRestoreStatus {
 		postgres_version: Some("16".to_string()),
@@ -770,24 +769,20 @@ fn deployment_runtime_reindex_handles_full_database_flag() {
 		"runtime startup hook must check the needs-reindex-all flag"
 	);
 	assert!(
-		script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"),
-		"needs-reindex-all branch must enable amcheck for the structural scan"
+		script.contains("REINDEX DATABASE CONCURRENTLY"),
+		"needs-reindex-all branch must run REINDEX DATABASE CONCURRENTLY on PG ≥ 12 so clients can keep querying the old indexes during the rebuild"
 	);
+	// The PG < 12 fallback uses plain REINDEX DATABASE — verify the
+	// version gate exists in the script. (We can't string-match on the
+	// literal SQL because the shell-quoted `\"` form differs from a
+	// Rust string literal.)
 	assert!(
-		script.contains("bt_index_check("),
-		"needs-reindex-all branch must call bt_index_check on each btree index"
+		script.contains(r#""$PG_MAJOR" -ge 12"#),
+		"needs-reindex-all branch must gate CONCURRENTLY behind a PG ≥ 12 check"
 	);
 	assert!(
-		script.contains("a.amname <> 'btree'"),
-		"needs-reindex-all branch must collect non-btree indexes for blind REINDEX"
-	);
-	assert!(
-		script.contains("REINDEX INDEX"),
-		"needs-reindex-all branch must REINDEX (targeted, not the whole DB)"
-	);
-	assert!(
-		!script.contains("REINDEX DATABASE"),
-		"needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids"
+		!script.contains("bt_index_check("),
+		"runtime hook must not call bt_index_check — amcheck wedges on the prod data"
 	);
 	assert!(
 		script.contains("rm -f /pgdata/needs-reindex-all"),
@@ -796,11 +791,15 @@ fn deployment_runtime_reindex_handles_full_database_flag() {
 }
 
 #[test]
-fn deployment_readiness_probe_waits_for_full_reindex() {
-	// The readiness probe gates traffic on both reindex flags being
-	// absent, otherwise queries see torn-page indexes between pod-start
-	// and reindex-complete. The probe used to check only the narrow
-	// needs-reindex flag.
+fn deployment_readiness_probe_only_gates_on_locale_reindex() {
+	// The readiness probe waits for the locale-only `needs-reindex`
+	// flag to clear (small, fast, finishes in seconds) but NOT for
+	// `needs-reindex-all` (the post-pg_resetwal blind REINDEX DATABASE
+	// — takes hours on prod-sized indexes; gating here would trip the
+	// operator's deployment_ready_timeout and block restores
+	// indefinitely). The -all reindex runs in the background; clients
+	// hitting a not-yet-reindexed corrupt index see the explicit
+	// "unexpected zero page" error and can retry.
 	let (mut restore, replica) = test_restore_and_replica();
 	restore.status = Some(PostgresPhysicalRestoreStatus {
 		postgres_version: Some("16".to_string()),
@@ -826,12 +825,12 @@ fn deployment_readiness_probe_waits_for_full_reindex() {
 		.expect("exec probe must have a command");
 	let probe_script = probe_cmd.join(" ");
 	assert!(
-		probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"),
-		"readiness probe must wait for needs-reindex-all to clear; got: {probe_script}"
+		probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
+		"readiness probe must still wait for the locale-only needs-reindex flag; got: {probe_script}"
 	);
 	assert!(
-		probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
-		"readiness probe must still wait for needs-reindex; got: {probe_script}"
+		!probe_script.contains("needs-reindex-all"),
+		"readiness probe must NOT gate on needs-reindex-all (the long-running post-resetwal reindex); got: {probe_script}"
 	);
 }