fix(operator): un-gate readiness on needs-reindex-all, REINDEX DATABASE CONCURRENTLY (#69)

passcod · web-flow · commit e9e0b9f7dd9b · 2026-06-10T16:02:45.000+12:00
diff --git a/src/controllers/restore/builders.rs b/src/controllers/restore/builders.rs
@@ -1251,77 +1251,46 @@ if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
     while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
     psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
     # needs-reindex-all (pg_resetwal aftermath) can leave torn pages in
-    # ANY index, not just collation-dependent ones. A blind REINDEX
-    # DATABASE takes hours on the prod DBs; instead do a two-step
-    # smart pass:
+    # ANY index, not just collation-dependent ones. We tried a "smart
+    # pass" using the amcheck contrib extension (scan each btree, queue
+    # only the corrupt ones for REINDEX) — empirically that hits the
+    # same postgres-internal pathology that wedges other vanilla DDL on
+    # this dataset: bt_index_check itself burns 100% CPU forever on
+    # specific indexes with no visible progress, blocking the whole
+    # reindex behind it.
     #
-    #  1. Use the amcheck contrib extension to read each valid btree
-    #     index and verify its structural invariants. Indexes that
-    #     fail the check (including "unexpected zero page" corruption)
-    #     get queued for REINDEX. Indexes that pass are left alone.
-    #     For a healthy snapshot this finds nothing and reads
-    #     ~index-size of disk instead of ~table-size to rewrite.
+    # Fall back to blind REINDEX DATABASE. REINDEX reads the heap and
+    # rebuilds the index from scratch (different code path from amcheck,
+    # which reads the corrupt index pages directly) and so isn't subject
+    # to the same wedge. Slow on prod-sized DBs but it makes progress;
+    # the alternative was a permanently-stuck restore.
     #
-    #  2. Blindly REINDEX every non-btree index in the DB (GIN, GiST,
-    #     BRIN, hash). amcheck only covers btree; non-btree indexes
-    #     are typically a small fraction of total index size so the
-    #     cost is bounded, and skipping them risks the same
-    #     post-resetwal corruption sneaking through.
-    #
-    # CONCURRENTLY when available (PG ≥ 14) so the work overlaps with
-    # whatever clients hit the pod after the readiness gate lifts.
+    # Crucially this branch does NOT remove needs-reindex-all at the
+    # top of the work — the readiness probe ignores -all for exactly
+    # this reason (see the probe spec below). The pod becomes Ready as
+    # soon as postgres accepts connections; clients hitting a not-yet-
+    # reindexed corrupt index get the explicit "unexpected zero page"
+    # error, retry, succeed once the rebuild lands. After REINDEX
+    # DATABASE completes for every user db, the flag is cleared and
+    # _pgro.restore_info.stage flips to ready.
     if [ -f /pgdata/needs-reindex-all ]; then
+      # CONCURRENTLY (PG ≥ 12) builds replacement indexes alongside the
+      # existing ones and atomically swaps. Clients can keep using the
+      # old indexes during the rebuild — they'll see "unexpected zero
+      # page" only if a query happens to hit a corrupt page on the old
+      # side; once the swap lands the corruption is gone.
+      #
+      # REINDEX DATABASE CONCURRENTLY skips system catalogs (PG won't
+      # CONCURRENTLY them). For an analytics replica that's the right
+      # trade: user-data indexes matter for client queries, system
+      # catalog corruption shows up as different errors and is rare.
       for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
-        echo "Reindex after pg_resetwal: $db (smart pass via amcheck)"
-        psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true
-
-        # Step 1: scan btree indexes; collect those that fail amcheck.
-        BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
-          SELECT c.oid::regclass::text
-          FROM pg_class c
-          JOIN pg_am a ON a.oid = c.relam
-          JOIN pg_index i ON i.indexrelid = c.oid
-          WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid;
-        ")
-        BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true)
-        echo "  amcheck scanning $BTREE_COUNT btree indexes in $db"
-        CORRUPT_BTREE=""
-        N=0
-        for idx in $BTREE_INDEXES; do
-          [ -z "$idx" ] && continue
-          N=$((N + 1))
-          # bt_index_check raises an error if the index is corrupt.
-          # Suppress its output and check exit code; an error → queue it.
-          if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then
-            echo "  [$N/$BTREE_COUNT] CORRUPT: $db: $idx"
-            CORRUPT_BTREE="$CORRUPT_BTREE $idx"
-          fi
-        done
-
-        # Step 2: list non-btree indexes for blind reindex.
-        NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
-          SELECT c.oid::regclass::text
-          FROM pg_class c
-          JOIN pg_am a ON a.oid = c.relam
-          JOIN pg_index i ON i.indexrelid = c.oid
-          WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid;
-        ")
-        NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true)
-
-        TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES"
-        TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true)
-        echo "  $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)"
-        N=0
-        for idx in $TO_REINDEX; do
-          [ -z "$idx" ] && continue
-          N=$((N + 1))
-          echo "  [$N/$TOTAL] $db: REINDEX $idx"
-          if [ "$PG_MAJOR" -ge 14 ]; then
-            psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
-          else
-            psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
-          fi
-        done
+        echo "Reindex after pg_resetwal: $db (REINDEX DATABASE CONCURRENTLY)"
+        if [ "$PG_MAJOR" -ge 12 ]; then
+          psql -U postgres -d "$db" -c "REINDEX DATABASE CONCURRENTLY \"$db\";" 2>&1 || true
+        else
+          psql -U postgres -d "$db" -c "REINDEX DATABASE \"$db\";" 2>&1 || true
+        fi
       done
       rm -f /pgdata/needs-reindex-all
       # needs-reindex (collation-dependent only) is a strict subset of
@@ -1397,7 +1366,15 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
 								command: Some(vec![
 									"/bin/sh".to_string(),
 									"-c".to_string(),
-									"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(),
+									// Gate readiness on the locale-only needs-reindex flag
+								// (small, fast, finishes in seconds-to-minutes) but NOT on
+								// needs-reindex-all (post-pg_resetwal blind REINDEX DATABASE
+								// — takes hours on prod-sized indexes; gating here would
+								// trip the operator's deployment_ready_timeout). The -all
+								// reindex runs in the background; clients hitting a
+								// not-yet-reindexed corrupt index see the explicit
+								// "unexpected zero page" error and retry.
+								"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(),
 								]),
 							}),
 							initial_delay_seconds: Some(5),
diff --git a/src/controllers/restore/tests.rs b/src/controllers/restore/tests.rs
@@ -740,13 +740,12 @@ fn deployment_init_script_flags_full_reindex_after_resetwal() {
 #[test]
 fn deployment_runtime_reindex_handles_full_database_flag() {
 	// The main container's startup hook handles the broad flag
-	// (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX
-	// only those that fail the structural check (caught the
-	// "unexpected zero page" failure mode reported in prod) plus all
-	// non-btree indexes (amcheck only covers btree, so non-btree get
-	// a blind REINDEX). A blind REINDEX DATABASE would take hours on
-	// the prod size; the smart pass reads index-size and rewrites
-	// only the corrupt ones.
+	// (needs-reindex-all) via blind REINDEX DATABASE on every user DB.
+	// The earlier amcheck-driven smart pass turned out to hit the same
+	// postgres-internal pathology that wedges other vanilla DDL on
+	// the prod data — bt_index_check burned 100% CPU forever on
+	// individual indexes. REINDEX uses a different code path (reads
+	// the heap, rebuilds from scratch) and doesn't trip that wedge.
 	let (mut restore, replica) = test_restore_and_replica();
 	restore.status = Some(PostgresPhysicalRestoreStatus {
 		postgres_version: Some("16".to_string()),
@@ -770,24 +769,20 @@ fn deployment_runtime_reindex_handles_full_database_flag() {
 		"runtime startup hook must check the needs-reindex-all flag"
 	);
 	assert!(
-		script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"),
-		"needs-reindex-all branch must enable amcheck for the structural scan"
+		script.contains("REINDEX DATABASE CONCURRENTLY"),
+		"needs-reindex-all branch must run REINDEX DATABASE CONCURRENTLY on PG ≥ 12 so clients can keep querying the old indexes during the rebuild"
 	);
+	// The PG < 12 fallback uses plain REINDEX DATABASE — verify the
+	// version gate exists in the script. (We can't string-match on the
+	// literal SQL because the shell-quoted `\"` form differs from a
+	// Rust string literal.)
 	assert!(
-		script.contains("bt_index_check("),
-		"needs-reindex-all branch must call bt_index_check on each btree index"
+		script.contains(r#""$PG_MAJOR" -ge 12"#),
+		"needs-reindex-all branch must gate CONCURRENTLY behind a PG ≥ 12 check"
 	);
 	assert!(
-		script.contains("a.amname <> 'btree'"),
-		"needs-reindex-all branch must collect non-btree indexes for blind REINDEX"
-	);
-	assert!(
-		script.contains("REINDEX INDEX"),
-		"needs-reindex-all branch must REINDEX (targeted, not the whole DB)"
-	);
-	assert!(
-		!script.contains("REINDEX DATABASE"),
-		"needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids"
+		!script.contains("bt_index_check("),
+		"runtime hook must not call bt_index_check — amcheck wedges on the prod data"
 	);
 	assert!(
 		script.contains("rm -f /pgdata/needs-reindex-all"),
@@ -796,11 +791,15 @@ fn deployment_runtime_reindex_handles_full_database_flag() {
 }
 
 #[test]
-fn deployment_readiness_probe_waits_for_full_reindex() {
-	// The readiness probe gates traffic on both reindex flags being
-	// absent, otherwise queries see torn-page indexes between pod-start
-	// and reindex-complete. The probe used to check only the narrow
-	// needs-reindex flag.
+fn deployment_readiness_probe_only_gates_on_locale_reindex() {
+	// The readiness probe waits for the locale-only `needs-reindex`
+	// flag to clear (small, fast, finishes in seconds) but NOT for
+	// `needs-reindex-all` (the post-pg_resetwal blind REINDEX DATABASE
+	// — takes hours on prod-sized indexes; gating here would trip the
+	// operator's deployment_ready_timeout and block restores
+	// indefinitely). The -all reindex runs in the background; clients
+	// hitting a not-yet-reindexed corrupt index see the explicit
+	// "unexpected zero page" error and can retry.
 	let (mut restore, replica) = test_restore_and_replica();
 	restore.status = Some(PostgresPhysicalRestoreStatus {
 		postgres_version: Some("16".to_string()),
@@ -826,12 +825,12 @@ fn deployment_readiness_probe_waits_for_full_reindex() {
 		.expect("exec probe must have a command");
 	let probe_script = probe_cmd.join(" ");
 	assert!(
-		probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"),
-		"readiness probe must wait for needs-reindex-all to clear; got: {probe_script}"
+		probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
+		"readiness probe must still wait for the locale-only needs-reindex flag; got: {probe_script}"
 	);
 	assert!(
-		probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
-		"readiness probe must still wait for needs-reindex; got: {probe_script}"
+		!probe_script.contains("needs-reindex-all"),
+		"readiness probe must NOT gate on needs-reindex-all (the long-running post-resetwal reindex); got: {probe_script}"
 	);
 }