@@ -1251,77 +1251,46 @@ if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
12511251 while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
12521252 psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
12531253 # needs-reindex-all (pg_resetwal aftermath) can leave torn pages in
1254- # ANY index, not just collation-dependent ones. A blind REINDEX
1255- # DATABASE takes hours on the prod DBs; instead do a two-step
1256- # smart pass:
1254+ # ANY index, not just collation-dependent ones. We tried a "smart
1255+ # pass" using the amcheck contrib extension (scan each btree, queue
1256+ # only the corrupt ones for REINDEX) — empirically that hits the
1257+ # same postgres-internal pathology that wedges other vanilla DDL on
1258+ # this dataset: bt_index_check itself burns 100% CPU forever on
1259+ # specific indexes with no visible progress, blocking the whole
1260+ # reindex behind it.
12571261 #
1258- # 1. Use the amcheck contrib extension to read each valid btree
1259- # index and verify its structural invariants. Indexes that
1260- # fail the check (including "unexpected zero page" corruption)
1261- # get queued for REINDEX. Indexes that pass are left alone.
1262- # For a healthy snapshot this finds nothing and reads
1263- # ~index-size of disk instead of ~table-size to rewrite.
1262+ # Fall back to blind REINDEX DATABASE. REINDEX reads the heap and
1263+ # rebuilds the index from scratch (different code path from amcheck,
1264+ # which reads the corrupt index pages directly) and so isn't subject
1265+ # to the same wedge. Slow on prod-sized DBs but it makes progress;
1266+ # the alternative was a permanently-stuck restore.
12641267 #
1265- # 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST,
1266- # BRIN, hash). amcheck only covers btree; non-btree indexes
1267- # are typically a small fraction of total index size so the
1268- # cost is bounded, and skipping them risks the same
1269- # post-resetwal corruption sneaking through.
1270- #
1271- # CONCURRENTLY when available (PG ≥ 14) so the work overlaps with
1272- # whatever clients hit the pod after the readiness gate lifts .
1268+ # Crucially this branch does NOT remove needs-reindex-all at the
1269+ # top of the work — the readiness probe ignores -all for exactly
1270+ # this reason (see the probe spec below). The pod becomes Ready as
1271+ # soon as postgres accepts connections; clients hitting a not-yet-
1272+ # reindexed corrupt index get the explicit "unexpected zero page"
1273+ # error, retry, succeed once the rebuild lands. After REINDEX
1274+ # DATABASE completes for every user db, the flag is cleared and
1275+ # _pgro.restore_info.stage flips to ready .
12731276 if [ -f /pgdata/needs-reindex-all ]; then
1277+ # CONCURRENTLY (PG ≥ 12) builds replacement indexes alongside the
1278+ # existing ones and atomically swaps. Clients can keep using the
1279+ # old indexes during the rebuild — they'll see "unexpected zero
1280+ # page" only if a query happens to hit a corrupt page on the old
1281+ # side; once the swap lands the corruption is gone.
1282+ #
1283+ # REINDEX DATABASE CONCURRENTLY skips system catalogs (PG won't
1284+ # CONCURRENTLY them). For an analytics replica that's the right
1285+ # trade: user-data indexes matter for client queries, system
1286+ # catalog corruption shows up as different errors and is rare.
12741287 for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1275- echo "Reindex after pg_resetwal: $db (smart pass via amcheck)"
1276- psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true
1277-
1278- # Step 1: scan btree indexes; collect those that fail amcheck.
1279- BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
1280- SELECT c.oid::regclass::text
1281- FROM pg_class c
1282- JOIN pg_am a ON a.oid = c.relam
1283- JOIN pg_index i ON i.indexrelid = c.oid
1284- WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid;
1285- ")
1286- BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true)
1287- echo " amcheck scanning $BTREE_COUNT btree indexes in $db"
1288- CORRUPT_BTREE=""
1289- N=0
1290- for idx in $BTREE_INDEXES; do
1291- [ -z "$idx" ] && continue
1292- N=$((N + 1))
1293- # bt_index_check raises an error if the index is corrupt.
1294- # Suppress its output and check exit code; an error → queue it.
1295- if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then
1296- echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx"
1297- CORRUPT_BTREE="$CORRUPT_BTREE $idx"
1298- fi
1299- done
1300-
1301- # Step 2: list non-btree indexes for blind reindex.
1302- NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
1303- SELECT c.oid::regclass::text
1304- FROM pg_class c
1305- JOIN pg_am a ON a.oid = c.relam
1306- JOIN pg_index i ON i.indexrelid = c.oid
1307- WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid;
1308- ")
1309- NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true)
1310-
1311- TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES"
1312- TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true)
1313- echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)"
1314- N=0
1315- for idx in $TO_REINDEX; do
1316- [ -z "$idx" ] && continue
1317- N=$((N + 1))
1318- echo " [$N/$TOTAL] $db: REINDEX $idx"
1319- if [ "$PG_MAJOR" -ge 14 ]; then
1320- psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1321- else
1322- psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1323- fi
1324- done
1288+ echo "Reindex after pg_resetwal: $db (REINDEX DATABASE CONCURRENTLY)"
1289+ if [ "$PG_MAJOR" -ge 12 ]; then
1290+ psql -U postgres -d "$db" -c "REINDEX DATABASE CONCURRENTLY \"$db\";" 2>&1 || true
1291+ else
1292+ psql -U postgres -d "$db" -c "REINDEX DATABASE \"$db\";" 2>&1 || true
1293+ fi
13251294 done
13261295 rm -f /pgdata/needs-reindex-all
13271296 # needs-reindex (collation-dependent only) is a strict subset of
@@ -1397,7 +1366,15 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
13971366 command: Some ( vec![
13981367 "/bin/sh" . to_string( ) ,
13991368 "-c" . to_string( ) ,
1400- "pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]" . to_string( ) ,
1369+ // Gate readiness on the locale-only needs-reindex flag
1370+ // (small, fast, finishes in seconds-to-minutes) but NOT on
1371+ // needs-reindex-all (post-pg_resetwal blind REINDEX DATABASE
1372+ // — takes hours on prod-sized indexes; gating here would
1373+ // trip the operator's deployment_ready_timeout). The -all
1374+ // reindex runs in the background; clients hitting a
1375+ // not-yet-reindexed corrupt index see the explicit
1376+ // "unexpected zero page" error and retry.
1377+ "pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]" . to_string( ) ,
14011378 ] ) ,
14021379 } ) ,
14031380 initial_delay_seconds: Some ( 5 ) ,
0 commit comments