Skip to content

Commit e9e0b9f

Browse files
authored
fix(operator): un-gate readiness on needs-reindex-all, REINDEX DATABASE CONCURRENTLY (#69)
2 parents 98e1000 + 6987f64 commit e9e0b9f

2 files changed

Lines changed: 74 additions & 98 deletions

File tree

src/controllers/restore/builders.rs

Lines changed: 45 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1251,77 +1251,46 @@ if [ -f /pgdata/needs-reindex ] || [ -f /pgdata/needs-reindex-all ]; then
12511251
while ! pg_isready -q -U postgres -d postgres; do sleep 2; done
12521252
psql -U postgres -d postgres -c "UPDATE _pgro.restore_info SET stage = 'reindexing', last_transition_time = now() WHERE id = 1;"
12531253
# needs-reindex-all (pg_resetwal aftermath) can leave torn pages in
1254-
# ANY index, not just collation-dependent ones. A blind REINDEX
1255-
# DATABASE takes hours on the prod DBs; instead do a two-step
1256-
# smart pass:
1254+
# ANY index, not just collation-dependent ones. We tried a "smart
1255+
# pass" using the amcheck contrib extension (scan each btree, queue
1256+
# only the corrupt ones for REINDEX) — empirically that hits the
1257+
# same postgres-internal pathology that wedges other vanilla DDL on
1258+
# this dataset: bt_index_check itself burns 100% CPU forever on
1259+
# specific indexes with no visible progress, blocking the whole
1260+
# reindex behind it.
12571261
#
1258-
# 1. Use the amcheck contrib extension to read each valid btree
1259-
# index and verify its structural invariants. Indexes that
1260-
# fail the check (including "unexpected zero page" corruption)
1261-
# get queued for REINDEX. Indexes that pass are left alone.
1262-
# For a healthy snapshot this finds nothing and reads
1263-
# ~index-size of disk instead of ~table-size to rewrite.
1262+
# Fall back to blind REINDEX DATABASE. REINDEX reads the heap and
1263+
# rebuilds the index from scratch (different code path from amcheck,
1264+
# which reads the corrupt index pages directly) and so isn't subject
1265+
# to the same wedge. Slow on prod-sized DBs but it makes progress;
1266+
# the alternative was a permanently-stuck restore.
12641267
#
1265-
# 2. Blindly REINDEX every non-btree index in the DB (GIN, GiST,
1266-
# BRIN, hash). amcheck only covers btree; non-btree indexes
1267-
# are typically a small fraction of total index size so the
1268-
# cost is bounded, and skipping them risks the same
1269-
# post-resetwal corruption sneaking through.
1270-
#
1271-
# CONCURRENTLY when available (PG ≥ 14) so the work overlaps with
1272-
# whatever clients hit the pod after the readiness gate lifts.
1268+
# Crucially this branch does NOT remove needs-reindex-all at the
1269+
# top of the work — the readiness probe ignores -all for exactly
1270+
# this reason (see the probe spec below). The pod becomes Ready as
1271+
# soon as postgres accepts connections; clients hitting a not-yet-
1272+
# reindexed corrupt index get the explicit "unexpected zero page"
1273+
# error, retry, succeed once the rebuild lands. After REINDEX
1274+
# DATABASE completes for every user db, the flag is cleared and
1275+
# _pgro.restore_info.stage flips to ready.
12731276
if [ -f /pgdata/needs-reindex-all ]; then
1277+
# CONCURRENTLY (PG ≥ 12) builds replacement indexes alongside the
1278+
# existing ones and atomically swaps. Clients can keep using the
1279+
# old indexes during the rebuild — they'll see "unexpected zero
1280+
# page" only if a query happens to hit a corrupt page on the old
1281+
# side; once the swap lands the corruption is gone.
1282+
#
1283+
# REINDEX DATABASE CONCURRENTLY skips system catalogs (PG won't
1284+
# CONCURRENTLY them). For an analytics replica that's the right
1285+
# trade: user-data indexes matter for client queries, system
1286+
# catalog corruption shows up as different errors and is rare.
12741287
for db in $(psql -U postgres -d postgres -At -c "SELECT datname FROM pg_database WHERE datallowconn AND datname <> 'template0'"); do
1275-
echo "Reindex after pg_resetwal: $db (smart pass via amcheck)"
1276-
psql -U postgres -d "$db" -c "CREATE EXTENSION IF NOT EXISTS amcheck;" 2>&1 || true
1277-
1278-
# Step 1: scan btree indexes; collect those that fail amcheck.
1279-
BTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
1280-
SELECT c.oid::regclass::text
1281-
FROM pg_class c
1282-
JOIN pg_am a ON a.oid = c.relam
1283-
JOIN pg_index i ON i.indexrelid = c.oid
1284-
WHERE c.relkind = 'i' AND a.amname = 'btree' AND i.indisvalid;
1285-
")
1286-
BTREE_COUNT=$(echo "$BTREE_INDEXES" | grep -c . || true)
1287-
echo " amcheck scanning $BTREE_COUNT btree indexes in $db"
1288-
CORRUPT_BTREE=""
1289-
N=0
1290-
for idx in $BTREE_INDEXES; do
1291-
[ -z "$idx" ] && continue
1292-
N=$((N + 1))
1293-
# bt_index_check raises an error if the index is corrupt.
1294-
# Suppress its output and check exit code; an error → queue it.
1295-
if ! psql -U postgres -d "$db" -At -c "SELECT bt_index_check('$idx'::regclass);" > /dev/null 2>&1; then
1296-
echo " [$N/$BTREE_COUNT] CORRUPT: $db: $idx"
1297-
CORRUPT_BTREE="$CORRUPT_BTREE $idx"
1298-
fi
1299-
done
1300-
1301-
# Step 2: list non-btree indexes for blind reindex.
1302-
NONBTREE_INDEXES=$(psql -U postgres -d "$db" -At -c "
1303-
SELECT c.oid::regclass::text
1304-
FROM pg_class c
1305-
JOIN pg_am a ON a.oid = c.relam
1306-
JOIN pg_index i ON i.indexrelid = c.oid
1307-
WHERE c.relkind = 'i' AND a.amname <> 'btree' AND i.indisvalid;
1308-
")
1309-
NONBTREE_COUNT=$(echo "$NONBTREE_INDEXES" | grep -c . || true)
1310-
1311-
TO_REINDEX="$CORRUPT_BTREE $NONBTREE_INDEXES"
1312-
TOTAL=$(echo "$TO_REINDEX" | tr ' ' '\n' | grep -c . || true)
1313-
echo " $db: $TOTAL indexes to REINDEX (corrupt btree + all non-btree=$NONBTREE_COUNT)"
1314-
N=0
1315-
for idx in $TO_REINDEX; do
1316-
[ -z "$idx" ] && continue
1317-
N=$((N + 1))
1318-
echo " [$N/$TOTAL] $db: REINDEX $idx"
1319-
if [ "$PG_MAJOR" -ge 14 ]; then
1320-
psql -U postgres -d "$db" -c "REINDEX INDEX CONCURRENTLY $idx;" 2>&1 || true
1321-
else
1322-
psql -U postgres -d "$db" -c "REINDEX INDEX $idx;" 2>&1 || true
1323-
fi
1324-
done
1288+
echo "Reindex after pg_resetwal: $db (REINDEX DATABASE CONCURRENTLY)"
1289+
if [ "$PG_MAJOR" -ge 12 ]; then
1290+
psql -U postgres -d "$db" -c "REINDEX DATABASE CONCURRENTLY \"$db\";" 2>&1 || true
1291+
else
1292+
psql -U postgres -d "$db" -c "REINDEX DATABASE \"$db\";" 2>&1 || true
1293+
fi
13251294
done
13261295
rm -f /pgdata/needs-reindex-all
13271296
# needs-reindex (collation-dependent only) is a strict subset of
@@ -1397,7 +1366,15 @@ exec postgres -D /pgdata/pgdata ${PGRO_LOG_LEVEL:+-c log_min_messages=$PGRO_LOG_
13971366
command: Some(vec![
13981367
"/bin/sh".to_string(),
13991368
"-c".to_string(),
1400-
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ] && [ ! -f /pgdata/needs-reindex-all ]".to_string(),
1369+
// Gate readiness on the locale-only needs-reindex flag
1370+
// (small, fast, finishes in seconds-to-minutes) but NOT on
1371+
// needs-reindex-all (post-pg_resetwal blind REINDEX DATABASE
1372+
// — takes hours on prod-sized indexes; gating here would
1373+
// trip the operator's deployment_ready_timeout). The -all
1374+
// reindex runs in the background; clients hitting a
1375+
// not-yet-reindexed corrupt index see the explicit
1376+
// "unexpected zero page" error and retry.
1377+
"pg_isready -U postgres -d postgres && [ ! -f /pgdata/needs-reindex ]".to_string(),
14011378
]),
14021379
}),
14031380
initial_delay_seconds: Some(5),

src/controllers/restore/tests.rs

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -740,13 +740,12 @@ fn deployment_init_script_flags_full_reindex_after_resetwal() {
740740
#[test]
741741
fn deployment_runtime_reindex_handles_full_database_flag() {
742742
// The main container's startup hook handles the broad flag
743-
// (needs-reindex-all) via amcheck: scan all btree indexes, REINDEX
744-
// only those that fail the structural check (caught the
745-
// "unexpected zero page" failure mode reported in prod) plus all
746-
// non-btree indexes (amcheck only covers btree, so non-btree get
747-
// a blind REINDEX). A blind REINDEX DATABASE would take hours on
748-
// the prod size; the smart pass reads index-size and rewrites
749-
// only the corrupt ones.
743+
// (needs-reindex-all) via blind REINDEX DATABASE on every user DB.
744+
// The earlier amcheck-driven smart pass turned out to hit the same
745+
// postgres-internal pathology that wedges other vanilla DDL on
746+
// the prod data — bt_index_check burned 100% CPU forever on
747+
// individual indexes. REINDEX uses a different code path (reads
748+
// the heap, rebuilds from scratch) and doesn't trip that wedge.
750749
let (mut restore, replica) = test_restore_and_replica();
751750
restore.status = Some(PostgresPhysicalRestoreStatus {
752751
postgres_version: Some("16".to_string()),
@@ -770,24 +769,20 @@ fn deployment_runtime_reindex_handles_full_database_flag() {
770769
"runtime startup hook must check the needs-reindex-all flag"
771770
);
772771
assert!(
773-
script.contains("CREATE EXTENSION IF NOT EXISTS amcheck"),
774-
"needs-reindex-all branch must enable amcheck for the structural scan"
772+
script.contains("REINDEX DATABASE CONCURRENTLY"),
773+
"needs-reindex-all branch must run REINDEX DATABASE CONCURRENTLY on PG ≥ 12 so clients can keep querying the old indexes during the rebuild"
775774
);
775+
// The PG < 12 fallback uses plain REINDEX DATABASE — verify the
776+
// version gate exists in the script. (We can't string-match on the
777+
// literal SQL because the shell-quoted `\"` form differs from a
778+
// Rust string literal.)
776779
assert!(
777-
script.contains("bt_index_check("),
778-
"needs-reindex-all branch must call bt_index_check on each btree index"
780+
script.contains(r#""$PG_MAJOR" -ge 12"#),
781+
"needs-reindex-all branch must gate CONCURRENTLY behind a PG ≥ 12 check"
779782
);
780783
assert!(
781-
script.contains("a.amname <> 'btree'"),
782-
"needs-reindex-all branch must collect non-btree indexes for blind REINDEX"
783-
);
784-
assert!(
785-
script.contains("REINDEX INDEX"),
786-
"needs-reindex-all branch must REINDEX (targeted, not the whole DB)"
787-
);
788-
assert!(
789-
!script.contains("REINDEX DATABASE"),
790-
"needs-reindex-all must not REINDEX DATABASE — it's the expensive hammer the smart pass avoids"
784+
!script.contains("bt_index_check("),
785+
"runtime hook must not call bt_index_check — amcheck wedges on the prod data"
791786
);
792787
assert!(
793788
script.contains("rm -f /pgdata/needs-reindex-all"),
@@ -796,11 +791,15 @@ fn deployment_runtime_reindex_handles_full_database_flag() {
796791
}
797792

798793
#[test]
799-
fn deployment_readiness_probe_waits_for_full_reindex() {
800-
// The readiness probe gates traffic on both reindex flags being
801-
// absent, otherwise queries see torn-page indexes between pod-start
802-
// and reindex-complete. The probe used to check only the narrow
803-
// needs-reindex flag.
794+
fn deployment_readiness_probe_only_gates_on_locale_reindex() {
795+
// The readiness probe waits for the locale-only `needs-reindex`
796+
// flag to clear (small, fast, finishes in seconds) but NOT for
797+
// `needs-reindex-all` (the post-pg_resetwal blind REINDEX DATABASE
798+
// — takes hours on prod-sized indexes; gating here would trip the
799+
// operator's deployment_ready_timeout and block restores
800+
// indefinitely). The -all reindex runs in the background; clients
801+
// hitting a not-yet-reindexed corrupt index see the explicit
802+
// "unexpected zero page" error and can retry.
804803
let (mut restore, replica) = test_restore_and_replica();
805804
restore.status = Some(PostgresPhysicalRestoreStatus {
806805
postgres_version: Some("16".to_string()),
@@ -826,12 +825,12 @@ fn deployment_readiness_probe_waits_for_full_reindex() {
826825
.expect("exec probe must have a command");
827826
let probe_script = probe_cmd.join(" ");
828827
assert!(
829-
probe_script.contains("[ ! -f /pgdata/needs-reindex-all ]"),
830-
"readiness probe must wait for needs-reindex-all to clear; got: {probe_script}"
828+
probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
829+
"readiness probe must still wait for the locale-only needs-reindex flag; got: {probe_script}"
831830
);
832831
assert!(
833-
probe_script.contains("[ ! -f /pgdata/needs-reindex ]"),
834-
"readiness probe must still wait for needs-reindex; got: {probe_script}"
832+
!probe_script.contains("needs-reindex-all"),
833+
"readiness probe must NOT gate on needs-reindex-all (the long-running post-resetwal reindex); got: {probe_script}"
835834
);
836835
}
837836

0 commit comments

Comments
 (0)