From aaa480c62828149b47a3748749915a0132162f9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Sat, 6 Jun 2026 01:16:37 +1200
Subject: [PATCH] fix(operator): bound the schema-migration timeout cleanup to
 60s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The timeout path added in v0.3.22 calls DROP SCHEMA … CASCADE on the new
restore to clear stale persistent_schemas before switchover. But that
DROP can hang indefinitely if a backend on the target has the schema's
namespace locked — which is precisely the failure mode that tripped
the budget in the first place (e.g. a CREATE TABLE spinning at 100%
CPU and ignoring SIGTERM). The cleanup queued behind that lock,
wedging the switchover the cleanup was meant to enable.

Wrap the DROP cleanup in tokio::time::timeout (60s). If it doesn't
complete in time (either errors or times out), log it and proceed with
the switchover anyway — leftover schemas in the target are strictly
better than the replica never coming up. The next restore cycle will
re-attempt migration with a fresh target PVC and (typically) clear
the leftovers as part of pre-migration prep.

Restructures the cleanup into a separate drop_persistent_schemas_on_target
helper so the timeout wraps a single async future.
---
 src/controllers/replica.rs | 96 ++++++++++++++++++++++++++++----------
 1 file changed, 71 insertions(+), 25 deletions(-)
diff --git a/src/controllers/replica.rs b/src/controllers/replica.rs
index ca2653e..366bc2b 100644
--- a/src/controllers/replica.rs
+++ b/src/controllers/replica.rs
@@ -1099,6 +1099,46 @@ fn migration_exceeded_budget(replica: &PostgresPhysicalReplica, job: &Job) -> bo
 /// schema-less) database instead of being blocked indefinitely. The
 /// next restore cycle re-attempts migration if the schemas reappear on
 /// the source.
+/// Connect to the new restore and `DROP SCHEMA … CASCADE` each given
+/// schema. Separate function so the caller can wrap it in
+/// `tokio::time::timeout`; the operation has no internal deadline and
+/// can hang indefinitely if a backend on the target has a lock on the
+/// schema's namespace.
+async fn drop_persistent_schemas_on_target(
+	client: &Client,
+	ctx: &Arc<Context>,
+	replica: &PostgresPhysicalReplica,
+	namespace: &str,
+	new_restore_name: &str,
+	schemas: &[String],
+) -> Result<()> {
+	let reader_secret_name = replica.creds_secret_name();
+	let secrets: Api<Secret> = Api::namespaced(client.clone(), namespace);
+	let reader_secret = secrets.get(&reader_secret_name).await?;
+	let reader_user = postgres::read_secret_field(&reader_secret, "username")?;
+	let reader_password = postgres::read_secret_field(&reader_secret, "password")?;
+	let target_dbname = postgres::discover_restore_database(
+		client,
+		namespace,
+		new_restore_name,
+		&reader_user,
+		&reader_password,
+		ctx.use_port_forward(),
+	)
+	.await?;
+	let conn = postgres::connect_to_restore(
+		client,
+		namespace,
+		new_restore_name,
+		&target_dbname,
+		&reader_user,
+		&reader_password,
+		ctx.use_port_forward(),
+	)
+	.await?;
+	postgres::drop_schemas_on(&conn.client, schemas).await
+}
+
 async fn timeout_schema_migration(
 	client: &Client,
 	ctx: &Arc<Context>,
@@ -1126,35 +1166,41 @@ async fn timeout_schema_migration(
 		warn!(job = %job_name, error = %e, "failed to delete timed-out migration Job");
 	}
 
-	// DROP SCHEMA … CASCADE the persistent_schemas on the new restore so
-	// the operator's "owned" schemas don't carry stale leftovers from the
-	// restored data. Best-effort per-schema via IF EXISTS.
+	// Opportunistically DROP SCHEMA … CASCADE the persistent_schemas on
+	// the new restore so the operator's "owned" schemas don't carry stale
+	// leftovers from the restored data. Bounded at 60 seconds: if a
+	// backend in the target restore is itself stuck (the exact failure
+	// mode that tripped the migration budget in the first place — e.g.
+	// CREATE TABLE spinning at 100% CPU and ignoring SIGTERM), our DROP
+	// queues on its lock and never completes. Better to leave leftover
+	// schemas than to wedge the switchover indefinitely on the cleanup.
+	const CLEANUP_TIMEOUT: Duration = Duration::from_secs(60);
 	if !schemas.is_empty() {
-		let reader_secret_name = replica.creds_secret_name();
-		let secrets: Api<Secret> = Api::namespaced(client.clone(), namespace);
-		let reader_secret = secrets.get(&reader_secret_name).await?;
-		let reader_user = postgres::read_secret_field(&reader_secret, "username")?;
-		let reader_password = postgres::read_secret_field(&reader_secret, "password")?;
-		let target_dbname = postgres::discover_restore_database(
+		let cleanup = drop_persistent_schemas_on_target(
 			client,
+			ctx,
+			replica,
 			namespace,
 			&new_restore_name,
-			&reader_user,
-			&reader_password,
-			ctx.use_port_forward(),
-		)
-		.await?;
-		let conn = postgres::connect_to_restore(
-			client,
-			namespace,
-			&new_restore_name,
-			&target_dbname,
-			&reader_user,
-			&reader_password,
-			ctx.use_port_forward(),
-		)
-		.await?;
-		postgres::drop_schemas_on(&conn.client, &schemas).await?;
+			&schemas,
+		);
+		match tokio::time::timeout(CLEANUP_TIMEOUT, cleanup).await {
+			Ok(Ok(())) => {}
+			Ok(Err(e)) => {
+				warn!(
+					replica = %replica_name,
+					error = %e,
+					"DROP SCHEMA cleanup errored in timeout-skip path; proceeding to switchover with leftover schemas"
+				);
+			}
+			Err(_) => {
+				warn!(
+					replica = %replica_name,
+					timeout = ?CLEANUP_TIMEOUT,
+					"DROP SCHEMA cleanup itself timed out (target postgres backend likely stuck); proceeding to switchover with leftover schemas"
+				);
+			}
+		}
 	}
 
 	// Surface as a Warning event so this is visible on the replica CR.