feat(operator): bound schema migration at 20% of cron interval

passcod · passcod · commit 5da48ccc0d85 · 2026-06-06T00:53:26.000+12:00
The schema migration step inside switchover used to wait indefinitely
for its Job to complete. A genuinely-stuck postgres backend on the
target restore (e.g. a CREATE TABLE spinning at 100% CPU forever)
therefore wedged the entire replica: switchover blocked, replica stuck
in Switching, the old Active restore serves increasingly stale data,
and every subsequent scheduled restore queues behind the wedge.

Cap the migration at 20% of the time between consecutive cron firings
(72 min for a 6-hourly schedule, 5h for daily). On timeout the
operator drops the persistent_schemas on the new restore via
DROP SCHEMA … CASCADE, records a SchemaMigrationTimedOut Warning
event, sets schemaMigrationPhase = timeout-skipped, and proceeds with
the switchover. The replica comes up serving snapshot contents
without the persistent schemas; the next cycle re-attempts migration
if the schemas have regenerated on the source. A usable replica
beats carrying the schema through indefinitely.

Document the budget + timeout behaviour in the README's new
"Persistent schemas" section.
diff --git a/README.md b/README.md
@@ -102,7 +102,7 @@ Defines a continuously-refreshed replica of a PostgreSQL database restored from
 | `readOnly` | `bool` | No | `true` | Set the restored database to read-only mode. |
 | `postgresExtraConfig` | `string` | No | — | Extra lines appended to `postgresql.conf` (e.g. `shared_preload_libraries`). |
 | `notifications` | `[]NotificationConfig` | No | `[]` | Notification targets called on restore events. |
-| `persistentSchemas` | `[]string` | No | — | List of schema names to migrate from the previous restore to the new restore on each switchover. |
+| `persistentSchemas` | `[]string` | No | — | List of schema names to migrate from the previous restore to the new restore on each switchover. See [Persistent schemas](#persistent-schemas) below for the migration time budget and what happens on timeout. |
 
 The cron expression is parsed using the [cronexpr](https://docs.rs/cronexpr) crate.
 It has two interesting features:
@@ -114,6 +114,24 @@ The jitter is a random duration between -time/2 and +time/2.
 For example, `10m` will result in a jitter between -5m and 5m.
 When using `H` in the cron expression, you might want to set the jitter to zero to properly take advantage of the spread-but-stable behaviour.
 
+#### Persistent schemas
+
+Each switchover normally drops the new restore (so it carries only what was in the snapshot) and is fast.
+The `persistentSchemas` field opts a schema (e.g. `dbt`) into being **carried across restores** via a `pg_dump | psql` migration Job that runs between the previous restore and the new one.
+A healthy migration takes seconds.
+
+The migration has a **hard time budget of 20% of the cron interval** (e.g. ~72 min on a 6-hourly schedule, ~5 h on a daily one).
+If the budget is exceeded — most realistically because some external upstream condition wedges postgres mid-migration — the operator:
+
+1. Cancels the migration Job.
+2. Runs `DROP SCHEMA <name> CASCADE` for each persistent schema on the new restore.
+3. Records a `SchemaMigrationTimedOut` Warning event on the replica.
+4. Sets `status.schemaMigrationPhase = "timeout-skipped"`.
+5. Proceeds with the switchover.
+
+The intent is that **a usable replica beats carrying the schema through**.
+The next restore cycle will re-attempt the migration if the schemas have been regenerated on the source in the meantime; until then the replica is up and serving the snapshot contents.
+
 #### SnapshotFilter
 
 | Field | Type | Required | Description |
@@ -163,7 +181,7 @@ Additional fields for `target: graphQL`:
 | `notifications` | `[]NotificationStatus` | Status of each configured notification target. |
 | `conditions` | `[]Condition` | Standard Kubernetes conditions. |
 | `schemaMigrationJob` | `string` | Name of the active schema migration Job (set while migration is in progress). |
-| `schemaMigrationPhase` | `string` | Phase of the schema migration (`active`, `complete`, or `failed: <reason>`). |
+| `schemaMigrationPhase` | `string` | Phase of the schema migration (`active`, `complete`, `partial`, `timeout-skipped`, or `failed: <reason>`). See [Persistent schemas](#persistent-schemas). |
 | `persistentSchemaDataSize` | `Quantity` | Measured size of persistent schema data from the last successful migration. Used to size the next restore PVC. |
 | `consecutiveRestoreFailures` | `uint32` | Number of consecutive restore failures. Reset to 0 on success. After 3 consecutive failures the operator stops scheduling new restores until the counter is reset (automatically on next successful restore, or manually via `kubectl patch --subresource=status`). |
 
diff --git a/src/controllers/replica.rs b/src/controllers/replica.rs
@@ -1074,6 +1074,133 @@ async fn mark_schema_migration_complete(
 	Ok(())
 }
 
+/// True if the running schema-migration Job has been alive longer than the
+/// replica's per-cycle migration budget (see
+/// [`PostgresPhysicalReplica::schema_migration_timeout`]). Uses the Job's
+/// `creationTimestamp` as the start; falls back to "not exceeded" if the
+/// Job has no creation timestamp (which shouldn't happen in practice).
+fn migration_exceeded_budget(replica: &PostgresPhysicalReplica, job: &Job) -> bool {
+	let Some(created) = job
+		.metadata
+		.creation_timestamp
+		.as_ref()
+		.map(|t| Timestamp::from(t.0))
+	else {
+		return false;
+	};
+	let elapsed = Timestamp::now().duration_since(created);
+	elapsed > replica.schema_migration_timeout()
+}
+
+/// Abandon a stuck schema migration: drop the Job, DROP SCHEMA … CASCADE
+/// the configured `persistent_schemas` on the new restore, record a
+/// Warning event, and mark the migration phase as `timeout-skipped`.
+/// Lets switchover proceed so the replica gets a usable (if
+/// schema-less) database instead of being blocked indefinitely. The
+/// next restore cycle re-attempts migration if the schemas reappear on
+/// the source.
+async fn timeout_schema_migration(
+	client: &Client,
+	ctx: &Arc<Context>,
+	replica: &PostgresPhysicalReplica,
+	namespace: &str,
+	new_restore: &PostgresPhysicalRestore,
+	job_name: &str,
+) -> Result<()> {
+	let replica_name = replica.name_any();
+	let new_restore_name = new_restore.name_any();
+	let schemas: Vec<String> = replica.spec.persistent_schemas.clone().unwrap_or_default();
+
+	warn!(
+		replica = %replica_name,
+		restore = %new_restore_name,
+		schemas = ?schemas,
+		timeout = ?replica.schema_migration_timeout(),
+		"schema migration exceeded budget; dropping persistent schemas on new restore and proceeding to switchover"
+	);
+
+	// Cancel the Job (background propagation so its pods are GC'd too).
+	let jobs: Api<Job> = Api::namespaced(client.clone(), namespace);
+	let dp = kube::api::DeleteParams::background();
+	if let Err(e) = jobs.delete(job_name, &dp).await {
+		warn!(job = %job_name, error = %e, "failed to delete timed-out migration Job");
+	}
+
+	// DROP SCHEMA … CASCADE the persistent_schemas on the new restore so
+	// the operator's "owned" schemas don't carry stale leftovers from the
+	// restored data. Best-effort per-schema via IF EXISTS.
+	if !schemas.is_empty() {
+		let reader_secret_name = replica.creds_secret_name();
+		let secrets: Api<Secret> = Api::namespaced(client.clone(), namespace);
+		let reader_secret = secrets.get(&reader_secret_name).await?;
+		let reader_user = postgres::read_secret_field(&reader_secret, "username")?;
+		let reader_password = postgres::read_secret_field(&reader_secret, "password")?;
+		let target_dbname = postgres::discover_restore_database(
+			client,
+			namespace,
+			&new_restore_name,
+			&reader_user,
+			&reader_password,
+			ctx.use_port_forward(),
+		)
+		.await?;
+		let conn = postgres::connect_to_restore(
+			client,
+			namespace,
+			&new_restore_name,
+			&target_dbname,
+			&reader_user,
+			&reader_password,
+			ctx.use_port_forward(),
+		)
+		.await?;
+		postgres::drop_schemas_on(&conn.client, &schemas).await?;
+	}
+
+	// Surface as a Warning event so this is visible on the replica CR.
+	let note = format!(
+		"Schema migration exceeded its time budget (20% of cron interval). \
+		 Persistent schemas [{}] were dropped on the new restore so it can come up. \
+		 The next restore cycle will reattempt migration if the schemas have been regenerated upstream.",
+		schemas.join(", ")
+	);
+	if let Err(e) = ctx
+		.recorder
+		.publish(
+			&Event {
+				type_: EventType::Warning,
+				reason: "SchemaMigrationTimedOut".into(),
+				note: Some(note),
+				action: "Restore".into(),
+				secondary: Some(new_restore.object_ref(&())),
+			},
+			&replica.object_ref(&()),
+		)
+		.await
+	{
+		warn!(replica = %replica_name, error = %e, "failed to publish SchemaMigrationTimedOut event");
+	}
+
+	// Status: phase = timeout-skipped so it's distinguishable from
+	// complete/partial/failed.
+	let replicas: Api<PostgresPhysicalReplica> = Api::namespaced(client.clone(), namespace);
+	let patch = serde_json::json!({
+		"status": {
+			"schemaMigrationJob": null,
+			"schemaMigrationPhase": "timeout-skipped",
+		}
+	});
+	replicas
+		.patch_status(
+			&replica_name,
+			&PatchParams::apply("postgres-restore-operator"),
+			&Patch::Merge(&patch),
+		)
+		.await?;
+
+	Ok(())
+}
+
 async fn reconcile_schema_migration(
 	client: &Client,
 	ctx: &Arc<Context>,
@@ -1131,6 +1258,18 @@ async fn reconcile_schema_migration(
 	if let Some(job) = jobs.get_opt(&job_name).await? {
 		match classify_job(&job) {
 			JobStatus::Active => {
+				if migration_exceeded_budget(replica, &job) {
+					timeout_schema_migration(
+						client,
+						ctx,
+						replica,
+						namespace,
+						new_restore,
+						&job_name,
+					)
+					.await?;
+					return Ok(true);
+				}
 				debug!(replica = %replica_name, job = %job_name, "migration Job still running");
 				return Ok(false);
 			}
diff --git a/src/controllers/replica/scheduling.rs b/src/controllers/replica/scheduling.rs
@@ -53,6 +53,46 @@ impl PostgresPhysicalReplica {
 		format!("{:016x}", hasher.finish())
 	}
 
+	/// Wall-clock budget for the schema migration step inside a single
+	/// restore cycle. Returns 20% of the interval between consecutive
+	/// cron firings; e.g. a `0 */6 * * *` schedule (every 6h) gets a
+	/// ~72 min budget. A healthy migration completes in seconds, so this
+	/// is a generous backstop, not a tight SLA — the goal is to keep a
+	/// pathological migration (postgres backend stuck on a single DDL,
+	/// for example) from blocking the replica from coming up at all.
+	/// Falls back to 1h if the cron expression can't be parsed or there
+	/// is no schedule. When the timeout fires, the operator drops the
+	/// `persistent_schemas` on the new restore (DROP SCHEMA … CASCADE)
+	/// and proceeds to switchover. The next restore reattempts the
+	/// migration if the schemas were regenerated upstream in between.
+	pub fn schema_migration_timeout(&self) -> SignedDuration {
+		const FALLBACK: SignedDuration = SignedDuration::from_secs(3600);
+		const BUDGET_FRACTION_DENOMINATOR: i64 = 5; // 1/5 == 20%
+		let Some(interval) = self.cron_interval(Timestamp::now()) else {
+			return FALLBACK;
+		};
+		SignedDuration::from_secs(interval.as_secs() / BUDGET_FRACTION_DENOMINATOR)
+	}
+
+	/// Interval between two consecutive cron firings of this replica's
+	/// schedule, measured from `now`. Returns `None` when the schedule
+	/// can't be parsed or doesn't have a second next-fire.
+	fn cron_interval(&self, now: Timestamp) -> Option<SignedDuration> {
+		let schedule = &self.spec.schedule;
+		let cron = parse_crontab_with(schedule, {
+			let mut options = ParseOptions::default();
+			options.fallback_timezone_option = cronexpr::FallbackTimezoneOption::UTC;
+			options
+		})
+		.ok()?;
+		let next = cron.find_next(now).ok()?;
+		let next_ts = next.timestamp();
+		let after = cron
+			.find_next(next_ts + SignedDuration::from_secs(1))
+			.ok()?;
+		Some(after.timestamp().duration_since(next_ts))
+	}
+
 	pub fn compute_next_scheduled_restore(&self, now: Timestamp) -> Option<Timestamp> {
 		let schedule = &self.spec.schedule;
 
@@ -348,6 +388,29 @@ mod tests {
 		assert!(next.unwrap() > now);
 	}
 
+	#[test]
+	fn schema_migration_timeout_six_hourly_cron_is_twenty_percent() {
+		// `0 */6 * * *` fires every 6h → 21600s → 20% = 4320s = 72min.
+		let replica = make_replica("0 */6 * * *", None, None, None);
+		let timeout = replica.schema_migration_timeout();
+		assert_eq!(timeout, SignedDuration::from_secs(4320));
+	}
+
+	#[test]
+	fn schema_migration_timeout_daily_cron_is_twenty_percent() {
+		// Daily at midnight → 86400s → 20% = 17280s = 288min.
+		let replica = make_replica("0 0 * * *", None, None, None);
+		let timeout = replica.schema_migration_timeout();
+		assert_eq!(timeout, SignedDuration::from_secs(17280));
+	}
+
+	#[test]
+	fn schema_migration_timeout_falls_back_on_invalid_cron() {
+		let replica = make_replica("not a cron", None, None, None);
+		let timeout = replica.schema_migration_timeout();
+		assert_eq!(timeout, SignedDuration::from_secs(3600));
+	}
+
 	#[test]
 	fn compute_next_scheduled_restore_invalid_cron() {
 		let replica = make_replica("not a cron", None, None, None);