adapter: keep catalog upper up-to-date with read ts (#35402)

teskje · web-flow · commit af5783aa4fe8 · 2026-03-16T10:45:03.000+01:00
This PR introduces a way to advance the frontier of the catalog shard
without appending any new updates and then uses that to keep the catalog
shard's frontier up to date with the oracle read ts. This ensures that
`mz_catalog_raw` is always readable with strict serializable isolation,
since under that isolation level, the oracle read ts will be used as the
query timestamp.

This increases the amount of CRDB queries we make, instead of bumping
only the txn-wal, we now additionally need to bump the catalog shard.
But note that in two of the three instances the `advance_upper` call
replaces a `confirm_leadership` call, which also did a CRDB query, so
one can hope that the resulting number of CRDB calls is similar.

This could potentially be further improved by moving the catalog shard
into txn-wal, but that's a larger lift so we leave it as future work.

### Motivation

Closes SQL-117
diff --git a/misc/python/materialize/cli/ci_annotate_errors.py b/misc/python/materialize/cli/ci_annotate_errors.py
@@ -180,20 +180,20 @@
     # For tests we purposely trigger this error
     | skip-version-upgrade-materialized.* \| .* incompatible\ persist\ version\ \d+\.\d+\.\d+(-dev)?,\ current:\ \d+\.\d+\.\d+(-dev\.\d+)?,\ make\ sure\ to\ upgrade\ the\ catalog\ one\ version\ forward\ at\ a\ time
     # For 0dt upgrades
-    | halting\ process:\ (unable\ to\ confirm\ leadership|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out|dependency\ since\ frontier\ is\ empty\ while\ dependent\ upper\ is\ not\ empty|code\ at\ version\ .*\ cannot\ read\ data\ with\ version)
+    | halting\ process:\ (unable\ to\ advance\ catalog\ upper|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out|dependency\ since\ frontier\ is\ empty\ while\ dependent\ upper\ is\ not\ empty|code\ at\ version\ .*\ cannot\ read\ data\ with\ version)
     | zippy-materialized.* \| .* halting\ process:\ Server\ started\ with\ requested\ generation
     | there\ have\ been\ DDL\ that\ we\ need\ to\ react\ to;\ rebooting\ in\ read-only\ mode
     # Don't care for ssh problems
     | fatal:\ userauth_pubkey
     # Expected in Terraform tests if something else failed during the setup
     | mz-debug:\ fatal:\ failed\ to\ read\ kubeconfig
     # Fences without incrementing deploy generation
-    | txn-wal-fencing-mz_first-.* \| .* unable\ to\ confirm\ leadership
+    | txn-wal-fencing-mz_first-.* \| .* unable\ to\ advance\ catalog\ upper
     | txn-wal-fencing-mz_first-.* \| .* fenced\ by\ envd
     # 0dt platform-checks have two envds running in parallel, thus high load, tests still succeed, so ignore noise
     | platform-checks-mz_.* \| .* was\ expired\ due\ to\ inactivity\.\ Did\ the\ machine\ go\ to\ sleep\?
     # This can happen in "K8s recovery: envd on failing node", but the test still succeeds, old environmentd will just be crashed, see database-issues#8749
-    | \[pod/environmentd-0/environmentd\]\ .*\ (unable\ to\ confirm\ leadership|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out)
+    | \[pod/environmentd-0/environmentd\]\ .*\ (unable\ to\ advance\ catalog\ upper|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out)
     | cannot\ load\ unknown\ system\ parameter
     # Occurs in Orchestratord test when restarting
     | comm="containerd"\ exe="/usr/local/bin/containerd"\ sig=11
diff --git a/src/adapter/src/catalog.rs b/src/adapter/src/catalog.rs
@@ -1147,8 +1147,8 @@ impl Catalog {
     }
 
     #[mz_ore::instrument(level = "debug")]
-    pub async fn confirm_leadership(&self) -> Result<(), AdapterError> {
-        Ok(self.storage().await.confirm_leadership().await?)
+    pub async fn advance_upper(&self, new_upper: mz_repr::Timestamp) -> Result<(), AdapterError> {
+        Ok(self.storage().await.advance_upper(new_upper).await?)
     }
 
     /// Return the ids of all log sources the given object depends on.
diff --git a/src/adapter/src/coord/appends.rs b/src/adapter/src/coord/appends.rs
@@ -445,22 +445,17 @@ impl Coordinator {
             advance_to,
         } = self.get_local_write_ts().await;
 
-        // While we're flipping on the feature flags for txn-wal tables and
-        // the separated Postgres timestamp oracle, we also need to confirm
-        // leadership on writes _after_ getting the timestamp and _before_
-        // writing anything to table shards.
-        //
-        // TODO: Remove this after both (either?) of the above features are on
-        // for good and no possibility of running the old code.
-        let confirm_leadership_start = Instant::now();
-        let () = self
-            .catalog
-            .confirm_leadership()
+        // Advance the catalog shard's upper to keep it in sync with the oracle
+        // timestamp. This ensures that reads of mz_catalog_raw at the oracle's
+        // read_ts do not block waiting for the catalog shard's upper to advance.
+        let catalog_upper_start = Instant::now();
+        self.catalog
+            .advance_upper(advance_to)
             .await
-            .unwrap_or_terminate("unable to confirm leadership");
+            .unwrap_or_terminate("unable to advance catalog upper");
         self.metrics
-            .group_commit_confirm_leadership_seconds
-            .observe(confirm_leadership_start.elapsed().as_secs_f64());
+            .group_commit_catalog_upper_seconds
+            .observe(catalog_upper_start.elapsed().as_secs_f64());
 
         let mut appends: BTreeMap<CatalogItemId, SmallVec<[TableData; 1]>> = BTreeMap::new();
         let mut responses = Vec::with_capacity(validated_writes.len());
diff --git a/src/adapter/src/coord/catalog_implications.rs b/src/adapter/src/coord/catalog_implications.rs
@@ -921,18 +921,22 @@ impl Coordinator {
         execution_timestamps_to_set: BTreeSet<StatementLoggingId>,
     ) -> Result<(), AdapterError> {
         // If we have tables, determine the initial validity for the table.
-        let register_ts = self.get_local_write_ts().await.timestamp;
+        let write_ts = self.get_local_write_ts().await;
+        let register_ts = write_ts.timestamp;
 
         // After acquiring `register_ts` but before using it, we need to
         // be sure we're still the leader. Otherwise a new generation
         // may also be trying to use `register_ts` for a different
-        // purpose.
+        // purpose. See materialize#28216.
         //
-        // See #28216.
+        // We also should advance the upper of the catalog shard, to ensure it
+        // is readable at the oracle read ts after we bump it to the
+        // `register_ts` below. Both of these needs are served by calling
+        // `advance_upper`.
         self.catalog
-            .confirm_leadership()
+            .advance_upper(write_ts.advance_to)
             .await
-            .unwrap_or_terminate("unable to confirm leadership");
+            .unwrap_or_terminate("unable to advance catalog upper");
 
         for id in execution_timestamps_to_set {
             self.set_statement_execution_timestamp(id, register_ts);
@@ -1170,7 +1174,15 @@ impl Coordinator {
             .desc
             .at_version(RelationVersionSelector::Specific(new_version));
 
-        let register_ts = self.get_local_write_ts().await.timestamp;
+        let write_ts = self.get_local_write_ts().await;
+        let register_ts = write_ts.timestamp;
+
+        // Ensure the catalog will be immediately readable at the read ts we're
+        // about to bump.
+        self.catalog
+            .advance_upper(write_ts.advance_to)
+            .await
+            .unwrap_or_terminate("unable to advance catalog upper");
 
         // Alter the table description, creating a "new" collection.
         self.controller
diff --git a/src/adapter/src/metrics.rs b/src/adapter/src/metrics.rs
@@ -52,7 +52,7 @@ pub struct Metrics {
     pub pgwire_recv_scheduling_delay_ms: HistogramVec,
     pub catalog_transact_seconds: HistogramVec,
     pub apply_catalog_implications_seconds: Histogram,
-    pub group_commit_confirm_leadership_seconds: Histogram,
+    pub group_commit_catalog_upper_seconds: Histogram,
     pub group_commit_table_advancement_seconds: Histogram,
 }
 
@@ -237,9 +237,9 @@ impl Metrics {
                 help: "The time it takes to apply catalog implications.",
                 buckets: histogram_seconds_buckets(0.001, 32.0),
             )),
-            group_commit_confirm_leadership_seconds: registry.register(metric!(
-                name: "mz_group_commit_confirm_leadership_seconds",
-                help: "The time it takes to confirm leadership during group commit.",
+            group_commit_catalog_upper_seconds: registry.register(metric!(
+                name: "mz_group_commit_catalog_upper_seconds",
+                help: "The time it takes to advance the catalog shard upper during group commit.",
                 buckets: histogram_seconds_buckets(0.001, 32.0),
             )),
             group_commit_table_advancement_seconds: registry.register(metric!(
diff --git a/src/catalog/src/durable.rs b/src/catalog/src/durable.rs
@@ -323,10 +323,11 @@ pub trait DurableCatalogState: ReadOnlyDurableCatalogState {
         commit_ts: Timestamp,
     ) -> Result<Timestamp, CatalogError>;
 
-    /// Confirms that this catalog is connected as the current leader.
+    /// Advances the upper of the catalog shard to `new_upper`.
     ///
-    /// NB: We may remove this in later iterations of Pv2.
-    async fn confirm_leadership(&mut self) -> Result<(), CatalogError>;
+    /// This implicitly confirms leadership, as attempting to advance the catalog frontier will
+    /// fail if the writer has been fenced out.
+    async fn advance_upper(&mut self, new_upper: Timestamp) -> Result<(), CatalogError>;
 
     /// Allocates and returns `amount` IDs of `id_type`.
     ///
diff --git a/src/catalog/src/durable/persist.rs b/src/catalog/src/durable/persist.rs
@@ -391,29 +391,15 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
         updates: Vec<(S, Diff)>,
         commit_ts: Timestamp,
     ) -> Result<Timestamp, CompareAndAppendError> {
-        assert_eq!(self.mode, Mode::Writable);
-        assert!(
-            commit_ts >= self.upper,
-            "expected commit ts, {}, to be greater than or equal to upper, {}",
-            commit_ts,
-            self.upper
-        );
-
-        // This awkward code allows us to perform an expensive soft assert that requires cloning
-        // `updates` twice, after `updates` has been consumed.
+        // The fencing check is expensive, so run it only with soft assertions enabled.
         let contains_fence = if mz_ore::assert::soft_assertions_enabled() {
-            let updates: Vec<_> = updates.clone();
             let parsed_updates: Vec<_> = updates
                 .clone()
                 .into_iter()
-                .map(|(update, diff)| {
-                    let update: StateUpdateKindJson = update.into();
-                    (update, diff)
-                })
                 .filter_map(|(update, diff)| {
-                    <StateUpdateKindJson as TryIntoStateUpdateKind>::try_into(update)
-                        .ok()
-                        .map(|update| (update, diff))
+                    let update: StateUpdateKindJson = update.into();
+                    let update = TryIntoStateUpdateKind::try_into(update).ok()?;
+                    Some((update, diff))
                 })
                 .collect();
             let contains_retraction = parsed_updates.iter().any(|(update, diff)| {
@@ -422,10 +408,9 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
             let contains_addition = parsed_updates.iter().any(|(update, diff)| {
                 matches!(update, StateUpdateKind::FenceToken(..)) && *diff == Diff::ONE
             });
-            let contains_fence = contains_retraction && contains_addition;
-            Some((contains_fence, updates))
+            contains_retraction && contains_addition
         } else {
-            None
+            false
         };
 
         let updates = updates.into_iter().map(|(kind, diff)| {
@@ -437,6 +422,44 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
             )
         });
         let next_upper = commit_ts.step_forward();
+        self.compare_and_append_inner(updates, next_upper)
+            .await
+            .inspect_err(|e| {
+                // A compare-and-append failure means someone else must have written to the
+                // catalog. We expect to have been fenced out, since writing to the catalog without
+                // fencing other catalogs should be impossible. The one exception is if we are
+                // trying to fence other catalogs with this write.
+                soft_assert_or_log!(
+                    matches!(e, CompareAndAppendError::Fence(_)) || contains_fence,
+                    "encountered an upper mismatch on a non-fencing write"
+                );
+            })?;
+
+        self.sync(next_upper).await?;
+        Ok(next_upper)
+    }
+
+    /// Compare-and-append `updates` to the catalog shard, advancing the upper to `next_upper`.
+    ///
+    /// On success, updating `self.upper` is left to the caller. The caller can thus decide whether
+    /// or not it needs to sync the catalog.
+    ///
+    /// # Panics
+    ///
+    /// Panics if not in `Writable` mode.
+    /// Panics if `next_upper` is not greater than `self.upper`.
+    async fn compare_and_append_inner(
+        &mut self,
+        updates: impl IntoIterator<Item = ((SourceData, ()), Timestamp, StorageDiff)>,
+        next_upper: Timestamp,
+    ) -> Result<(), CompareAndAppendError> {
+        assert_eq!(self.mode, Mode::Writable);
+        assert!(
+            next_upper > self.upper,
+            "next_upper ({next_upper}) not greater than current upper ({})",
+            self.upper,
+        );
+
         let res = self
             .write_handle
             .compare_and_append(
@@ -447,18 +470,10 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
             .await
             .expect("invalid usage");
 
-        // There was an upper mismatch which means something else must have written to the catalog.
-        // Syncing to the current upper should result in a fence error since writing to the catalog
-        // without fencing other catalogs should be impossible. The one exception is if we are
-        // trying to fence other catalogs with this write, in which case we won't see a fence error.
         if let Err(e @ UpperMismatch { .. }) = res {
+            // Most likely we were fenced out.
+            // Sync to the current upper to detect that.
             self.sync_to_current_upper().await?;
-            if let Some((contains_fence, updates)) = contains_fence {
-                assert!(
-                    contains_fence,
-                    "updates were neither fenced nor fencing and encountered an upper mismatch: {updates:#?}"
-                )
-            }
             return Err(e.into());
         }
 
@@ -483,8 +498,8 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
                 "updated bound should match expected"
             ),
         }
-        self.sync(next_upper).await?;
-        Ok(next_upper)
+
+        Ok(())
     }
 
     /// Generates an iterator of [`StateUpdate`] that contain all unconsolidated updates to the
@@ -1791,12 +1806,39 @@ impl DurableCatalogState for PersistCatalogState {
     }
 
     #[mz_ore::instrument(level = "debug")]
-    async fn confirm_leadership(&mut self) -> Result<(), CatalogError> {
-        // Read only catalog does not care about leadership.
-        if self.is_read_only() {
+    async fn advance_upper(&mut self, new_upper: Timestamp) -> Result<(), CatalogError> {
+        if self.upper >= new_upper {
+            // We don't expect a no-op advancement, but if we are wrong we'd crash the process.
+            // Seems safer to only soft-assert and return gracefully in production. If we get here
+            // that means we tried to make the catalog shard readable at a time it was already
+            // readable, which likely means we are violating linearizability. That's not great, but
+            // crashing (or even crash-looping) is worse.
+            //
+            // TODO: Consider removing this once we have built some confidence.
+            soft_panic_or_log!(
+                "new_upper ({new_upper}) not greater than current upper ({})",
+                self.upper
+            );
             return Ok(());
         }
-        self.sync_to_current_upper().await?;
+
+        match self.mode {
+            Mode::Writable => self
+                .compare_and_append_inner([], new_upper)
+                .await
+                .map_err(|e| e.unwrap_fence_error())?,
+            Mode::Savepoint => (),
+            Mode::Readonly => {
+                return Err(DurableCatalogError::NotWritable(
+                    "cannot advance upper of a read-only catalog".into(),
+                )
+                .into());
+            }
+        }
+
+        self.upper = new_upper;
+        // No sync needed since no data was written.
+
         Ok(())
     }
 
diff --git a/src/catalog/tests/read-write.rs b/src/catalog/tests/read-write.rs
@@ -32,13 +32,13 @@ use mz_sql::names::{DatabaseId, ResolvedDatabaseSpecifier, SchemaId};
 
 #[mz_ore::test(tokio::test)]
 #[cfg_attr(miri, ignore)] //  unsupported operation: can't call foreign function `TLS_client_method` on OS `linux`
-async fn test_persist_confirm_leadership() {
+async fn test_persist_advance_upper_fencing() {
     let persist_client = PersistClient::new_for_tests().await;
     let state_builder = TestCatalogStateBuilder::new(persist_client);
-    test_confirm_leadership(state_builder).await;
+    test_advance_upper_fencing(state_builder).await;
 }
 
-async fn test_confirm_leadership(state_builder: TestCatalogStateBuilder) {
+async fn test_advance_upper_fencing(state_builder: TestCatalogStateBuilder) {
     let state_builder = state_builder.with_default_deploy_generation();
     let mut state1 = state_builder
         .clone()
@@ -48,7 +48,8 @@ async fn test_confirm_leadership(state_builder: TestCatalogStateBuilder) {
         .await
         .unwrap()
         .0;
-    assert_ok!(state1.confirm_leadership().await);
+    let ts = state1.current_upper().await.step_forward();
+    assert_ok!(state1.advance_upper(ts).await);
 
     let mut state2 = state_builder
         .unwrap_build()
@@ -57,9 +58,11 @@ async fn test_confirm_leadership(state_builder: TestCatalogStateBuilder) {
         .await
         .unwrap()
         .0;
-    assert_ok!(state2.confirm_leadership().await);
+    let ts = state2.current_upper().await.step_forward();
+    assert_ok!(state2.advance_upper(ts).await);
 
-    let err = state1.confirm_leadership().await.unwrap_err();
+    let ts = ts.step_forward();
+    let err = state1.advance_upper(ts).await.unwrap_err();
     assert!(matches!(
         err,
         CatalogError::Durable(DurableCatalogError::Fence(FenceError::Epoch { .. }))
diff --git a/test/sqllogictest/mz_catalog_raw.slt b/test/sqllogictest/mz_catalog_raw.slt
@@ -19,10 +19,8 @@ SELECT * FROM mz_internal.mz_catalog_raw
 # The mz_system user can query it.
 
 simple conn=mz_system,user=mz_system
-SET transaction_isolation = serializable;
 SELECT count(*) > 0 FROM mz_internal.mz_catalog_raw;
 ----
-COMPLETE 0
 t
 COMPLETE 1
 
@@ -32,8 +30,9 @@ statement ok
 CREATE TABLE t (a INT, b TEXT)
 
 simple conn=mz_system,user=mz_system
-SET transaction_isolation = serializable;
-SELECT data->'value'->>'name' FROM mz_internal.mz_catalog_raw WHERE data->>'kind' = 'Item' AND data->'value'->>'name' = 't';
+SELECT data->'value'->>'name'
+FROM mz_internal.mz_catalog_raw
+WHERE data->>'kind' = 'Item' AND data->'value'->>'name' = 't';
 ----
-COMPLETE 0
-COMPLETE 0
+t
+COMPLETE 1
diff --git a/test/txn-wal-fencing/mzcompose.py b/test/txn-wal-fencing/mzcompose.py
@@ -246,7 +246,7 @@ def run_workload(c: Composition, workload: Workload, args: argparse.Namespace) -
         # Confirm that the first Mz has properly given up the ghost
         mz_first_log = c.invoke("logs", "mz_first", capture=True)
         assert (
-            "unable to confirm leadership" in mz_first_log.stdout
+            "unable to advance catalog upper" in mz_first_log.stdout
             or "unexpected fence epoch" in mz_first_log.stdout
             or "fenced by new catalog upper" in mz_first_log.stdout
             or "fenced by envd" in mz_first_log.stdout

Original file line number	Diff line number	Diff line change
`@@ -1147,8 +1147,8 @@ impl Catalog {`
`1147`	`1147`	`}`
`1148`	`1148`
`1149`	`1149`	`#[mz_ore::instrument(level = "debug")]`
`1150`		`- pub async fn confirm_leadership(&self) -> Result<(), AdapterError> {`
`1151`		`- Ok(self.storage().await.confirm_leadership().await?)`
	`1150`	`+ pub async fn advance_upper(&self, new_upper: mz_repr::Timestamp) -> Result<(), AdapterError> {`
	`1151`	`+ Ok(self.storage().await.advance_upper(new_upper).await?)`
`1152`	`1152`	`}`
`1153`	`1153`
`1154`	`1154`	`/// Return the ids of all log sources the given object depends on.`