Skip to content

Commit af5783a

Browse files
authored
adapter: keep catalog upper up-to-date with read ts (#35402)
This PR introduces a way to advance the frontier of the catalog shard without appending any new updates and then uses that to keep the catalog shard's frontier up to date with the oracle read ts. This ensures that `mz_catalog_raw` is always readable with strict serializable isolation, since under that isolation level, the oracle read ts will be used as the query timestamp. This increases the amount of CRDB queries we make, instead of bumping only the txn-wal, we now additionally need to bump the catalog shard. But note that in two of the three instances the `advance_upper` call replaces a `confirm_leadership` call, which also did a CRDB query, so one can hope that the resulting number of CRDB calls is similar. This could potentially be further improved by moving the catalog shard into txn-wal, but that's a larger lift so we leave it as future work. ### Motivation Closes SQL-117
1 parent b4a938c commit af5783a

10 files changed

Lines changed: 134 additions & 82 deletions

File tree

misc/python/materialize/cli/ci_annotate_errors.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,20 +180,20 @@
180180
# For tests we purposely trigger this error
181181
| skip-version-upgrade-materialized.* \| .* incompatible\ persist\ version\ \d+\.\d+\.\d+(-dev)?,\ current:\ \d+\.\d+\.\d+(-dev\.\d+)?,\ make\ sure\ to\ upgrade\ the\ catalog\ one\ version\ forward\ at\ a\ time
182182
# For 0dt upgrades
183-
| halting\ process:\ (unable\ to\ confirm\ leadership|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out|dependency\ since\ frontier\ is\ empty\ while\ dependent\ upper\ is\ not\ empty|code\ at\ version\ .*\ cannot\ read\ data\ with\ version)
183+
| halting\ process:\ (unable\ to\ advance\ catalog\ upper|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out|dependency\ since\ frontier\ is\ empty\ while\ dependent\ upper\ is\ not\ empty|code\ at\ version\ .*\ cannot\ read\ data\ with\ version)
184184
| zippy-materialized.* \| .* halting\ process:\ Server\ started\ with\ requested\ generation
185185
| there\ have\ been\ DDL\ that\ we\ need\ to\ react\ to;\ rebooting\ in\ read-only\ mode
186186
# Don't care for ssh problems
187187
| fatal:\ userauth_pubkey
188188
# Expected in Terraform tests if something else failed during the setup
189189
| mz-debug:\ fatal:\ failed\ to\ read\ kubeconfig
190190
# Fences without incrementing deploy generation
191-
| txn-wal-fencing-mz_first-.* \| .* unable\ to\ confirm\ leadership
191+
| txn-wal-fencing-mz_first-.* \| .* unable\ to\ advance\ catalog\ upper
192192
| txn-wal-fencing-mz_first-.* \| .* fenced\ by\ envd
193193
# 0dt platform-checks have two envds running in parallel, thus high load, tests still succeed, so ignore noise
194194
| platform-checks-mz_.* \| .* was\ expired\ due\ to\ inactivity\.\ Did\ the\ machine\ go\ to\ sleep\?
195195
# This can happen in "K8s recovery: envd on failing node", but the test still succeeds, old environmentd will just be crashed, see database-issues#8749
196-
| \[pod/environmentd-0/environmentd\]\ .*\ (unable\ to\ confirm\ leadership|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out)
196+
| \[pod/environmentd-0/environmentd\]\ .*\ (unable\ to\ advance\ catalog\ upper|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out)
197197
| cannot\ load\ unknown\ system\ parameter
198198
# Occurs in Orchestratord test when restarting
199199
| comm="containerd"\ exe="/usr/local/bin/containerd"\ sig=11

src/adapter/src/catalog.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,8 +1147,8 @@ impl Catalog {
11471147
}
11481148

11491149
#[mz_ore::instrument(level = "debug")]
1150-
pub async fn confirm_leadership(&self) -> Result<(), AdapterError> {
1151-
Ok(self.storage().await.confirm_leadership().await?)
1150+
pub async fn advance_upper(&self, new_upper: mz_repr::Timestamp) -> Result<(), AdapterError> {
1151+
Ok(self.storage().await.advance_upper(new_upper).await?)
11521152
}
11531153

11541154
/// Return the ids of all log sources the given object depends on.

src/adapter/src/coord/appends.rs

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -445,22 +445,17 @@ impl Coordinator {
445445
advance_to,
446446
} = self.get_local_write_ts().await;
447447

448-
// While we're flipping on the feature flags for txn-wal tables and
449-
// the separated Postgres timestamp oracle, we also need to confirm
450-
// leadership on writes _after_ getting the timestamp and _before_
451-
// writing anything to table shards.
452-
//
453-
// TODO: Remove this after both (either?) of the above features are on
454-
// for good and no possibility of running the old code.
455-
let confirm_leadership_start = Instant::now();
456-
let () = self
457-
.catalog
458-
.confirm_leadership()
448+
// Advance the catalog shard's upper to keep it in sync with the oracle
449+
// timestamp. This ensures that reads of mz_catalog_raw at the oracle's
450+
// read_ts do not block waiting for the catalog shard's upper to advance.
451+
let catalog_upper_start = Instant::now();
452+
self.catalog
453+
.advance_upper(advance_to)
459454
.await
460-
.unwrap_or_terminate("unable to confirm leadership");
455+
.unwrap_or_terminate("unable to advance catalog upper");
461456
self.metrics
462-
.group_commit_confirm_leadership_seconds
463-
.observe(confirm_leadership_start.elapsed().as_secs_f64());
457+
.group_commit_catalog_upper_seconds
458+
.observe(catalog_upper_start.elapsed().as_secs_f64());
464459

465460
let mut appends: BTreeMap<CatalogItemId, SmallVec<[TableData; 1]>> = BTreeMap::new();
466461
let mut responses = Vec::with_capacity(validated_writes.len());

src/adapter/src/coord/catalog_implications.rs

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -921,18 +921,22 @@ impl Coordinator {
921921
execution_timestamps_to_set: BTreeSet<StatementLoggingId>,
922922
) -> Result<(), AdapterError> {
923923
// If we have tables, determine the initial validity for the table.
924-
let register_ts = self.get_local_write_ts().await.timestamp;
924+
let write_ts = self.get_local_write_ts().await;
925+
let register_ts = write_ts.timestamp;
925926

926927
// After acquiring `register_ts` but before using it, we need to
927928
// be sure we're still the leader. Otherwise a new generation
928929
// may also be trying to use `register_ts` for a different
929-
// purpose.
930+
// purpose. See materialize#28216.
930931
//
931-
// See #28216.
932+
// We also should advance the upper of the catalog shard, to ensure it
933+
// is readable at the oracle read ts after we bump it to the
934+
// `register_ts` below. Both of these needs are served by calling
935+
// `advance_upper`.
932936
self.catalog
933-
.confirm_leadership()
937+
.advance_upper(write_ts.advance_to)
934938
.await
935-
.unwrap_or_terminate("unable to confirm leadership");
939+
.unwrap_or_terminate("unable to advance catalog upper");
936940

937941
for id in execution_timestamps_to_set {
938942
self.set_statement_execution_timestamp(id, register_ts);
@@ -1170,7 +1174,15 @@ impl Coordinator {
11701174
.desc
11711175
.at_version(RelationVersionSelector::Specific(new_version));
11721176

1173-
let register_ts = self.get_local_write_ts().await.timestamp;
1177+
let write_ts = self.get_local_write_ts().await;
1178+
let register_ts = write_ts.timestamp;
1179+
1180+
// Ensure the catalog will be immediately readable at the read ts we're
1181+
// about to bump.
1182+
self.catalog
1183+
.advance_upper(write_ts.advance_to)
1184+
.await
1185+
.unwrap_or_terminate("unable to advance catalog upper");
11741186

11751187
// Alter the table description, creating a "new" collection.
11761188
self.controller

src/adapter/src/metrics.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ pub struct Metrics {
5252
pub pgwire_recv_scheduling_delay_ms: HistogramVec,
5353
pub catalog_transact_seconds: HistogramVec,
5454
pub apply_catalog_implications_seconds: Histogram,
55-
pub group_commit_confirm_leadership_seconds: Histogram,
55+
pub group_commit_catalog_upper_seconds: Histogram,
5656
pub group_commit_table_advancement_seconds: Histogram,
5757
}
5858

@@ -237,9 +237,9 @@ impl Metrics {
237237
help: "The time it takes to apply catalog implications.",
238238
buckets: histogram_seconds_buckets(0.001, 32.0),
239239
)),
240-
group_commit_confirm_leadership_seconds: registry.register(metric!(
241-
name: "mz_group_commit_confirm_leadership_seconds",
242-
help: "The time it takes to confirm leadership during group commit.",
240+
group_commit_catalog_upper_seconds: registry.register(metric!(
241+
name: "mz_group_commit_catalog_upper_seconds",
242+
help: "The time it takes to advance the catalog shard upper during group commit.",
243243
buckets: histogram_seconds_buckets(0.001, 32.0),
244244
)),
245245
group_commit_table_advancement_seconds: registry.register(metric!(

src/catalog/src/durable.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -323,10 +323,11 @@ pub trait DurableCatalogState: ReadOnlyDurableCatalogState {
323323
commit_ts: Timestamp,
324324
) -> Result<Timestamp, CatalogError>;
325325

326-
/// Confirms that this catalog is connected as the current leader.
326+
/// Advances the upper of the catalog shard to `new_upper`.
327327
///
328-
/// NB: We may remove this in later iterations of Pv2.
329-
async fn confirm_leadership(&mut self) -> Result<(), CatalogError>;
328+
/// This implicitly confirms leadership, as attempting to advance the catalog frontier will
329+
/// fail if the writer has been fenced out.
330+
async fn advance_upper(&mut self, new_upper: Timestamp) -> Result<(), CatalogError>;
330331

331332
/// Allocates and returns `amount` IDs of `id_type`.
332333
///

src/catalog/src/durable/persist.rs

Lines changed: 79 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -391,29 +391,15 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
391391
updates: Vec<(S, Diff)>,
392392
commit_ts: Timestamp,
393393
) -> Result<Timestamp, CompareAndAppendError> {
394-
assert_eq!(self.mode, Mode::Writable);
395-
assert!(
396-
commit_ts >= self.upper,
397-
"expected commit ts, {}, to be greater than or equal to upper, {}",
398-
commit_ts,
399-
self.upper
400-
);
401-
402-
// This awkward code allows us to perform an expensive soft assert that requires cloning
403-
// `updates` twice, after `updates` has been consumed.
394+
// The fencing check is expensive, so run it only with soft assertions enabled.
404395
let contains_fence = if mz_ore::assert::soft_assertions_enabled() {
405-
let updates: Vec<_> = updates.clone();
406396
let parsed_updates: Vec<_> = updates
407397
.clone()
408398
.into_iter()
409-
.map(|(update, diff)| {
410-
let update: StateUpdateKindJson = update.into();
411-
(update, diff)
412-
})
413399
.filter_map(|(update, diff)| {
414-
<StateUpdateKindJson as TryIntoStateUpdateKind>::try_into(update)
415-
.ok()
416-
.map(|update| (update, diff))
400+
let update: StateUpdateKindJson = update.into();
401+
let update = TryIntoStateUpdateKind::try_into(update).ok()?;
402+
Some((update, diff))
417403
})
418404
.collect();
419405
let contains_retraction = parsed_updates.iter().any(|(update, diff)| {
@@ -422,10 +408,9 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
422408
let contains_addition = parsed_updates.iter().any(|(update, diff)| {
423409
matches!(update, StateUpdateKind::FenceToken(..)) && *diff == Diff::ONE
424410
});
425-
let contains_fence = contains_retraction && contains_addition;
426-
Some((contains_fence, updates))
411+
contains_retraction && contains_addition
427412
} else {
428-
None
413+
false
429414
};
430415

431416
let updates = updates.into_iter().map(|(kind, diff)| {
@@ -437,6 +422,44 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
437422
)
438423
});
439424
let next_upper = commit_ts.step_forward();
425+
self.compare_and_append_inner(updates, next_upper)
426+
.await
427+
.inspect_err(|e| {
428+
// A compare-and-append failure means someone else must have written to the
429+
// catalog. We expect to have been fenced out, since writing to the catalog without
430+
// fencing other catalogs should be impossible. The one exception is if we are
431+
// trying to fence other catalogs with this write.
432+
soft_assert_or_log!(
433+
matches!(e, CompareAndAppendError::Fence(_)) || contains_fence,
434+
"encountered an upper mismatch on a non-fencing write"
435+
);
436+
})?;
437+
438+
self.sync(next_upper).await?;
439+
Ok(next_upper)
440+
}
441+
442+
/// Compare-and-append `updates` to the catalog shard, advancing the upper to `next_upper`.
443+
///
444+
/// On success, updating `self.upper` is left to the caller. The caller can thus decide whether
445+
/// or not it needs to sync the catalog.
446+
///
447+
/// # Panics
448+
///
449+
/// Panics if not in `Writable` mode.
450+
/// Panics if `next_upper` is not greater than `self.upper`.
451+
async fn compare_and_append_inner(
452+
&mut self,
453+
updates: impl IntoIterator<Item = ((SourceData, ()), Timestamp, StorageDiff)>,
454+
next_upper: Timestamp,
455+
) -> Result<(), CompareAndAppendError> {
456+
assert_eq!(self.mode, Mode::Writable);
457+
assert!(
458+
next_upper > self.upper,
459+
"next_upper ({next_upper}) not greater than current upper ({})",
460+
self.upper,
461+
);
462+
440463
let res = self
441464
.write_handle
442465
.compare_and_append(
@@ -447,18 +470,10 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
447470
.await
448471
.expect("invalid usage");
449472

450-
// There was an upper mismatch which means something else must have written to the catalog.
451-
// Syncing to the current upper should result in a fence error since writing to the catalog
452-
// without fencing other catalogs should be impossible. The one exception is if we are
453-
// trying to fence other catalogs with this write, in which case we won't see a fence error.
454473
if let Err(e @ UpperMismatch { .. }) = res {
474+
// Most likely we were fenced out.
475+
// Sync to the current upper to detect that.
455476
self.sync_to_current_upper().await?;
456-
if let Some((contains_fence, updates)) = contains_fence {
457-
assert!(
458-
contains_fence,
459-
"updates were neither fenced nor fencing and encountered an upper mismatch: {updates:#?}"
460-
)
461-
}
462477
return Err(e.into());
463478
}
464479

@@ -483,8 +498,8 @@ impl<T: TryIntoStateUpdateKind, U: ApplyUpdate<T>> PersistHandle<T, U> {
483498
"updated bound should match expected"
484499
),
485500
}
486-
self.sync(next_upper).await?;
487-
Ok(next_upper)
501+
502+
Ok(())
488503
}
489504

490505
/// Generates an iterator of [`StateUpdate`] that contain all unconsolidated updates to the
@@ -1791,12 +1806,39 @@ impl DurableCatalogState for PersistCatalogState {
17911806
}
17921807

17931808
#[mz_ore::instrument(level = "debug")]
1794-
async fn confirm_leadership(&mut self) -> Result<(), CatalogError> {
1795-
// Read only catalog does not care about leadership.
1796-
if self.is_read_only() {
1809+
async fn advance_upper(&mut self, new_upper: Timestamp) -> Result<(), CatalogError> {
1810+
if self.upper >= new_upper {
1811+
// We don't expect a no-op advancement, but if we are wrong we'd crash the process.
1812+
// Seems safer to only soft-assert and return gracefully in production. If we get here
1813+
// that means we tried to make the catalog shard readable at a time it was already
1814+
// readable, which likely means we are violating linearizability. That's not great, but
1815+
// crashing (or even crash-looping) is worse.
1816+
//
1817+
// TODO: Consider removing this once we have built some confidence.
1818+
soft_panic_or_log!(
1819+
"new_upper ({new_upper}) not greater than current upper ({})",
1820+
self.upper
1821+
);
17971822
return Ok(());
17981823
}
1799-
self.sync_to_current_upper().await?;
1824+
1825+
match self.mode {
1826+
Mode::Writable => self
1827+
.compare_and_append_inner([], new_upper)
1828+
.await
1829+
.map_err(|e| e.unwrap_fence_error())?,
1830+
Mode::Savepoint => (),
1831+
Mode::Readonly => {
1832+
return Err(DurableCatalogError::NotWritable(
1833+
"cannot advance upper of a read-only catalog".into(),
1834+
)
1835+
.into());
1836+
}
1837+
}
1838+
1839+
self.upper = new_upper;
1840+
// No sync needed since no data was written.
1841+
18001842
Ok(())
18011843
}
18021844

src/catalog/tests/read-write.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ use mz_sql::names::{DatabaseId, ResolvedDatabaseSpecifier, SchemaId};
3232

3333
#[mz_ore::test(tokio::test)]
3434
#[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function `TLS_client_method` on OS `linux`
35-
async fn test_persist_confirm_leadership() {
35+
async fn test_persist_advance_upper_fencing() {
3636
let persist_client = PersistClient::new_for_tests().await;
3737
let state_builder = TestCatalogStateBuilder::new(persist_client);
38-
test_confirm_leadership(state_builder).await;
38+
test_advance_upper_fencing(state_builder).await;
3939
}
4040

41-
async fn test_confirm_leadership(state_builder: TestCatalogStateBuilder) {
41+
async fn test_advance_upper_fencing(state_builder: TestCatalogStateBuilder) {
4242
let state_builder = state_builder.with_default_deploy_generation();
4343
let mut state1 = state_builder
4444
.clone()
@@ -48,7 +48,8 @@ async fn test_confirm_leadership(state_builder: TestCatalogStateBuilder) {
4848
.await
4949
.unwrap()
5050
.0;
51-
assert_ok!(state1.confirm_leadership().await);
51+
let ts = state1.current_upper().await.step_forward();
52+
assert_ok!(state1.advance_upper(ts).await);
5253

5354
let mut state2 = state_builder
5455
.unwrap_build()
@@ -57,9 +58,11 @@ async fn test_confirm_leadership(state_builder: TestCatalogStateBuilder) {
5758
.await
5859
.unwrap()
5960
.0;
60-
assert_ok!(state2.confirm_leadership().await);
61+
let ts = state2.current_upper().await.step_forward();
62+
assert_ok!(state2.advance_upper(ts).await);
6163

62-
let err = state1.confirm_leadership().await.unwrap_err();
64+
let ts = ts.step_forward();
65+
let err = state1.advance_upper(ts).await.unwrap_err();
6366
assert!(matches!(
6467
err,
6568
CatalogError::Durable(DurableCatalogError::Fence(FenceError::Epoch { .. }))

test/sqllogictest/mz_catalog_raw.slt

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,8 @@ SELECT * FROM mz_internal.mz_catalog_raw
1919
# The mz_system user can query it.
2020

2121
simple conn=mz_system,user=mz_system
22-
SET transaction_isolation = serializable;
2322
SELECT count(*) > 0 FROM mz_internal.mz_catalog_raw;
2423
----
25-
COMPLETE 0
2624
t
2725
COMPLETE 1
2826

@@ -32,8 +30,9 @@ statement ok
3230
CREATE TABLE t (a INT, b TEXT)
3331

3432
simple conn=mz_system,user=mz_system
35-
SET transaction_isolation = serializable;
36-
SELECT data->'value'->>'name' FROM mz_internal.mz_catalog_raw WHERE data->>'kind' = 'Item' AND data->'value'->>'name' = 't';
33+
SELECT data->'value'->>'name'
34+
FROM mz_internal.mz_catalog_raw
35+
WHERE data->>'kind' = 'Item' AND data->'value'->>'name' = 't';
3736
----
38-
COMPLETE 0
39-
COMPLETE 0
37+
t
38+
COMPLETE 1

test/txn-wal-fencing/mzcompose.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def run_workload(c: Composition, workload: Workload, args: argparse.Namespace) -
246246
# Confirm that the first Mz has properly given up the ghost
247247
mz_first_log = c.invoke("logs", "mz_first", capture=True)
248248
assert (
249-
"unable to confirm leadership" in mz_first_log.stdout
249+
"unable to advance catalog upper" in mz_first_log.stdout
250250
or "unexpected fence epoch" in mz_first_log.stdout
251251
or "fenced by new catalog upper" in mz_first_log.stdout
252252
or "fenced by envd" in mz_first_log.stdout

0 commit comments

Comments
 (0)