Skip to content

Commit 9eeb694

Browse files
committed
Add mz_object_arrangement_size_history with periodic collection
Introduces a builtin history table that records periodic snapshots of mz_object_arrangement_sizes so users can see arrangement memory usage over time. What's added: * The history table and two indexes (on object_id and collection_timestamp). * Dyncfgs for collection interval (1h) and retention (7d). * A coordinator task that snapshots the live table every interval and appends rows, plus a startup pruner that drops rows past retention. * Metrics for collection latency and rows written. Notes for review: * Collection timing is offset by organization_id so environments don't all collect at the same instant. * A schema-migration guard prevents the history table from being truncated, which would break the pruner.
1 parent ad86390 commit 9eeb694

7 files changed

Lines changed: 394 additions & 4 deletions

File tree

src/adapter-types/src/dyncfgs.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,20 @@ pub const CONSOLE_OIDC_SCOPES: Config<&'static str> = Config::new(
214214
"Space-separated OIDC scopes requested by the web console.",
215215
);
216216

217+
/// Interval at which to collect per-object arrangement size snapshots for the history table.
218+
pub const ARRANGEMENT_SIZE_COLLECTION_INTERVAL: Config<Duration> = Config::new(
219+
"arrangement_size_collection_interval",
220+
Duration::from_secs(60 * 60),
221+
"Interval at which to collect and snapshot per-object arrangement sizes.",
222+
);
223+
224+
/// How long to retain per-object arrangement size history.
225+
pub const ARRANGEMENT_SIZE_RETENTION_PERIOD: Config<Duration> = Config::new(
226+
"arrangement_size_retention_period",
227+
Duration::from_secs(7 * 24 * 60 * 60),
228+
"How long to retain per-object arrangement size history.",
229+
);
230+
217231
/// Adds the full set of all adapter `Config`s.
218232
pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet {
219233
configs
@@ -245,4 +259,6 @@ pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet {
245259
.add(&USER_ID_POOL_BATCH_SIZE)
246260
.add(&CONSOLE_OIDC_CLIENT_ID)
247261
.add(&CONSOLE_OIDC_SCOPES)
262+
.add(&ARRANGEMENT_SIZE_COLLECTION_INTERVAL)
263+
.add(&ARRANGEMENT_SIZE_RETENTION_PERIOD)
248264
}

src/adapter/src/catalog/open/builtin_schema_migration.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ use futures::future::BoxFuture;
3737
use mz_build_info::{BuildInfo, DUMMY_BUILD_INFO};
3838
use mz_catalog::builtin::{
3939
BUILTIN_LOOKUP, Builtin, Fingerprint, MZ_CATALOG_RAW, MZ_CATALOG_RAW_DESCRIPTION,
40-
MZ_STORAGE_USAGE_BY_SHARD_DESCRIPTION, RUNTIME_ALTERABLE_FINGERPRINT_SENTINEL,
40+
MZ_OBJECT_ARRANGEMENT_SIZE_HISTORY_DESCRIPTION, MZ_STORAGE_USAGE_BY_SHARD_DESCRIPTION,
41+
RUNTIME_ALTERABLE_FINGERPRINT_SENTINEL,
4142
};
4243
use mz_catalog::config::BuiltinItemMigrationConfig;
4344
use mz_catalog::durable::objects::SystemObjectUniqueIdentifier;
@@ -486,6 +487,14 @@ impl Migration {
486487
"mz_storage_usage_by_shard cannot be migrated or else the table will be truncated"
487488
);
488489

490+
// Same hazard as `mz_storage_usage_by_shard`: the startup pruner
491+
// (`Coordinator::prune_arrangement_sizes_history_on_startup`) assumes it is
492+
// the only source of retractions, so migration-driven truncation would break it.
493+
assert_ne!(
494+
&*MZ_OBJECT_ARRANGEMENT_SIZE_HISTORY_DESCRIPTION, object,
495+
"mz_object_arrangement_size_history cannot be migrated or else the table will be truncated"
496+
);
497+
489498
// `mz_catalog_raw` cannot be migrated because it contains the durable catalog and it
490499
// wouldn't be very durable if we allowed it to be truncated.
491500
assert_ne!(

src/adapter/src/coord.rs

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,9 @@ pub enum Message {
347347
StorageUsageFetch,
348348
StorageUsageUpdate(ShardsUsageReferenced),
349349
StorageUsagePrune(Vec<BuiltinTableUpdate>),
350+
ArrangementSizesSchedule,
351+
ArrangementSizesSnapshot,
352+
ArrangementSizesPrune(Vec<BuiltinTableUpdate>),
350353
/// Performs any cleanup and logging actions necessary for
351354
/// finalizing a statement execution.
352355
RetireExecute {
@@ -483,6 +486,9 @@ impl Message {
483486
Message::StorageUsageFetch => "storage_usage_fetch",
484487
Message::StorageUsageUpdate(_) => "storage_usage_update",
485488
Message::StorageUsagePrune(_) => "storage_usage_prune",
489+
Message::ArrangementSizesSchedule => "arrangement_sizes_schedule",
490+
Message::ArrangementSizesSnapshot => "arrangement_sizes_snapshot",
491+
Message::ArrangementSizesPrune(_) => "arrangement_sizes_prune",
486492
Message::RetireExecute { .. } => "retire_execute",
487493
Message::ExecuteSingleStatementTransaction { .. } => {
488494
"execute_single_statement_transaction"
@@ -3618,6 +3624,7 @@ impl Coordinator {
36183624
});
36193625

36203626
self.schedule_storage_usage_collection().await;
3627+
self.schedule_arrangement_sizes_collection().await;
36213628
self.spawn_privatelink_vpc_endpoints_watch_task();
36223629
self.spawn_statement_logging_task();
36233630
flags::tracing_config(self.catalog.system_config()).apply(&self.tracing_handle);
@@ -4234,6 +4241,68 @@ impl Coordinator {
42344241
});
42354242
}
42364243

4244+
/// Retracts `mz_object_arrangement_size_history` rows older than the
4245+
/// `arrangement_size_retention_period` dyncfg.
4246+
///
4247+
/// Must only run at startup: it reads at the oracle read timestamp and
4248+
/// writes retractions at the current write timestamp, which is only safe
4249+
/// when no other writes are in flight. See [the equivalent storage-usage
4250+
/// pruner](Self::prune_storage_usage_events_on_startup) for the same
4251+
/// reasoning.
4252+
async fn prune_arrangement_sizes_history_on_startup(&self) {
4253+
// The catalog server is not writable in read-only mode.
4254+
if self.controller.read_only() {
4255+
return;
4256+
}
4257+
4258+
let retention_period = mz_adapter_types::dyncfgs::ARRANGEMENT_SIZE_RETENTION_PERIOD
4259+
.get(self.catalog().system_config().dyncfgs());
4260+
let item_id = self
4261+
.catalog()
4262+
.resolve_builtin_table(&mz_catalog::builtin::MZ_OBJECT_ARRANGEMENT_SIZE_HISTORY);
4263+
let global_id = self.catalog.get_entry(&item_id).latest_global_id();
4264+
let read_ts = self.get_local_read_ts().await;
4265+
let current_contents_fut = self
4266+
.controller
4267+
.storage_collections
4268+
.snapshot(global_id, read_ts);
4269+
let internal_cmd_tx = self.internal_cmd_tx.clone();
4270+
spawn(|| "arrangement_sizes_history_prune", async move {
4271+
let mut current_contents = current_contents_fut
4272+
.await
4273+
.unwrap_or_terminate("cannot fail to fetch snapshot");
4274+
differential_dataflow::consolidation::consolidate(&mut current_contents);
4275+
4276+
let cutoff_ts = u128::from(read_ts).saturating_sub(retention_period.as_millis());
4277+
let mut expired = Vec::new();
4278+
for (row, diff) in current_contents {
4279+
assert_eq!(
4280+
diff, 1,
4281+
"consolidated contents should not contain retractions: ({row:#?}, {diff:#?})"
4282+
);
4283+
// Column 3 is `collection_timestamp`.
4284+
let collection_timestamp = row
4285+
.unpack()
4286+
.get(3)
4287+
.expect("definition of mz_object_arrangement_size_history changed")
4288+
.unwrap_timestamptz();
4289+
let collection_timestamp = collection_timestamp.timestamp_millis();
4290+
let collection_timestamp: u128 = collection_timestamp
4291+
.try_into()
4292+
.expect("all collections happen after Jan 1 1970");
4293+
if collection_timestamp < cutoff_ts {
4294+
let builtin_update = BuiltinTableUpdate::row(item_id, row, Diff::MINUS_ONE);
4295+
expired.push(builtin_update);
4296+
}
4297+
}
4298+
4299+
// TODO(arrangement-sizes): when the writeable-catalog-server
4300+
// plumbing in https://github.com/MaterializeInc/materialize/pull/35436
4301+
// lands, retract directly on `mz_catalog_server`.
4302+
let _ = internal_cmd_tx.send(Message::ArrangementSizesPrune(expired));
4303+
});
4304+
}
4305+
42374306
fn current_credit_consumption_rate(&self) -> Numeric {
42384307
self.catalog()
42394308
.user_cluster_replicas()
@@ -4760,6 +4829,8 @@ pub fn serve(
47604829
.await;
47614830
}
47624831

4832+
coord.prune_arrangement_sizes_history_on_startup().await;
4833+
47634834
Ok(())
47644835
});
47654836
let ok = bootstrap.is_ok();

src/adapter/src/coord/message_handler.rs

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use mz_catalog::memory::objects::ClusterReplicaProcessStatus;
1919
use mz_controller::ControllerResponse;
2020
use mz_controller::clusters::{ClusterEvent, ClusterStatus};
2121
use mz_ore::instrument;
22+
use mz_ore::cast::CastFrom;
2223
use mz_ore::now::EpochMillis;
2324
use mz_ore::option::OptionExt;
2425
use mz_ore::tracing::OpenTelemetryContext;
@@ -130,6 +131,17 @@ impl Coordinator {
130131
Message::StorageUsagePrune(expired) => {
131132
self.storage_usage_prune(expired).boxed_local().await;
132133
}
134+
Message::ArrangementSizesSchedule => {
135+
self.schedule_arrangement_sizes_collection()
136+
.boxed_local()
137+
.await;
138+
}
139+
Message::ArrangementSizesSnapshot => {
140+
self.arrangement_sizes_snapshot().boxed_local().await;
141+
}
142+
Message::ArrangementSizesPrune(expired) => {
143+
self.arrangement_sizes_prune(expired).boxed_local().await;
144+
}
133145
Message::RetireExecute {
134146
otel_ctx,
135147
data,
@@ -340,6 +352,201 @@ impl Coordinator {
340352
});
341353
}
342354

355+
/// Schedules the next per-object arrangement sizes snapshot.
356+
///
357+
/// Re-reads the interval dyncfg on every call so operators can retune
358+
/// cadence without a restart. Aligns to an `organization_id`-seeded offset
359+
/// within the interval so collections stay consistent across restarts and
360+
/// don't synchronize across environments.
361+
pub async fn schedule_arrangement_sizes_collection(&self) {
362+
let interval_duration = mz_adapter_types::dyncfgs::ARRANGEMENT_SIZE_COLLECTION_INTERVAL
363+
.get(self.catalog().system_config().dyncfgs());
364+
365+
const SEED_LEN: usize = 32;
366+
let mut seed = [0; SEED_LEN];
367+
for (i, byte) in self
368+
.catalog()
369+
.state()
370+
.config()
371+
.environment_id
372+
.organization_id()
373+
.as_bytes()
374+
.into_iter()
375+
.take(SEED_LEN)
376+
.enumerate()
377+
{
378+
seed[i] = *byte;
379+
}
380+
let interval_ms: EpochMillis = EpochMillis::try_from(interval_duration.as_millis())
381+
.expect("arrangement_size_collection_interval must fit into u64");
382+
// `rand::random_range` panics on an empty range.
383+
let interval_ms = interval_ms.max(1);
384+
let offset = rngs::SmallRng::from_seed(seed).random_range(0..interval_ms);
385+
let now_ts: EpochMillis = self.peek_local_write_ts().await.into();
386+
387+
let previous_collection_ts = (now_ts - (now_ts % interval_ms)) + offset;
388+
let next_collection_ts = if previous_collection_ts > now_ts {
389+
previous_collection_ts
390+
} else {
391+
previous_collection_ts + interval_ms
392+
};
393+
let sleep_for = Duration::from_millis(next_collection_ts - now_ts);
394+
395+
let internal_cmd_tx = self.internal_cmd_tx.clone();
396+
task::spawn(|| "arrangement_sizes_collection", async move {
397+
tokio::time::sleep(sleep_for).await;
398+
// Best-effort: if the coordinator is shutting down, just drop.
399+
let _ = internal_cmd_tx.send(Message::ArrangementSizesSnapshot);
400+
});
401+
}
402+
403+
/// Snapshots the current contents of `mz_object_arrangement_sizes` and
404+
/// appends them to `mz_object_arrangement_size_history`, tagged with a
405+
/// shared `collection_timestamp`. Reschedules on completion.
406+
#[mz_ore::instrument(level = "debug")]
407+
async fn arrangement_sizes_snapshot(&mut self) {
408+
// The catalog server is not writable in read-only mode.
409+
if self.controller.read_only() {
410+
self.schedule_arrangement_sizes_collection().await;
411+
return;
412+
}
413+
414+
let collection_timer = self
415+
.metrics
416+
.arrangement_sizes_collection_time_seconds
417+
.start_timer();
418+
419+
let live_item_id = self
420+
.catalog()
421+
.resolve_builtin_storage_collection(
422+
&mz_catalog::builtin::MZ_OBJECT_ARRANGEMENT_SIZES_UNIFIED,
423+
);
424+
let live_global_id = self.catalog.get_entry(&live_item_id).latest_global_id();
425+
let history_item_id = self
426+
.catalog()
427+
.resolve_builtin_table(&mz_catalog::builtin::MZ_OBJECT_ARRANGEMENT_SIZE_HISTORY);
428+
429+
let read_ts = self.get_local_read_ts().await;
430+
let snapshot = match self
431+
.controller
432+
.storage_collections
433+
.snapshot(live_global_id, read_ts)
434+
.await
435+
{
436+
Ok(s) => s,
437+
Err(e) => {
438+
tracing::warn!("arrangement sizes snapshot failed: {e:?}");
439+
drop(collection_timer);
440+
self.schedule_arrangement_sizes_collection().await;
441+
return;
442+
}
443+
};
444+
445+
// `collection_ts` is stamped after the snapshot so it's always >= the
446+
// state the rows describe, and monotone across restarts. The snapshot
447+
// read and this stamp aren't atomic, but the resulting skew is bounded
448+
// by snapshot latency and negligible at this cadence.
449+
let collection_ts: EpochMillis = self.get_local_write_ts().await.timestamp.into();
450+
let collection_datum = Datum::TimestampTz(
451+
mz_ore::now::to_datetime(collection_ts)
452+
.try_into()
453+
.expect("collection_timestamp must fit into TimestampTz"),
454+
);
455+
456+
let mut consolidated = snapshot;
457+
differential_dataflow::consolidation::consolidate(&mut consolidated);
458+
459+
// Column positions in `mz_object_arrangement_sizes`.
460+
const LIVE_COL_REPLICA_ID: usize = 0;
461+
const LIVE_COL_OBJECT_ID: usize = 1;
462+
const LIVE_COL_SIZE: usize = 2;
463+
const LIVE_COL_COUNT: usize = 3;
464+
465+
let mut skipped_malformed: u64 = 0;
466+
let mut skipped_null_size: u64 = 0;
467+
let updates: Vec<BuiltinTableUpdate> = consolidated
468+
.into_iter()
469+
.filter_map(|(row, diff)| {
470+
if diff != 1 {
471+
return None;
472+
}
473+
let datums = row.unpack();
474+
// Surface schema drift via a warn log below rather than silently
475+
// skipping entire snapshots.
476+
if datums.len() != LIVE_COL_COUNT {
477+
skipped_malformed += 1;
478+
return None;
479+
}
480+
let replica_id = datums[LIVE_COL_REPLICA_ID].unwrap_str();
481+
let object_id = datums[LIVE_COL_OBJECT_ID].unwrap_str();
482+
let size_datum = datums[LIVE_COL_SIZE];
483+
// The history table's `size` is non-null; fabricating zero would
484+
// be misleading, so drop.
485+
if size_datum.is_null() {
486+
skipped_null_size += 1;
487+
return None;
488+
}
489+
let size = size_datum.unwrap_int64();
490+
let new_row = Row::pack_slice(&[
491+
Datum::String(replica_id),
492+
Datum::String(object_id),
493+
Datum::Int64(size),
494+
collection_datum,
495+
]);
496+
Some(BuiltinTableUpdate::row(history_item_id, new_row, Diff::ONE))
497+
})
498+
.collect();
499+
if skipped_malformed > 0 {
500+
warn!(
501+
"mz_object_arrangement_sizes schema drift: skipped {skipped_malformed} rows \
502+
with unexpected arity"
503+
);
504+
}
505+
if skipped_null_size > 0 {
506+
tracing::debug!("skipped {skipped_null_size} live rows with null size");
507+
}
508+
509+
let row_count = updates.len();
510+
// Captures snapshot + row construction. The async table-apply below
511+
// is captured separately by `mz_append_table_duration_seconds`.
512+
collection_timer.observe_duration();
513+
514+
if !updates.is_empty() {
515+
self.metrics
516+
.arrangement_sizes_rows_written
517+
.inc_by(u64::cast_from(row_count));
518+
// TODO(arrangement-sizes): when the writeable-catalog-server plumbing
519+
// in https://github.com/MaterializeInc/materialize/pull/35436 lands,
520+
// append directly on `mz_catalog_server` instead of going through
521+
// the environmentd builtin-table-update path.
522+
let (fut, _) = self.builtin_table_update().execute(updates).await;
523+
let internal_cmd_tx = self.internal_cmd_tx.clone();
524+
let task_span =
525+
info_span!(parent: None, "coord::arrangement_sizes_snapshot::table_updates");
526+
OpenTelemetryContext::obtain().attach_as_parent_to(&task_span);
527+
task::spawn(|| "arrangement_sizes_snapshot_apply", async move {
528+
fut.instrument(task_span).await;
529+
if let Err(e) = internal_cmd_tx.send(Message::ArrangementSizesSchedule) {
530+
warn!("internal_cmd_rx dropped before we could send: {e:?}");
531+
}
532+
});
533+
} else {
534+
self.schedule_arrangement_sizes_collection().await;
535+
}
536+
537+
tracing::debug!(
538+
"appended {row_count} rows to mz_object_arrangement_size_history at ts {collection_ts}"
539+
);
540+
}
541+
542+
#[mz_ore::instrument(level = "debug")]
543+
async fn arrangement_sizes_prune(&mut self, expired: Vec<BuiltinTableUpdate>) {
544+
let (fut, _) = self.builtin_table_update().execute(expired).await;
545+
task::spawn(|| "arrangement_sizes_pruning_apply", async move {
546+
fut.await;
547+
});
548+
}
549+
343550
#[mz_ore::instrument(level = "debug")]
344551
async fn message_command(&mut self, cmd: Command) {
345552
self.handle_command(cmd).await;

0 commit comments

Comments
 (0)