Skip to content

Commit dd4ac18

Browse files
authored
V8 heap metrics: Track instance memory usage for procedure workers too (#5122)
# Description of Changes Prior to this PR, the `V8HeapMetrics` were tracked only for the "main" instance of a database, i.e. the reducer worker. This meant that we had little to no visibility into memory usage by procedures. In this PR, we also track values for the procedure workers. We considered tracking each instance's usage separately with a unique integer `instance_id` label, but were concerned about cardinality (see discussion), so decided instead to track only two sets of label values per database: `JsWorkerKind::Main` and `JsWorkerKind::Procedure`. The entries for `JsWorkerKind::Procedure` store the sum of the values for all procedure workers for that database. I also moved the logic for calling `remove_label_values` into an associated function on `V8HeapMetrics`, rather than listing them all in `remove_database_gauges`. This hides the fact that we have label values for both `JsWorkerKind` variants. # API and ABI breaking changes We don't use any of these metrics for billing, and otherwise do not consider our metrics a stable API. # Expected complexity level and risk 2: it would be unfortunate if we reported incorrect values for these metrics, though (as mentioned above) they are not used for billing, only diagnostics. # Testing I do not know how to test metrics.
1 parent 0305a24 commit dd4ac18

3 files changed

Lines changed: 83 additions & 63 deletions

File tree

crates/core/src/host/host_controller.rs

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use super::module_host::{EventStatus, ModuleHost, ModuleInfo, NoSuchModule};
22
use super::scheduler::SchedulerStarter;
3+
use super::v8::V8HeapMetrics;
34
use super::wasmtime::WasmtimeRuntime;
45
use super::{Scheduler, UpdateDatabaseResult};
56
use crate::client::{ClientActorId, ClientName};
@@ -1411,26 +1412,8 @@ where
14111412
.data_size_blob_store_bytes_used_by_blobs
14121413
.remove_label_values(db);
14131414
let _ = WORKER_METRICS.wasm_memory_bytes.remove_label_values(db);
1414-
let worker_kind = crate::host::v8::V8_WORKER_KIND_MAIN;
1415-
let _ = WORKER_METRICS
1416-
.v8_total_heap_size_bytes
1417-
.remove_label_values(db, worker_kind);
1418-
let _ = WORKER_METRICS
1419-
.v8_total_physical_size_bytes
1420-
.remove_label_values(db, worker_kind);
1421-
let _ = WORKER_METRICS
1422-
.v8_used_global_handles_size_bytes
1423-
.remove_label_values(db, worker_kind);
1424-
let _ = WORKER_METRICS
1425-
.v8_used_heap_size_bytes
1426-
.remove_label_values(db, worker_kind);
1427-
let _ = WORKER_METRICS
1428-
.v8_heap_size_limit_bytes
1429-
.remove_label_values(db, worker_kind);
1430-
let _ = WORKER_METRICS
1431-
.v8_external_memory_bytes
1432-
.remove_label_values(db, worker_kind);
1433-
let _ = WORKER_METRICS.v8_native_contexts.remove_label_values(db, worker_kind);
1434-
let _ = WORKER_METRICS.v8_detached_contexts.remove_label_values(db, worker_kind);
1415+
1416+
V8HeapMetrics::remove_all_metric_label_values_for_database(db);
1417+
14351418
let _ = WORKER_METRICS.v8_request_queue_length.remove_label_values(db);
14361419
}

crates/core/src/host/v8/mod.rs

Lines changed: 62 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -169,17 +169,19 @@ impl V8Runtime {
169169
static V8_RUNTIME_GLOBAL: LazyLock<V8RuntimeInner> = LazyLock::new(V8RuntimeInner::init);
170170
const REDUCER_ARGS_BUFFER_SIZE: usize = 4_096;
171171
const JS_PROCEDURE_INSTANCE_QUEUE_CAPACITY: usize = 1;
172-
pub(crate) const V8_WORKER_KIND_MAIN: &str = "main";
173172

174173
#[derive(Copy, Clone)]
175-
enum JsWorkerKind {
174+
pub(crate) enum JsWorkerKind {
176175
Main,
177176
Procedure,
178177
}
179178

180-
impl JsWorkerKind {
181-
const fn checks_heap(self) -> bool {
182-
matches!(self, Self::Main)
179+
impl AsRef<str> for JsWorkerKind {
180+
fn as_ref(&self) -> &str {
181+
match self {
182+
Self::Main => "main",
183+
Self::Procedure => "procedure",
184+
}
183185
}
184186
}
185187

@@ -946,7 +948,7 @@ fn handle_detached_worker_request(
946948
}
947949
}
948950

949-
struct V8HeapMetrics {
951+
pub(in crate::host) struct V8HeapMetrics {
950952
total_heap_size_bytes: IntGauge,
951953
total_physical_size_bytes: IntGauge,
952954
used_global_handles_size_bytes: IntGauge,
@@ -955,6 +957,16 @@ struct V8HeapMetrics {
955957
external_memory_bytes: IntGauge,
956958
native_contexts: IntGauge,
957959
detached_contexts: IntGauge,
960+
961+
/// Previous values observed by this instance.
962+
///
963+
/// In [`Self::observe`], we use this to compute deltas against the new instance's values,
964+
/// then increment/decrement the metric values by those deltas.
965+
/// We do this rather than `set`ting the metric values as multiple instances may coexist
966+
/// and share the same metric label values.
967+
/// This happens when a database has multiple procedure workers running,
968+
/// and during a module update, as there is a period when the new version has already been created
969+
/// but the old version has not yet shut down.
958970
last_observed: V8HeapSnapshot,
959971
}
960972

@@ -986,32 +998,61 @@ impl V8HeapSnapshot {
986998
}
987999

9881000
impl V8HeapMetrics {
989-
fn new(database_identity: &Identity) -> Self {
1001+
pub(in crate::host) fn remove_all_metric_label_values_for_database(database_identity: &Identity) {
1002+
for worker_kind in [JsWorkerKind::Main, JsWorkerKind::Procedure] {
1003+
let _ = WORKER_METRICS
1004+
.v8_total_heap_size_bytes
1005+
.remove_label_values(database_identity, &worker_kind);
1006+
let _ = WORKER_METRICS
1007+
.v8_total_physical_size_bytes
1008+
.remove_label_values(database_identity, &worker_kind);
1009+
let _ = WORKER_METRICS
1010+
.v8_used_global_handles_size_bytes
1011+
.remove_label_values(database_identity, &worker_kind);
1012+
let _ = WORKER_METRICS
1013+
.v8_used_heap_size_bytes
1014+
.remove_label_values(database_identity, &worker_kind);
1015+
let _ = WORKER_METRICS
1016+
.v8_heap_size_limit_bytes
1017+
.remove_label_values(database_identity, &worker_kind);
1018+
let _ = WORKER_METRICS
1019+
.v8_external_memory_bytes
1020+
.remove_label_values(database_identity, &worker_kind);
1021+
let _ = WORKER_METRICS
1022+
.v8_native_contexts
1023+
.remove_label_values(database_identity, &worker_kind);
1024+
let _ = WORKER_METRICS
1025+
.v8_detached_contexts
1026+
.remove_label_values(database_identity, &worker_kind);
1027+
}
1028+
}
1029+
1030+
fn new(database_identity: &Identity, worker_kind: JsWorkerKind) -> Self {
9901031
Self {
9911032
total_heap_size_bytes: WORKER_METRICS
9921033
.v8_total_heap_size_bytes
993-
.with_label_values(database_identity, V8_WORKER_KIND_MAIN),
1034+
.with_label_values(database_identity, &worker_kind),
9941035
total_physical_size_bytes: WORKER_METRICS
9951036
.v8_total_physical_size_bytes
996-
.with_label_values(database_identity, V8_WORKER_KIND_MAIN),
1037+
.with_label_values(database_identity, &worker_kind),
9971038
used_global_handles_size_bytes: WORKER_METRICS
9981039
.v8_used_global_handles_size_bytes
999-
.with_label_values(database_identity, V8_WORKER_KIND_MAIN),
1040+
.with_label_values(database_identity, &worker_kind),
10001041
used_heap_size_bytes: WORKER_METRICS
10011042
.v8_used_heap_size_bytes
1002-
.with_label_values(database_identity, V8_WORKER_KIND_MAIN),
1043+
.with_label_values(database_identity, &worker_kind),
10031044
heap_size_limit_bytes: WORKER_METRICS
10041045
.v8_heap_size_limit_bytes
1005-
.with_label_values(database_identity, V8_WORKER_KIND_MAIN),
1046+
.with_label_values(database_identity, &worker_kind),
10061047
external_memory_bytes: WORKER_METRICS
10071048
.v8_external_memory_bytes
1008-
.with_label_values(database_identity, V8_WORKER_KIND_MAIN),
1049+
.with_label_values(database_identity, &worker_kind),
10091050
native_contexts: WORKER_METRICS
10101051
.v8_native_contexts
1011-
.with_label_values(database_identity, V8_WORKER_KIND_MAIN),
1052+
.with_label_values(database_identity, &worker_kind),
10121053
detached_contexts: WORKER_METRICS
10131054
.v8_detached_contexts
1014-
.with_label_values(database_identity, V8_WORKER_KIND_MAIN),
1055+
.with_label_values(database_identity, &worker_kind),
10151056
last_observed: V8HeapSnapshot::default(),
10161057
}
10171058
}
@@ -1031,6 +1072,8 @@ impl V8HeapMetrics {
10311072
}
10321073

10331074
fn observe(&mut self, stats: &v8::HeapStatistics) {
1075+
// See doc comment on `Self::last_observed` for why we compute a delta and apply it to the metrics value
1076+
// rather than directly calling `set`.
10341077
let next = V8HeapSnapshot::from_stats(stats);
10351078
self.adjust_by(V8HeapSnapshot {
10361079
total_heap_size_bytes: next.total_heap_size_bytes - self.last_observed.total_heap_size_bytes,
@@ -1625,9 +1668,7 @@ where
16251668
let info = &module_common.info();
16261669
let mut instance_common = InstanceCommon::new(&module_common);
16271670
let replica_ctx: &Arc<ReplicaContext> = module_common.replica_ctx();
1628-
let mut heap_metrics = worker_kind
1629-
.checks_heap()
1630-
.then(|| V8HeapMetrics::new(&info.database_identity));
1671+
let mut heap_metrics = V8HeapMetrics::new(&info.database_identity, worker_kind);
16311672

16321673
let mut inst = V8Instance {
16331674
scope,
@@ -1639,9 +1680,7 @@ where
16391680
.with_label_values(&info.database_identity),
16401681
initial_heap_limit: heap_policy.heap_limit_bytes,
16411682
};
1642-
if let Some(heap_metrics) = heap_metrics.as_mut() {
1643-
let _initial_heap_stats = sample_heap_stats(inst.scope, heap_metrics);
1644-
}
1683+
let _initial_heap_stats = sample_heap_stats(inst.scope, &mut heap_metrics);
16451684

16461685
// Process requests to the worker.
16471686
//
@@ -1655,9 +1694,7 @@ where
16551694
let mut outcome =
16561695
W::handle_request(request, &mut instance_common, &mut inst, &module_common, replica_ctx);
16571696

1658-
if let WorkerRequestOutcome::Continue = outcome
1659-
&& let Some(heap_metrics) = heap_metrics.as_mut()
1660-
{
1697+
if let WorkerRequestOutcome::Continue = outcome {
16611698
let request_check_due = heap_policy.heap_check_request_interval.is_some_and(|interval| {
16621699
requests_since_heap_check += 1;
16631700
requests_since_heap_check >= interval
@@ -1669,7 +1706,7 @@ where
16691706
requests_since_heap_check = 0;
16701707
last_heap_check_at = Instant::now();
16711708
if let Some((used, limit)) =
1672-
should_retire_worker_for_heap(inst.scope, heap_metrics, heap_policy)
1709+
should_retire_worker_for_heap(inst.scope, &mut heap_metrics, heap_policy)
16731710
{
16741711
outcome = outcome.recreate_instance();
16751712
log::warn!(

crates/core/src/worker_metrics/mod.rs

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
use crate::hash::Hash;
21
use crate::messages::control_db::HostType;
32
use crate::subscription::row_list_builder_pool::BsatnRowListBuilderPool;
3+
use crate::{hash::Hash, host::v8::JsWorkerKind};
44
use once_cell::sync::Lazy;
55
use prometheus::{GaugeVec, HistogramVec, IntCounterVec, IntGaugeVec};
66
use spacetimedb_datastore::execution_context::WorkloadType;
@@ -288,28 +288,28 @@ metrics_group!(
288288
pub wasm_memory_bytes: IntGaugeVec,
289289

290290
#[name = spacetime_worker_v8_total_heap_size_bytes]
291-
#[help = "The total size of the V8 heap for a database's tracked JS worker kind (currently main only)"]
292-
#[labels(database_identity: Identity, worker_kind: str)]
291+
#[help = "The total size of the V8 heap for a database's JS workers"]
292+
#[labels(database_identity: Identity, worker_kind: JsWorkerKind)]
293293
pub v8_total_heap_size_bytes: IntGaugeVec,
294294

295295
#[name = spacetime_worker_v8_total_physical_size_bytes]
296-
#[help = "The total committed physical V8 heap memory for a database's tracked JS worker kind (currently main only)"]
297-
#[labels(database_identity: Identity, worker_kind: str)]
296+
#[help = "The total committed physical V8 heap memory for a database's JS workers"]
297+
#[labels(database_identity: Identity, worker_kind: JsWorkerKind)]
298298
pub v8_total_physical_size_bytes: IntGaugeVec,
299299

300300
#[name = spacetime_worker_v8_used_global_handles_size_bytes]
301-
#[help = "The used size of V8 global handles for a database's tracked JS worker kind (currently main only)"]
302-
#[labels(database_identity: Identity, worker_kind: str)]
301+
#[help = "The used size of V8 global handles for a database's JS workers"]
302+
#[labels(database_identity: Identity, worker_kind: JsWorkerKind)]
303303
pub v8_used_global_handles_size_bytes: IntGaugeVec,
304304

305305
#[name = spacetime_worker_v8_used_heap_size_bytes]
306-
#[help = "The live V8 heap size for a database's tracked JS worker kind (currently main only)"]
307-
#[labels(database_identity: Identity, worker_kind: str)]
306+
#[help = "The live V8 heap size for a database's JS workers"]
307+
#[labels(database_identity: Identity, worker_kind: JsWorkerKind)]
308308
pub v8_used_heap_size_bytes: IntGaugeVec,
309309

310310
#[name = spacetime_worker_v8_heap_size_limit_bytes]
311-
#[help = "The V8 heap size limit for a database's tracked JS worker kind (currently main only)"]
312-
#[labels(database_identity: Identity, worker_kind: str)]
311+
#[help = "The V8 heap size limit for a database's JS workers"]
312+
#[labels(database_identity: Identity, worker_kind: JsWorkerKind)]
313313
pub v8_heap_size_limit_bytes: IntGaugeVec,
314314

315315
#[name = spacetime_worker_v8_heap_limit_hit]
@@ -318,18 +318,18 @@ metrics_group!(
318318
pub v8_heap_limit_hit: IntCounterVec,
319319

320320
#[name = spacetime_worker_v8_external_memory_bytes]
321-
#[help = "The external memory tracked by V8 for a database's tracked JS worker kind (currently main only)"]
322-
#[labels(database_identity: Identity, worker_kind: str)]
321+
#[help = "The external memory tracked by V8 for a database's JS workers"]
322+
#[labels(database_identity: Identity, worker_kind: JsWorkerKind)]
323323
pub v8_external_memory_bytes: IntGaugeVec,
324324

325325
#[name = spacetime_worker_v8_native_contexts]
326-
#[help = "The number of native V8 contexts for a database's tracked JS worker kind (currently main only)"]
327-
#[labels(database_identity: Identity, worker_kind: str)]
326+
#[help = "The number of native V8 contexts for a database's JS workers"]
327+
#[labels(database_identity: Identity, worker_kind: JsWorkerKind)]
328328
pub v8_native_contexts: IntGaugeVec,
329329

330330
#[name = spacetime_worker_v8_detached_contexts]
331-
#[help = "The number of detached V8 contexts for a database's tracked JS worker kind (currently main only)"]
332-
#[labels(database_identity: Identity, worker_kind: str)]
331+
#[help = "The number of detached V8 contexts for a database's JS workers"]
332+
#[labels(database_identity: Identity, worker_kind: JsWorkerKind)]
333333
pub v8_detached_contexts: IntGaugeVec,
334334

335335
#[name = spacetime_worker_v8_request_queue_length]

0 commit comments

Comments
 (0)