Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8e664fc
WIP: counter latency monitor
NathanFlurry May 10, 2026
c5e2cf1
feat(rivetkit-core): expose metrics endpoint
NathanFlurry May 11, 2026
1455f67
fix(sqlite): fence remote requests by generation
NathanFlurry May 11, 2026
88102f1
fix(runtime): avoid bounded dispatch channel stalls
NathanFlurry May 11, 2026
a8c93fa
fix(rivetkit): expose client actor metadata
NathanFlurry May 11, 2026
12f4efc
feat(kitchen-sink): add load testing harness
NathanFlurry May 11, 2026
063f2e1
docs(serverless): document timeout tuning
NathanFlurry May 11, 2026
9b85cf3
feat(rivetkit): expose low-cardinality metrics routes
NathanFlurry May 11, 2026
5bed104
feat(rivetkit): add current actor metrics
NathanFlurry May 11, 2026
dc3f075
fix(rivetkit): require engine ping for health
NathanFlurry May 11, 2026
b717224
fix(wasm): point wasm-pack build to new wasm-bindgen repo
abcxff May 11, 2026
04d580f
fix(envoy-client): dont abort drain after 20s
MasterPtato May 12, 2026
909edbc
fix(rivetkit-core): decrement active actor metrics
NathanFlurry May 12, 2026
ad401cc
fix(rivetkit): exit pid1 after signal shutdown
NathanFlurry May 12, 2026
1debedc
fix(rivetkit): use engine actor stop threshold for shutdown
NathanFlurry May 12, 2026
a777712
feat(kitchen-sink): rust counter-latency harness
NathanFlurry May 12, 2026
c73cd5d
chore(kitchen-sink): refresh bench + smoke scripts
NathanFlurry May 12, 2026
407633a
chore(kitchen-sink): counter actor + sigterm probe tweaks
NathanFlurry May 12, 2026
58f88e0
feat(kitchen-sink): ws-ping fast-path on tunnel-stress + load-test-agent
NathanFlurry May 18, 2026
e902ea3
test(depot-client): stale vfs cache reads fail closed
NathanFlurry May 20, 2026
b62b00b
test(depot-client): head fence read poisons vfs
NathanFlurry May 20, 2026
989dc51
test(depot-client): vfs stale page cache writer
NathanFlurry May 20, 2026
955920a
test(depot-client): delayed read ahead stale pages
NathanFlurry May 20, 2026
3162460
test(depot-client): startup preload stale pages
NathanFlurry May 20, 2026
ab06f95
test(rivetkit-core): sqlite lifecycle fuzz harness
NathanFlurry May 20, 2026
260927e
chore(kitchen-sink): agent load test
NathanFlurry May 20, 2026
d1e97b6
test(depot-client): batch atomic cap repro
NathanFlurry May 20, 2026
6cebc46
test(depot-client): warm pidx stale read rmw repro
NathanFlurry May 20, 2026
fab8ec5
test(depot-client): natural warm pidx repro
NathanFlurry May 20, 2026
27cbff1
test(depot-client): natural reopen warm pidx repro
NathanFlurry May 20, 2026
49e17ec
fix(rivetkit): Check for runner config before upserting
MasterPtato May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .agent/notes/driver-test-progress.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,11 @@ Scope: DB driver tests only
- 2026-05-08 16:52 PDT raw-websocket [native/local/bare]: PASS (16 passed, 14.0s). Covers raw websocket callback tracking.
- 2026-05-08 16:52 PDT actor-conn-state [native/local/bare]: PASS (11 passed, 10.9s). Covers connection lifecycle and disconnect accounting through the work registry.
- 2026-05-08 16:52 PDT actor-sleep-db [native/local/bare]: PASS (26 passed, 70.9s). Covers DB close timing during sleep shutdown and waitUntil state persistence.
- 2026-05-09 04:48 PDT DB TESTS RERUN STARTED [native only] - validating async websocket close handler behavior.
- 2026-05-09 04:48 PDT actor-db rerun [native]: PASS (13 passed, 104 skipped, 18.3s).
- 2026-05-09 04:48 PDT actor-db-raw rerun [native]: PASS (5 passed, 40 skipped, 4.9s).
- 2026-05-09 04:48 PDT actor-db-pragma-migration rerun [native]: PASS (4 passed, 32 skipped, 4.3s).
- 2026-05-09 04:48 PDT actor-sleep-db rerun [native]: PASS (26 passed, 208 skipped, 63.8s). Includes `async websocket close handler can use c.db before sleep completes` and `async websocket addEventListener close handler can use c.db before sleep completes`.
- 2026-05-09 04:48 PDT actor-db-stress rerun [native]: PASS (5 passed, 40 skipped, 31.3s).
- 2026-05-09 04:48 PDT actor-db-init-order rerun [native]: PASS (6 passed, 48 skipped, 6.3s).
- 2026-05-09 04:48 PDT DB TESTS RERUN COMPLETE [native only] - 6/6 DB file groups passed. Async close handler tests included.
16 changes: 2 additions & 14 deletions .github/workflows/publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ jobs:
if-no-files-found: error

# ---------------------------------------------------------------------------
# build-wasm — wasm package artifact built in parallel with native artifacts
# build-wasm — disabled
# ---------------------------------------------------------------------------
build-wasm:
needs: [context]
Expand Down Expand Up @@ -319,12 +319,11 @@ jobs:
# publish — npm publish + R2 upload + Docker manifest + release tail
# ---------------------------------------------------------------------------
publish:
needs: [context, build, build-wasm, docker-images]
needs: [context, build, docker-images]
name: "Publish"
if: |
!cancelled() &&
needs.build.result == 'success' &&
needs.build-wasm.result == 'success' &&
needs.docker-images.result == 'success'
runs-on: depot-ubuntu-24.04-8
permissions:
Expand Down Expand Up @@ -364,17 +363,6 @@ jobs:
path: engine-artifacts
pattern: engine-*
merge-multiple: true
- name: Download wasm package artifact
uses: actions/download-artifact@v4
with:
name: wasm-package
path: rivetkit-typescript/packages/rivetkit-wasm/pkg
- name: Validate wasm package artifact
run: |
test -f rivetkit-typescript/packages/rivetkit-wasm/pkg/rivetkit_wasm.js
test -f rivetkit-typescript/packages/rivetkit-wasm/pkg/rivetkit_wasm.d.ts
test -f rivetkit-typescript/packages/rivetkit-wasm/pkg/rivetkit_wasm_bg.wasm

- name: Place native binaries in platform packages
run: |
NATIVE_DIR=rivetkit-typescript/packages/rivetkit-napi
Expand Down
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Design constraints, invariants, and reference commands for the Rivet monorepo. F

- Avoid raw `f64` fields in vbare protocol schemas that use hashable maps; generated Rust derives `Eq`/`Hash`, so encode floats as fixed bytes or an ordered wrapper.
- Version converters must manually map fields between versions; never use serialize-deserialize round trips such as `transcode_version` or `serde_bare::to_vec` plus `from_slice`.
- RivetKit client/server protocol compatibility assumes the server/runtime is newer than the client; clients send their latest request protocol version, and servers handle older-client compatibility and negotiation.

When talking about "Rivet Actors" make sure to capitalize "Rivet Actor" as a proper noun and lowercase "actor" as a generic noun.

Expand Down
8 changes: 4 additions & 4 deletions docs-internal/engine/SQLITE_METRICS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
- `sqlite_commit_dirty_page_count{path}`: Histogram of dirty page counts per commit path.
- `sqlite_commit_dirty_bytes{path}`: Histogram of raw dirty-page bytes per commit path.
- `sqlite_udb_ops_per_commit{path}`: Histogram of UniversalDB operations per commit path.
- `sqlite_commit_envoy_dispatch_duration_seconds`: Pegboard-envoy histogram for websocket frame arrival to `depot` dispatch.
- `sqlite_commit_envoy_response_duration_seconds`: Pegboard-envoy histogram for `depot` return to websocket response send.
- `envoy_sqlite_commit_dispatch_duration_seconds`: Pegboard-envoy histogram for websocket frame arrival to `depot` dispatch.
- `envoy_sqlite_commit_response_duration_seconds`: Pegboard-envoy histogram for `depot` return to websocket response send.
- `sqlite_commit_phases`: Actor inspector labeled timing metric exposed from `/inspector/metrics`. Values are `request_build`, `serialize`, `transport`, and `state_update`.

## Scrape Points
Expand All @@ -23,8 +23,8 @@

## Diagnosis

- High `decode_request` or `sqlite_commit_envoy_dispatch_duration_seconds` usually means envoy-side validation or actor lookup is slow before storage work starts.
- High `decode_request` or `envoy_sqlite_commit_dispatch_duration_seconds` usually means envoy-side validation or actor lookup is slow before storage work starts.
- High `meta_read` or `pidx_read` points at UniversalDB read pressure or cache misses.
- High `ltx_encode` means commit encoding and compression are doing real work. Check dirty page counts and raw dirty bytes together.
- High `udb_write`, `meta_write`, or `sqlite_commit_envoy_response_duration_seconds` points at write-path latency after encode.
- High `udb_write`, `meta_write`, or `envoy_sqlite_commit_response_duration_seconds` points at write-path latency after encode.
- A healthy actor should show non-zero `sqlite_commit_phases` totals after commits in `/inspector/metrics`. If SQL runs but those timings stay zero, the native VFS metrics path is broken.
8 changes: 4 additions & 4 deletions engine/packages/config/src/config/pegboard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ pub struct Pegboard {

impl Pegboard {
pub fn base_retry_timeout(&self) -> usize {
self.base_retry_timeout.unwrap_or(2000)
self.base_retry_timeout.unwrap_or(2_000)
}

pub fn actor_allocation_threshold(&self) -> i64 {
Expand All @@ -177,7 +177,7 @@ impl Pegboard {
}

pub fn actor_retry_duration_threshold(&self) -> i64 {
self.actor_retry_duration_threshold.unwrap_or(300_000)
self.actor_retry_duration_threshold.unwrap_or(5 * 60 * 1000)
}

pub fn retry_reset_duration(&self) -> i64 {
Expand All @@ -202,7 +202,7 @@ impl Pegboard {
}

pub fn serverless_base_retry_timeout(&self) -> usize {
self.serverless_base_retry_timeout.unwrap_or(2000)
self.serverless_base_retry_timeout.unwrap_or(2_000)
}

pub fn serverless_retry_reset_duration(&self) -> i64 {
Expand Down Expand Up @@ -237,7 +237,7 @@ impl Pegboard {

pub fn gateway_response_start_timeout_ms(&self) -> u64 {
self.gateway_response_start_timeout_ms
.unwrap_or(5 * 60 * 1000) // 5 minutes
.unwrap_or(5 * 60 * 1000)
}

pub fn gateway_update_ping_interval_ms(&self) -> u64 {
Expand Down
37 changes: 34 additions & 3 deletions engine/packages/depot-client/src/database.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::sync::Arc;

use anyhow::{Result, anyhow};
use async_trait::async_trait;
use rivet_envoy_protocol as protocol;
use tokio::runtime::Handle;

use crate::{
Expand All @@ -23,6 +25,30 @@ pub fn vfs_name_for_actor_database(actor_id: &str, generation: u64) -> String {
format!("envoy-sqlite-{actor_id}-g{generation}")
}

struct GenerationFencedTransport {
inner: SqliteTransportHandle,
generation: u64,
}

#[async_trait]
impl crate::vfs::SqliteTransport for GenerationFencedTransport {
async fn get_pages(
&self,
mut request: protocol::SqliteGetPagesRequest,
) -> Result<protocol::SqliteGetPagesResponse> {
request.expected_generation.get_or_insert(self.generation);
self.inner.get_pages(request).await
}

async fn commit(
&self,
mut request: protocol::SqliteCommitRequest,
) -> Result<protocol::SqliteCommitResponse> {
request.expected_generation.get_or_insert(self.generation);
self.inner.commit(request).await
}
}

pub async fn open_database_from_transport(
transport: SqliteTransportHandle,
actor_id: String,
Expand All @@ -32,9 +58,14 @@ pub async fn open_database_from_transport(
) -> Result<NativeDatabaseHandle> {
let vfs_name = vfs_name_for_actor_database(&actor_id, generation);
let config = VfsConfig::default();
let initial_pages = fetch_initial_pages_for_registration(transport.clone(), &actor_id, &config)
.await
.map_err(|e| anyhow!("failed to preload sqlite pages: {e}"))?;
let transport: SqliteTransportHandle = Arc::new(GenerationFencedTransport {
inner: transport,
generation,
});
let initial_pages =
fetch_initial_pages_for_registration(transport.clone(), &actor_id, generation, &config)
.await
.map_err(|e| anyhow!("failed to preload sqlite pages: {e}"))?;
let vfs = Arc::new(
SqliteVfs::register_with_transport_and_initial_pages(
&vfs_name,
Expand Down
26 changes: 24 additions & 2 deletions engine/packages/depot-client/src/optimization_flags.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ pub const VFS_PAGE_CACHE_CAPACITY_PAGES_ENV: &str =
"RIVETKIT_SQLITE_OPT_VFS_PAGE_CACHE_CAPACITY_PAGES";
pub const VFS_PROTECTED_CACHE_PAGES_ENV: &str = "RIVETKIT_SQLITE_OPT_VFS_PROTECTED_CACHE_PAGES";
pub const VFS_STAGING_CACHE_TTL_MS_ENV: &str = "RIVETKIT_SQLITE_OPT_VFS_STAGING_CACHE_TTL_MS";
pub const VFS_RETAIN_READ_CACHE_ENV: &str = "RIVETKIT_SQLITE_OPT_VFS_RETAIN_READ_CACHE";

pub const DEFAULT_STARTUP_PRELOAD_MAX_BYTES: usize = 1024 * 1024;
pub const MAX_STARTUP_PRELOAD_MAX_BYTES: usize = 8 * 1024 * 1024;
pub const MAX_STARTUP_PRELOAD_MAX_BYTES: usize = 64 * 1024 * 1024;
pub const DEFAULT_STARTUP_PRELOAD_FIRST_PAGE_COUNT: u32 = 1;
pub const MAX_STARTUP_PRELOAD_FIRST_PAGE_COUNT: u32 = 256;
pub const MAX_STARTUP_PRELOAD_FIRST_PAGE_COUNT: u32 = 16_384;
pub const DEFAULT_VFS_PAGE_CACHE_CAPACITY_PAGES: u64 = 50_000;
pub const MAX_VFS_PAGE_CACHE_CAPACITY_PAGES: u64 = 500_000;
pub const DEFAULT_VFS_PROTECTED_CACHE_PAGES: usize = 512;
Expand Down Expand Up @@ -106,6 +107,7 @@ pub struct SqliteOptimizationFlags {
pub vfs_page_cache_capacity_pages: u64,
pub vfs_protected_cache_pages: usize,
pub vfs_staging_cache_ttl_ms: u64,
pub vfs_retain_read_cache: bool,
}

impl Default for SqliteOptimizationFlags {
Expand Down Expand Up @@ -133,6 +135,7 @@ impl Default for SqliteOptimizationFlags {
vfs_page_cache_capacity_pages: DEFAULT_VFS_PAGE_CACHE_CAPACITY_PAGES,
vfs_protected_cache_pages: DEFAULT_VFS_PROTECTED_CACHE_PAGES,
vfs_staging_cache_ttl_ms: DEFAULT_VFS_STAGING_CACHE_TTL_MS,
vfs_retain_read_cache: false,
}
}
}
Expand Down Expand Up @@ -206,6 +209,9 @@ impl SqliteOptimizationFlags {
DEFAULT_VFS_STAGING_CACHE_TTL_MS,
MAX_VFS_STAGING_CACHE_TTL_MS,
),
vfs_retain_read_cache: disabled_by_default(
read_env(VFS_RETAIN_READ_CACHE_ENV).as_deref(),
),
}
}
}
Expand All @@ -228,6 +234,20 @@ fn enabled_by_default(value: Option<&str>) -> bool {
_ => true,
}
}

fn disabled_by_default(value: Option<&str>) -> bool {
match value.map(|value| value.trim().to_ascii_lowercase()) {
Some(value)
if matches!(
value.as_str(),
"1" | "true" | "on" | "yes" | "enabled" | "enable"
) =>
{
true
}
_ => false,
}
}
fn usize_bounded_by_default(value: Option<&str>, default: usize, max: usize) -> usize {
value
.and_then(|value| value.trim().parse::<usize>().ok())
Expand Down Expand Up @@ -318,6 +338,7 @@ mod tests {
VFS_PAGE_CACHE_CAPACITY_PAGES_ENV => Some("0".to_string()),
VFS_PROTECTED_CACHE_PAGES_ENV => Some("0".to_string()),
VFS_STAGING_CACHE_TTL_MS_ENV => Some("0".to_string()),
VFS_RETAIN_READ_CACHE_ENV => Some("true".to_string()),
_ => None,
});

Expand All @@ -339,6 +360,7 @@ mod tests {
assert_eq!(flags.vfs_page_cache_capacity_pages, 0);
assert_eq!(flags.vfs_protected_cache_pages, 0);
assert_eq!(flags.vfs_staging_cache_ttl_ms, 0);
assert!(flags.vfs_retain_read_cache);
}

#[test]
Expand Down
Loading
Loading