From 59858bf2b2065ac74b081ce13a2f1fe5ed0b8d82 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Tue, 21 Apr 2026 20:38:21 -0700 Subject: [PATCH] chore: sqlite v1 to v2 data migration --- Cargo.lock | 46 +- Cargo.toml | 1 + engine/packages/pegboard-envoy/Cargo.toml | 2 + engine/packages/pegboard-envoy/src/metrics.rs | 33 + .../pegboard-envoy/src/sqlite_runtime.rs | 1108 ++- .../pegboard-envoy/src/ws_to_tunnel_task.rs | 1 + .../sqlite-storage/examples/bench_rtt.rs | 1 + engine/packages/sqlite-storage/src/commit.rs | 17 +- .../sqlite-storage/src/compaction/shard.rs | 9 +- .../sqlite-storage/src/compaction/worker.rs | 3 +- engine/packages/sqlite-storage/src/engine.rs | 27 +- engine/packages/sqlite-storage/src/keys.rs | 7 + engine/packages/sqlite-storage/src/quota.rs | 2 + engine/packages/sqlite-storage/src/read.rs | 9 +- .../packages/sqlite-storage/src/takeover.rs | 160 +- engine/packages/sqlite-storage/src/types.rs | 83 + engine/packages/sqlite-storage/src/udb.rs | 63 + .../packages/rivetkit-core/src/registry.rs | 5 +- .../packages/rivetkit-core/src/sqlite.rs | 34 +- .../packages/rivetkit-sqlite/Cargo.toml | 3 +- .../examples/v1_baseline_bench.rs | 400 -- .../packages/rivetkit-sqlite/src/database.rs | 176 +- .../packages/rivetkit-sqlite/src/kv.rs | 198 - .../packages/rivetkit-sqlite/src/lib.rs | 26 +- .../packages/rivetkit-sqlite/src/query.rs | 2 +- .../packages/rivetkit-sqlite/src/sqlite_kv.rs | 114 - .../packages/rivetkit-sqlite/src/v2/mod.rs | 1 - .../packages/rivetkit-sqlite/src/v2/vfs.rs | 5077 -------------- .../packages/rivetkit-sqlite/src/vfs.rs | 6167 ++++++++++++----- .../packages/rivetkit-napi/index.d.ts | 2 +- .../rivetkit-napi/src/bridge_actor.rs | 16 - .../packages/rivetkit-napi/src/database.rs | 24 +- .../rivetkit-napi/src/envoy_handle.rs | 11 +- .../packages/rivetkit-napi/src/lib.rs | 6 +- .../packages/rivetkit-napi/src/sqlite_db.rs | 1 - .../packages/rivetkit-napi/wrapper.d.ts | 1 - .../packages/rivetkit-napi/wrapper.js | 14 +- 37 files changed, 6125 insertions(+), 7725 deletions(-) delete mode 100644 rivetkit-rust/packages/rivetkit-sqlite/examples/v1_baseline_bench.rs delete mode 100644 rivetkit-rust/packages/rivetkit-sqlite/src/kv.rs delete mode 100644 rivetkit-rust/packages/rivetkit-sqlite/src/sqlite_kv.rs delete mode 100644 rivetkit-rust/packages/rivetkit-sqlite/src/v2/mod.rs delete mode 100644 rivetkit-rust/packages/rivetkit-sqlite/src/v2/vfs.rs diff --git a/Cargo.lock b/Cargo.lock index 9fae14b8cb..1359af98f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1594,6 +1594,18 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fastrand" version = "2.3.0" @@ -2004,6 +2016,15 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "hashlink" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "hdrhistogram" version = "7.5.4" @@ -3520,11 +3541,13 @@ dependencies = [ "rivet-metrics", "rivet-runtime", "rivet-types", + "rusqlite", "scc", "serde", "serde_bare", "serde_json", "sqlite-storage", + "tempfile", "tokio", "tokio-tungstenite", "tracing", @@ -3840,7 +3863,7 @@ dependencies = [ "base64 0.22.1", "byteorder", "bytes", - "fallible-iterator", + "fallible-iterator 0.2.0", "hmac", "md-5", "memchr", @@ -3857,7 +3880,7 @@ checksum = "ef4605b7c057056dd35baeb6ac0c0338e4975b1f2bef0f65da953285eb007095" dependencies = [ "bytes", "chrono", - "fallible-iterator", + "fallible-iterator 0.2.0", "postgres-protocol", ] @@ -5273,7 +5296,6 @@ name = "rivetkit-sqlite" version = "2.3.0-rc.4" dependencies = [ "anyhow", - "async-trait", "getrandom 0.2.16", "libsqlite3-sys", "moka", @@ -5326,6 +5348,20 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "rusqlite" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e" +dependencies = [ + "bitflags", + "fallible-iterator 0.3.0", + "fallible-streaming-iterator", + "hashlink 0.9.1", + "libsqlite3-sys", + "smallvec", +] + [[package]] name = "rust-multipart-rfc7578_2" version = "0.8.0" @@ -6583,7 +6619,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "fallible-iterator", + "fallible-iterator 0.2.0", "futures-channel", "futures-util", "log", @@ -7983,7 +8019,7 @@ checksum = "8902160c4e6f2fb145dbe9d6760a75e3c9522d8bf796ed7047c85919ac7115f8" dependencies = [ "arraydeque", "encoding_rs", - "hashlink", + "hashlink 0.8.4", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 196698ce0c..2c64cb6c3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -120,6 +120,7 @@ members = [ rand = "0.8" regex = "1.4" rstest = "0.26.1" + rusqlite = { version = "0.32.1", features = [ "bundled" ] } rustls-pemfile = "2.2.0" rustyline = "15.0.0" scc = "3.6.12" diff --git a/engine/packages/pegboard-envoy/Cargo.toml b/engine/packages/pegboard-envoy/Cargo.toml index 05e7685a0a..3be93a750f 100644 --- a/engine/packages/pegboard-envoy/Cargo.toml +++ b/engine/packages/pegboard-envoy/Cargo.toml @@ -32,6 +32,7 @@ serde_bare.workspace = true serde_json.workspace = true serde.workspace = true sqlite-storage.workspace = true +tempfile.workspace = true tokio-tungstenite.workspace = true tokio.workspace = true tracing.workspace = true @@ -39,6 +40,7 @@ universaldb.workspace = true universalpubsub.workspace = true url.workspace = true vbare.workspace = true +rusqlite.workspace = true pegboard.workspace = true namespace.workspace = true diff --git a/engine/packages/pegboard-envoy/src/metrics.rs b/engine/packages/pegboard-envoy/src/metrics.rs index 563e1de769..829a2d6ec6 100644 --- a/engine/packages/pegboard-envoy/src/metrics.rs +++ b/engine/packages/pegboard-envoy/src/metrics.rs @@ -55,4 +55,37 @@ lazy_static::lazy_static! { BUCKETS.to_vec(), *REGISTRY ).unwrap(); + + pub static ref SQLITE_MIGRATION_ATTEMPTS_TOTAL: IntCounter = register_int_counter_with_registry!( + "pegboard_envoy_sqlite_migration_attempts_total", + "Total number of sqlite v1 to v2 migration attempts.", + *REGISTRY + ).unwrap(); + + pub static ref SQLITE_MIGRATION_SUCCESSES_TOTAL: IntCounter = register_int_counter_with_registry!( + "pegboard_envoy_sqlite_migration_successes_total", + "Total number of sqlite v1 to v2 migrations that completed successfully.", + *REGISTRY + ).unwrap(); + + pub static ref SQLITE_MIGRATION_FAILURES_TOTAL: IntCounterVec = register_int_counter_vec_with_registry!( + "pegboard_envoy_sqlite_migration_failures_total", + "Total number of sqlite v1 to v2 migration failures by phase.", + &["phase"], + *REGISTRY + ).unwrap(); + + pub static ref SQLITE_MIGRATION_DURATION: Histogram = register_histogram_with_registry!( + "pegboard_envoy_sqlite_migration_duration_seconds", + "Duration of sqlite v1 to v2 migrations.", + BUCKETS.to_vec(), + *REGISTRY + ).unwrap(); + + pub static ref SQLITE_MIGRATION_PAGES: Histogram = register_histogram_with_registry!( + "pegboard_envoy_sqlite_migration_pages", + "Number of pages imported during sqlite v1 to v2 migration.", + BUCKETS.to_vec(), + *REGISTRY + ).unwrap(); } diff --git a/engine/packages/pegboard-envoy/src/sqlite_runtime.rs b/engine/packages/pegboard-envoy/src/sqlite_runtime.rs index 49c8067a42..4d3ecfc383 100644 --- a/engine/packages/pegboard-envoy/src/sqlite_runtime.rs +++ b/engine/packages/pegboard-envoy/src/sqlite_runtime.rs @@ -1,16 +1,43 @@ -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use std::time::Instant; -use anyhow::Result; +use anyhow::{Context, Result, ensure}; use gas::prelude::{Id, StandaloneCtx, util::timestamp}; +use pegboard::actor_kv::Recipient; use rivet_envoy_protocol::{self as protocol, PROTOCOL_VERSION}; +use rusqlite::Connection; +use scc::{HashMap, hash_map::Entry}; use sqlite_storage::{ - compaction::CompactionCoordinator, engine::SqliteEngine, takeover::TakeoverConfig, - types::SQLITE_VFS_V2_SCHEMA_VERSION, + commit::{CommitFinalizeRequest, CommitStageBeginRequest, CommitStageRequest}, + compaction::CompactionCoordinator, + engine::SqliteEngine, + ltx::{LtxHeader, encode_ltx_v3}, + takeover::TakeoverConfig, + types::{DirtyPage, SqliteOrigin, SQLITE_PAGE_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION}, }; -use tokio::sync::OnceCell; +use tempfile::tempdir; +use tokio::sync::{Mutex, OnceCell}; use universaldb::Subspace; +use crate::metrics; + static SQLITE_ENGINE: OnceCell> = OnceCell::const_new(); +static SQLITE_MIGRATION_LOCKS: OnceLock>>> = OnceLock::new(); + +const SQLITE_V1_PREFIX: u8 = 0x08; +const SQLITE_V1_SCHEMA_VERSION: u8 = 0x01; +const SQLITE_V1_META_PREFIX: u8 = 0x00; +const SQLITE_V1_CHUNK_PREFIX: u8 = 0x01; +const SQLITE_V1_META_VERSION: u16 = 1; +const SQLITE_V1_META_LEN: usize = 10; +const SQLITE_V1_CHUNK_SIZE: usize = 4096; +const SQLITE_V1_MAX_MIGRATION_BYTES: u64 = 128 * 1024 * 1024; +const SQLITE_V1_MIGRATION_LEASE_MS: i64 = 5 * 60 * 1000; +const FILE_TAG_MAIN: u8 = 0x00; +const FILE_TAG_JOURNAL: u8 = 0x01; +const FILE_TAG_WAL: u8 = 0x02; +const FILE_TAG_SHM: u8 = 0x03; +const SQLITE_MAGIC: &[u8; 16] = b"SQLite format 3\0"; pub async fn shared_engine(ctx: &StandaloneCtx) -> Result> { let db = Arc::new((*ctx.udb()?).clone()); @@ -37,6 +64,18 @@ fn sqlite_subspace() -> Subspace { pegboard::keys::subspace().subspace(&("sqlite-storage",)) } +fn migration_locks() -> &'static HashMap>> { + SQLITE_MIGRATION_LOCKS.get_or_init(HashMap::default) +} + +async fn actor_migration_lock(actor_id: &str) -> Arc> { + let actor_id = actor_id.to_string(); + match migration_locks().entry_async(actor_id).await { + Entry::Occupied(entry) => Arc::clone(entry.get()), + Entry::Vacant(entry) => Arc::clone(entry.insert_entry(Arc::new(Mutex::new(()))).get()), + } +} + pub async fn populate_start_command( ctx: &StandaloneCtx, sqlite_engine: &SqliteEngine, @@ -57,12 +96,25 @@ pub async fn populate_start_command( .await?; } - start.sqlite_schema_version = - if pegboard::actor_kv::sqlite_v1_data_exists(&*ctx.udb()?, actor_id).await? { - pegboard::workflows::actor2::SQLITE_SCHEMA_VERSION_V1 - } else { - SQLITE_VFS_V2_SCHEMA_VERSION - }; + let db = ctx.udb()?; + let recipient = Recipient { + actor_id, + namespace_id, + name: start.config.name.clone(), + }; + if protocol_version >= PROTOCOL_VERSION { + maybe_migrate_v1_to_v2(&db, sqlite_engine, &recipient).await?; + } + + let actor_id_str = actor_id.to_string(); + let has_v2_meta = sqlite_engine.try_load_meta(&actor_id_str).await?.is_some(); + start.sqlite_schema_version = if has_v2_meta { + SQLITE_VFS_V2_SCHEMA_VERSION + } else if pegboard::actor_kv::sqlite_v1_data_exists(&db, actor_id).await? { + pegboard::workflows::actor2::SQLITE_SCHEMA_VERSION_V1 + } else { + SQLITE_VFS_V2_SCHEMA_VERSION + }; start.sqlite_startup_data = maybe_load_sqlite_startup_data( sqlite_engine, protocol_version, @@ -74,6 +126,449 @@ pub async fn populate_start_command( Ok(()) } +async fn maybe_migrate_v1_to_v2( + db: &universaldb::Database, + sqlite_engine: &SqliteEngine, + recipient: &Recipient, +) -> Result { + if !pegboard::actor_kv::sqlite_v1_data_exists(db, recipient.actor_id).await? { + return Ok(false); + } + + let actor_id = recipient.actor_id.to_string(); + let migration_lock = actor_migration_lock(&actor_id).await; + let _guard = migration_lock.lock().await; + + if !pegboard::actor_kv::sqlite_v1_data_exists(db, recipient.actor_id).await? { + return Ok(false); + } + + if let Some(head) = sqlite_engine.try_load_head(&actor_id).await? { + match head.origin { + SqliteOrigin::Native | SqliteOrigin::MigratedFromV1 => return Ok(false), + SqliteOrigin::MigratingFromV1 => { + let migration_started_at = head.creation_ts_ms; + let lease_expires_at = + migration_started_at.saturating_add(SQLITE_V1_MIGRATION_LEASE_MS); + let stage_in_progress = head.next_txid > head.head_txid.saturating_add(1); + ensure!( + !stage_in_progress || lease_expires_at <= timestamp::now(), + "sqlite v1 migration for actor {actor_id} is already in progress" + ); + } + } + } + + metrics::SQLITE_MIGRATION_ATTEMPTS_TOTAL.inc(); + let start = Instant::now(); + + let snapshot = read_v1_snapshot(db, recipient) + .await + .map_err(|err| migration_error(&actor_id, "read_v1", err))?; + let recovered = recover_v1_snapshot(&actor_id, snapshot) + .map_err(|err| migration_error(&actor_id, "validate", err))?; + metrics::SQLITE_MIGRATION_PAGES.observe(recovered.total_pages as f64); + tracing::info!( + actor_id = %actor_id, + pages = recovered.total_pages, + size_bytes = recovered.bytes.len(), + has_journal = recovered.had_journal, + "starting v1→v2 migration" + ); + + let prepared = sqlite_engine + .prepare_v1_migration(&actor_id, timestamp::now()) + .await + .map_err(|err| migration_error(&actor_id, "takeover", err))?; + let stage_begin = sqlite_engine + .commit_stage_begin( + &actor_id, + CommitStageBeginRequest { + generation: prepared.meta.generation, + }, + ) + .await + .map_err(|err| migration_error(&actor_id, "stage", err))?; + let dirty_pages = recovered + .bytes + .chunks(SQLITE_PAGE_SIZE as usize) + .enumerate() + .map(|(idx, bytes)| DirtyPage { + pgno: idx as u32 + 1, + bytes: bytes.to_vec(), + }) + .collect::>(); + let encoded_delta = encode_ltx_v3( + LtxHeader::delta(stage_begin.txid, recovered.total_pages, timestamp::now()), + &dirty_pages, + ) + .map_err(|err| migration_error(&actor_id, "stage", err.into()))?; + let staged_chunks = split_bytes( + &encoded_delta, + prepared + .meta + .max_delta_bytes + .try_into() + .context("sqlite max_delta_bytes exceeded usize") + .map_err(|err| migration_error(&actor_id, "stage", err))?, + ); + for (chunk_idx, chunk) in staged_chunks.iter().enumerate() { + sqlite_engine + .commit_stage( + &actor_id, + CommitStageRequest { + generation: prepared.meta.generation, + txid: stage_begin.txid, + chunk_idx: chunk_idx as u32, + bytes: chunk.clone(), + is_last: chunk_idx + 1 == staged_chunks.len(), + }, + ) + .await + .map_err(|err| migration_error(&actor_id, "stage", err))?; + } + sqlite_engine + .commit_finalize( + &actor_id, + CommitFinalizeRequest { + generation: prepared.meta.generation, + expected_head_txid: prepared.meta.head_txid, + txid: stage_begin.txid, + new_db_size_pages: recovered.total_pages, + now_ms: timestamp::now(), + origin_override: Some(SqliteOrigin::MigratedFromV1), + }, + ) + .await + .map_err(|err| migration_error(&actor_id, "finalize", err))?; + + metrics::SQLITE_MIGRATION_SUCCESSES_TOTAL.inc(); + metrics::SQLITE_MIGRATION_DURATION.observe(start.elapsed().as_secs_f64()); + tracing::info!( + actor_id = %actor_id, + pages = recovered.total_pages, + duration_ms = start.elapsed().as_millis(), + "v1→v2 migration complete" + ); + + Ok(true) +} + +fn migration_error(actor_id: &str, phase: &'static str, err: anyhow::Error) -> anyhow::Error { + metrics::SQLITE_MIGRATION_FAILURES_TOTAL + .with_label_values(&[phase]) + .inc(); + tracing::error!(actor_id = %actor_id, phase, ?err, "v1→v2 migration failed"); + err +} + +async fn read_v1_snapshot( + db: &universaldb::Database, + recipient: &Recipient, +) -> Result { + ensure!( + !v1_file_exists(db, recipient, FILE_TAG_WAL).await?, + "unexpected sqlite v1 WAL sidecar present" + ); + ensure!( + !v1_file_exists(db, recipient, FILE_TAG_SHM).await?, + "unexpected sqlite v1 SHM sidecar present" + ); + + let main = read_v1_file(db, recipient, FILE_TAG_MAIN) + .await? + .context("sqlite v1 main file missing metadata")?; + let journal = read_v1_file(db, recipient, FILE_TAG_JOURNAL).await?; + let had_journal = journal.is_some(); + + Ok(RecoveredV1Snapshot { + main, + journal, + had_journal, + }) +} + +async fn v1_file_exists( + db: &universaldb::Database, + recipient: &Recipient, + file_tag: u8, +) -> Result { + let (keys, _, _) = pegboard::actor_kv::list( + db, + recipient, + protocol::KvListQuery::KvListPrefixQuery(protocol::KvListPrefixQuery { + key: v1_chunk_prefix(file_tag).to_vec(), + }), + false, + Some(1), + ) + .await?; + + Ok(!keys.is_empty()) +} + +async fn read_v1_file( + db: &universaldb::Database, + recipient: &Recipient, + file_tag: u8, +) -> Result> { + let meta_key = v1_meta_key(file_tag).to_vec(); + let (meta_keys, meta_values, _) = pegboard::actor_kv::get(db, recipient, vec![meta_key.clone()]) + .await?; + + if meta_keys.is_empty() + && !v1_file_exists(db, recipient, file_tag).await? + { + return Ok(None); + } + ensure!( + !meta_keys.is_empty(), + "sqlite v1 file tag {file_tag} has chunks but no metadata" + ); + ensure!( + meta_keys.len() == 1 && meta_keys[0] == meta_key, + "unexpected sqlite v1 metadata layout for file tag {file_tag}" + ); + + let size_bytes = decode_v1_meta(&meta_values[0]) + .with_context(|| format!("decode sqlite v1 metadata for file tag {file_tag}"))?; + ensure!( + size_bytes <= SQLITE_V1_MAX_MIGRATION_BYTES, + "sqlite v1 file tag {file_tag} exceeded migration limit of {} bytes", + SQLITE_V1_MAX_MIGRATION_BYTES + ); + let expected_chunks = size_bytes.div_ceil(SQLITE_V1_CHUNK_SIZE as u64); + let chunk_limit = usize::try_from(expected_chunks) + .context("sqlite v1 expected chunk count exceeded usize")? + .checked_add(1) + .context("sqlite v1 chunk limit overflow")? + .max(1); + let (chunk_keys, chunk_values, _) = pegboard::actor_kv::list( + db, + recipient, + protocol::KvListQuery::KvListPrefixQuery(protocol::KvListPrefixQuery { + key: v1_chunk_prefix(file_tag).to_vec(), + }), + false, + Some(chunk_limit), + ) + .await?; + let mut chunks = chunk_keys + .into_iter() + .zip(chunk_values.into_iter()) + .map(|(key, value)| { + let chunk_idx = decode_v1_chunk_index(file_tag, &key)?; + Ok((chunk_idx, value)) + }) + .collect::>>()?; + chunks.sort_by_key(|(chunk_idx, _)| *chunk_idx); + + let bytes = rebuild_v1_file( + size_bytes, + expected_chunks + .try_into() + .context("sqlite v1 expected chunk count exceeded usize")?, + &chunks, + ) + .with_context(|| format!("rebuild sqlite v1 file tag {file_tag}"))?; + + Ok(Some(V1File { size_bytes, bytes })) +} + +fn recover_v1_snapshot(actor_id: &str, snapshot: RecoveredV1Snapshot) -> Result { + if snapshot.main.size_bytes == 0 { + return Ok(RecoveredDb { + bytes: Vec::new(), + total_pages: 0, + had_journal: snapshot.had_journal, + }); + } + + let tmp = tempdir().context("create sqlite v1 migration tempdir")?; + let db_path = tmp.path().join("migration.db"); + std::fs::write(&db_path, &snapshot.main.bytes) + .with_context(|| format!("write sqlite v1 main temp file for actor {actor_id}"))?; + if let Some(journal) = snapshot.journal { + std::fs::write(tmp.path().join("migration.db-journal"), &journal.bytes) + .with_context(|| format!("write sqlite v1 journal temp file for actor {actor_id}"))?; + } + + let conn = Connection::open(&db_path) + .with_context(|| format!("open sqlite v1 temp db for actor {actor_id}"))?; + conn.pragma_update(None, "journal_mode", "DELETE") + .context("set sqlite journal_mode during v1 recovery")?; + let integrity: String = conn + .query_row("PRAGMA quick_check(1)", [], |row| row.get(0)) + .context("run sqlite quick_check during v1 recovery")?; + ensure!( + integrity == "ok", + "sqlite integrity check failed after v1 recovery: {integrity}" + ); + drop(conn); + + let recovered = std::fs::read(&db_path) + .with_context(|| format!("read recovered sqlite db for actor {actor_id}"))?; + ensure!( + recovered.len() >= SQLITE_MAGIC.len() + 2, + "sqlite v1 database too small after recovery" + ); + ensure!( + &recovered[..SQLITE_MAGIC.len()] == SQLITE_MAGIC, + "sqlite magic bytes mismatch after v1 recovery" + ); + let raw_page_size = u16::from_be_bytes([recovered[16], recovered[17]]); + let page_size = if raw_page_size == 1 { + 65_536_u32 + } else { + u32::from(raw_page_size) + }; + ensure!( + (512..=65_536).contains(&page_size), + "sqlite page size {page_size} is outside the supported range" + ); + ensure!( + page_size == SQLITE_PAGE_SIZE, + "sqlite page size {page_size} is not supported by sqlite v2" + ); + ensure!( + recovered.len() % page_size as usize == 0, + "sqlite v1 database size {} is not page aligned to {}", + recovered.len(), + page_size + ); + + Ok(RecoveredDb { + total_pages: (recovered.len() / page_size as usize) as u32, + bytes: recovered, + had_journal: snapshot.had_journal, + }) +} + +fn decode_v1_meta(bytes: &[u8]) -> Result { + ensure!( + bytes.len() == SQLITE_V1_META_LEN, + "sqlite v1 metadata had invalid length {}", + bytes.len() + ); + let version = u16::from_le_bytes( + bytes[..2] + .try_into() + .expect("sqlite v1 metadata version bytes should exist"), + ); + ensure!( + version == SQLITE_V1_META_VERSION, + "unsupported sqlite v1 metadata version {version}" + ); + Ok(u64::from_le_bytes( + bytes[2..10] + .try_into() + .expect("sqlite v1 metadata size bytes should exist"), + )) +} + +fn rebuild_v1_file( + size_bytes: u64, + expected_chunks: usize, + chunks: &[(u32, Vec)], +) -> Result> { + let size_bytes: usize = size_bytes.try_into().context("sqlite v1 file exceeded usize")?; + ensure!( + chunks.len() == expected_chunks, + "sqlite v1 file expected {expected_chunks} chunks for size {size_bytes}, found {}", + chunks.len() + ); + let mut bytes = vec![0; size_bytes]; + + for (expected_chunk_idx, (chunk_idx, chunk)) in chunks.iter().enumerate() { + ensure!( + *chunk_idx == expected_chunk_idx as u32, + "sqlite v1 file missing or duplicated chunk at index {expected_chunk_idx}" + ); + ensure!( + chunk.len() <= SQLITE_V1_CHUNK_SIZE, + "sqlite v1 chunk {chunk_idx} exceeded {} bytes", + SQLITE_V1_CHUNK_SIZE + ); + let start = (*chunk_idx as usize) + .checked_mul(SQLITE_V1_CHUNK_SIZE) + .context("sqlite v1 chunk offset overflow")?; + let end = start + .checked_add(chunk.len()) + .context("sqlite v1 chunk end overflow")?; + ensure!( + end <= bytes.len(), + "sqlite v1 chunk {chunk_idx} overflowed file size {}", + bytes.len() + ); + bytes[start..end].copy_from_slice(chunk); + } + + Ok(bytes) +} + +fn decode_v1_chunk_index(file_tag: u8, key: &[u8]) -> Result { + let prefix = v1_chunk_prefix(file_tag); + ensure!( + key.starts_with(&prefix), + "sqlite v1 chunk key for file tag {file_tag} had the wrong prefix" + ); + ensure!( + key.len() == prefix.len() + 4, + "sqlite v1 chunk key for file tag {file_tag} had invalid length {}", + key.len() + ); + + Ok(u32::from_be_bytes( + key[prefix.len()..] + .try_into() + .expect("sqlite v1 chunk key index bytes should exist"), + )) +} + +fn split_bytes(bytes: &[u8], max_chunk_bytes: usize) -> Vec> { + if bytes.is_empty() || max_chunk_bytes == 0 { + return vec![bytes.to_vec()]; + } + + bytes + .chunks(max_chunk_bytes) + .map(|chunk| chunk.to_vec()) + .collect() +} + +fn v1_meta_key(file_tag: u8) -> [u8; 4] { + [ + SQLITE_V1_PREFIX, + SQLITE_V1_SCHEMA_VERSION, + SQLITE_V1_META_PREFIX, + file_tag, + ] +} + +fn v1_chunk_prefix(file_tag: u8) -> [u8; 4] { + [ + SQLITE_V1_PREFIX, + SQLITE_V1_SCHEMA_VERSION, + SQLITE_V1_CHUNK_PREFIX, + file_tag, + ] +} + +#[cfg(test)] +fn v1_chunk_key(file_tag: u8, chunk_idx: u32) -> [u8; 8] { + let chunk_idx = chunk_idx.to_be_bytes(); + [ + SQLITE_V1_PREFIX, + SQLITE_V1_SCHEMA_VERSION, + SQLITE_V1_CHUNK_PREFIX, + file_tag, + chunk_idx[0], + chunk_idx[1], + chunk_idx[2], + chunk_idx[3], + ] +} + pub async fn maybe_load_sqlite_startup_data( sqlite_engine: &SqliteEngine, protocol_version: u16, @@ -86,6 +581,12 @@ pub async fn maybe_load_sqlite_startup_data( } let actor_id = actor_id.to_string(); + if let Some(meta) = sqlite_engine.try_load_meta(&actor_id).await? { + ensure!( + !matches!(meta.origin, SqliteOrigin::MigratingFromV1), + "sqlite v1 migration for actor {actor_id} is incomplete" + ); + } let startup = sqlite_engine .takeover(&actor_id, TakeoverConfig::new(timestamp::now())) .await?; @@ -122,3 +623,588 @@ pub fn protocol_sqlite_fetched_page( bytes: page.bytes, } } + +struct V1File { + size_bytes: u64, + bytes: Vec, +} + +struct RecoveredV1Snapshot { + main: V1File, + journal: Option, + had_journal: bool, +} + +struct RecoveredDb { + bytes: Vec, + total_pages: u32, + had_journal: bool, +} + +#[cfg(test)] +mod tests { + use std::path::Path; + use std::sync::Arc; + + use anyhow::Result; + use gas::prelude::{Id, util::timestamp}; + use pegboard::actor_kv::Recipient; + use rusqlite::{Connection, params}; + use sqlite_storage::{ + commit::{CommitRequest, CommitStageRequest}, + engine::SqliteEngine, + keys::meta_key, + ltx::{LtxHeader, encode_ltx_v3}, + takeover::TakeoverConfig, + types::{DirtyPage, SqliteOrigin}, + udb::{WriteOp, apply_write_ops}, + }; + use tempfile::tempdir; + use universaldb::driver::RocksDbDatabaseDriver; + + use super::{ + FILE_TAG_JOURNAL, FILE_TAG_MAIN, FILE_TAG_SHM, FILE_TAG_WAL, + SQLITE_V1_CHUNK_SIZE, SQLITE_V1_MAX_MIGRATION_BYTES, SQLITE_V1_MIGRATION_LEASE_MS, + maybe_migrate_v1_to_v2, read_v1_file, sqlite_subspace, v1_chunk_key, v1_meta_key, + }; + + fn recipient(actor_id: Id) -> Recipient { + Recipient { + actor_id, + namespace_id: Id::new_v1(1), + name: "test".to_string(), + } + } + + async fn test_db() -> Result> { + let path = tempdir()?.keep(); + let driver = RocksDbDatabaseDriver::new(path).await?; + Ok(Arc::new(universaldb::Database::new(Arc::new(driver)))) + } + + fn sqlite_file_bytes(path: &Path) -> Result> { + Ok(std::fs::read(path)?) + } + + fn configure_v1_pragmas_with_page_size(conn: &Connection, page_size: u32) -> Result<()> { + conn.pragma_update(None, "page_size", page_size)?; + conn.pragma_update(None, "journal_mode", "DELETE")?; + conn.pragma_update(None, "synchronous", "NORMAL")?; + conn.pragma_update(None, "temp_store", "MEMORY")?; + conn.pragma_update(None, "auto_vacuum", "NONE")?; + conn.pragma_update(None, "locking_mode", "EXCLUSIVE")?; + Ok(()) + } + + fn configure_v1_pragmas(conn: &Connection) -> Result<()> { + configure_v1_pragmas_with_page_size(conn, 4096) + } + + fn encode_v1_meta(size: u64) -> [u8; 10] { + let mut bytes = [0_u8; 10]; + bytes[..2].copy_from_slice(&1_u16.to_le_bytes()); + bytes[2..].copy_from_slice(&size.to_le_bytes()); + bytes + } + + async fn seed_v1_file( + db: &universaldb::Database, + recipient: &Recipient, + file_tag: u8, + bytes: &[u8], + ) -> Result<()> { + let mut keys = vec![v1_meta_key(file_tag).to_vec()]; + let mut values = vec![encode_v1_meta(bytes.len() as u64).to_vec()]; + for (chunk_idx, chunk) in bytes.chunks(SQLITE_V1_CHUNK_SIZE).enumerate() { + if keys.len() == 128 { + pegboard::actor_kv::put(db, recipient, std::mem::take(&mut keys), std::mem::take(&mut values)) + .await?; + } + keys.push(v1_chunk_key(file_tag, chunk_idx as u32).to_vec()); + values.push(chunk.to_vec()); + } + pegboard::actor_kv::put(db, recipient, keys, values).await + } + + async fn seed_v1_sparse_chunks( + db: &universaldb::Database, + recipient: &Recipient, + file_tag: u8, + size_bytes: u64, + chunk_count: u32, + ) -> Result<()> { + let mut keys = vec![v1_meta_key(file_tag).to_vec()]; + let mut values = vec![encode_v1_meta(size_bytes).to_vec()]; + for chunk_idx in 0..chunk_count { + if keys.len() == 128 { + pegboard::actor_kv::put(db, recipient, std::mem::take(&mut keys), std::mem::take(&mut values)) + .await?; + } + keys.push(v1_chunk_key(file_tag, chunk_idx).to_vec()); + values.push(vec![(chunk_idx as u8).wrapping_add(1)]); + } + pegboard::actor_kv::put(db, recipient, keys, values).await + } + + async fn age_v1_migration_head( + db: &universaldb::Database, + engine: &SqliteEngine, + actor_id: &str, + ) -> Result<()> { + let mut head = engine.load_head(actor_id).await?; + head.creation_ts_ms -= SQLITE_V1_MIGRATION_LEASE_MS + 1; + apply_write_ops( + db, + &sqlite_subspace(), + engine.op_counter.as_ref(), + vec![WriteOp::put(meta_key(actor_id), serde_bare::to_vec(&head)?)], + ) + .await + } + + async fn load_v2_bytes(engine: &SqliteEngine, actor_id: &str) -> Result> { + let meta = engine.load_meta(actor_id).await?; + let pages = engine + .get_pages(actor_id, meta.generation, (1..=meta.db_size_pages).collect()) + .await?; + let mut bytes = Vec::with_capacity(meta.db_size_pages as usize * meta.page_size as usize); + for page in pages { + bytes.extend_from_slice( + &page + .bytes + .unwrap_or_else(|| vec![0; meta.page_size as usize]), + ); + } + Ok(bytes) + } + + fn query_note_values(bytes: &[u8]) -> Result> { + let tmp = tempdir()?; + let path = tmp.path().join("query.db"); + std::fs::write(&path, bytes)?; + let conn = Connection::open(path)?; + let mut stmt = conn.prepare("SELECT note FROM items ORDER BY id")?; + let values = stmt + .query_map([], |row| row.get::<_, String>(0))? + .collect::, _>>()?; + let integrity: String = conn.query_row("PRAGMA integrity_check", [], |row| row.get(0))?; + assert_eq!(integrity, "ok"); + Ok(values) + } + + fn build_fixture_db(notes: &[&str]) -> Result> { + let tmp = tempdir()?; + let path = tmp.path().join("fixture.db"); + let conn = Connection::open(&path)?; + configure_v1_pragmas(&conn)?; + conn.execute_batch( + "CREATE TABLE items (id INTEGER PRIMARY KEY, note TEXT NOT NULL); + CREATE INDEX idx_items_note ON items(note);", + )?; + let tx = conn.unchecked_transaction()?; + for note in notes { + tx.execute("INSERT INTO items(note) VALUES (?1)", params![note])?; + } + tx.commit()?; + drop(conn); + sqlite_file_bytes(&path) + } + + fn build_fixture_db_with_page_size(notes: &[&str], page_size: u32) -> Result> { + let tmp = tempdir()?; + let path = tmp.path().join("fixture.db"); + let conn = Connection::open(&path)?; + configure_v1_pragmas_with_page_size(&conn, page_size)?; + conn.execute_batch("CREATE TABLE items (id INTEGER PRIMARY KEY, note TEXT NOT NULL);")?; + let tx = conn.unchecked_transaction()?; + for note in notes { + tx.execute("INSERT INTO items(note) VALUES (?1)", params![note])?; + } + tx.commit()?; + drop(conn); + sqlite_file_bytes(&path) + } + + fn build_open_tx_fixture() -> Result<(Vec, Vec)> { + let tmp = tempdir()?; + let path = tmp.path().join("fixture.db"); + let conn = Connection::open(&path)?; + configure_v1_pragmas(&conn)?; + conn.execute_batch("CREATE TABLE items (id INTEGER PRIMARY KEY, note TEXT NOT NULL);")?; + conn.execute("INSERT INTO items(note) VALUES (?1)", params!["before"])?; + conn.execute_batch("BEGIN IMMEDIATE;")?; + conn.execute("INSERT INTO items(note) VALUES (?1)", params!["during"])?; + let main = sqlite_file_bytes(&path)?; + let journal = sqlite_file_bytes(&tmp.path().join("fixture.db-journal"))?; + Ok((main, journal)) + } + + #[tokio::test] + async fn migrates_v1_sqlite_into_v2_storage() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let fixture = build_fixture_db(&["alpha", "beta", "gamma", "delta"])?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &fixture).await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + + assert!(maybe_migrate_v1_to_v2(&db, &engine, &recipient).await?); + + let meta = engine.load_meta(&actor_id.to_string()).await?; + assert!(meta.migrated_from_v1); + assert_eq!(meta.origin, SqliteOrigin::MigratedFromV1); + assert_eq!( + query_note_values(&load_v2_bytes(&engine, &actor_id.to_string()).await?)?, + vec!["alpha", "beta", "gamma", "delta"] + ); + + Ok(()) + } + + #[tokio::test] + async fn retries_cleanly_after_stale_partial_v1_import() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let fixture = build_fixture_db(&["retry-a", "retry-b", "retry-c"])?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &fixture).await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + let actor_id_str = actor_id.to_string(); + + let prepared = engine.prepare_v1_migration(&actor_id_str, timestamp::now()).await?; + let stage = engine + .commit_stage_begin( + &actor_id_str, + sqlite_storage::commit::CommitStageBeginRequest { + generation: prepared.meta.generation, + }, + ) + .await?; + let dirty_pages = fixture + .chunks(super::SQLITE_V1_CHUNK_SIZE) + .enumerate() + .map(|(idx, bytes)| DirtyPage { + pgno: idx as u32 + 1, + bytes: bytes.to_vec(), + }) + .collect::>(); + let encoded = encode_ltx_v3( + LtxHeader::delta(stage.txid, dirty_pages.len() as u32, timestamp::now()), + &dirty_pages, + )?; + engine + .commit_stage( + &actor_id_str, + CommitStageRequest { + generation: prepared.meta.generation, + txid: stage.txid, + chunk_idx: 0, + bytes: encoded, + is_last: true, + }, + ) + .await?; + age_v1_migration_head(&db, &engine, &actor_id_str).await?; + + assert!(maybe_migrate_v1_to_v2(&db, &engine, &recipient).await?); + let meta = engine.load_meta(&actor_id_str).await?; + assert_eq!(meta.origin, SqliteOrigin::MigratedFromV1); + assert_eq!( + query_note_values(&load_v2_bytes(&engine, &actor_id_str).await?)?, + vec!["retry-a", "retry-b", "retry-c"] + ); + + Ok(()) + } + + #[tokio::test] + async fn rejects_fresh_in_progress_v1_migrations() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let fixture = build_fixture_db(&["fresh-retry-a", "fresh-retry-b"])?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &fixture).await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + let actor_id_str = actor_id.to_string(); + + let prepared = engine.prepare_v1_migration(&actor_id_str, timestamp::now()).await?; + engine + .commit_stage_begin( + &actor_id_str, + sqlite_storage::commit::CommitStageBeginRequest { + generation: prepared.meta.generation, + }, + ) + .await?; + + let err = maybe_migrate_v1_to_v2(&db, &engine, &recipient) + .await + .expect_err("fresh staged migration should not be retried"); + assert!( + err.to_string().contains("already in progress"), + "unexpected error: {err:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn skips_native_v2_state_even_if_v1_tombstone_exists() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let actor_id_str = actor_id.to_string(); + let v1_fixture = build_fixture_db(&["legacy"])?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &v1_fixture).await?; + let native_fixture = build_fixture_db(&["native"])?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + let takeover = engine + .takeover(&actor_id_str, TakeoverConfig::new(timestamp::now())) + .await?; + let dirty_pages = native_fixture + .chunks(super::SQLITE_V1_CHUNK_SIZE) + .enumerate() + .map(|(idx, bytes)| DirtyPage { + pgno: idx as u32 + 1, + bytes: bytes.to_vec(), + }) + .collect::>(); + engine + .commit( + &actor_id_str, + CommitRequest { + generation: takeover.generation, + head_txid: takeover.meta.head_txid, + db_size_pages: dirty_pages.len() as u32, + dirty_pages, + now_ms: timestamp::now(), + }, + ) + .await?; + + assert!(!maybe_migrate_v1_to_v2(&db, &engine, &recipient).await?); + + let meta = engine.load_meta(&actor_id_str).await?; + assert_eq!(meta.origin, SqliteOrigin::Native); + assert_eq!( + query_note_values(&load_v2_bytes(&engine, &actor_id_str).await?)?, + vec!["native"] + ); + + Ok(()) + } + + #[tokio::test] + async fn bails_when_v2_meta_is_unreadable() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let actor_id_str = actor_id.to_string(); + let fixture = build_fixture_db(&["broken-meta"])?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &fixture).await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + apply_write_ops( + db.as_ref(), + &sqlite_subspace(), + engine.op_counter.as_ref(), + vec![WriteOp::put(meta_key(&actor_id_str), b"not-a-db-head".to_vec())], + ) + .await?; + + let err = maybe_migrate_v1_to_v2(&db, &engine, &recipient) + .await + .expect_err("corrupt meta should fail migration"); + assert!( + err.to_string().contains("decode sqlite db head"), + "unexpected error: {err:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn recovers_a_pending_v1_journal_before_import() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let (main, journal) = build_open_tx_fixture()?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &main).await?; + seed_v1_file(&db, &recipient, FILE_TAG_JOURNAL, &journal).await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + + assert!(maybe_migrate_v1_to_v2(&db, &engine, &recipient).await?); + assert_eq!( + query_note_values(&load_v2_bytes(&engine, &actor_id.to_string()).await?)?, + vec!["before"] + ); + + Ok(()) + } + + #[tokio::test] + async fn migrates_zero_size_v1_state_without_pages() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &[]).await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + + assert!(maybe_migrate_v1_to_v2(&db, &engine, &recipient).await?); + + let meta = engine.load_meta(&actor_id.to_string()).await?; + assert_eq!(meta.origin, SqliteOrigin::MigratedFromV1); + assert_eq!(meta.db_size_pages, 0); + assert!(load_v2_bytes(&engine, &actor_id.to_string()).await?.is_empty()); + + Ok(()) + } + + #[tokio::test] + async fn rejects_v1_databases_with_unsupported_page_size() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let fixture = build_fixture_db_with_page_size(&["wrong-page-size"], 8192)?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &fixture).await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + + let err = maybe_migrate_v1_to_v2(&db, &engine, &recipient) + .await + .expect_err("unsupported page size should fail migration"); + assert!( + err.to_string() + .contains("sqlite page size 8192 is not supported by sqlite v2"), + "unexpected error: {err:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn rejects_v1_wal_sidecars() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let fixture = build_fixture_db(&["wal-sidecar"])?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &fixture).await?; + seed_v1_file(&db, &recipient, FILE_TAG_WAL, b"unexpected wal bytes").await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + + let err = maybe_migrate_v1_to_v2(&db, &engine, &recipient) + .await + .expect_err("wal sidecar should fail migration"); + assert!( + err.to_string() + .contains("unexpected sqlite v1 WAL sidecar present"), + "unexpected error: {err:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn rejects_v1_shm_sidecars() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let fixture = build_fixture_db(&["shm-sidecar"])?; + seed_v1_file(&db, &recipient, FILE_TAG_MAIN, &fixture).await?; + seed_v1_file(&db, &recipient, FILE_TAG_SHM, b"unexpected shm bytes").await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + + let err = maybe_migrate_v1_to_v2(&db, &engine, &recipient) + .await + .expect_err("shm sidecar should fail migration"); + assert!( + err.to_string() + .contains("unexpected sqlite v1 SHM sidecar present"), + "unexpected error: {err:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn rejects_v1_files_with_missing_chunks() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let fixture = build_fixture_db(&["chunk-a", "chunk-b", "chunk-c", "chunk-d"])?; + let mut keys = vec![v1_meta_key(FILE_TAG_MAIN).to_vec()]; + let mut values = vec![encode_v1_meta(fixture.len() as u64).to_vec()]; + for (chunk_idx, chunk) in fixture.chunks(SQLITE_V1_CHUNK_SIZE).enumerate() { + if chunk_idx == 1 { + continue; + } + if keys.len() == 128 { + pegboard::actor_kv::put(&db, &recipient, std::mem::take(&mut keys), std::mem::take(&mut values)) + .await?; + } + keys.push(v1_chunk_key(FILE_TAG_MAIN, chunk_idx as u32).to_vec()); + values.push(chunk.to_vec()); + } + pegboard::actor_kv::put(&db, &recipient, keys, values).await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + + let err = maybe_migrate_v1_to_v2(&db, &engine, &recipient) + .await + .expect_err("missing chunk should fail migration"); + let err_debug = format!("{err:?}"); + assert!( + err_debug.contains("sqlite v1 file expected") + || err_debug.contains("missing or duplicated chunk"), + "unexpected error: {err:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn rejects_v1_files_that_exceed_migration_limit() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + pegboard::actor_kv::put( + &db, + &recipient, + vec![v1_meta_key(FILE_TAG_MAIN).to_vec()], + vec![encode_v1_meta(SQLITE_V1_MAX_MIGRATION_BYTES + 1).to_vec()], + ) + .await?; + let (engine, _compaction_rx) = SqliteEngine::new(db.clone(), sqlite_subspace()); + + let err = maybe_migrate_v1_to_v2(&db, &engine, &recipient) + .await + .expect_err("oversized v1 file should fail migration"); + assert!( + err.to_string().contains("exceeded migration limit"), + "unexpected error: {err:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn reads_v1_files_beyond_the_default_kv_list_limit() -> Result<()> { + let db = test_db().await?; + let actor_id = Id::new_v1(1); + let recipient = recipient(actor_id); + let chunk_count = 16_385_u32; + let size_bytes = u64::from(chunk_count) * SQLITE_V1_CHUNK_SIZE as u64; + seed_v1_sparse_chunks(&db, &recipient, FILE_TAG_MAIN, size_bytes, chunk_count).await?; + + let file = read_v1_file(&db, &recipient, FILE_TAG_MAIN) + .await? + .expect("sparse v1 file should exist"); + assert_eq!(file.size_bytes, size_bytes); + assert_eq!(file.bytes.len(), size_bytes as usize); + assert_eq!(file.bytes[0], 1); + assert_eq!(file.bytes[SQLITE_V1_CHUNK_SIZE], 2); + assert_eq!( + file.bytes[(chunk_count as usize - 1) * SQLITE_V1_CHUNK_SIZE], + (chunk_count as u8).wrapping_add(0) + ); + + Ok(()) + } +} diff --git a/engine/packages/pegboard-envoy/src/ws_to_tunnel_task.rs b/engine/packages/pegboard-envoy/src/ws_to_tunnel_task.rs index 7c001287f6..4c1fe04e16 100644 --- a/engine/packages/pegboard-envoy/src/ws_to_tunnel_task.rs +++ b/engine/packages/pegboard-envoy/src/ws_to_tunnel_task.rs @@ -918,6 +918,7 @@ async fn handle_sqlite_commit_finalize( txid: request.txid, new_db_size_pages: request.new_db_size_pages, now_ms: util::timestamp::now(), + origin_override: None, }, ) .await; diff --git a/engine/packages/sqlite-storage/examples/bench_rtt.rs b/engine/packages/sqlite-storage/examples/bench_rtt.rs index 15b2046b81..3355568a08 100644 --- a/engine/packages/sqlite-storage/examples/bench_rtt.rs +++ b/engine/packages/sqlite-storage/examples/bench_rtt.rs @@ -206,6 +206,7 @@ async fn main() -> Result<()> { txid: stage.txid, new_db_size_pages: total_pages, now_ms: 300, + origin_override: None, }, ) .await diff --git a/engine/packages/sqlite-storage/src/commit.rs b/engine/packages/sqlite-storage/src/commit.rs index 8e9eb2711c..de6d9fd048 100644 --- a/engine/packages/sqlite-storage/src/commit.rs +++ b/engine/packages/sqlite-storage/src/commit.rs @@ -13,7 +13,9 @@ use crate::error::SqliteStorageError; use crate::keys::{delta_chunk_key, delta_chunk_prefix, meta_key, pidx_delta_key}; use crate::ltx::{LtxHeader, decode_ltx_v3, encode_ltx_v3}; use crate::quota::{encode_db_head_with_usage, tracked_storage_entry_size}; -use crate::types::{DBHead, DirtyPage, SQLITE_MAX_DELTA_BYTES, SqliteMeta}; +use crate::types::{ + DirtyPage, SQLITE_MAX_DELTA_BYTES, SqliteMeta, SqliteOrigin, decode_db_head, +}; use crate::udb; #[derive(Debug, Clone, PartialEq, Eq)] @@ -63,6 +65,7 @@ pub struct CommitFinalizeRequest { pub txid: u64, pub new_db_size_pages: u32, pub now_ms: i64, + pub origin_override: Option, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -691,6 +694,9 @@ impl SqliteEngine { head.head_txid = request.txid; head.db_size_pages = request.new_db_size_pages; + if let Some(origin_override) = request.origin_override { + head.origin = origin_override; + } let txid_bytes = request.txid.to_be_bytes(); let mut usage_without_meta = head.sqlite_storage_used.saturating_sub( @@ -799,10 +805,6 @@ impl SqliteEngine { } } -fn decode_db_head(bytes: &[u8]) -> Result { - serde_bare::from_slice(bytes).context("decode sqlite db head") -} - fn dirty_pages_raw_bytes(dirty_pages: &[DirtyPage]) -> Result { dirty_pages.iter().try_fold(0u64, |total, page| { let page_bytes = @@ -835,7 +837,7 @@ mod tests { }; use crate::types::{ DBHead, DirtyPage, FetchedPage, SQLITE_DEFAULT_MAX_STORAGE_BYTES, SQLITE_PAGE_SIZE, - SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, + SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, SqliteOrigin, }; use crate::udb::{WriteOp, apply_write_ops}; @@ -854,6 +856,7 @@ mod tests { creation_ts_ms: 123, sqlite_storage_used: 0, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES, + origin: SqliteOrigin::Native, } } @@ -922,6 +925,7 @@ mod tests { txid: stage_begin.txid, new_db_size_pages, now_ms, + origin_override: None, }, ) .await?; @@ -1513,6 +1517,7 @@ mod tests { txid: 999, new_db_size_pages: 1, now_ms: 777, + origin_override: None, }, ) .await diff --git a/engine/packages/sqlite-storage/src/compaction/shard.rs b/engine/packages/sqlite-storage/src/compaction/shard.rs index 9ea942d29e..d544d74f11 100644 --- a/engine/packages/sqlite-storage/src/compaction/shard.rs +++ b/engine/packages/sqlite-storage/src/compaction/shard.rs @@ -13,7 +13,7 @@ use crate::keys::{ }; use crate::ltx::{LtxHeader, decode_ltx_v3, encode_ltx_v3}; use crate::quota::{encode_db_head_with_usage, tracked_storage_entry_size}; -use crate::types::{DBHead, DirtyPage, SQLITE_PAGE_SIZE}; +use crate::types::{DBHead, DirtyPage, SQLITE_PAGE_SIZE, decode_db_head}; use crate::udb::{self, WriteOp}; const PIDX_PGNO_BYTES: usize = std::mem::size_of::(); @@ -467,10 +467,6 @@ fn compute_materialized_txid( } } -fn decode_db_head(bytes: &[u8]) -> Result { - serde_bare::from_slice(bytes).context("decode sqlite db head") -} - fn decode_pidx_pgno(actor_id: &str, key: &[u8]) -> Result { let prefix = pidx_delta_prefix(actor_id); ensure!( @@ -522,7 +518,7 @@ mod tests { use crate::test_utils::{read_value, scan_prefix_values, test_db}; use crate::types::{ DBHead, DirtyPage, FetchedPage, SQLITE_DEFAULT_MAX_STORAGE_BYTES, SQLITE_PAGE_SIZE, - SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, + SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, SqliteOrigin, }; use crate::udb::{WriteOp, apply_write_ops, test_hooks}; @@ -541,6 +537,7 @@ mod tests { creation_ts_ms: 123, sqlite_storage_used: 0, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES, + origin: SqliteOrigin::Native, } } diff --git a/engine/packages/sqlite-storage/src/compaction/worker.rs b/engine/packages/sqlite-storage/src/compaction/worker.rs index b4d27b7403..2e50e35dd5 100644 --- a/engine/packages/sqlite-storage/src/compaction/worker.rs +++ b/engine/packages/sqlite-storage/src/compaction/worker.rs @@ -68,7 +68,7 @@ mod tests { use crate::test_utils::{clear_op_count, scan_prefix_values, test_db}; use crate::types::{ DBHead, DirtyPage, SQLITE_DEFAULT_MAX_STORAGE_BYTES, SQLITE_PAGE_SIZE, SQLITE_SHARD_SIZE, - SQLITE_VFS_V2_SCHEMA_VERSION, + SQLITE_VFS_V2_SCHEMA_VERSION, SqliteOrigin, }; use crate::udb::{self, WriteOp, apply_write_ops}; @@ -87,6 +87,7 @@ mod tests { creation_ts_ms: 123, sqlite_storage_used: 0, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES, + origin: SqliteOrigin::Native, } } diff --git a/engine/packages/sqlite-storage/src/engine.rs b/engine/packages/sqlite-storage/src/engine.rs index d5732ba7f2..b2b38caf5f 100644 --- a/engine/packages/sqlite-storage/src/engine.rs +++ b/engine/packages/sqlite-storage/src/engine.rs @@ -12,7 +12,7 @@ use universaldb::Subspace; use crate::keys::{meta_key, pidx_delta_prefix}; use crate::metrics::SqliteStorageMetrics; use crate::page_index::DeltaPageIndex; -use crate::types::{DBHead, SQLITE_MAX_DELTA_BYTES, SqliteMeta}; +use crate::types::{DBHead, SQLITE_MAX_DELTA_BYTES, SqliteMeta, decode_db_head}; use crate::udb; pub struct SqliteEngine { @@ -56,23 +56,34 @@ impl SqliteEngine { } pub async fn load_head(&self, actor_id: &str) -> Result { + self.try_load_head(actor_id) + .await? + .context("sqlite meta missing") + } + + pub async fn try_load_head(&self, actor_id: &str) -> Result> { let meta_bytes = udb::get_value( self.db.as_ref(), &self.subspace, self.op_counter.as_ref(), meta_key(actor_id), ) - .await? - .context("sqlite meta missing")?; + .await?; - serde_bare::from_slice(&meta_bytes).context("decode sqlite db head") + meta_bytes.map(|meta_bytes| decode_db_head(&meta_bytes)).transpose() } pub async fn load_meta(&self, actor_id: &str) -> Result { - Ok(SqliteMeta::from(( - self.load_head(actor_id).await?, - SQLITE_MAX_DELTA_BYTES, - ))) + self.try_load_meta(actor_id) + .await? + .context("sqlite meta missing") + } + + pub async fn try_load_meta(&self, actor_id: &str) -> Result> { + Ok(self + .try_load_head(actor_id) + .await? + .map(|head| SqliteMeta::from((head, SQLITE_MAX_DELTA_BYTES)))) } pub async fn get_or_load_pidx( diff --git a/engine/packages/sqlite-storage/src/keys.rs b/engine/packages/sqlite-storage/src/keys.rs index 4a1458886a..ddd9460a49 100644 --- a/engine/packages/sqlite-storage/src/keys.rs +++ b/engine/packages/sqlite-storage/src/keys.rs @@ -1,6 +1,7 @@ //! Key builders for sqlite-storage blobs and indexes. use anyhow::{Context, Result, ensure}; +use universaldb::utils::end_of_key_range; pub const SQLITE_SUBSPACE_PREFIX: u8 = 0x02; @@ -18,6 +19,12 @@ pub(crate) fn actor_prefix(actor_id: &str) -> Vec { key } +pub fn actor_range(actor_id: &str) -> (Vec, Vec) { + let start = actor_prefix(actor_id); + let end = end_of_key_range(&start); + (start, end) +} + pub fn meta_key(actor_id: &str) -> Vec { let prefix = actor_prefix(actor_id); let mut key = Vec::with_capacity(prefix.len() + META_PATH.len()); diff --git a/engine/packages/sqlite-storage/src/quota.rs b/engine/packages/sqlite-storage/src/quota.rs index 0c87326cfe..d45973dcd5 100644 --- a/engine/packages/sqlite-storage/src/quota.rs +++ b/engine/packages/sqlite-storage/src/quota.rs @@ -63,6 +63,7 @@ mod tests { use crate::keys::{delta_chunk_key, meta_key, pidx_delta_key, shard_key}; use crate::types::{ DBHead, SQLITE_DEFAULT_MAX_STORAGE_BYTES, SQLITE_PAGE_SIZE, SQLITE_SHARD_SIZE, + SqliteOrigin, }; const TEST_ACTOR: &str = "test-actor"; @@ -97,6 +98,7 @@ mod tests { creation_ts_ms: 123, sqlite_storage_used: 0, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES, + origin: SqliteOrigin::Native, }; let (encoded_head, encoded_bytes) = encode_db_head_with_usage(TEST_ACTOR, &head, 1_024)?; diff --git a/engine/packages/sqlite-storage/src/read.rs b/engine/packages/sqlite-storage/src/read.rs index 00ebeeda2c..865e54231b 100644 --- a/engine/packages/sqlite-storage/src/read.rs +++ b/engine/packages/sqlite-storage/src/read.rs @@ -14,7 +14,7 @@ use crate::keys::{ }; use crate::ltx::{DecodedLtx, decode_ltx_v3}; use crate::page_index::DeltaPageIndex; -use crate::types::{DBHead, FetchedPage}; +use crate::types::{DBHead, FetchedPage, decode_db_head}; use crate::udb; const PIDX_PGNO_BYTES: usize = std::mem::size_of::(); @@ -346,10 +346,6 @@ struct GetPagesTxResult { stale_pidx_pgnos: BTreeSet, } -fn decode_db_head(bytes: &[u8]) -> Result { - serde_bare::from_slice(bytes).context("decode sqlite db head") -} - async fn load_delta_history_blobs( engine: &SqliteEngine, actor_id: &str, @@ -466,7 +462,7 @@ mod tests { use crate::test_utils::{assert_op_count, clear_op_count, read_value, test_db}; use crate::types::{ DBHead, DirtyPage, FetchedPage, SQLITE_DEFAULT_MAX_STORAGE_BYTES, SQLITE_PAGE_SIZE, - SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, + SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, SqliteOrigin, }; use crate::udb::{WriteOp, apply_write_ops}; @@ -485,6 +481,7 @@ mod tests { creation_ts_ms: 123, sqlite_storage_used: 0, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES, + origin: SqliteOrigin::Native, } } diff --git a/engine/packages/sqlite-storage/src/takeover.rs b/engine/packages/sqlite-storage/src/takeover.rs index c61682ccb3..ba57f17650 100644 --- a/engine/packages/sqlite-storage/src/takeover.rs +++ b/engine/packages/sqlite-storage/src/takeover.rs @@ -9,11 +9,13 @@ use crate::engine::SqliteEngine; use crate::error::SqliteStorageError; use crate::keys::{ decode_delta_chunk_txid, delta_chunk_prefix, delta_prefix, meta_key, pidx_delta_prefix, - shard_key, + shard_key, shard_prefix, }; use crate::ltx::decode_ltx_v3; use crate::quota::{encode_db_head_with_usage, tracked_storage_entry_size}; -use crate::types::{DBHead, FetchedPage, SQLITE_MAX_DELTA_BYTES, SqliteMeta}; +use crate::types::{ + DBHead, FetchedPage, SQLITE_MAX_DELTA_BYTES, SqliteMeta, SqliteOrigin, decode_db_head, +}; use crate::udb::{self, WriteOp}; pub const DEFAULT_PRELOAD_MAX_BYTES: usize = 1024 * 1024; @@ -53,7 +55,66 @@ pub struct TakeoverResult { pub preloaded_pages: Vec, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PrepareV1MigrationResult { + pub meta: SqliteMeta, +} + impl SqliteEngine { + pub async fn prepare_v1_migration( + &self, + actor_id: &str, + now_ms: i64, + ) -> Result { + let actor_id = actor_id.to_string(); + let actor_id_for_tx = actor_id.clone(); + let subspace = self.subspace.clone(); + let (head, _encoded_head) = + udb::run_db_op(self.db.as_ref(), self.op_counter.as_ref(), move |tx| { + let actor_id = actor_id_for_tx.clone(); + let subspace = subspace.clone(); + async move { + let meta_storage_key = meta_key(&actor_id); + if let Some(existing_meta) = + udb::tx_get_value_serializable(&tx, &subspace, &meta_storage_key).await? + { + let existing_head = decode_db_head(&existing_meta)?; + ensure!( + matches!(existing_head.origin, SqliteOrigin::MigratingFromV1), + SqliteStorageError::ConcurrentTakeover + ); + } + + udb::tx_delete_value_precise(&tx, &subspace, &meta_storage_key).await?; + for prefix in [ + delta_prefix(&actor_id), + pidx_delta_prefix(&actor_id), + shard_prefix(&actor_id), + ] { + for (key, _) in udb::tx_scan_prefix_values(&tx, &subspace, &prefix).await? { + udb::tx_delete_value_precise(&tx, &subspace, &key).await?; + } + } + + let mut head = DBHead::new(now_ms); + head.origin = SqliteOrigin::MigratingFromV1; + let (head, encoded_head) = encode_db_head_with_usage(&actor_id, &head, 0)?; + udb::tx_write_value(&tx, &subspace, &meta_storage_key, &encoded_head)?; + + Ok((head, encoded_head)) + } + }) + .await?; + + self.page_indices.remove_async(&actor_id).await; + self.pending_stages + .retain_sync(|(pending_actor_id, _), _| pending_actor_id != &actor_id); + + Ok(PrepareV1MigrationResult { + meta: SqliteMeta::from((head, SQLITE_MAX_DELTA_BYTES)), + }) + } + pub async fn takeover(&self, actor_id: &str, config: TakeoverConfig) -> Result { let start = Instant::now(); let meta_bytes = udb::get_value( @@ -343,10 +404,6 @@ fn collect_preload_pgnos(config: &TakeoverConfig) -> Vec { requested.into_iter().collect() } -fn decode_db_head(bytes: &[u8]) -> Result { - serde_bare::from_slice(bytes).context("decode sqlite db head") -} - fn decode_pidx_pgno(actor_id: &str, key: &[u8]) -> Result { let prefix = pidx_delta_prefix(actor_id); ensure!( @@ -430,9 +487,12 @@ mod tests { } use crate::types::{ DBHead, DirtyPage, FetchedPage, SQLITE_DEFAULT_MAX_STORAGE_BYTES, SQLITE_MAX_DELTA_BYTES, - SQLITE_PAGE_SIZE, SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, + SQLITE_PAGE_SIZE, SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, SqliteOrigin, + decode_db_head, + }; + use crate::udb::{ + WriteOp, apply_write_ops, physical_chunk_key, raw_key_exists, }; - use crate::udb::{WriteOp, apply_write_ops}; const TEST_ACTOR: &str = "test-actor"; @@ -449,6 +509,7 @@ mod tests { creation_ts_ms: 123, sqlite_storage_used: 0, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES, + origin: SqliteOrigin::Native, } } @@ -502,6 +563,89 @@ mod tests { Ok(()) } + #[tokio::test] + async fn prepare_v1_migration_wipes_actor_rows_and_chunk_subkeys() -> Result<()> { + let (db, subspace) = test_db().await?; + let (engine, _compaction_rx) = SqliteEngine::new(db, subspace); + let large_orphan = vec![0x5a; 150_000]; + let unrelated_key = meta_key("other-actor"); + let orphan_key = delta_blob_key(TEST_ACTOR, 99); + let orphan_chunk_0 = physical_chunk_key(&engine.subspace, &orphan_key, 0); + let orphan_chunk_14 = physical_chunk_key(&engine.subspace, &orphan_key, 14); + apply_write_ops( + engine.db.as_ref(), + &engine.subspace, + engine.op_counter.as_ref(), + vec![ + WriteOp::put(orphan_key.clone(), large_orphan), + WriteOp::put(pidx_delta_key(TEST_ACTOR, 1), 99_u64.to_be_bytes().to_vec()), + WriteOp::put(unrelated_key.clone(), vec![0x42]), + ], + ) + .await?; + + assert!( + raw_key_exists( + engine.db.as_ref(), + engine.op_counter.as_ref(), + orphan_chunk_0.clone(), + ) + .await?, + "chunked orphan should create physical chunk rows" + ); + assert!( + raw_key_exists( + engine.db.as_ref(), + engine.op_counter.as_ref(), + orphan_chunk_14.clone(), + ) + .await?, + "chunked orphan should create the tail chunk row too" + ); + + let prepared = engine.prepare_v1_migration(TEST_ACTOR, 4_242).await?; + assert_eq!(prepared.meta.origin, SqliteOrigin::MigratingFromV1); + + assert!(read_value(&engine, orphan_key.clone()).await?.is_none()); + assert!( + read_value(&engine, pidx_delta_key(TEST_ACTOR, 1)) + .await? + .is_none() + ); + let stored_meta = read_value(&engine, meta_key(TEST_ACTOR)) + .await? + .expect("meta should be recreated"); + let head = decode_db_head(&stored_meta)?; + assert_eq!(head.origin, SqliteOrigin::MigratingFromV1); + assert_eq!(head.creation_ts_ms, 4_242); + assert!( + !raw_key_exists( + engine.db.as_ref(), + engine.op_counter.as_ref(), + orphan_chunk_0, + ) + .await?, + "orphaned chunk row 0 should be wiped" + ); + assert!( + !raw_key_exists( + engine.db.as_ref(), + engine.op_counter.as_ref(), + orphan_chunk_14, + ) + .await?, + "orphaned chunk subkeys should be wiped too" + ); + + assert_eq!( + read_value(&engine, unrelated_key.clone()).await?, + Some(vec![0x42]), + "cleanup should stay inside the actor prefix" + ); + + Ok(()) + } + #[tokio::test] async fn takeover_on_existing_meta_bumps_generation_and_preloads_page_one() -> Result<()> { let (db, subspace) = test_db().await?; diff --git a/engine/packages/sqlite-storage/src/types.rs b/engine/packages/sqlite-storage/src/types.rs index c49d5433a0..48b1d09cfe 100644 --- a/engine/packages/sqlite-storage/src/types.rs +++ b/engine/packages/sqlite-storage/src/types.rs @@ -1,5 +1,6 @@ //! Core storage types for the SQLite VFS v2 engine implementation. +use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; pub const SQLITE_VFS_V2_SCHEMA_VERSION: u32 = 2; @@ -20,6 +21,13 @@ pub const SQLITE_DEFAULT_MAX_STORAGE_BYTES: u64 = 10 * 1024 * 1024 * 1024; /// advances `materialized_txid` to the highest txid whose pages have all been merged. /// - `generation` is bumped by takeover. Every commit and compaction writes a fence check on /// `generation` so a takeover cleanly invalidates an in-flight commit from the previous owner. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum SqliteOrigin { + Native, + MigratedFromV1, + MigratingFromV1, +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct DBHead { pub schema_version: u32, @@ -33,6 +41,7 @@ pub struct DBHead { pub creation_ts_ms: i64, pub sqlite_storage_used: u64, pub sqlite_max_storage: u64, + pub origin: SqliteOrigin, } impl DBHead { @@ -49,6 +58,7 @@ impl DBHead { creation_ts_ms, sqlite_storage_used: 0, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES, + origin: SqliteOrigin::Native, } } } @@ -77,6 +87,8 @@ pub struct SqliteMeta { pub max_delta_bytes: u64, pub sqlite_storage_used: u64, pub sqlite_max_storage: u64, + pub migrated_from_v1: bool, + pub origin: SqliteOrigin, } impl From<(DBHead, u64)> for SqliteMeta { @@ -92,6 +104,48 @@ impl From<(DBHead, u64)> for SqliteMeta { max_delta_bytes, sqlite_storage_used: head.sqlite_storage_used, sqlite_max_storage: head.sqlite_max_storage, + migrated_from_v1: matches!(head.origin, SqliteOrigin::MigratedFromV1), + origin: head.origin, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct LegacyDBHead { + schema_version: u32, + generation: u64, + head_txid: u64, + next_txid: u64, + materialized_txid: u64, + db_size_pages: u32, + page_size: u32, + shard_size: u32, + creation_ts_ms: i64, + sqlite_storage_used: u64, + sqlite_max_storage: u64, +} + +pub fn decode_db_head(bytes: &[u8]) -> Result { + match serde_bare::from_slice(bytes) { + Ok(head) => Ok(head), + Err(err) => { + let legacy: LegacyDBHead = + serde_bare::from_slice(bytes).context("decode sqlite db head")?; + tracing::debug!(?err, "decoded legacy sqlite db head without origin field"); + Ok(DBHead { + schema_version: legacy.schema_version, + generation: legacy.generation, + head_txid: legacy.head_txid, + next_txid: legacy.next_txid, + materialized_txid: legacy.materialized_txid, + db_size_pages: legacy.db_size_pages, + page_size: legacy.page_size, + shard_size: legacy.shard_size, + creation_ts_ms: legacy.creation_ts_ms, + sqlite_storage_used: legacy.sqlite_storage_used, + sqlite_max_storage: legacy.sqlite_max_storage, + origin: SqliteOrigin::Native, + }) } } } @@ -101,6 +155,7 @@ mod tests { use super::{ DBHead, DirtyPage, FetchedPage, SQLITE_DEFAULT_MAX_STORAGE_BYTES, SQLITE_MAX_DELTA_BYTES, SQLITE_PAGE_SIZE, SQLITE_SHARD_SIZE, SQLITE_VFS_V2_SCHEMA_VERSION, SqliteMeta, + SqliteOrigin, decode_db_head, }; #[test] @@ -118,6 +173,7 @@ mod tests { assert_eq!(head.creation_ts_ms, 1_713_456_789_000); assert_eq!(head.sqlite_storage_used, 0); assert_eq!(head.sqlite_max_storage, SQLITE_DEFAULT_MAX_STORAGE_BYTES); + assert_eq!(head.origin, SqliteOrigin::Native); } #[test] @@ -134,6 +190,7 @@ mod tests { creation_ts_ms: 1_713_456_789_000, sqlite_storage_used: 8_192, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES, + origin: SqliteOrigin::MigratedFromV1, }; let encoded = serde_bare::to_vec(&head).expect("db head should serialize"); @@ -157,6 +214,7 @@ mod tests { creation_ts_ms: 456, sqlite_storage_used: 16_384, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES / 2, + origin: SqliteOrigin::MigratedFromV1, }, SQLITE_MAX_DELTA_BYTES, )); @@ -174,10 +232,35 @@ mod tests { max_delta_bytes: SQLITE_MAX_DELTA_BYTES, sqlite_storage_used: 16_384, sqlite_max_storage: SQLITE_DEFAULT_MAX_STORAGE_BYTES / 2, + migrated_from_v1: true, + origin: SqliteOrigin::MigratedFromV1, } ); } + #[test] + fn decode_db_head_defaults_legacy_rows_to_native_origin() { + let legacy = ( + SQLITE_VFS_V2_SCHEMA_VERSION, + 7_u64, + 9_u64, + 10_u64, + 5_u64, + 321_u32, + SQLITE_PAGE_SIZE, + SQLITE_SHARD_SIZE, + 1_713_456_789_000_i64, + 8_192_u64, + SQLITE_DEFAULT_MAX_STORAGE_BYTES, + ); + let encoded = serde_bare::to_vec(&legacy).expect("legacy head should serialize"); + let decoded = decode_db_head(&encoded).expect("legacy head should decode"); + + assert_eq!(decoded.origin, SqliteOrigin::Native); + assert_eq!(decoded.generation, 7); + assert_eq!(decoded.db_size_pages, 321); + } + #[test] fn page_types_preserve_payloads() { let dirty = DirtyPage { diff --git a/engine/packages/sqlite-storage/src/udb.rs b/engine/packages/sqlite-storage/src/udb.rs index 7b63d6442f..8ba9a057f5 100644 --- a/engine/packages/sqlite-storage/src/udb.rs +++ b/engine/packages/sqlite-storage/src/udb.rs @@ -188,6 +188,49 @@ pub(crate) async fn tx_scan_prefix_values( Ok(rows) } +pub(crate) async fn tx_delete_value_precise( + tx: &universaldb::Transaction, + subspace: &Subspace, + key: &[u8], +) -> Result<()> { + let metadata = tx.get(&physical_key(subspace, key), Snapshot).await?; + tx.clear(&physical_key(subspace, key)); + + if let Some(metadata) = metadata.as_ref() { + match metadata.first().copied() { + Some(INLINE_VALUE_MARKER) | None => {} + Some(CHUNKED_VALUE_MARKER) => { + ensure!( + metadata.len() == CHUNKED_METADATA_LEN, + "chunked metadata for key {:?} had invalid length {}", + key, + metadata.len() + ); + let chunk_count = u32::from_be_bytes( + metadata[5..9] + .try_into() + .expect("chunked metadata count bytes should be present"), + ); + for chunk_idx in 0..chunk_count { + tx.clear(&physical_key(subspace, &chunk_key(key, chunk_idx))); + } + } + Some(other) => { + return Err(anyhow::anyhow!( + "unknown sqlite-storage value marker {other} for key {:?}", + key + )); + } + } + } + + let prefix = chunk_key_prefix(key); + let physical_prefix = physical_key(subspace, &prefix); + tx.clear_range(&physical_prefix, &end_of_key_range(&physical_prefix)); + + Ok(()) +} + pub(crate) fn tx_write_value( tx: &universaldb::Transaction, subspace: &Subspace, @@ -312,6 +355,26 @@ fn physical_key(subspace: &Subspace, key: &[u8]) -> Vec { [subspace.bytes(), key].concat() } +#[cfg(test)] +pub fn physical_chunk_key(subspace: &Subspace, key: &[u8], chunk_idx: u32) -> Vec { + physical_key(subspace, &chunk_key(key, chunk_idx)) +} + +#[cfg(test)] +pub async fn raw_key_exists( + db: &universaldb::Database, + op_counter: &AtomicUsize, + key: Vec, +) -> Result { + run_db_op(db, op_counter, move |tx| { + let key = key.clone(); + async move { + Ok(tx.get(&key, Snapshot).await?.is_some()) + } + }) + .await +} + #[cfg(test)] pub mod test_hooks { use std::sync::Mutex; diff --git a/rivetkit-rust/packages/rivetkit-core/src/registry.rs b/rivetkit-rust/packages/rivetkit-core/src/registry.rs index 119ee7071e..b0b4c25275 100644 --- a/rivetkit-rust/packages/rivetkit-core/src/registry.rs +++ b/rivetkit-rust/packages/rivetkit-core/src/registry.rs @@ -2216,7 +2216,6 @@ impl RegistryDispatcher { generation: u32, actor_name: &str, key: ActorKey, - sqlite_schema_version: u32, sqlite_startup_data: Option, factory: &ActorFactory, ) -> ActorContext { @@ -2230,7 +2229,6 @@ impl RegistryDispatcher { SqliteDb::new( handle.clone(), actor_id.to_owned(), - sqlite_schema_version, sqlite_startup_data, ), ); @@ -2248,7 +2246,7 @@ impl EnvoyCallbacks for RegistryCallbacks { generation: u32, config: protocol::ActorConfig, preloaded_kv: Option, - sqlite_schema_version: u32, + _sqlite_schema_version: u32, sqlite_startup_data: Option, ) -> EnvoyBoxFuture> { let dispatcher = self.dispatcher.clone(); @@ -2267,7 +2265,6 @@ impl EnvoyCallbacks for RegistryCallbacks { generation, &actor_name, key, - sqlite_schema_version, sqlite_startup_data, factory.as_ref(), ); diff --git a/rivetkit-rust/packages/rivetkit-core/src/sqlite.rs b/rivetkit-rust/packages/rivetkit-core/src/sqlite.rs index 713a6862d1..486260fa17 100644 --- a/rivetkit-rust/packages/rivetkit-core/src/sqlite.rs +++ b/rivetkit-rust/packages/rivetkit-core/src/sqlite.rs @@ -13,7 +13,7 @@ pub use rivetkit_sqlite::query::{BindParam, ColumnValue, ExecResult, QueryResult use rivetkit_sqlite::{ database::{NativeDatabaseHandle, open_database_from_envoy}, query::{exec_statements, execute_statement, query_statement}, - v2::vfs::SqliteVfsMetricsSnapshot, + vfs::SqliteVfsMetricsSnapshot, }; #[cfg(not(feature = "sqlite"))] @@ -64,7 +64,6 @@ pub struct SqliteVfsMetricsSnapshot { pub struct SqliteRuntimeConfig { pub handle: EnvoyHandle, pub actor_id: String, - pub schema_version: u32, pub startup_data: Option, } @@ -72,7 +71,6 @@ pub struct SqliteRuntimeConfig { pub struct SqliteDb { handle: Option, actor_id: Option, - schema_version: Option, startup_data: Option, #[cfg(feature = "sqlite")] db: std::sync::Arc>>, @@ -82,13 +80,11 @@ impl SqliteDb { pub fn new( handle: EnvoyHandle, actor_id: impl Into, - schema_version: u32, startup_data: Option, ) -> Self { Self { handle: Some(handle), actor_id: Some(actor_id.into()), - schema_version: Some(schema_version), startup_data, #[cfg(feature = "sqlite")] db: Default::default(), @@ -137,7 +133,7 @@ impl SqliteDb { self.handle()?.sqlite_commit_finalize(request).await } - pub async fn open(&self, preloaded_entries: Vec<(Vec, Vec)>) -> Result<()> { + pub async fn open(&self) -> Result<()> { #[cfg(feature = "sqlite")] { let config = self.runtime_config()?; @@ -156,9 +152,7 @@ impl SqliteDb { let native_db = open_database_from_envoy( config.handle, config.actor_id, - config.schema_version, config.startup_data, - preloaded_entries, rt_handle, )?; *guard = Some(native_db); @@ -170,17 +164,14 @@ impl SqliteDb { #[cfg(not(feature = "sqlite"))] { - let _ = preloaded_entries; - bail!( - "actor database is not available because rivetkit-core was built without the `sqlite` feature" - ) + bail!("actor database is not available because rivetkit-core was built without the `sqlite` feature") } } pub async fn exec(&self, sql: impl Into) -> Result { #[cfg(feature = "sqlite")] { - self.open(Vec::new()).await?; + self.open().await?; let sql = sql.into(); let db = self.db.clone(); tokio::task::spawn_blocking(move || { @@ -212,7 +203,7 @@ impl SqliteDb { ) -> Result { #[cfg(feature = "sqlite")] { - self.open(Vec::new()).await?; + self.open().await?; let sql = sql.into(); let db = self.db.clone(); tokio::task::spawn_blocking(move || { @@ -244,7 +235,7 @@ impl SqliteDb { ) -> Result { #[cfg(feature = "sqlite")] { - self.open(Vec::new()).await?; + self.open().await?; let sql = sql.into(); let db = self.db.clone(); tokio::task::spawn_blocking(move || { @@ -313,11 +304,10 @@ impl SqliteDb { pub fn metrics(&self) -> Option { #[cfg(feature = "sqlite")] { - self.db.lock().ok().and_then(|guard| { - guard - .as_ref() - .and_then(NativeDatabaseHandle::sqlite_vfs_metrics) - }) + self.db + .lock() + .ok() + .and_then(|guard| guard.as_ref().map(NativeDatabaseHandle::sqlite_vfs_metrics)) } #[cfg(not(feature = "sqlite"))] @@ -333,9 +323,6 @@ impl SqliteDb { .actor_id .clone() .ok_or_else(|| anyhow!("sqlite actor id is not configured"))?, - schema_version: self - .schema_version - .ok_or_else(|| anyhow!("sqlite schema version is not configured"))?, startup_data: self.startup_data.clone(), }) } @@ -372,7 +359,6 @@ impl std::fmt::Debug for SqliteDb { f.debug_struct("SqliteDb") .field("configured", &self.handle.is_some()) .field("actor_id", &self.actor_id) - .field("schema_version", &self.schema_version) .finish() } } diff --git a/rivetkit-rust/packages/rivetkit-sqlite/Cargo.toml b/rivetkit-rust/packages/rivetkit-sqlite/Cargo.toml index cbd589c634..5d0fd18732 100644 --- a/rivetkit-rust/packages/rivetkit-sqlite/Cargo.toml +++ b/rivetkit-rust/packages/rivetkit-sqlite/Cargo.toml @@ -5,14 +5,13 @@ edition.workspace = true authors.workspace = true license.workspace = true workspace = "../../../" -description = "Native SQLite VFS for RivetKit backed by a transport-agnostic KV trait" +description = "Native SQLite VFS for RivetKit backed by sqlite-storage" [lib] crate-type = ["lib"] [dependencies] anyhow.workspace = true -async-trait.workspace = true libsqlite3-sys = { version = "0.30", features = ["bundled"] } rivet-envoy-client.workspace = true tokio.workspace = true diff --git a/rivetkit-rust/packages/rivetkit-sqlite/examples/v1_baseline_bench.rs b/rivetkit-rust/packages/rivetkit-sqlite/examples/v1_baseline_bench.rs deleted file mode 100644 index 96ba5a87c5..0000000000 --- a/rivetkit-rust/packages/rivetkit-sqlite/examples/v1_baseline_bench.rs +++ /dev/null @@ -1,400 +0,0 @@ -use std::collections::HashMap; -use std::ffi::{CStr, CString, c_void}; -use std::ptr; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::{Arc, Mutex}; -use std::time::Instant; - -use async_trait::async_trait; -use libsqlite3_sys::*; -use rivetkit_sqlite::sqlite_kv::{KvGetResult, SqliteKv, SqliteKvError}; -use rivetkit_sqlite::vfs::{KvVfs, open_database}; - -const PAGE_SIZE_BYTES: usize = 4096; - -#[derive(Clone, Copy, Default)] -struct OpTotals { - get: u64, - put: u64, - delete: u64, - delete_range: u64, -} - -impl OpTotals { - fn round_trips(self) -> u64 { - self.get + self.put + self.delete + self.delete_range - } -} - -#[derive(Default)] -struct MemoryKv { - stores: Mutex, Vec>>>, - op_totals: Mutex>, -} - -impl MemoryKv { - fn record_get(&self, actor_id: &str) { - let mut totals = self.op_totals.lock().unwrap(); - totals.entry(actor_id.to_string()).or_default().get += 1; - } - - fn record_put(&self, actor_id: &str) { - let mut totals = self.op_totals.lock().unwrap(); - totals.entry(actor_id.to_string()).or_default().put += 1; - } - - fn record_delete(&self, actor_id: &str) { - let mut totals = self.op_totals.lock().unwrap(); - totals.entry(actor_id.to_string()).or_default().delete += 1; - } - - fn record_delete_range(&self, actor_id: &str) { - let mut totals = self.op_totals.lock().unwrap(); - totals.entry(actor_id.to_string()).or_default().delete_range += 1; - } - - fn totals_for(&self, actor_id: &str) -> OpTotals { - self.op_totals - .lock() - .unwrap() - .get(actor_id) - .copied() - .unwrap_or_default() - } -} - -#[async_trait] -impl SqliteKv for MemoryKv { - async fn batch_get( - &self, - actor_id: &str, - keys: Vec>, - ) -> Result { - self.record_get(actor_id); - - let store_guard = self.stores.lock().unwrap(); - let actor_store = store_guard.get(actor_id); - let mut found_keys = Vec::new(); - let mut found_values = Vec::new(); - - for key in keys { - if let Some(value) = actor_store.and_then(|store| store.get(&key)) { - found_keys.push(key); - found_values.push(value.clone()); - } - } - - Ok(KvGetResult { - keys: found_keys, - values: found_values, - }) - } - - async fn batch_put( - &self, - actor_id: &str, - keys: Vec>, - values: Vec>, - ) -> Result<(), SqliteKvError> { - if keys.len() != values.len() { - return Err(SqliteKvError::new("keys and values length mismatch")); - } - - self.record_put(actor_id); - - let mut stores = self.stores.lock().unwrap(); - let actor_store = stores.entry(actor_id.to_string()).or_default(); - for (key, value) in keys.into_iter().zip(values.into_iter()) { - actor_store.insert(key, value); - } - - Ok(()) - } - - async fn batch_delete(&self, actor_id: &str, keys: Vec>) -> Result<(), SqliteKvError> { - self.record_delete(actor_id); - - let mut stores = self.stores.lock().unwrap(); - let actor_store = stores.entry(actor_id.to_string()).or_default(); - for key in keys { - actor_store.remove(&key); - } - - Ok(()) - } - - async fn delete_range( - &self, - actor_id: &str, - start: Vec, - end: Vec, - ) -> Result<(), SqliteKvError> { - self.record_delete_range(actor_id); - - let mut stores = self.stores.lock().unwrap(); - let actor_store = stores.entry(actor_id.to_string()).or_default(); - actor_store.retain(|key, _| { - !(key.as_slice() >= start.as_slice() && key.as_slice() < end.as_slice()) - }); - - Ok(()) - } -} - -#[derive(Clone, Copy)] -struct WorkloadResult { - latency_ms: f64, - round_trips: u64, -} - -static NEXT_ID: AtomicU64 = AtomicU64::new(1); - -fn next_name(prefix: &str) -> String { - let id = NEXT_ID.fetch_add(1, Ordering::Relaxed); - format!("{prefix}-{id}") -} - -fn with_database( - kv: Arc, - actor_id: &str, - callback: impl FnOnce(*mut sqlite3) -> T, -) -> T { - let runtime = tokio::runtime::Builder::new_current_thread() - .build() - .unwrap(); - let vfs_name = next_name("sqlite-native-bench-vfs"); - let vfs = KvVfs::register( - &vfs_name, - kv, - actor_id.to_string(), - runtime.handle().clone(), - Vec::new(), - ) - .unwrap(); - let db = open_database(vfs, actor_id).unwrap(); - let output = callback(db.as_ptr()); - drop(db); - drop(runtime); - output -} - -fn exec_sql(db: *mut sqlite3, sql: &str) { - let c_sql = CString::new(sql).unwrap(); - let mut err_msg = ptr::null_mut(); - let rc = unsafe { sqlite3_exec(db, c_sql.as_ptr(), None, ptr::null_mut(), &mut err_msg) }; - if rc != SQLITE_OK { - let message = if err_msg.is_null() { - format!("sqlite error {rc}") - } else { - let message = unsafe { CStr::from_ptr(err_msg) } - .to_string_lossy() - .into_owned(); - unsafe { - sqlite3_free(err_msg as *mut c_void); - } - message - }; - panic!("sqlite3_exec failed for `{sql}`: {message}"); - } -} - -fn prepare_statement(db: *mut sqlite3, sql: &str) -> *mut sqlite3_stmt { - let c_sql = CString::new(sql).unwrap(); - let mut stmt = ptr::null_mut(); - let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; - assert_eq!(rc, SQLITE_OK, "failed to prepare `{sql}`"); - assert!( - !stmt.is_null(), - "sqlite returned null statement for `{sql}`" - ); - stmt -} - -fn finalize_statement(stmt: *mut sqlite3_stmt) { - let rc = unsafe { sqlite3_finalize(stmt) }; - assert_eq!(rc, SQLITE_OK, "failed to finalize statement"); -} - -fn insert_blob(db: *mut sqlite3, payload: &[u8]) { - let stmt = prepare_statement(db, "INSERT INTO payloads (body) VALUES (?1);"); - let bind_rc = unsafe { - sqlite3_bind_blob( - stmt, - 1, - payload.as_ptr() as *const c_void, - payload.len() as i32, - SQLITE_TRANSIENT(), - ) - }; - assert_eq!(bind_rc, SQLITE_OK, "failed to bind blob payload"); - - let step_rc = unsafe { sqlite3_step(stmt) }; - assert_eq!(step_rc, SQLITE_DONE, "failed to insert blob payload"); - finalize_statement(stmt); -} - -fn insert_page_rows(db: *mut sqlite3, rows: usize) { - let payload = vec![0x5au8; PAGE_SIZE_BYTES]; - let stmt = prepare_statement(db, "INSERT INTO payloads (body) VALUES (?1);"); - - for _ in 0..rows { - let clear_rc = unsafe { sqlite3_clear_bindings(stmt) }; - assert_eq!(clear_rc, SQLITE_OK, "failed to clear bindings"); - - let reset_rc = unsafe { sqlite3_reset(stmt) }; - assert_eq!(reset_rc, SQLITE_OK, "failed to reset statement"); - - let bind_rc = unsafe { - sqlite3_bind_blob( - stmt, - 1, - payload.as_ptr() as *const c_void, - payload.len() as i32, - SQLITE_TRANSIENT(), - ) - }; - assert_eq!(bind_rc, SQLITE_OK, "failed to bind page payload"); - - let step_rc = unsafe { sqlite3_step(stmt) }; - assert_eq!(step_rc, SQLITE_DONE, "failed to insert page payload"); - } - - finalize_statement(stmt); -} - -fn select_page_rows(db: *mut sqlite3) { - let stmt = prepare_statement(db, "SELECT body FROM payloads ORDER BY id;"); - let mut rows = 0usize; - - loop { - let step_rc = unsafe { sqlite3_step(stmt) }; - if step_rc == SQLITE_DONE { - break; - } - assert_eq!(step_rc, SQLITE_ROW, "expected row while reading payloads"); - let bytes = unsafe { sqlite3_column_bytes(stmt, 0) } as usize; - assert_eq!(bytes, PAGE_SIZE_BYTES, "expected one page per payload row"); - rows += 1; - } - - assert_eq!(rows, 100, "expected to read 100 payload rows"); - finalize_statement(stmt); -} - -fn run_workload(name: &str, callback: impl FnOnce(Arc, &str) -> ()) -> WorkloadResult { - let actor_id = next_name("sqlite-native-bench-actor"); - let kv = Arc::new(MemoryKv::default()); - let started_at = Instant::now(); - callback(kv.clone(), &actor_id); - let elapsed = started_at.elapsed(); - let totals = kv.totals_for(&actor_id); - - let result = WorkloadResult { - latency_ms: elapsed.as_secs_f64() * 1000.0, - round_trips: totals.round_trips(), - }; - - println!( - "RESULT\t{name}\t{:.3}\t{}", - result.latency_ms, result.round_trips - ); - result -} - -fn workload_one_mib_insert() -> WorkloadResult { - run_workload("1 MiB insert", |kv, actor_id| { - with_database(kv, actor_id, |db| { - exec_sql( - db, - "CREATE TABLE payloads (id INTEGER PRIMARY KEY, body BLOB NOT NULL);", - ); - let payload = vec![0x11u8; 1024 * 1024]; - insert_blob(db, &payload); - }); - }) -} - -fn workload_ten_mib_insert() -> WorkloadResult { - run_workload("10 MiB insert", |kv, actor_id| { - with_database(kv, actor_id, |db| { - exec_sql( - db, - "CREATE TABLE payloads (id INTEGER PRIMARY KEY, body BLOB NOT NULL);", - ); - let payload = vec![0x22u8; 10 * 1024 * 1024]; - insert_blob(db, &payload); - }); - }) -} - -fn workload_hot_row_update() -> WorkloadResult { - run_workload("hot-row update", |kv, actor_id| { - with_database(kv, actor_id, |db| { - exec_sql( - db, - "CREATE TABLE counters (id INTEGER PRIMARY KEY, value INTEGER NOT NULL);", - ); - exec_sql(db, "INSERT INTO counters (id, value) VALUES (1, 0);"); - for _ in 0..100 { - exec_sql(db, "UPDATE counters SET value = value + 1 WHERE id = 1;"); - } - }); - }) -} - -fn workload_cold_read() -> WorkloadResult { - run_workload("cold read", |kv, actor_id| { - with_database(kv.clone(), actor_id, |db| { - exec_sql( - db, - "CREATE TABLE payloads (id INTEGER PRIMARY KEY, body BLOB NOT NULL);", - ); - insert_page_rows(db, 100); - }); - - with_database(kv, actor_id, |db| { - select_page_rows(db); - }); - }) -} - -fn workload_mixed_read_write() -> WorkloadResult { - run_workload("mixed read/write", |kv, actor_id| { - with_database(kv, actor_id, |db| { - exec_sql( - db, - "CREATE TABLE items (id INTEGER PRIMARY KEY, value INTEGER NOT NULL);", - ); - exec_sql( - db, - "INSERT INTO items (id, value) VALUES - (1, 10), (2, 20), (3, 30), (4, 40), (5, 50);", - ); - for _ in 0..25 { - exec_sql(db, "SELECT value FROM items WHERE id = 3;"); - exec_sql(db, "UPDATE items SET value = value + 1 WHERE id = 3;"); - exec_sql(db, "INSERT INTO items (value) VALUES (99);"); - exec_sql( - db, - "DELETE FROM items WHERE id = (SELECT MIN(id) FROM items);", - ); - } - }); - }) -} - -fn main() { - let results = [ - workload_one_mib_insert(), - workload_ten_mib_insert(), - workload_hot_row_update(), - workload_cold_read(), - workload_mixed_read_write(), - ]; - - println!( - "SUMMARY\tpage_size_bytes={}\tworkloads={}", - PAGE_SIZE_BYTES, - results.len() - ); -} diff --git a/rivetkit-rust/packages/rivetkit-sqlite/src/database.rs b/rivetkit-rust/packages/rivetkit-sqlite/src/database.rs index f87456a1dd..b57187422a 100644 --- a/rivetkit-rust/packages/rivetkit-sqlite/src/database.rs +++ b/rivetkit-rust/packages/rivetkit-sqlite/src/database.rs @@ -1,173 +1,31 @@ -use std::sync::Arc; - use anyhow::{Result, anyhow}; -use async_trait::async_trait; use rivet_envoy_client::handle::EnvoyHandle; use rivet_envoy_protocol as protocol; use tokio::runtime::Handle; -use crate::sqlite_kv::{KvGetResult, SqliteKv, SqliteKvError}; -use crate::v2::vfs::{NativeDatabaseV2, SqliteVfsMetricsSnapshot, SqliteVfsV2, VfsV2Config}; -use crate::vfs::{KvVfs, NativeDatabase}; - -pub struct EnvoyKv { - handle: EnvoyHandle, - actor_id: String, -} - -impl EnvoyKv { - pub fn new(handle: EnvoyHandle, actor_id: String) -> Self { - Self { handle, actor_id } - } -} - -#[async_trait] -impl SqliteKv for EnvoyKv { - fn on_error(&self, actor_id: &str, error: &SqliteKvError) { - tracing::error!(%actor_id, %error, "native sqlite kv operation failed"); - } - - async fn on_open(&self, _actor_id: &str) -> Result<(), SqliteKvError> { - Ok(()) - } - - async fn on_close(&self, _actor_id: &str) -> Result<(), SqliteKvError> { - Ok(()) - } - - async fn batch_get( - &self, - _actor_id: &str, - keys: Vec>, - ) -> Result { - let result = self - .handle - .kv_get(self.actor_id.clone(), keys.clone()) - .await - .map_err(|e| SqliteKvError::new(e.to_string()))?; - - let mut out_keys = Vec::new(); - let mut out_values = Vec::new(); - for (i, val) in result.into_iter().enumerate() { - if let Some(v) = val { - out_keys.push(keys[i].clone()); - out_values.push(v); - } - } +use crate::vfs::{NativeDatabase, SqliteVfs, VfsConfig}; - Ok(KvGetResult { - keys: out_keys, - values: out_values, - }) - } - - async fn batch_put( - &self, - _actor_id: &str, - keys: Vec>, - values: Vec>, - ) -> Result<(), SqliteKvError> { - let entries: Vec<(Vec, Vec)> = keys.into_iter().zip(values).collect(); - self.handle - .kv_put(self.actor_id.clone(), entries) - .await - .map_err(|e| SqliteKvError::new(e.to_string())) - } - - async fn batch_delete(&self, _actor_id: &str, keys: Vec>) -> Result<(), SqliteKvError> { - self.handle - .kv_delete(self.actor_id.clone(), keys) - .await - .map_err(|e| SqliteKvError::new(e.to_string())) - } - - async fn delete_range( - &self, - _actor_id: &str, - start: Vec, - end: Vec, - ) -> Result<(), SqliteKvError> { - self.handle - .kv_delete_range(self.actor_id.clone(), start, end) - .await - .map_err(|e| SqliteKvError::new(e.to_string())) - } -} - -pub enum NativeDatabaseHandle { - V1(NativeDatabase), - V2(NativeDatabaseV2), -} - -impl NativeDatabaseHandle { - pub fn as_ptr(&self) -> *mut libsqlite3_sys::sqlite3 { - match self { - Self::V1(db) => db.as_ptr(), - Self::V2(db) => db.as_ptr(), - } - } - - pub fn take_last_kv_error(&self) -> Option { - match self { - Self::V1(db) => db.take_last_kv_error(), - Self::V2(db) => db.take_last_kv_error(), - } - } - - pub fn sqlite_vfs_metrics(&self) -> Option { - match self { - Self::V1(_) => None, - Self::V2(db) => Some(db.sqlite_vfs_metrics()), - } - } -} +pub type NativeDatabaseHandle = NativeDatabase; pub fn open_database_from_envoy( handle: EnvoyHandle, actor_id: String, - schema_version: u32, startup_data: Option, - preloaded_entries: Vec<(Vec, Vec)>, rt_handle: Handle, ) -> Result { - match schema_version { - 1 => { - let vfs_name = format!("envoy-kv-{actor_id}"); - let envoy_kv = Arc::new(EnvoyKv::new(handle, actor_id.clone())); - let vfs = KvVfs::register( - &vfs_name, - envoy_kv, - actor_id.clone(), - rt_handle, - preloaded_entries, - ) - .map_err(|e| anyhow!("failed to register VFS: {e}"))?; - - crate::vfs::open_database(vfs, &actor_id) - .map(NativeDatabaseHandle::V1) - .map_err(|e| anyhow!("failed to open database: {e}")) - } - 2 => { - let startup = startup_data.ok_or_else(|| { - anyhow!("missing sqlite startup data for actor {actor_id} using schema version 2") - })?; - let vfs_name = format!("envoy-sqlite-v2-{actor_id}"); - let vfs = SqliteVfsV2::register( - &vfs_name, - handle, - actor_id.clone(), - rt_handle, - startup, - VfsV2Config::default(), - ) - .map_err(|e| anyhow!("failed to register V2 VFS: {e}"))?; - - crate::v2::vfs::open_database(vfs, &actor_id) - .map(NativeDatabaseHandle::V2) - .map_err(|e| anyhow!("failed to open V2 database: {e}")) - } - version => Err(anyhow!( - "unsupported sqlite schema version {version} for actor {actor_id}" - )), - } + let startup = startup_data + .ok_or_else(|| anyhow!("missing sqlite startup data for actor {actor_id}"))?; + let vfs_name = format!("envoy-sqlite-{actor_id}"); + let vfs = SqliteVfs::register( + &vfs_name, + handle, + actor_id.clone(), + rt_handle, + startup, + VfsConfig::default(), + ) + .map_err(|e| anyhow!("failed to register sqlite VFS: {e}"))?; + + crate::vfs::open_database(vfs, &actor_id) + .map_err(|e| anyhow!("failed to open sqlite database: {e}")) } diff --git a/rivetkit-rust/packages/rivetkit-sqlite/src/kv.rs b/rivetkit-rust/packages/rivetkit-sqlite/src/kv.rs deleted file mode 100644 index ffd43f9151..0000000000 --- a/rivetkit-rust/packages/rivetkit-sqlite/src/kv.rs +++ /dev/null @@ -1,198 +0,0 @@ -//! KV key layout for SQLite-over-KV storage. -//! -//! Key layout: -//! Meta key: [SQLITE_PREFIX, SCHEMA_VERSION, META_PREFIX, file_tag] (4 bytes) -//! Chunk key: [SQLITE_PREFIX, SCHEMA_VERSION, CHUNK_PREFIX, file_tag, chunk_index_u32_be] (8 bytes) - -/// Size of each file chunk stored in KV. -pub const CHUNK_SIZE: usize = 4096; - -/// Top-level SQLite prefix byte. -pub const SQLITE_PREFIX: u8 = 0x08; - -/// Schema version namespace byte after SQLITE_PREFIX. -pub const SQLITE_SCHEMA_VERSION: u8 = 0x01; - -/// Key prefix byte for file metadata (after SQLITE_PREFIX + version). -pub const META_PREFIX: u8 = 0x00; - -/// Key prefix byte for file chunks (after SQLITE_PREFIX + version). -pub const CHUNK_PREFIX: u8 = 0x01; - -/// File kind tag for the actor's main database file. -pub const FILE_TAG_MAIN: u8 = 0x00; - -/// File kind tag for the actor's rollback journal sidecar. -pub const FILE_TAG_JOURNAL: u8 = 0x01; - -/// File kind tag for the actor's WAL sidecar. -pub const FILE_TAG_WAL: u8 = 0x02; - -/// File kind tag for the actor's SHM sidecar. -pub const FILE_TAG_SHM: u8 = 0x03; - -/// Returns the 4-byte metadata key for the given file tag. -/// -/// Format: `[SQLITE_PREFIX, SCHEMA_VERSION, META_PREFIX, file_tag]` -pub fn get_meta_key(file_tag: u8) -> [u8; 4] { - [SQLITE_PREFIX, SQLITE_SCHEMA_VERSION, META_PREFIX, file_tag] -} - -/// Returns the 8-byte chunk key for the given file tag and chunk index. -/// -/// Format: `[SQLITE_PREFIX, SCHEMA_VERSION, CHUNK_PREFIX, file_tag, chunk_index_u32_be]` -/// -/// The chunk index is derived from byte offset as `offset / CHUNK_SIZE`. -pub fn get_chunk_key(file_tag: u8, chunk_index: u32) -> [u8; 8] { - let ci = chunk_index.to_be_bytes(); - [ - SQLITE_PREFIX, - SQLITE_SCHEMA_VERSION, - CHUNK_PREFIX, - file_tag, - ci[0], - ci[1], - ci[2], - ci[3], - ] -} - -/// Maximum file size in bytes before chunk index overflow. -/// -/// Chunk indices are u32, so the maximum addressable byte is -/// (u32::MAX as u64 + 1) * CHUNK_SIZE. Writes or truncates beyond this would -/// wrap the chunk index. -pub const MAX_FILE_SIZE: u64 = (u32::MAX as u64 + 1) * CHUNK_SIZE as u64; - -/// Returns a 4-byte key that is lexicographically just past all chunk keys for -/// the given file tag. Useful as the exclusive end bound for deleteRange. -/// -/// Format: `[SQLITE_PREFIX, SCHEMA_VERSION, CHUNK_PREFIX, file_tag + 1]` -/// -/// This is shorter than a chunk key but lexicographically greater than any -/// 8-byte chunk key with the same file_tag prefix. -pub fn get_chunk_key_range_end(file_tag: u8) -> [u8; 4] { - [ - SQLITE_PREFIX, - SQLITE_SCHEMA_VERSION, - CHUNK_PREFIX, - file_tag + 1, - ] -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn constants_match_expected_layout() { - assert_eq!(CHUNK_SIZE, 4096); - assert_eq!(SQLITE_PREFIX, 8); - assert_eq!(SQLITE_SCHEMA_VERSION, 1); - assert_eq!(META_PREFIX, 0); - assert_eq!(CHUNK_PREFIX, 1); - assert_eq!(FILE_TAG_MAIN, 0); - assert_eq!(FILE_TAG_JOURNAL, 1); - assert_eq!(FILE_TAG_WAL, 2); - assert_eq!(FILE_TAG_SHM, 3); - } - - #[test] - fn meta_key_main() { - assert_eq!(get_meta_key(FILE_TAG_MAIN), [0x08, 0x01, 0x00, 0x00]); - } - - #[test] - fn meta_key_journal() { - assert_eq!(get_meta_key(FILE_TAG_JOURNAL), [0x08, 0x01, 0x00, 0x01]); - } - - #[test] - fn meta_key_wal() { - assert_eq!(get_meta_key(FILE_TAG_WAL), [0x08, 0x01, 0x00, 0x02]); - } - - #[test] - fn meta_key_shm() { - assert_eq!(get_meta_key(FILE_TAG_SHM), [0x08, 0x01, 0x00, 0x03]); - } - - #[test] - fn chunk_key_zero_index() { - assert_eq!( - get_chunk_key(FILE_TAG_MAIN, 0), - [0x08, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00] - ); - } - - #[test] - fn chunk_key_index_one() { - assert_eq!( - get_chunk_key(FILE_TAG_MAIN, 1), - [0x08, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01] - ); - } - - #[test] - fn chunk_key_large_index() { - // TypeScript: getChunkKey(FILE_TAG_MAIN, 256) => [8, 1, 1, 0, 0, 0, 1, 0] - assert_eq!( - get_chunk_key(FILE_TAG_MAIN, 256), - [0x08, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00] - ); - } - - #[test] - fn chunk_key_max_index() { - // TypeScript: getChunkKey(FILE_TAG_MAIN, 0xFFFFFFFF) => [8, 1, 1, 0, 255, 255, 255, 255] - assert_eq!( - get_chunk_key(FILE_TAG_MAIN, u32::MAX), - [0x08, 0x01, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF] - ); - } - - #[test] - fn chunk_key_journal_tag() { - assert_eq!( - get_chunk_key(FILE_TAG_JOURNAL, 42), - [0x08, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 42] - ); - } - - #[test] - fn chunk_key_big_endian_encoding() { - // 0x01020304 => bytes [1, 2, 3, 4] - assert_eq!( - get_chunk_key(FILE_TAG_MAIN, 0x01020304), - [0x08, 0x01, 0x01, 0x00, 0x01, 0x02, 0x03, 0x04] - ); - } - - #[test] - fn chunk_key_range_end_main() { - // TypeScript: getChunkKeyRangeEnd(FILE_TAG_MAIN) => [8, 1, 1, 1] - assert_eq!( - get_chunk_key_range_end(FILE_TAG_MAIN), - [0x08, 0x01, 0x01, 0x01] - ); - } - - #[test] - fn chunk_key_range_end_journal() { - // TypeScript: getChunkKeyRangeEnd(FILE_TAG_JOURNAL) => [8, 1, 1, 2] - assert_eq!( - get_chunk_key_range_end(FILE_TAG_JOURNAL), - [0x08, 0x01, 0x01, 0x02] - ); - } - - #[test] - fn range_end_is_past_all_chunk_keys() { - // The range end key must be lexicographically greater than any chunk key for the same tag. - let max_chunk = get_chunk_key(FILE_TAG_MAIN, u32::MAX); - let range_end = get_chunk_key_range_end(FILE_TAG_MAIN); - // Compare as slices. The range end [8,1,1,1] > [8,1,1,0,FF,FF,FF,FF] - // because at byte index 3, 1 > 0. - assert!(range_end.as_slice() > max_chunk.as_slice()); - } -} diff --git a/rivetkit-rust/packages/rivetkit-sqlite/src/lib.rs b/rivetkit-rust/packages/rivetkit-sqlite/src/lib.rs index ee518d9a96..acec7186bb 100644 --- a/rivetkit-rust/packages/rivetkit-sqlite/src/lib.rs +++ b/rivetkit-rust/packages/rivetkit-sqlite/src/lib.rs @@ -1,38 +1,24 @@ //! SQLite library for RivetKit. //! -//! Provides a custom SQLite VFS backed by a transport-agnostic KV trait. -//! Consumers supply a `SqliteKv` implementation and this crate handles -//! VFS registration, database open/close, and chunk-level I/O. +//! Provides the native SQLite VFS used by RivetKit actors. //! //! This is a pure Rust library. N-API bindings and transport clients //! live in separate crates that compose this one. //! -//! The KV-backed SQLite implementation used by `rivetkit-napi` is defined in -//! this crate. Keep its storage layout and behavior in sync with the internal -//! SQLite data-channel spec. +//! The SQLite implementation used by `rivetkit-napi` is defined in this crate. +//! Keep its storage layout and behavior in sync with the internal SQLite +//! data-channel spec. //! //! Key invariants: -//! - KV key layout and encoding (see `kv.rs`) -//! - Chunk size (4 KiB) //! - PRAGMA settings -//! - VFS callback-to-KV-operation mapping //! - Delete and truncate behavior //! - Journal and BATCH_ATOMIC behavior -/// KV key layout for the native SQLite VFS. -pub mod kv; - -/// Unified native database handles and envoy-backed KV adapters. +/// Unified native database handles and open helpers. pub mod database; /// SQLite query execution helpers. pub mod query; -/// Transport-agnostic KV trait for the SQLite VFS. -pub mod sqlite_kv; - -/// Custom SQLite VFS that maps VFS callbacks to KV operations via the trait. +/// Custom SQLite VFS for actor-side sqlite-storage transport. pub mod vfs; - -/// Building blocks for the upcoming SQLite v2 actor-side VFS. -pub mod v2; diff --git a/rivetkit-rust/packages/rivetkit-sqlite/src/query.rs b/rivetkit-rust/packages/rivetkit-sqlite/src/query.rs index 899cb5b18e..285eb36a7d 100644 --- a/rivetkit-rust/packages/rivetkit-sqlite/src/query.rs +++ b/rivetkit-rust/packages/rivetkit-sqlite/src/query.rs @@ -1,7 +1,7 @@ use std::ffi::{CStr, CString, c_char}; use std::ptr; -use anyhow::{Result, anyhow, bail}; +use anyhow::{Result, anyhow}; use libsqlite3_sys::{ SQLITE_BLOB, SQLITE_DONE, SQLITE_FLOAT, SQLITE_INTEGER, SQLITE_NULL, SQLITE_OK, SQLITE_ROW, SQLITE_TEXT, SQLITE_TRANSIENT, sqlite3, sqlite3_bind_blob, sqlite3_bind_double, diff --git a/rivetkit-rust/packages/rivetkit-sqlite/src/sqlite_kv.rs b/rivetkit-rust/packages/rivetkit-sqlite/src/sqlite_kv.rs deleted file mode 100644 index 6f7d75059c..0000000000 --- a/rivetkit-rust/packages/rivetkit-sqlite/src/sqlite_kv.rs +++ /dev/null @@ -1,114 +0,0 @@ -//! Transport-agnostic KV trait for the native SQLite VFS. -//! -//! Implementations provide the backing KV storage that the native SQLite VFS -//! reads and writes chunks through. The trait is object-safe and async so it -//! can be implemented over any transport (WebSocket channel, in-process engine, -//! etc.). - -use std::fmt; - -use async_trait::async_trait; - -// MARK: Error - -/// Error type for SqliteKv operations. -#[derive(Debug)] -pub struct SqliteKvError { - message: String, -} - -impl SqliteKvError { - pub fn new(message: impl Into) -> Self { - Self { - message: message.into(), - } - } -} - -impl fmt::Display for SqliteKvError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.message) - } -} - -impl std::error::Error for SqliteKvError {} - -impl From for SqliteKvError { - fn from(message: String) -> Self { - Self { message } - } -} - -impl From<&str> for SqliteKvError { - fn from(message: &str) -> Self { - Self { - message: message.to_string(), - } - } -} - -// MARK: Get result - -/// Result of a batch get operation. -/// -/// `keys` and `values` are parallel lists. Only keys that exist in the store -/// are returned; missing keys are omitted. -#[derive(Debug)] -pub struct KvGetResult { - pub keys: Vec>, - pub values: Vec>, -} - -// MARK: Trait - -/// Transport-agnostic KV trait consumed by the native SQLite VFS. -/// -/// All methods receive an `actor_id` to scope operations to a specific actor's -/// KV namespace. Implementations are free to ignore it if scoping is handled -/// at a higher level. -#[async_trait] -pub trait SqliteKv: Send + Sync { - /// Called when a KV operation fails inside a VFS callback before the - /// original error is collapsed into a generic SQLite IO error code. - fn on_error(&self, _actor_id: &str, _error: &SqliteKvError) {} - - /// Called when an actor's database is opened. - async fn on_open(&self, _actor_id: &str) -> Result<(), SqliteKvError> { - Ok(()) - } - - /// Called when an actor's database is closed. - async fn on_close(&self, _actor_id: &str) -> Result<(), SqliteKvError> { - Ok(()) - } - - /// Fetch multiple keys in one batch. - /// - /// Only existing keys are returned in the result. Missing keys are omitted. - async fn batch_get( - &self, - actor_id: &str, - keys: Vec>, - ) -> Result; - - /// Write multiple key-value pairs in one batch. - /// - /// `keys` and `values` must have the same length. - async fn batch_put( - &self, - actor_id: &str, - keys: Vec>, - values: Vec>, - ) -> Result<(), SqliteKvError>; - - /// Delete multiple keys in one batch. - async fn batch_delete(&self, actor_id: &str, keys: Vec>) -> Result<(), SqliteKvError>; - - /// Delete all keys in the half-open range `[start, end)`. - async fn delete_range( - &self, - actor_id: &str, - start: Vec, - end: Vec, - ) -> Result<(), SqliteKvError>; -} diff --git a/rivetkit-rust/packages/rivetkit-sqlite/src/v2/mod.rs b/rivetkit-rust/packages/rivetkit-sqlite/src/v2/mod.rs deleted file mode 100644 index 10d397e7d2..0000000000 --- a/rivetkit-rust/packages/rivetkit-sqlite/src/v2/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod vfs; diff --git a/rivetkit-rust/packages/rivetkit-sqlite/src/v2/vfs.rs b/rivetkit-rust/packages/rivetkit-sqlite/src/v2/vfs.rs deleted file mode 100644 index 687073c32e..0000000000 --- a/rivetkit-rust/packages/rivetkit-sqlite/src/v2/vfs.rs +++ /dev/null @@ -1,5077 +0,0 @@ -use std::collections::{BTreeMap, HashMap, HashSet}; -use std::ffi::{CStr, CString, c_char, c_int, c_void}; -use std::ptr; -use std::slice; -use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::time::Instant; - -use anyhow::Result; -use libsqlite3_sys::*; -use moka::sync::Cache; -use parking_lot::{Mutex, RwLock}; -use rivet_envoy_client::handle::EnvoyHandle; -use rivet_envoy_protocol as protocol; -use sqlite_storage::ltx::{LtxHeader, encode_ltx_v3}; -#[cfg(test)] -use sqlite_storage::{engine::SqliteEngine, error::SqliteStorageError}; -use tokio::runtime::Handle; -#[cfg(test)] -use tokio::sync::Notify; - -const DEFAULT_CACHE_CAPACITY_PAGES: u64 = 50_000; -const DEFAULT_PREFETCH_DEPTH: usize = 16; -const DEFAULT_MAX_PREFETCH_BYTES: usize = 256 * 1024; -const DEFAULT_MAX_PAGES_PER_STAGE: usize = 4_000; -const DEFAULT_PAGE_SIZE: usize = 4096; -const MAX_PATHNAME: c_int = 64; -const TEMP_AUX_PATH_PREFIX: &str = "__sqlite_v2_temp__"; -const EMPTY_DB_PAGE_HEADER_PREFIX: [u8; 108] = [ - 83, 81, 76, 105, 116, 101, 32, 102, 111, 114, 109, 97, 116, 32, 51, 0, 16, 0, 1, 1, 0, 64, 32, - 32, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 46, 138, 17, 13, 0, 0, 0, 0, 16, 0, 0, -]; - -#[cfg(test)] -static NEXT_STAGE_ID: AtomicU64 = AtomicU64::new(1); -static NEXT_TEMP_AUX_ID: AtomicU64 = AtomicU64::new(1); - -unsafe extern "C" { - fn sqlite3_close_v2(db: *mut sqlite3) -> c_int; -} - -fn empty_db_page() -> Vec { - let mut page = vec![0u8; DEFAULT_PAGE_SIZE]; - page[..EMPTY_DB_PAGE_HEADER_PREFIX.len()].copy_from_slice(&EMPTY_DB_PAGE_HEADER_PREFIX); - page -} - -fn panic_message(payload: &Box) -> String { - if let Some(message) = payload.downcast_ref::<&str>() { - message.to_string() - } else if let Some(message) = payload.downcast_ref::() { - message.clone() - } else { - "unknown panic".to_string() - } -} - -macro_rules! vfs_catch_unwind { - ($err_val:expr, $body:expr) => { - match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| $body)) { - Ok(result) => result, - Err(panic) => { - tracing::error!( - message = panic_message(&panic), - "sqlite v2 callback panicked" - ); - $err_val - } - } - }; -} - -#[derive(Clone)] -struct SqliteTransport { - inner: Arc, -} - -enum SqliteTransportInner { - Envoy(EnvoyHandle), - #[cfg(test)] - Direct { - engine: Arc, - hooks: Arc, - }, - #[cfg(test)] - Test(Arc), -} - -impl SqliteTransport { - fn from_envoy(handle: EnvoyHandle) -> Self { - Self { - inner: Arc::new(SqliteTransportInner::Envoy(handle)), - } - } - - #[cfg(test)] - fn from_direct(engine: Arc) -> Self { - Self { - inner: Arc::new(SqliteTransportInner::Direct { - engine, - hooks: Arc::new(DirectTransportHooks::default()), - }), - } - } - - #[cfg(test)] - fn from_mock(protocol: Arc) -> Self { - Self { - inner: Arc::new(SqliteTransportInner::Test(protocol)), - } - } - - #[cfg(test)] - fn direct_hooks(&self) -> Option> { - match &*self.inner { - SqliteTransportInner::Direct { hooks, .. } => Some(Arc::clone(hooks)), - _ => None, - } - } - - async fn get_pages( - &self, - req: protocol::SqliteGetPagesRequest, - ) -> Result { - match &*self.inner { - SqliteTransportInner::Envoy(handle) => handle.sqlite_get_pages(req).await, - #[cfg(test)] - SqliteTransportInner::Direct { engine, .. } => { - let pgnos = req.pgnos.clone(); - match engine.get_pages(&req.actor_id, req.generation, pgnos).await { - Ok(pages) => Ok(protocol::SqliteGetPagesResponse::SqliteGetPagesOk( - protocol::SqliteGetPagesOk { - pages: pages.into_iter().map(protocol_fetched_page).collect(), - meta: protocol_sqlite_meta(engine.load_meta(&req.actor_id).await?), - }, - )), - Err(err) => { - if let Some(SqliteStorageError::FenceMismatch { reason }) = - sqlite_storage_error(&err) - { - Ok(protocol::SqliteGetPagesResponse::SqliteFenceMismatch( - protocol::SqliteFenceMismatch { - actual_meta: protocol_sqlite_meta( - engine.load_meta(&req.actor_id).await?, - ), - reason: reason.clone(), - }, - )) - } else if matches!( - sqlite_storage_error(&err), - Some(SqliteStorageError::MetaMissing { operation }) - if *operation == "get_pages" && req.generation == 1 - ) { - match engine - .takeover( - &req.actor_id, - sqlite_storage::takeover::TakeoverConfig::new(1), - ) - .await - { - Ok(_) => {} - Err(takeover_err) - if matches!( - sqlite_storage_error(&takeover_err), - Some(SqliteStorageError::ConcurrentTakeover) - ) => {} - Err(takeover_err) => { - return Ok( - protocol::SqliteGetPagesResponse::SqliteErrorResponse( - sqlite_error_response(&takeover_err), - ), - ); - } - } - - match engine - .get_pages(&req.actor_id, req.generation, req.pgnos) - .await - { - Ok(pages) => { - Ok(protocol::SqliteGetPagesResponse::SqliteGetPagesOk( - protocol::SqliteGetPagesOk { - pages: pages - .into_iter() - .map(protocol_fetched_page) - .collect(), - meta: protocol_sqlite_meta( - engine.load_meta(&req.actor_id).await?, - ), - }, - )) - } - Err(retry_err) => { - Ok(protocol::SqliteGetPagesResponse::SqliteErrorResponse( - sqlite_error_response(&retry_err), - )) - } - } - } else { - Ok(protocol::SqliteGetPagesResponse::SqliteErrorResponse( - sqlite_error_response(&err), - )) - } - } - } - } - #[cfg(test)] - SqliteTransportInner::Test(protocol) => protocol.get_pages(req).await, - } - } - - async fn commit( - &self, - req: protocol::SqliteCommitRequest, - ) -> Result { - match &*self.inner { - SqliteTransportInner::Envoy(handle) => handle.sqlite_commit(req).await, - #[cfg(test)] - SqliteTransportInner::Direct { engine, hooks } => { - if let Some(message) = hooks.take_commit_error() { - return Err(anyhow::anyhow!(message)); - } - - match engine - .commit( - &req.actor_id, - sqlite_storage::commit::CommitRequest { - generation: req.generation, - head_txid: req.expected_head_txid, - db_size_pages: req.new_db_size_pages, - dirty_pages: req - .dirty_pages - .into_iter() - .map(storage_dirty_page) - .collect(), - now_ms: sqlite_now_ms()?, - }, - ) - .await - { - Ok(result) => Ok(protocol::SqliteCommitResponse::SqliteCommitOk( - protocol::SqliteCommitOk { - new_head_txid: result.txid, - meta: protocol_sqlite_meta(result.meta), - }, - )), - Err(err) => { - if let Some(SqliteStorageError::FenceMismatch { reason }) = - sqlite_storage_error(&err) - { - Ok(protocol::SqliteCommitResponse::SqliteFenceMismatch( - protocol::SqliteFenceMismatch { - actual_meta: protocol_sqlite_meta( - engine.load_meta(&req.actor_id).await?, - ), - reason: reason.clone(), - }, - )) - } else if let Some(SqliteStorageError::CommitTooLarge { - actual_size_bytes, - max_size_bytes, - }) = sqlite_storage_error(&err) - { - Ok(protocol::SqliteCommitResponse::SqliteCommitTooLarge( - protocol::SqliteCommitTooLarge { - actual_size_bytes: *actual_size_bytes, - max_size_bytes: *max_size_bytes, - }, - )) - } else { - Ok(protocol::SqliteCommitResponse::SqliteErrorResponse( - sqlite_error_response(&err), - )) - } - } - } - } - #[cfg(test)] - SqliteTransportInner::Test(protocol) => protocol.commit(req).await, - } - } - - async fn commit_stage_begin( - &self, - req: protocol::SqliteCommitStageBeginRequest, - ) -> Result { - match &*self.inner { - SqliteTransportInner::Envoy(handle) => handle.sqlite_commit_stage_begin(req).await, - #[cfg(test)] - SqliteTransportInner::Direct { engine, .. } => { - match engine - .commit_stage_begin( - &req.actor_id, - sqlite_storage::commit::CommitStageBeginRequest { - generation: req.generation, - }, - ) - .await - { - Ok(result) => Ok( - protocol::SqliteCommitStageBeginResponse::SqliteCommitStageBeginOk( - protocol::SqliteCommitStageBeginOk { txid: result.txid }, - ), - ), - Err(err) => { - if let Some(SqliteStorageError::FenceMismatch { reason }) = - sqlite_storage_error(&err) - { - Ok( - protocol::SqliteCommitStageBeginResponse::SqliteFenceMismatch( - protocol::SqliteFenceMismatch { - actual_meta: protocol_sqlite_meta( - engine.load_meta(&req.actor_id).await?, - ), - reason: reason.clone(), - }, - ), - ) - } else { - Ok( - protocol::SqliteCommitStageBeginResponse::SqliteErrorResponse( - sqlite_error_response(&err), - ), - ) - } - } - } - } - #[cfg(test)] - SqliteTransportInner::Test(protocol) => protocol.commit_stage_begin(req).await, - } - } - - async fn commit_stage( - &self, - req: protocol::SqliteCommitStageRequest, - ) -> Result { - match &*self.inner { - SqliteTransportInner::Envoy(handle) => handle.sqlite_commit_stage(req).await, - #[cfg(test)] - SqliteTransportInner::Direct { engine, .. } => { - match engine - .commit_stage( - &req.actor_id, - sqlite_storage::commit::CommitStageRequest { - generation: req.generation, - txid: req.txid, - chunk_idx: req.chunk_idx, - bytes: req.bytes, - is_last: req.is_last, - }, - ) - .await - { - Ok(result) => Ok(protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: result.chunk_idx_committed, - }, - )), - Err(err) => { - if let Some(SqliteStorageError::FenceMismatch { reason }) = - sqlite_storage_error(&err) - { - Ok(protocol::SqliteCommitStageResponse::SqliteFenceMismatch( - protocol::SqliteFenceMismatch { - actual_meta: protocol_sqlite_meta( - engine.load_meta(&req.actor_id).await?, - ), - reason: reason.clone(), - }, - )) - } else { - Ok(protocol::SqliteCommitStageResponse::SqliteErrorResponse( - sqlite_error_response(&err), - )) - } - } - } - } - #[cfg(test)] - SqliteTransportInner::Test(protocol) => protocol.commit_stage(req).await, - } - } - - fn queue_commit_stage(&self, req: protocol::SqliteCommitStageRequest) -> Result { - match &*self.inner { - SqliteTransportInner::Envoy(handle) => { - handle.sqlite_commit_stage_fire_and_forget(req)?; - Ok(true) - } - #[cfg(test)] - SqliteTransportInner::Direct { .. } => Ok(false), - #[cfg(test)] - SqliteTransportInner::Test(protocol) => { - protocol.queue_commit_stage(req); - Ok(true) - } - } - } - - async fn commit_finalize( - &self, - req: protocol::SqliteCommitFinalizeRequest, - ) -> Result { - match &*self.inner { - SqliteTransportInner::Envoy(handle) => handle.sqlite_commit_finalize(req).await, - #[cfg(test)] - SqliteTransportInner::Direct { engine, .. } => { - match engine - .commit_finalize( - &req.actor_id, - sqlite_storage::commit::CommitFinalizeRequest { - generation: req.generation, - expected_head_txid: req.expected_head_txid, - txid: req.txid, - new_db_size_pages: req.new_db_size_pages, - now_ms: sqlite_now_ms()?, - }, - ) - .await - { - Ok(result) => Ok( - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: result.new_head_txid, - meta: protocol_sqlite_meta(result.meta), - }, - ), - ), - Err(err) => { - if let Some(SqliteStorageError::FenceMismatch { reason }) = - sqlite_storage_error(&err) - { - Ok(protocol::SqliteCommitFinalizeResponse::SqliteFenceMismatch( - protocol::SqliteFenceMismatch { - actual_meta: protocol_sqlite_meta( - engine.load_meta(&req.actor_id).await?, - ), - reason: reason.clone(), - }, - )) - } else if let Some(SqliteStorageError::StageNotFound { stage_id }) = - sqlite_storage_error(&err) - { - Ok(protocol::SqliteCommitFinalizeResponse::SqliteStageNotFound( - protocol::SqliteStageNotFound { - stage_id: *stage_id, - }, - )) - } else { - Ok(protocol::SqliteCommitFinalizeResponse::SqliteErrorResponse( - sqlite_error_response(&err), - )) - } - } - } - } - #[cfg(test)] - SqliteTransportInner::Test(protocol) => protocol.commit_finalize(req).await, - } - } -} - -#[cfg(test)] -#[derive(Default)] -struct DirectTransportHooks { - fail_next_commit: Mutex>, -} - -#[cfg(test)] -impl DirectTransportHooks { - fn fail_next_commit(&self, message: impl Into) { - *self.fail_next_commit.lock() = Some(message.into()); - } - - fn take_commit_error(&self) -> Option { - self.fail_next_commit.lock().take() - } -} - -#[cfg(test)] -fn protocol_sqlite_meta(meta: sqlite_storage::types::SqliteMeta) -> protocol::SqliteMeta { - protocol::SqliteMeta { - schema_version: meta.schema_version, - generation: meta.generation, - head_txid: meta.head_txid, - materialized_txid: meta.materialized_txid, - db_size_pages: meta.db_size_pages, - page_size: meta.page_size, - creation_ts_ms: meta.creation_ts_ms, - max_delta_bytes: meta.max_delta_bytes, - } -} - -#[cfg(test)] -fn protocol_fetched_page(page: sqlite_storage::types::FetchedPage) -> protocol::SqliteFetchedPage { - protocol::SqliteFetchedPage { - pgno: page.pgno, - bytes: page.bytes, - } -} - -#[cfg(test)] -fn storage_dirty_page(page: protocol::SqliteDirtyPage) -> sqlite_storage::types::DirtyPage { - sqlite_storage::types::DirtyPage { - pgno: page.pgno, - bytes: page.bytes, - } -} - -#[cfg(test)] -fn sqlite_storage_error(err: &anyhow::Error) -> Option<&SqliteStorageError> { - err.downcast_ref::() -} - -#[cfg(test)] -fn sqlite_error_reason(err: &anyhow::Error) -> String { - err.chain() - .map(ToString::to_string) - .collect::>() - .join(": ") -} - -#[cfg(test)] -fn sqlite_error_response(err: &anyhow::Error) -> protocol::SqliteErrorResponse { - protocol::SqliteErrorResponse { - message: sqlite_error_reason(err), - } -} - -fn sqlite_now_ms() -> Result { - use std::time::{SystemTime, UNIX_EPOCH}; - - Ok(SystemTime::now() - .duration_since(UNIX_EPOCH)? - .as_millis() - .try_into()?) -} - -#[cfg(test)] -struct MockProtocol { - commit_response: protocol::SqliteCommitResponse, - stage_response: protocol::SqliteCommitStageResponse, - finalize_response: protocol::SqliteCommitFinalizeResponse, - get_pages_response: protocol::SqliteGetPagesResponse, - mirror_commit_meta: Mutex, - commit_requests: Mutex>, - stage_requests: Mutex>, - awaited_stage_responses: Mutex, - finalize_requests: Mutex>, - get_pages_requests: Mutex>, - finalize_started: Notify, - release_finalize: Notify, -} - -#[cfg(test)] -impl MockProtocol { - fn new( - commit_response: protocol::SqliteCommitResponse, - stage_response: protocol::SqliteCommitStageResponse, - finalize_response: protocol::SqliteCommitFinalizeResponse, - ) -> Self { - Self { - commit_response, - stage_response, - finalize_response, - get_pages_response: protocol::SqliteGetPagesResponse::SqliteGetPagesOk( - protocol::SqliteGetPagesOk { - pages: vec![], - meta: sqlite_meta(8 * 1024 * 1024), - }, - ), - mirror_commit_meta: Mutex::new(false), - commit_requests: Mutex::new(Vec::new()), - stage_requests: Mutex::new(Vec::new()), - awaited_stage_responses: Mutex::new(0), - finalize_requests: Mutex::new(Vec::new()), - get_pages_requests: Mutex::new(Vec::new()), - finalize_started: Notify::new(), - release_finalize: Notify::new(), - } - } - - fn commit_requests(&self) -> parking_lot::MutexGuard<'_, Vec> { - self.commit_requests.lock() - } - - fn stage_requests( - &self, - ) -> parking_lot::MutexGuard<'_, Vec> { - self.stage_requests.lock() - } - - fn awaited_stage_responses(&self) -> usize { - *self.awaited_stage_responses.lock() - } - - fn finalize_requests( - &self, - ) -> parking_lot::MutexGuard<'_, Vec> { - self.finalize_requests.lock() - } - - fn get_pages_requests( - &self, - ) -> parking_lot::MutexGuard<'_, Vec> { - self.get_pages_requests.lock() - } - - fn set_mirror_commit_meta(&self, enabled: bool) { - *self.mirror_commit_meta.lock() = enabled; - } - - fn queue_commit_stage(&self, req: protocol::SqliteCommitStageRequest) { - self.stage_requests().push(req); - } - - async fn get_pages( - &self, - req: protocol::SqliteGetPagesRequest, - ) -> Result { - self.get_pages_requests().push(req); - Ok(self.get_pages_response.clone()) - } - - async fn commit( - &self, - req: protocol::SqliteCommitRequest, - ) -> Result { - let req = req.clone(); - self.commit_requests().push(req.clone()); - if *self.mirror_commit_meta.lock() { - if let protocol::SqliteCommitResponse::SqliteCommitOk(ok) = &self.commit_response { - let mut meta = ok.meta.clone(); - meta.head_txid = req.expected_head_txid + 1; - meta.db_size_pages = req.new_db_size_pages; - return Ok(protocol::SqliteCommitResponse::SqliteCommitOk( - protocol::SqliteCommitOk { - new_head_txid: req.expected_head_txid + 1, - meta, - }, - )); - } - } - Ok(self.commit_response.clone()) - } - - async fn commit_stage_begin( - &self, - _req: protocol::SqliteCommitStageBeginRequest, - ) -> Result { - Ok( - protocol::SqliteCommitStageBeginResponse::SqliteCommitStageBeginOk( - protocol::SqliteCommitStageBeginOk { - txid: next_stage_id(), - }, - ), - ) - } - - async fn commit_stage( - &self, - req: protocol::SqliteCommitStageRequest, - ) -> Result { - *self.awaited_stage_responses.lock() += 1; - self.stage_requests().push(req); - Ok(self.stage_response.clone()) - } - - async fn commit_finalize( - &self, - req: protocol::SqliteCommitFinalizeRequest, - ) -> Result { - let req = req.clone(); - self.finalize_requests().push(req.clone()); - self.finalize_started.notify_one(); - self.release_finalize.notified().await; - if *self.mirror_commit_meta.lock() { - if let protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk(ok) = - &self.finalize_response - { - let mut meta = ok.meta.clone(); - meta.head_txid = req.expected_head_txid + 1; - meta.db_size_pages = req.new_db_size_pages; - return Ok( - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: req.expected_head_txid + 1, - meta, - }, - ), - ); - } - } - Ok(self.finalize_response.clone()) - } -} - -#[cfg(test)] -fn sqlite_meta(max_delta_bytes: u64) -> protocol::SqliteMeta { - protocol::SqliteMeta { - schema_version: 2, - generation: 7, - head_txid: 12, - materialized_txid: 12, - db_size_pages: 1, - page_size: 4096, - creation_ts_ms: 1_700_000_000_000, - max_delta_bytes, - } -} - -#[derive(Debug, Clone)] -pub struct VfsV2Config { - pub cache_capacity_pages: u64, - pub prefetch_depth: usize, - pub max_prefetch_bytes: usize, - pub max_pages_per_stage: usize, -} - -impl Default for VfsV2Config { - fn default() -> Self { - Self { - cache_capacity_pages: DEFAULT_CACHE_CAPACITY_PAGES, - prefetch_depth: DEFAULT_PREFETCH_DEPTH, - max_prefetch_bytes: DEFAULT_MAX_PREFETCH_BYTES, - max_pages_per_stage: DEFAULT_MAX_PAGES_PER_STAGE, - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum CommitPath { - Fast, - Slow, -} - -#[derive(Debug, Clone)] -pub struct BufferedCommitRequest { - pub actor_id: String, - pub generation: u64, - pub expected_head_txid: u64, - pub new_db_size_pages: u32, - pub max_delta_bytes: u64, - pub max_pages_per_stage: usize, - pub dirty_pages: Vec, -} - -#[derive(Debug, Clone)] -pub struct BufferedCommitOutcome { - pub path: CommitPath, - pub new_head_txid: u64, - pub meta: protocol::SqliteMeta, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum CommitBufferError { - FenceMismatch(String), - StageNotFound(u64), - Other(String), -} - -#[derive(Debug, Clone, Copy, Default)] -pub struct SqliteVfsMetricsSnapshot { - pub request_build_ns: u64, - pub serialize_ns: u64, - pub transport_ns: u64, - pub state_update_ns: u64, - pub total_ns: u64, - pub commit_count: u64, -} - -#[derive(Debug, Clone, Copy, Default)] -struct CommitTransportMetrics { - serialize_ns: u64, - transport_ns: u64, -} - -pub struct VfsV2Context { - actor_id: String, - runtime: Handle, - transport: SqliteTransport, - config: VfsV2Config, - state: RwLock, - aux_files: RwLock>>, - last_error: Mutex>, - commit_atomic_count: AtomicU64, - io_methods: Box, - // Performance counters - pub resolve_pages_total: AtomicU64, - pub resolve_pages_cache_hits: AtomicU64, - pub resolve_pages_fetches: AtomicU64, - pub pages_fetched_total: AtomicU64, - pub prefetch_pages_total: AtomicU64, - pub commit_total: AtomicU64, - pub commit_request_build_ns: AtomicU64, - pub commit_serialize_ns: AtomicU64, - pub commit_transport_ns: AtomicU64, - pub commit_state_update_ns: AtomicU64, - pub commit_duration_ns_total: AtomicU64, -} - -#[derive(Debug, Clone)] -struct VfsV2State { - generation: u64, - head_txid: u64, - db_size_pages: u32, - page_size: usize, - max_delta_bytes: u64, - page_cache: Cache>, - write_buffer: WriteBuffer, - predictor: PrefetchPredictor, - dead: bool, -} - -#[derive(Debug, Clone, Default)] -struct WriteBuffer { - in_atomic_write: bool, - saved_db_size: u32, - dirty: BTreeMap>, -} - -#[derive(Debug, Clone, Default)] -struct PrefetchPredictor { - last_pgno: Option, - last_delta: Option, - stride_run_len: usize, - // Inspired by mvSQLite's Markov + stride predictor design (Apache-2.0). - transitions: HashMap>, -} - -#[derive(Debug)] -enum GetPagesError { - FenceMismatch(String), - Other(String), -} - -#[repr(C)] -struct VfsV2File { - base: sqlite3_file, - ctx: *const VfsV2Context, - aux: *mut AuxFileHandle, -} - -#[derive(Default)] -struct AuxFileState { - bytes: Mutex>, -} - -struct AuxFileHandle { - path: String, - state: Arc, - delete_on_close: bool, -} - -unsafe impl Send for VfsV2Context {} -unsafe impl Sync for VfsV2Context {} - -pub struct SqliteVfsV2 { - vfs_ptr: *mut sqlite3_vfs, - _name: CString, - ctx_ptr: *mut VfsV2Context, -} - -unsafe impl Send for SqliteVfsV2 {} -unsafe impl Sync for SqliteVfsV2 {} - -pub struct NativeDatabaseV2 { - db: *mut sqlite3, - _vfs: SqliteVfsV2, -} - -unsafe impl Send for NativeDatabaseV2 {} - -impl PrefetchPredictor { - fn record(&mut self, pgno: u32) { - if let Some(last_pgno) = self.last_pgno { - let delta = pgno as i64 - last_pgno as i64; - if let Some(last_delta) = self.last_delta { - self.transitions - .entry(last_delta) - .or_default() - .entry(delta) - .and_modify(|count| *count += 1) - .or_insert(1); - if delta == last_delta { - self.stride_run_len += 1; - } else { - self.stride_run_len = 1; - } - } else { - self.stride_run_len = 1; - } - self.last_delta = Some(delta); - } - self.last_pgno = Some(pgno); - } - - fn multi_predict(&self, from_pgno: u32, depth: usize, db_size_pages: u32) -> Vec { - if depth == 0 || db_size_pages == 0 { - return Vec::new(); - } - - let mut seen = HashSet::new(); - let mut predicted = Vec::with_capacity(depth); - - if let Some(delta) = self.last_delta { - if self.stride_run_len >= 2 && delta > 0 { - let mut current = from_pgno as i64; - for _ in 0..depth { - current += delta; - if !(1..=db_size_pages as i64).contains(¤t) { - break; - } - let pgno = current as u32; - if seen.insert(pgno) { - predicted.push(pgno); - } - } - if predicted.len() >= depth { - return predicted; - } - } - - let mut current_delta = delta; - let mut current_pgno = from_pgno as i64; - for _ in predicted.len()..depth { - let Some(next_delta) = self - .transitions - .get(¤t_delta) - .and_then(|counts| counts.iter().max_by_key(|(_, count)| *count)) - .map(|(delta, _)| *delta) - else { - break; - }; - - current_pgno += next_delta; - if !(1..=db_size_pages as i64).contains(¤t_pgno) { - break; - } - let pgno = current_pgno as u32; - if seen.insert(pgno) { - predicted.push(pgno); - } - current_delta = next_delta; - } - } - - predicted - } -} - -impl VfsV2State { - fn new(config: &VfsV2Config, startup: &protocol::SqliteStartupData) -> Self { - let page_cache = Cache::builder() - .max_capacity(config.cache_capacity_pages) - .build(); - for page in &startup.preloaded_pages { - if let Some(bytes) = &page.bytes { - page_cache.insert(page.pgno, bytes.clone()); - } - } - - let mut state = Self { - generation: startup.generation, - head_txid: startup.meta.head_txid, - db_size_pages: startup.meta.db_size_pages, - page_size: startup.meta.page_size as usize, - max_delta_bytes: startup.meta.max_delta_bytes, - page_cache, - write_buffer: WriteBuffer::default(), - predictor: PrefetchPredictor::default(), - dead: false, - }; - if state.db_size_pages == 0 && !state.page_cache.contains_key(&1) { - state.page_cache.insert(1, empty_db_page()); - state.db_size_pages = 1; - } - state - } - - fn update_meta(&mut self, meta: &protocol::SqliteMeta) { - self.generation = meta.generation; - self.head_txid = meta.head_txid; - self.db_size_pages = meta.db_size_pages; - self.page_size = meta.page_size as usize; - self.max_delta_bytes = meta.max_delta_bytes; - } - - fn update_read_meta(&mut self, meta: &protocol::SqliteMeta) { - self.max_delta_bytes = meta.max_delta_bytes; - } -} - -impl VfsV2Context { - fn new( - actor_id: String, - runtime: Handle, - transport: SqliteTransport, - startup: protocol::SqliteStartupData, - config: VfsV2Config, - io_methods: sqlite3_io_methods, - ) -> Self { - Self { - actor_id, - runtime, - transport, - config: config.clone(), - state: RwLock::new(VfsV2State::new(&config, &startup)), - aux_files: RwLock::new(BTreeMap::new()), - last_error: Mutex::new(None), - commit_atomic_count: AtomicU64::new(0), - io_methods: Box::new(io_methods), - resolve_pages_total: AtomicU64::new(0), - resolve_pages_cache_hits: AtomicU64::new(0), - resolve_pages_fetches: AtomicU64::new(0), - pages_fetched_total: AtomicU64::new(0), - prefetch_pages_total: AtomicU64::new(0), - commit_total: AtomicU64::new(0), - commit_request_build_ns: AtomicU64::new(0), - commit_serialize_ns: AtomicU64::new(0), - commit_transport_ns: AtomicU64::new(0), - commit_state_update_ns: AtomicU64::new(0), - commit_duration_ns_total: AtomicU64::new(0), - } - } - - fn clear_last_error(&self) { - *self.last_error.lock() = None; - } - - fn set_last_error(&self, message: String) { - *self.last_error.lock() = Some(message); - } - - fn clone_last_error(&self) -> Option { - self.last_error.lock().clone() - } - - fn take_last_error(&self) -> Option { - self.last_error.lock().take() - } - - fn add_commit_phase_metrics( - &self, - request_build_ns: u64, - transport_metrics: CommitTransportMetrics, - state_update_ns: u64, - total_ns: u64, - ) { - self.commit_request_build_ns - .fetch_add(request_build_ns, Ordering::Relaxed); - self.commit_serialize_ns - .fetch_add(transport_metrics.serialize_ns, Ordering::Relaxed); - self.commit_transport_ns - .fetch_add(transport_metrics.transport_ns, Ordering::Relaxed); - self.commit_state_update_ns - .fetch_add(state_update_ns, Ordering::Relaxed); - self.commit_duration_ns_total - .fetch_add(total_ns, Ordering::Relaxed); - } - - fn sqlite_vfs_metrics(&self) -> SqliteVfsMetricsSnapshot { - SqliteVfsMetricsSnapshot { - request_build_ns: self.commit_request_build_ns.load(Ordering::Relaxed), - serialize_ns: self.commit_serialize_ns.load(Ordering::Relaxed), - transport_ns: self.commit_transport_ns.load(Ordering::Relaxed), - state_update_ns: self.commit_state_update_ns.load(Ordering::Relaxed), - total_ns: self.commit_duration_ns_total.load(Ordering::Relaxed), - commit_count: self.commit_total.load(Ordering::Relaxed), - } - } - - fn page_size(&self) -> usize { - self.state.read().page_size.max(DEFAULT_PAGE_SIZE) - } - - fn open_aux_file(&self, path: &str) -> Arc { - if let Some(state) = self.aux_files.read().get(path) { - return state.clone(); - } - - let mut aux_files = self.aux_files.write(); - aux_files - .entry(path.to_string()) - .or_insert_with(|| Arc::new(AuxFileState::default())) - .clone() - } - - fn aux_file_exists(&self, path: &str) -> bool { - self.aux_files.read().contains_key(path) - } - - fn delete_aux_file(&self, path: &str) { - self.aux_files.write().remove(path); - } - - fn is_dead(&self) -> bool { - self.state.read().dead - } - - fn mark_dead(&self, message: String) { - self.set_last_error(message); - self.state.write().dead = true; - } - - fn resolve_pages( - &self, - target_pgnos: &[u32], - prefetch: bool, - ) -> std::result::Result>>, GetPagesError> { - use std::sync::atomic::Ordering::Relaxed; - self.resolve_pages_total.fetch_add(1, Relaxed); - - let mut resolved = HashMap::new(); - let mut missing = Vec::new(); - let mut seen = HashSet::new(); - - { - let state = self.state.read(); - if state.dead { - return Err(GetPagesError::Other( - "sqlite v2 actor lost its fence".to_string(), - )); - } - - for pgno in target_pgnos.iter().copied() { - if !seen.insert(pgno) { - continue; - } - if let Some(bytes) = state.write_buffer.dirty.get(&pgno) { - resolved.insert(pgno, Some(bytes.clone())); - continue; - } - if let Some(bytes) = state.page_cache.get(&pgno) { - resolved.insert(pgno, Some(bytes)); - continue; - } - missing.push(pgno); - } - } - - if missing.is_empty() { - self.resolve_pages_cache_hits - .fetch_add(target_pgnos.len() as u64, Relaxed); - return Ok(resolved); - } - self.resolve_pages_cache_hits - .fetch_add((seen.len() - missing.len()) as u64, Relaxed); - - let (generation, to_fetch) = { - let mut state = self.state.write(); - for pgno in target_pgnos.iter().copied() { - state.predictor.record(pgno); - } - - let mut to_fetch = missing.clone(); - if prefetch { - let page_budget = (self.config.max_prefetch_bytes / state.page_size.max(1)).max(1); - let prediction_budget = page_budget.saturating_sub(to_fetch.len()); - let seed_pgno = target_pgnos.last().copied().unwrap_or_default(); - for predicted in state.predictor.multi_predict( - seed_pgno, - prediction_budget.min(self.config.prefetch_depth), - state.db_size_pages.max(seed_pgno), - ) { - if resolved.contains_key(&predicted) || to_fetch.contains(&predicted) { - continue; - } - to_fetch.push(predicted); - } - } - (state.generation, to_fetch) - }; - - { - let prefetch_count = to_fetch.len() - missing.len(); - self.resolve_pages_fetches.fetch_add(1, Relaxed); - self.pages_fetched_total - .fetch_add(to_fetch.len() as u64, Relaxed); - self.prefetch_pages_total - .fetch_add(prefetch_count as u64, Relaxed); - tracing::debug!( - missing = missing.len(), - prefetch = prefetch_count, - total_fetch = to_fetch.len(), - "vfs get_pages fetch" - ); - } - - let response = self - .runtime - .block_on(self.transport.get_pages(protocol::SqliteGetPagesRequest { - actor_id: self.actor_id.clone(), - generation, - pgnos: to_fetch.clone(), - })) - .map_err(|err| GetPagesError::Other(err.to_string()))?; - - match response { - protocol::SqliteGetPagesResponse::SqliteFenceMismatch(mismatch) => { - Err(GetPagesError::FenceMismatch(mismatch.reason)) - } - protocol::SqliteGetPagesResponse::SqliteGetPagesOk(ok) => { - let mut state = self.state.write(); - state.update_read_meta(&ok.meta); - for fetched in ok.pages { - if let Some(bytes) = &fetched.bytes { - state.page_cache.insert(fetched.pgno, bytes.clone()); - } - resolved.insert(fetched.pgno, fetched.bytes); - } - for pgno in missing { - resolved.entry(pgno).or_insert(None); - } - Ok(resolved) - } - protocol::SqliteGetPagesResponse::SqliteErrorResponse(error) => { - Err(GetPagesError::Other(error.message)) - } - } - } - - fn flush_dirty_pages( - &self, - ) -> std::result::Result, CommitBufferError> { - let total_start = Instant::now(); - let request_build_start = Instant::now(); - let request = { - let state = self.state.read(); - if state.dead { - return Err(CommitBufferError::Other( - "sqlite v2 actor lost its fence".to_string(), - )); - } - if state.write_buffer.in_atomic_write || state.write_buffer.dirty.is_empty() { - return Ok(None); - } - - BufferedCommitRequest { - actor_id: self.actor_id.clone(), - generation: state.generation, - expected_head_txid: state.head_txid, - new_db_size_pages: state.db_size_pages, - max_delta_bytes: state.max_delta_bytes, - max_pages_per_stage: self.config.max_pages_per_stage, - dirty_pages: state - .write_buffer - .dirty - .iter() - .map(|(pgno, bytes)| protocol::SqliteDirtyPage { - pgno: *pgno, - bytes: bytes.clone(), - }) - .collect(), - } - }; - let request_build_ns = request_build_start.elapsed().as_nanos() as u64; - - let (outcome, transport_metrics) = match self - .runtime - .block_on(commit_buffered_pages(&self.transport, request.clone())) - { - Ok(outcome) => outcome, - Err(err) => { - mark_dead_for_non_fence_commit_error(self, &err); - return Err(err); - } - }; - self.commit_total - .fetch_add(1, std::sync::atomic::Ordering::Relaxed); - tracing::debug!( - dirty_pages = request.dirty_pages.len(), - path = ?outcome.path, - new_head_txid = outcome.new_head_txid, - request_build_ns, - serialize_ns = transport_metrics.serialize_ns, - transport_ns = transport_metrics.transport_ns, - "vfs commit complete (flush)" - ); - let state_update_start = Instant::now(); - let mut state = self.state.write(); - state.update_meta(&outcome.meta); - state.db_size_pages = request.new_db_size_pages; - for dirty_page in &request.dirty_pages { - state - .page_cache - .insert(dirty_page.pgno, dirty_page.bytes.clone()); - } - state.write_buffer.dirty.clear(); - let state_update_ns = state_update_start.elapsed().as_nanos() as u64; - self.add_commit_phase_metrics( - request_build_ns, - transport_metrics, - state_update_ns, - total_start.elapsed().as_nanos() as u64, - ); - Ok(Some(outcome)) - } - - fn commit_atomic_write(&self) -> std::result::Result<(), CommitBufferError> { - let total_start = Instant::now(); - let request_build_start = Instant::now(); - let request = { - let mut state = self.state.write(); - if state.dead { - return Err(CommitBufferError::Other( - "sqlite v2 actor lost its fence".to_string(), - )); - } - if !state.write_buffer.in_atomic_write { - return Ok(()); - } - if state.write_buffer.dirty.is_empty() { - state.write_buffer.in_atomic_write = false; - return Ok(()); - } - - BufferedCommitRequest { - actor_id: self.actor_id.clone(), - generation: state.generation, - expected_head_txid: state.head_txid, - new_db_size_pages: state.db_size_pages, - max_delta_bytes: state.max_delta_bytes, - max_pages_per_stage: self.config.max_pages_per_stage, - dirty_pages: state - .write_buffer - .dirty - .iter() - .map(|(pgno, bytes)| protocol::SqliteDirtyPage { - pgno: *pgno, - bytes: bytes.clone(), - }) - .collect(), - } - }; - let request_build_ns = request_build_start.elapsed().as_nanos() as u64; - - let (outcome, transport_metrics) = match self - .runtime - .block_on(commit_buffered_pages(&self.transport, request.clone())) - { - Ok(outcome) => outcome, - Err(err) => { - mark_dead_for_non_fence_commit_error(self, &err); - return Err(err); - } - }; - self.commit_total - .fetch_add(1, std::sync::atomic::Ordering::Relaxed); - tracing::debug!( - dirty_pages = request.dirty_pages.len(), - path = ?outcome.path, - new_head_txid = outcome.new_head_txid, - request_build_ns, - serialize_ns = transport_metrics.serialize_ns, - transport_ns = transport_metrics.transport_ns, - "vfs commit complete (atomic)" - ); - self.set_last_error(format!( - "post-commit atomic write succeeded: requested_db_size_pages={}, returned_db_size_pages={}, returned_head_txid={}", - request.new_db_size_pages, - outcome.meta.db_size_pages, - outcome.meta.head_txid, - )); - let state_update_start = Instant::now(); - let mut state = self.state.write(); - state.update_meta(&outcome.meta); - state.db_size_pages = request.new_db_size_pages; - for dirty_page in &request.dirty_pages { - state - .page_cache - .insert(dirty_page.pgno, dirty_page.bytes.clone()); - } - state.write_buffer.dirty.clear(); - state.write_buffer.in_atomic_write = false; - let state_update_ns = state_update_start.elapsed().as_nanos() as u64; - self.add_commit_phase_metrics( - request_build_ns, - transport_metrics, - state_update_ns, - total_start.elapsed().as_nanos() as u64, - ); - Ok(()) - } - - fn truncate_main_file(&self, size: sqlite3_int64) { - let page_size = self.page_size() as i64; - let truncated_pages = ((size + page_size - 1) / page_size) as u32; - let mut state = self.state.write(); - state.db_size_pages = truncated_pages; - state - .write_buffer - .dirty - .retain(|pgno, _| *pgno <= truncated_pages); - state.page_cache.invalidate_all(); - } -} - -fn cleanup_batch_atomic_probe(db: *mut sqlite3) { - if let Err(err) = sqlite_exec(db, "DROP TABLE IF EXISTS __rivet_batch_probe;") { - tracing::warn!(%err, "failed to clean up sqlite v2 batch atomic probe table"); - } -} - -fn assert_batch_atomic_probe( - db: *mut sqlite3, - vfs: &SqliteVfsV2, -) -> std::result::Result<(), String> { - let commit_atomic_before = vfs.commit_atomic_count(); - let probe_sql = "\ - BEGIN IMMEDIATE;\ - CREATE TABLE IF NOT EXISTS __rivet_batch_probe(x INTEGER);\ - INSERT INTO __rivet_batch_probe VALUES(1);\ - DELETE FROM __rivet_batch_probe;\ - DROP TABLE IF EXISTS __rivet_batch_probe;\ - COMMIT;\ - "; - - if let Err(err) = sqlite_exec(db, probe_sql) { - cleanup_batch_atomic_probe(db); - return Err(format!("batch atomic probe failed: {err}")); - } - - let commit_atomic_after = vfs.commit_atomic_count(); - if commit_atomic_after == commit_atomic_before { - tracing::error!( - "batch atomic writes not active for sqlite v2, SQLITE_ENABLE_BATCH_ATOMIC_WRITE may be missing" - ); - cleanup_batch_atomic_probe(db); - return Err( - "batch atomic writes not active for sqlite v2, SQLITE_ENABLE_BATCH_ATOMIC_WRITE may be missing" - .to_string(), - ); - } - - Ok(()) -} - -fn mark_dead_for_non_fence_commit_error(ctx: &VfsV2Context, err: &CommitBufferError) { - match err { - CommitBufferError::FenceMismatch(_) => {} - CommitBufferError::StageNotFound(stage_id) => { - ctx.mark_dead(format!( - "sqlite v2 stage {stage_id} missing during commit finalize" - )); - } - CommitBufferError::Other(message) => ctx.mark_dead(message.clone()), - } -} - -fn mark_dead_from_fence_commit_error(ctx: &VfsV2Context, err: &CommitBufferError) { - if let CommitBufferError::FenceMismatch(reason) = err { - ctx.mark_dead(reason.clone()); - } -} - -fn dirty_pages_raw_bytes(dirty_pages: &[protocol::SqliteDirtyPage]) -> Result { - dirty_pages.iter().try_fold(0u64, |total, dirty_page| { - let page_len = u64::try_from(dirty_page.bytes.len())?; - Ok(total + page_len) - }) -} - -fn split_bytes(bytes: &[u8], max_chunk_bytes: usize) -> Vec> { - if bytes.is_empty() || max_chunk_bytes == 0 { - return vec![bytes.to_vec()]; - } - - bytes - .chunks(max_chunk_bytes) - .map(|chunk| chunk.to_vec()) - .collect() -} - -#[cfg(test)] -fn next_stage_id() -> u64 { - NEXT_STAGE_ID.fetch_add(1, Ordering::Relaxed) -} - -fn next_temp_aux_path() -> String { - format!( - "{TEMP_AUX_PATH_PREFIX}-{}", - NEXT_TEMP_AUX_ID.fetch_add(1, Ordering::Relaxed) - ) -} - -unsafe fn get_aux_state(file: &VfsV2File) -> Option<&AuxFileHandle> { - (!file.aux.is_null()).then(|| &*file.aux) -} - -async fn commit_buffered_pages( - transport: &SqliteTransport, - request: BufferedCommitRequest, -) -> std::result::Result<(BufferedCommitOutcome, CommitTransportMetrics), CommitBufferError> { - let raw_dirty_bytes = dirty_pages_raw_bytes(&request.dirty_pages) - .map_err(|err| CommitBufferError::Other(err.to_string()))?; - let mut metrics = CommitTransportMetrics::default(); - - if raw_dirty_bytes <= request.max_delta_bytes { - let serialize_start = Instant::now(); - let fast_request = protocol::SqliteCommitRequest { - actor_id: request.actor_id.clone(), - generation: request.generation, - expected_head_txid: request.expected_head_txid, - dirty_pages: request.dirty_pages.clone(), - new_db_size_pages: request.new_db_size_pages, - }; - metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; - let transport_start = Instant::now(); - match transport - .commit(fast_request) - .await - .map_err(|err| CommitBufferError::Other(err.to_string()))? - { - protocol::SqliteCommitResponse::SqliteCommitOk(ok) => { - metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; - return Ok(( - BufferedCommitOutcome { - path: CommitPath::Fast, - new_head_txid: ok.new_head_txid, - meta: ok.meta, - }, - metrics, - )); - } - protocol::SqliteCommitResponse::SqliteFenceMismatch(mismatch) => { - return Err(CommitBufferError::FenceMismatch(mismatch.reason)); - } - protocol::SqliteCommitResponse::SqliteCommitTooLarge(_) => { - metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; - } - protocol::SqliteCommitResponse::SqliteErrorResponse(error) => { - return Err(CommitBufferError::Other(error.message)); - } - } - } - - let serialize_start = Instant::now(); - let stage_begin_request = protocol::SqliteCommitStageBeginRequest { - actor_id: request.actor_id.clone(), - generation: request.generation, - }; - metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; - let transport_start = Instant::now(); - let txid = match transport - .commit_stage_begin(stage_begin_request) - .await - .map_err(|err| CommitBufferError::Other(err.to_string()))? - { - protocol::SqliteCommitStageBeginResponse::SqliteCommitStageBeginOk(ok) => { - metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; - ok.txid - } - protocol::SqliteCommitStageBeginResponse::SqliteFenceMismatch(mismatch) => { - return Err(CommitBufferError::FenceMismatch(mismatch.reason)); - } - protocol::SqliteCommitStageBeginResponse::SqliteErrorResponse(error) => { - return Err(CommitBufferError::Other(error.message)); - } - }; - - let serialize_start = Instant::now(); - let encoded_delta = encode_ltx_v3( - LtxHeader::delta( - txid, - request.new_db_size_pages, - sqlite_now_ms().map_err(|err| CommitBufferError::Other(err.to_string()))?, - ), - &request - .dirty_pages - .iter() - .map(|dirty_page| sqlite_storage::types::DirtyPage { - pgno: dirty_page.pgno, - bytes: dirty_page.bytes.clone(), - }) - .collect::>(), - ) - .map_err(|err| CommitBufferError::Other(err.to_string()))?; - let staged_chunks = split_bytes( - &encoded_delta, - request.max_delta_bytes.try_into().map_err(|_| { - CommitBufferError::Other("sqlite max_delta_bytes exceeded usize".to_string()) - })?, - ); - metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; - - for (chunk_idx, chunk_bytes) in staged_chunks.iter().enumerate() { - let serialize_start = Instant::now(); - let stage_request = protocol::SqliteCommitStageRequest { - actor_id: request.actor_id.clone(), - generation: request.generation, - txid, - chunk_idx: chunk_idx as u32, - bytes: chunk_bytes.clone(), - is_last: chunk_idx + 1 == staged_chunks.len(), - }; - metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; - if transport - .queue_commit_stage(stage_request.clone()) - .map_err(|err| CommitBufferError::Other(err.to_string()))? - { - continue; - } - - let transport_start = Instant::now(); - match transport - .commit_stage(stage_request) - .await - .map_err(|err| CommitBufferError::Other(err.to_string()))? - { - protocol::SqliteCommitStageResponse::SqliteCommitStageOk(_) => { - metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; - } - protocol::SqliteCommitStageResponse::SqliteFenceMismatch(mismatch) => { - return Err(CommitBufferError::FenceMismatch(mismatch.reason)); - } - protocol::SqliteCommitStageResponse::SqliteErrorResponse(error) => { - return Err(CommitBufferError::Other(error.message)); - } - } - } - - let serialize_start = Instant::now(); - let finalize_request = protocol::SqliteCommitFinalizeRequest { - actor_id: request.actor_id, - generation: request.generation, - expected_head_txid: request.expected_head_txid, - txid, - new_db_size_pages: request.new_db_size_pages, - }; - metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; - let transport_start = Instant::now(); - match transport - .commit_finalize(finalize_request) - .await - .map_err(|err| CommitBufferError::Other(err.to_string()))? - { - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk(ok) => { - metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; - Ok(( - BufferedCommitOutcome { - path: CommitPath::Slow, - new_head_txid: ok.new_head_txid, - meta: ok.meta, - }, - metrics, - )) - } - protocol::SqliteCommitFinalizeResponse::SqliteFenceMismatch(mismatch) => { - Err(CommitBufferError::FenceMismatch(mismatch.reason)) - } - protocol::SqliteCommitFinalizeResponse::SqliteStageNotFound(not_found) => { - Err(CommitBufferError::StageNotFound(not_found.stage_id)) - } - protocol::SqliteCommitFinalizeResponse::SqliteErrorResponse(error) => { - Err(CommitBufferError::Other(error.message)) - } - } -} - -unsafe fn get_file(p: *mut sqlite3_file) -> &'static mut VfsV2File { - &mut *(p as *mut VfsV2File) -} - -unsafe fn get_vfs_ctx(p: *mut sqlite3_vfs) -> &'static VfsV2Context { - &*((*p).pAppData as *const VfsV2Context) -} - -fn sqlite_error_message(db: *mut sqlite3) -> String { - unsafe { - if db.is_null() { - "unknown sqlite error".to_string() - } else { - CStr::from_ptr(sqlite3_errmsg(db)) - .to_string_lossy() - .into_owned() - } - } -} - -fn sqlite_exec(db: *mut sqlite3, sql: &str) -> std::result::Result<(), String> { - let c_sql = CString::new(sql).map_err(|err| err.to_string())?; - let rc = unsafe { sqlite3_exec(db, c_sql.as_ptr(), None, ptr::null_mut(), ptr::null_mut()) }; - if rc != SQLITE_OK { - return Err(format!( - "`{sql}` failed with code {rc}: {}", - sqlite_error_message(db) - )); - } - Ok(()) -} - -#[cfg(test)] -fn sqlite_step_statement(db: *mut sqlite3, sql: &str) -> std::result::Result<(), String> { - let c_sql = CString::new(sql).map_err(|err| err.to_string())?; - let mut stmt = ptr::null_mut(); - let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; - if rc != SQLITE_OK { - return Err(format!( - "`{sql}` prepare failed with code {rc}: {}", - sqlite_error_message(db) - )); - } - if stmt.is_null() { - return Ok(()); - } - - let result = loop { - let step_rc = unsafe { sqlite3_step(stmt) }; - if step_rc == SQLITE_DONE { - break Ok(()); - } - if step_rc != SQLITE_ROW { - break Err(format!( - "`{sql}` step failed with code {step_rc}: {}", - sqlite_error_message(db) - )); - } - }; - - unsafe { - sqlite3_finalize(stmt); - } - - result -} - -fn page_span(offset: i64, length: usize, page_size: usize) -> std::result::Result, ()> { - if offset < 0 { - return Err(()); - } - if length == 0 { - return Ok(Vec::new()); - } - - let start = offset as usize / page_size + 1; - let end = (offset as usize + length - 1) / page_size + 1; - Ok((start as u32..=end as u32).collect()) -} - -unsafe extern "C" fn v2_io_close(p_file: *mut sqlite3_file) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - if p_file.is_null() { - return SQLITE_OK; - } - let file = get_file(p_file); - let result = if !file.aux.is_null() { - let aux = Box::from_raw(file.aux); - if aux.delete_on_close { - let ctx = &*file.ctx; - ctx.delete_aux_file(&aux.path); - } - file.aux = ptr::null_mut(); - Ok(()) - } else { - let ctx = &*file.ctx; - let should_flush = { - let state = ctx.state.read(); - state.write_buffer.in_atomic_write || !state.write_buffer.dirty.is_empty() - }; - if should_flush { - if ctx.state.read().write_buffer.in_atomic_write { - ctx.commit_atomic_write().map(|_| ()) - } else { - ctx.flush_dirty_pages().map(|_| ()) - } - } else { - Ok(()) - } - }; - file.base.pMethods = ptr::null(); - match result { - Ok(()) => SQLITE_OK, - Err(err) => { - let ctx = &*file.ctx; - mark_dead_from_fence_commit_error(ctx, &err); - SQLITE_IOERR - } - } - }) -} - -unsafe extern "C" fn v2_io_read( - p_file: *mut sqlite3_file, - p_buf: *mut c_void, - i_amt: c_int, - i_offset: sqlite3_int64, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_READ, { - if i_amt <= 0 { - return SQLITE_OK; - } - - let file = get_file(p_file); - if let Some(aux) = get_aux_state(file) { - if i_offset < 0 { - return SQLITE_IOERR_READ; - } - - let offset = i_offset as usize; - let requested = i_amt as usize; - let buf = slice::from_raw_parts_mut(p_buf.cast::(), requested); - buf.fill(0); - - let bytes = aux.state.bytes.lock(); - if offset >= bytes.len() { - return SQLITE_IOERR_SHORT_READ; - } - - let copy_len = requested.min(bytes.len() - offset); - buf[..copy_len].copy_from_slice(&bytes[offset..offset + copy_len]); - return if copy_len < requested { - SQLITE_IOERR_SHORT_READ - } else { - SQLITE_OK - }; - } - - let ctx = &*file.ctx; - if ctx.is_dead() { - return SQLITE_IOERR_READ; - } - - let buf = slice::from_raw_parts_mut(p_buf.cast::(), i_amt as usize); - let requested_pages = match page_span(i_offset, i_amt as usize, ctx.page_size()) { - Ok(pages) => pages, - Err(_) => return SQLITE_IOERR_READ, - }; - let page_size = ctx.page_size(); - let file_size = { - let state = ctx.state.read(); - state.db_size_pages as usize * state.page_size - }; - - let resolved = match ctx.resolve_pages(&requested_pages, true) { - Ok(pages) => pages, - Err(GetPagesError::FenceMismatch(reason)) => { - ctx.mark_dead(reason); - return SQLITE_IOERR_READ; - } - Err(GetPagesError::Other(message)) => { - ctx.mark_dead(message); - return SQLITE_IOERR_READ; - } - }; - ctx.clear_last_error(); - - buf.fill(0); - for pgno in requested_pages { - let Some(Some(bytes)) = resolved.get(&pgno) else { - continue; - }; - let page_start = (pgno as usize - 1) * page_size; - let copy_start = page_start.max(i_offset as usize); - let copy_end = (page_start + page_size).min(i_offset as usize + i_amt as usize); - if copy_start >= copy_end { - continue; - } - let page_offset = copy_start - page_start; - let dest_offset = copy_start - i_offset as usize; - let copy_len = copy_end - copy_start; - buf[dest_offset..dest_offset + copy_len] - .copy_from_slice(&bytes[page_offset..page_offset + copy_len]); - } - - if i_offset as usize + i_amt as usize > file_size { - return SQLITE_IOERR_SHORT_READ; - } - - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_io_write( - p_file: *mut sqlite3_file, - p_buf: *const c_void, - i_amt: c_int, - i_offset: sqlite3_int64, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_WRITE, { - if i_amt <= 0 { - return SQLITE_OK; - } - - let file = get_file(p_file); - if let Some(aux) = get_aux_state(file) { - if i_offset < 0 { - return SQLITE_IOERR_WRITE; - } - - let offset = i_offset as usize; - let source = slice::from_raw_parts(p_buf.cast::(), i_amt as usize); - let mut bytes = aux.state.bytes.lock(); - let end = offset + source.len(); - if bytes.len() < end { - bytes.resize(end, 0); - } - bytes[offset..end].copy_from_slice(source); - return SQLITE_OK; - } - - let ctx = &*file.ctx; - if ctx.is_dead() { - return SQLITE_IOERR_WRITE; - } - - let page_size = ctx.page_size(); - let source = slice::from_raw_parts(p_buf.cast::(), i_amt as usize); - let target_pages = match page_span(i_offset, i_amt as usize, page_size) { - Ok(pages) => pages, - Err(_) => return SQLITE_IOERR_WRITE, - }; - - // Fast path: for full-page aligned writes we don't need the existing - // page data because we're overwriting every byte. Skip resolve_pages - // to eliminate a round trip to the engine per page. Also, for pages - // beyond db_size_pages (new allocations), there's nothing to fetch. - let offset = i_offset as usize; - let amt = i_amt as usize; - let is_aligned_full_page = offset % page_size == 0 && amt % page_size == 0; - - let resolved = if is_aligned_full_page { - HashMap::new() - } else { - let (db_size_pages, pages_to_resolve): (u32, Vec) = { - let state = ctx.state.read(); - let known_max = state.db_size_pages; - ( - known_max, - target_pages - .iter() - .copied() - .filter(|pgno| *pgno <= known_max) - .collect(), - ) - }; - - let mut resolved = if pages_to_resolve.is_empty() { - HashMap::new() - } else { - match ctx.resolve_pages(&pages_to_resolve, false) { - Ok(pages) => pages, - Err(GetPagesError::FenceMismatch(reason)) => { - ctx.mark_dead(reason); - return SQLITE_IOERR_WRITE; - } - Err(GetPagesError::Other(message)) => { - ctx.mark_dead(message); - return SQLITE_IOERR_WRITE; - } - } - }; - for pgno in &target_pages { - if *pgno > db_size_pages { - resolved.entry(*pgno).or_insert(None); - } - } - resolved - }; - - let mut dirty_pages = BTreeMap::new(); - for pgno in target_pages { - let page_start = (pgno as usize - 1) * page_size; - let patch_start = page_start.max(offset); - let patch_end = (page_start + page_size).min(offset + amt); - let Some(copy_len) = patch_end.checked_sub(patch_start) else { - continue; - }; - if copy_len == 0 { - continue; - } - - let mut page = if is_aligned_full_page { - vec![0; page_size] - } else { - resolved - .get(&pgno) - .and_then(|bytes| bytes.clone()) - .unwrap_or_else(|| vec![0; page_size]) - }; - if page.len() < page_size { - page.resize(page_size, 0); - } - - let page_offset = patch_start - page_start; - let source_offset = patch_start - offset; - page[page_offset..page_offset + copy_len] - .copy_from_slice(&source[source_offset..source_offset + copy_len]); - dirty_pages.insert(pgno, page); - } - - let mut state = ctx.state.write(); - for (pgno, bytes) in dirty_pages { - state.write_buffer.dirty.insert(pgno, bytes); - } - let end_page = ((offset + amt) + page_size - 1) / page_size; - state.db_size_pages = state.db_size_pages.max(end_page as u32); - ctx.clear_last_error(); - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_io_truncate(p_file: *mut sqlite3_file, size: sqlite3_int64) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_TRUNCATE, { - if size < 0 { - return SQLITE_IOERR_TRUNCATE; - } - let file = get_file(p_file); - if let Some(aux) = get_aux_state(file) { - aux.state.bytes.lock().truncate(size as usize); - return SQLITE_OK; - } - let ctx = &*file.ctx; - ctx.truncate_main_file(size); - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_io_sync(p_file: *mut sqlite3_file, _flags: c_int) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_FSYNC, { - let file = get_file(p_file); - if get_aux_state(file).is_some() { - return SQLITE_OK; - } - let ctx = &*file.ctx; - match ctx.flush_dirty_pages() { - Ok(_) => SQLITE_OK, - Err(err) => { - mark_dead_from_fence_commit_error(ctx, &err); - SQLITE_IOERR_FSYNC - } - } - }) -} - -unsafe extern "C" fn v2_io_file_size( - p_file: *mut sqlite3_file, - p_size: *mut sqlite3_int64, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_FSTAT, { - let file = get_file(p_file); - if let Some(aux) = get_aux_state(file) { - *p_size = aux.state.bytes.lock().len() as sqlite3_int64; - return SQLITE_OK; - } - let ctx = &*file.ctx; - let state = ctx.state.read(); - *p_size = (state.db_size_pages as usize * state.page_size) as sqlite3_int64; - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_io_lock(_p_file: *mut sqlite3_file, _level: c_int) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_LOCK, SQLITE_OK) -} - -unsafe extern "C" fn v2_io_unlock(_p_file: *mut sqlite3_file, _level: c_int) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_UNLOCK, SQLITE_OK) -} - -unsafe extern "C" fn v2_io_check_reserved_lock( - _p_file: *mut sqlite3_file, - p_res_out: *mut c_int, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - *p_res_out = 0; - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_io_file_control( - p_file: *mut sqlite3_file, - op: c_int, - _p_arg: *mut c_void, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - let file = get_file(p_file); - if get_aux_state(file).is_some() { - return SQLITE_NOTFOUND; - } - let ctx = &*file.ctx; - - match op { - SQLITE_FCNTL_BEGIN_ATOMIC_WRITE => { - let mut state = ctx.state.write(); - state.write_buffer.in_atomic_write = true; - state.write_buffer.saved_db_size = state.db_size_pages; - state.write_buffer.dirty.clear(); - SQLITE_OK - } - SQLITE_FCNTL_COMMIT_ATOMIC_WRITE => match ctx.commit_atomic_write() { - Ok(()) => { - ctx.commit_atomic_count.fetch_add(1, Ordering::Relaxed); - SQLITE_OK - } - Err(err) => { - mark_dead_from_fence_commit_error(ctx, &err); - SQLITE_IOERR - } - }, - SQLITE_FCNTL_ROLLBACK_ATOMIC_WRITE => { - let mut state = ctx.state.write(); - state.write_buffer.dirty.clear(); - state.write_buffer.in_atomic_write = false; - state.db_size_pages = state.write_buffer.saved_db_size; - SQLITE_OK - } - _ => SQLITE_NOTFOUND, - } - }) -} - -unsafe extern "C" fn v2_io_sector_size(_p_file: *mut sqlite3_file) -> c_int { - vfs_catch_unwind!(DEFAULT_PAGE_SIZE as c_int, DEFAULT_PAGE_SIZE as c_int) -} - -unsafe extern "C" fn v2_io_device_characteristics(p_file: *mut sqlite3_file) -> c_int { - vfs_catch_unwind!(0, { - let file = get_file(p_file); - if get_aux_state(file).is_some() { - 0 - } else { - SQLITE_IOCAP_BATCH_ATOMIC - } - }) -} - -unsafe extern "C" fn v2_vfs_open( - p_vfs: *mut sqlite3_vfs, - z_name: *const c_char, - p_file: *mut sqlite3_file, - flags: c_int, - p_out_flags: *mut c_int, -) -> c_int { - vfs_catch_unwind!(SQLITE_CANTOPEN, { - let ctx = get_vfs_ctx(p_vfs); - let delete_on_close = (flags & SQLITE_OPEN_DELETEONCLOSE) != 0; - let path = if z_name.is_null() { - if delete_on_close { - next_temp_aux_path() - } else { - return SQLITE_CANTOPEN; - } - } else { - match CStr::from_ptr(z_name).to_str() { - Ok(path) => path.to_string(), - Err(_) => return SQLITE_CANTOPEN, - } - }; - let is_main = - path == ctx.actor_id && !delete_on_close && (flags & SQLITE_OPEN_MAIN_DB) != 0; - - let base = sqlite3_file { - pMethods: ctx.io_methods.as_ref(), - }; - let aux = if is_main { - ptr::null_mut() - } else { - Box::into_raw(Box::new(AuxFileHandle { - path: path.clone(), - state: ctx.open_aux_file(&path), - delete_on_close, - })) - }; - ptr::write( - p_file.cast::(), - VfsV2File { - base, - ctx: ctx as *const VfsV2Context, - aux, - }, - ); - - if !p_out_flags.is_null() { - *p_out_flags = flags; - } - - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_vfs_delete( - p_vfs: *mut sqlite3_vfs, - z_name: *const c_char, - _sync_dir: c_int, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_DELETE, { - if z_name.is_null() { - return SQLITE_OK; - } - - let ctx = get_vfs_ctx(p_vfs); - let path = match CStr::from_ptr(z_name).to_str() { - Ok(path) => path, - Err(_) => return SQLITE_OK, - }; - if path != ctx.actor_id { - ctx.delete_aux_file(path); - } - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_vfs_access( - p_vfs: *mut sqlite3_vfs, - z_name: *const c_char, - _flags: c_int, - p_res_out: *mut c_int, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_ACCESS, { - if z_name.is_null() { - *p_res_out = 0; - return SQLITE_OK; - } - - let ctx = get_vfs_ctx(p_vfs); - let path = match CStr::from_ptr(z_name).to_str() { - Ok(path) => path, - Err(_) => { - *p_res_out = 0; - return SQLITE_OK; - } - }; - - *p_res_out = if path == ctx.actor_id || ctx.aux_file_exists(path) { - 1 - } else { - 0 - }; - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_vfs_full_pathname( - _p_vfs: *mut sqlite3_vfs, - z_name: *const c_char, - n_out: c_int, - z_out: *mut c_char, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - if z_name.is_null() || z_out.is_null() || n_out <= 0 { - return SQLITE_IOERR; - } - - let name = CStr::from_ptr(z_name); - let bytes = name.to_bytes_with_nul(); - if bytes.len() >= n_out as usize { - return SQLITE_IOERR; - } - - ptr::copy_nonoverlapping(bytes.as_ptr().cast::(), z_out, bytes.len()); - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_vfs_randomness( - _p_vfs: *mut sqlite3_vfs, - n_byte: c_int, - z_out: *mut c_char, -) -> c_int { - vfs_catch_unwind!(0, { - let buf = slice::from_raw_parts_mut(z_out.cast::(), n_byte as usize); - match getrandom::getrandom(buf) { - Ok(()) => n_byte, - Err(_) => 0, - } - }) -} - -unsafe extern "C" fn v2_vfs_sleep(_p_vfs: *mut sqlite3_vfs, microseconds: c_int) -> c_int { - vfs_catch_unwind!(0, { - std::thread::sleep(std::time::Duration::from_micros(microseconds as u64)); - microseconds - }) -} - -unsafe extern "C" fn v2_vfs_current_time(_p_vfs: *mut sqlite3_vfs, p_time_out: *mut f64) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - let now = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default(); - *p_time_out = 2440587.5 + (now.as_secs_f64() / 86400.0); - SQLITE_OK - }) -} - -unsafe extern "C" fn v2_vfs_get_last_error( - p_vfs: *mut sqlite3_vfs, - n_byte: c_int, - z_err_msg: *mut c_char, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - if n_byte <= 0 || z_err_msg.is_null() { - return 0; - } - - let ctx = get_vfs_ctx(p_vfs); - let Some(message) = ctx.clone_last_error() else { - *z_err_msg = 0; - return 0; - }; - - let bytes = message.as_bytes(); - let max_len = (n_byte as usize).saturating_sub(1); - let copy_len = bytes.len().min(max_len); - let dst = z_err_msg.cast::(); - ptr::copy_nonoverlapping(bytes.as_ptr(), dst, copy_len); - *dst.add(copy_len) = 0; - 0 - }) -} - -impl SqliteVfsV2 { - pub fn register( - name: &str, - handle: EnvoyHandle, - actor_id: String, - runtime: Handle, - startup: protocol::SqliteStartupData, - config: VfsV2Config, - ) -> std::result::Result { - Self::register_with_transport( - name, - SqliteTransport::from_envoy(handle), - actor_id, - runtime, - startup, - config, - ) - } - - fn take_last_error(&self) -> Option { - unsafe { (*self.ctx_ptr).take_last_error() } - } - - fn register_with_transport( - name: &str, - transport: SqliteTransport, - actor_id: String, - runtime: Handle, - startup: protocol::SqliteStartupData, - config: VfsV2Config, - ) -> std::result::Result { - let mut io_methods: sqlite3_io_methods = unsafe { std::mem::zeroed() }; - io_methods.iVersion = 1; - io_methods.xClose = Some(v2_io_close); - io_methods.xRead = Some(v2_io_read); - io_methods.xWrite = Some(v2_io_write); - io_methods.xTruncate = Some(v2_io_truncate); - io_methods.xSync = Some(v2_io_sync); - io_methods.xFileSize = Some(v2_io_file_size); - io_methods.xLock = Some(v2_io_lock); - io_methods.xUnlock = Some(v2_io_unlock); - io_methods.xCheckReservedLock = Some(v2_io_check_reserved_lock); - io_methods.xFileControl = Some(v2_io_file_control); - io_methods.xSectorSize = Some(v2_io_sector_size); - io_methods.xDeviceCharacteristics = Some(v2_io_device_characteristics); - - let ctx = Box::new(VfsV2Context::new( - actor_id, runtime, transport, startup, config, io_methods, - )); - let ctx_ptr = Box::into_raw(ctx); - let name_cstring = CString::new(name).map_err(|err| err.to_string())?; - - let mut vfs: sqlite3_vfs = unsafe { std::mem::zeroed() }; - vfs.iVersion = 1; - vfs.szOsFile = std::mem::size_of::() as c_int; - vfs.mxPathname = MAX_PATHNAME; - vfs.zName = name_cstring.as_ptr(); - vfs.pAppData = ctx_ptr.cast::(); - vfs.xOpen = Some(v2_vfs_open); - vfs.xDelete = Some(v2_vfs_delete); - vfs.xAccess = Some(v2_vfs_access); - vfs.xFullPathname = Some(v2_vfs_full_pathname); - vfs.xRandomness = Some(v2_vfs_randomness); - vfs.xSleep = Some(v2_vfs_sleep); - vfs.xCurrentTime = Some(v2_vfs_current_time); - vfs.xGetLastError = Some(v2_vfs_get_last_error); - - let vfs_ptr = Box::into_raw(Box::new(vfs)); - let rc = unsafe { sqlite3_vfs_register(vfs_ptr, 0) }; - if rc != SQLITE_OK { - unsafe { - drop(Box::from_raw(vfs_ptr)); - drop(Box::from_raw(ctx_ptr)); - } - return Err(format!("sqlite3_vfs_register failed with code {rc}")); - } - - Ok(Self { - vfs_ptr, - _name: name_cstring, - ctx_ptr, - }) - } - - pub fn name_ptr(&self) -> *const c_char { - self._name.as_ptr() - } - - fn commit_atomic_count(&self) -> u64 { - unsafe { (*self.ctx_ptr).commit_atomic_count.load(Ordering::Relaxed) } - } -} - -impl Drop for SqliteVfsV2 { - fn drop(&mut self) { - unsafe { - sqlite3_vfs_unregister(self.vfs_ptr); - drop(Box::from_raw(self.vfs_ptr)); - drop(Box::from_raw(self.ctx_ptr)); - } - } -} - -impl NativeDatabaseV2 { - pub fn as_ptr(&self) -> *mut sqlite3 { - self.db - } - - pub fn take_last_kv_error(&self) -> Option { - self._vfs.take_last_error() - } - - pub fn sqlite_vfs_metrics(&self) -> SqliteVfsMetricsSnapshot { - unsafe { (*self._vfs.ctx_ptr).sqlite_vfs_metrics() } - } -} - -impl Drop for NativeDatabaseV2 { - fn drop(&mut self) { - if !self.db.is_null() { - let rc = unsafe { sqlite3_close_v2(self.db) }; - if rc != SQLITE_OK { - tracing::warn!( - rc, - error = sqlite_error_message(self.db), - "failed to close sqlite v2 database" - ); - } - self.db = ptr::null_mut(); - } - } -} - -pub fn open_database( - vfs: SqliteVfsV2, - file_name: &str, -) -> std::result::Result { - let c_name = CString::new(file_name).map_err(|err| err.to_string())?; - let mut db: *mut sqlite3 = ptr::null_mut(); - - let rc = unsafe { - sqlite3_open_v2( - c_name.as_ptr(), - &mut db, - SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, - vfs.name_ptr(), - ) - }; - if rc != SQLITE_OK { - let message = sqlite_error_message(db); - if !db.is_null() { - unsafe { - sqlite3_close(db); - } - } - return Err(format!("sqlite3_open_v2 failed with code {rc}: {message}")); - } - - for pragma in &[ - "PRAGMA page_size = 4096;", - "PRAGMA journal_mode = DELETE;", - "PRAGMA synchronous = NORMAL;", - "PRAGMA temp_store = MEMORY;", - "PRAGMA auto_vacuum = NONE;", - "PRAGMA locking_mode = EXCLUSIVE;", - ] { - if let Err(err) = sqlite_exec(db, pragma) { - unsafe { - sqlite3_close(db); - } - return Err(err); - } - } - - if let Err(err) = assert_batch_atomic_probe(db, &vfs) { - unsafe { - sqlite3_close(db); - } - return Err(err); - } - - Ok(NativeDatabaseV2 { db, _vfs: vfs }) -} - -#[cfg(test)] -mod tests { - use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; - use std::sync::{Arc, Mutex as StdMutex}; - use std::thread; - - use tempfile::TempDir; - use tokio::runtime::Builder; - use universaldb::Subspace; - - use super::*; - - static TEST_ID: AtomicU64 = AtomicU64::new(1); - - fn dirty_pages(page_count: u32, fill: u8) -> Vec { - (0..page_count) - .map(|offset| protocol::SqliteDirtyPage { - pgno: offset + 1, - bytes: vec![fill; 4096], - }) - .collect() - } - - fn next_test_name(prefix: &str) -> String { - let id = TEST_ID.fetch_add(1, Ordering::Relaxed); - format!("{prefix}-{id}") - } - - fn random_hex() -> String { - let mut bytes = [0u8; 8]; - getrandom::getrandom(&mut bytes).expect("random bytes should be available"); - bytes.iter().map(|byte| format!("{byte:02x}")).collect() - } - - struct DirectEngineHarness { - actor_id: String, - db_dir: TempDir, - subspace: Subspace, - } - - impl DirectEngineHarness { - fn new() -> Self { - Self { - actor_id: next_test_name("sqlite-v2-direct-actor"), - db_dir: tempfile::tempdir().expect("temp dir should build"), - subspace: Subspace::new(&("sqlite-v2-direct", random_hex())), - } - } - - async fn open_engine(&self) -> Arc { - let driver = - universaldb::driver::RocksDbDatabaseDriver::new(self.db_dir.path().to_path_buf()) - .await - .expect("rocksdb driver should build"); - let db = Arc::new(universaldb::Database::new(Arc::new(driver))); - let (engine, _compaction_rx) = SqliteEngine::new(db, self.subspace.clone()); - - Arc::new(engine) - } - - async fn startup_data_for( - &self, - actor_id: &str, - engine: &SqliteEngine, - ) -> protocol::SqliteStartupData { - let takeover = engine - .takeover( - actor_id, - sqlite_storage::takeover::TakeoverConfig::new( - sqlite_now_ms().expect("startup time should resolve"), - ), - ) - .await - .expect("takeover should succeed"); - - protocol::SqliteStartupData { - generation: takeover.generation, - meta: protocol_sqlite_meta(takeover.meta), - preloaded_pages: takeover - .preloaded_pages - .into_iter() - .map(protocol_fetched_page) - .collect(), - } - } - - async fn startup_data(&self, engine: &SqliteEngine) -> protocol::SqliteStartupData { - self.startup_data_for(&self.actor_id, engine).await - } - - fn open_db_on_engine( - &self, - runtime: &tokio::runtime::Runtime, - engine: Arc, - actor_id: &str, - config: VfsV2Config, - ) -> NativeDatabaseV2 { - let startup = runtime.block_on(self.startup_data_for(actor_id, &engine)); - let vfs = SqliteVfsV2::register_with_transport( - &next_test_name("sqlite-v2-direct-vfs"), - SqliteTransport::from_direct(engine), - actor_id.to_string(), - runtime.handle().clone(), - startup, - config, - ) - .expect("v2 vfs should register"); - - open_database(vfs, actor_id).expect("sqlite database should open") - } - - fn open_db(&self, runtime: &tokio::runtime::Runtime) -> NativeDatabaseV2 { - let engine = runtime.block_on(self.open_engine()); - self.open_db_on_engine(runtime, engine, &self.actor_id, VfsV2Config::default()) - } - } - - fn direct_vfs_ctx(db: &NativeDatabaseV2) -> &VfsV2Context { - unsafe { &*db._vfs.ctx_ptr } - } - - fn sqlite_query_i64(db: *mut sqlite3, sql: &str) -> std::result::Result { - let c_sql = CString::new(sql).map_err(|err| err.to_string())?; - let mut stmt = ptr::null_mut(); - let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; - if rc != SQLITE_OK { - return Err(format!( - "`{sql}` prepare failed with code {rc}: {}", - sqlite_error_message(db) - )); - } - if stmt.is_null() { - return Err(format!("`{sql}` returned no statement")); - } - - let result = match unsafe { sqlite3_step(stmt) } { - SQLITE_ROW => Ok(unsafe { sqlite3_column_int64(stmt, 0) }), - step_rc => Err(format!( - "`{sql}` step failed with code {step_rc}: {}", - sqlite_error_message(db) - )), - }; - - unsafe { - sqlite3_finalize(stmt); - } - - result - } - - fn sqlite_query_text(db: *mut sqlite3, sql: &str) -> std::result::Result { - let c_sql = CString::new(sql).map_err(|err| err.to_string())?; - let mut stmt = ptr::null_mut(); - let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; - if rc != SQLITE_OK { - return Err(format!( - "`{sql}` prepare failed with code {rc}: {}", - sqlite_error_message(db) - )); - } - if stmt.is_null() { - return Err(format!("`{sql}` returned no statement")); - } - - let result = match unsafe { sqlite3_step(stmt) } { - SQLITE_ROW => { - let text_ptr = unsafe { sqlite3_column_text(stmt, 0) }; - if text_ptr.is_null() { - Ok(String::new()) - } else { - Ok(unsafe { CStr::from_ptr(text_ptr.cast()) } - .to_string_lossy() - .into_owned()) - } - } - step_rc => Err(format!( - "`{sql}` step failed with code {step_rc}: {}", - sqlite_error_message(db) - )), - }; - - unsafe { - sqlite3_finalize(stmt); - } - - result - } - - fn sqlite_file_control(db: *mut sqlite3, op: c_int) -> std::result::Result { - let main = CString::new("main").map_err(|err| err.to_string())?; - let rc = unsafe { sqlite3_file_control(db, main.as_ptr(), op, ptr::null_mut()) }; - if rc != SQLITE_OK { - return Err(format!( - "sqlite3_file_control op {op} failed with code {rc}: {}", - sqlite_error_message(db) - )); - } - - Ok(rc) - } - - fn direct_runtime() -> tokio::runtime::Runtime { - Builder::new_multi_thread() - .worker_threads(2) - .enable_all() - .build() - .expect("runtime should build") - } - - #[test] - fn predictor_prefers_stride_after_repeated_reads() { - let mut predictor = PrefetchPredictor::default(); - for pgno in [5, 8, 11, 14] { - predictor.record(pgno); - } - - assert_eq!(predictor.multi_predict(14, 3, 30), vec![17, 20, 23]); - } - - #[test] - fn startup_data_populates_cache_without_protocol_calls() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }, - ), - )); - let startup = protocol::SqliteStartupData { - generation: 3, - meta: sqlite_meta(8 * 1024 * 1024), - preloaded_pages: vec![protocol::SqliteFetchedPage { - pgno: 1, - bytes: Some(vec![7; 4096]), - }], - }; - - let ctx = VfsV2Context::new( - "actor".to_string(), - runtime.handle().clone(), - SqliteTransport::from_mock(protocol.clone()), - startup, - VfsV2Config::default(), - unsafe { std::mem::zeroed() }, - ); - - assert_eq!(ctx.state.read().page_cache.get(&1), Some(vec![7; 4096])); - assert!(protocol.get_pages_requests().is_empty()); - } - - #[test] - fn direct_engine_supports_create_insert_select_and_user_version() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - - assert_eq!( - sqlite_file_control(db.as_ptr(), SQLITE_FCNTL_BEGIN_ATOMIC_WRITE) - .expect("batch atomic begin should succeed"), - SQLITE_OK - ); - assert_eq!( - sqlite_file_control(db.as_ptr(), SQLITE_FCNTL_COMMIT_ATOMIC_WRITE) - .expect("batch atomic commit should succeed"), - SQLITE_OK - ); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO items (id, value) VALUES (1, 'alpha');", - ) - .expect("insert should succeed"); - sqlite_exec(db.as_ptr(), "PRAGMA user_version = 42;") - .expect("user_version pragma should succeed"); - - assert_eq!( - sqlite_query_text(db.as_ptr(), "SELECT value FROM items WHERE id = 1;") - .expect("select should succeed"), - "alpha" - ); - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM items;") - .expect("count should succeed"), - 1 - ); - assert_eq!( - sqlite_query_i64(db.as_ptr(), "PRAGMA user_version;") - .expect("user_version read should succeed"), - 42 - ); - } - - #[test] - fn direct_engine_handles_large_rows_and_multi_page_growth() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE blobs (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", - ) - .expect("create table should succeed"); - - for _ in 0..48 { - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO blobs (payload) VALUES (randomblob(3500));", - ) - .expect("seed insert should succeed"); - } - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO blobs (payload) VALUES (randomblob(9000));", - ) - .expect("large row insert should succeed"); - - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM blobs;") - .expect("count should succeed"), - 49 - ); - assert!( - sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;").expect("page_count should succeed") - > 20 - ); - assert!( - sqlite_query_i64(db.as_ptr(), "SELECT max(length(payload)) FROM blobs;") - .expect("max payload length should succeed") - >= 9000 - ); - } - - #[test] - fn direct_engine_persists_data_across_close_and_reopen() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - - { - let db = harness.open_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE events (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO events (id, value) VALUES (1, 'persisted');", - ) - .expect("insert should succeed"); - sqlite_exec(db.as_ptr(), "PRAGMA user_version = 7;") - .expect("user_version write should succeed"); - } - - let reopened = harness.open_db(&runtime); - assert_eq!( - sqlite_query_i64(reopened.as_ptr(), "SELECT COUNT(*) FROM events;") - .expect("count after reopen should succeed"), - 1 - ); - assert_eq!( - sqlite_query_text(reopened.as_ptr(), "SELECT value FROM events WHERE id = 1;") - .expect("value after reopen should succeed"), - "persisted" - ); - assert_eq!( - sqlite_query_i64(reopened.as_ptr(), "PRAGMA user_version;") - .expect("user_version after reopen should succeed"), - 7 - ); - } - - #[test] - fn direct_engine_handles_aux_files_and_truncate_then_regrow() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - - sqlite_exec(db.as_ptr(), "PRAGMA temp_store = FILE;") - .expect("temp_store pragma should succeed"); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE blobs (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", - ) - .expect("create table should succeed"); - - for _ in 0..32 { - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO blobs (payload) VALUES (randomblob(8192));", - ) - .expect("growth insert should succeed"); - } - let grown_pages = sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;") - .expect("grown page_count should succeed"); - assert!(grown_pages > 40); - - sqlite_exec( - db.as_ptr(), - "CREATE TEMP TABLE scratch AS SELECT id FROM blobs ORDER BY id DESC;", - ) - .expect("temp table should succeed"); - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM scratch;") - .expect("temp table count should succeed"), - 32 - ); - - sqlite_exec(db.as_ptr(), "DELETE FROM blobs;").expect("delete should succeed"); - sqlite_exec(db.as_ptr(), "VACUUM;").expect("vacuum should succeed"); - let shrunk_pages = sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;") - .expect("shrunk page_count should succeed"); - assert!(shrunk_pages < grown_pages); - - for _ in 0..8 { - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO blobs (payload) VALUES (randomblob(8192));", - ) - .expect("regrow insert should succeed"); - } - let regrown_pages = sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;") - .expect("regrown page_count should succeed"); - assert!(regrown_pages > shrunk_pages); - } - - #[test] - fn direct_engine_batch_atomic_probe_runs_on_open() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - - assert!( - db._vfs.commit_atomic_count() > 0, - "open_database should run the sqlite v2 batch-atomic probe", - ); - } - - #[test] - fn direct_engine_keeps_head_txid_after_cache_miss_reads_between_commits() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let db = harness.open_db_on_engine( - &runtime, - engine, - &harness.actor_id, - VfsV2Config { - cache_capacity_pages: 2, - prefetch_depth: 0, - max_prefetch_bytes: 0, - ..VfsV2Config::default() - }, - ); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_exec(db.as_ptr(), "CREATE INDEX items_value_idx ON items(value);") - .expect("create index should succeed"); - for i in 0..120 { - sqlite_step_statement( - db.as_ptr(), - &format!( - "INSERT INTO items (id, value) VALUES ({}, 'item-{i:03}');", - i + 1 - ), - ) - .expect("seed insert should succeed"); - } - - let ctx = direct_vfs_ctx(&db); - let head_after_first_phase = ctx.state.read().head_txid; - - ctx.state.write().page_cache.invalidate_all(); - assert_eq!( - sqlite_query_text( - db.as_ptr(), - "SELECT value FROM items WHERE value = 'item-091';", - ) - .expect("cache-miss read should succeed"), - "item-091" - ); - let head_after_cache_miss = ctx.state.read().head_txid; - assert_eq!( - head_after_cache_miss, head_after_first_phase, - "cache-miss reads must not rewind head_txid", - ); - - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO items (id, value) VALUES (1000, 'after-cache-miss');", - ) - .expect("commit after cache-miss read should succeed"); - assert!( - ctx.state.read().head_txid > head_after_cache_miss, - "head_txid should still advance after the follow-up commit", - ); - } - - #[test] - fn direct_engine_uses_slow_path_for_large_real_engine_commits() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let startup = runtime.block_on(harness.startup_data(&engine)); - let dirty_pages = (1..=2300u32) - .map(|pgno| protocol::SqliteDirtyPage { - pgno, - bytes: vec![(pgno % 251) as u8; 4096], - }) - .collect::>(); - - let outcome = runtime - .block_on(commit_buffered_pages( - &SqliteTransport::from_direct(Arc::clone(&engine)), - BufferedCommitRequest { - actor_id: harness.actor_id.clone(), - generation: startup.generation, - expected_head_txid: startup.meta.head_txid, - new_db_size_pages: 2300, - max_delta_bytes: startup.meta.max_delta_bytes, - max_pages_per_stage: 256, - dirty_pages, - }, - )) - .expect("slow-path direct commit should succeed"); - let (outcome, metrics) = outcome; - - assert_eq!(outcome.path, CommitPath::Slow); - assert_eq!(outcome.new_head_txid, startup.meta.head_txid + 1); - assert!(metrics.serialize_ns > 0); - assert!(metrics.transport_ns > 0); - - let pages = runtime - .block_on(engine.get_pages(&harness.actor_id, startup.generation, vec![1, 1024, 2300])) - .expect("pages should read back after slow-path commit"); - let expected_page_1 = vec![1u8; 4096]; - let expected_page_1024 = vec![(1024 % 251) as u8; 4096]; - let expected_page_2300 = vec![(2300 % 251) as u8; 4096]; - assert_eq!(pages.len(), 3); - assert_eq!(pages[0].bytes.as_deref(), Some(expected_page_1.as_slice())); - assert_eq!( - pages[1].bytes.as_deref(), - Some(expected_page_1024.as_slice()) - ); - assert_eq!( - pages[2].bytes.as_deref(), - Some(expected_page_2300.as_slice()) - ); - } - - #[test] - fn direct_engine_marks_vfs_dead_after_transport_errors() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let startup = runtime.block_on(harness.startup_data(&engine)); - let transport = SqliteTransport::from_direct(engine); - let hooks = transport - .direct_hooks() - .expect("direct transport should expose test hooks"); - let vfs = SqliteVfsV2::register_with_transport( - &next_test_name("sqlite-v2-direct-vfs"), - transport, - harness.actor_id.clone(), - runtime.handle().clone(), - startup, - VfsV2Config::default(), - ) - .expect("v2 vfs should register"); - let db = open_database(vfs, &harness.actor_id).expect("sqlite database should open"); - - hooks.fail_next_commit("InjectedTransportError: commit transport dropped"); - let err = sqlite_exec( - db.as_ptr(), - "CREATE TABLE broken (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect_err("failing transport commit should surface as an IO error"); - assert!( - err.contains("I/O") || err.contains("disk I/O"), - "sqlite should surface transport failure as an IO error: {err}", - ); - assert!( - direct_vfs_ctx(&db).is_dead(), - "transport error should kill the v2 VFS" - ); - assert_eq!( - db.take_last_kv_error().as_deref(), - Some("InjectedTransportError: commit transport dropped"), - ); - assert!( - sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;").is_err(), - "subsequent reads should fail once the VFS is dead", - ); - } - - #[test] - fn flush_dirty_pages_marks_vfs_dead_after_transport_error() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let startup = runtime.block_on(harness.startup_data(&engine)); - let transport = SqliteTransport::from_direct(engine); - let hooks = transport - .direct_hooks() - .expect("direct transport should expose test hooks"); - let vfs = SqliteVfsV2::register_with_transport( - &next_test_name("sqlite-v2-direct-vfs"), - transport, - harness.actor_id.clone(), - runtime.handle().clone(), - startup, - VfsV2Config::default(), - ) - .expect("v2 vfs should register"); - let db = open_database(vfs, &harness.actor_id).expect("sqlite database should open"); - let ctx = direct_vfs_ctx(&db); - - { - let mut state = ctx.state.write(); - state.write_buffer.dirty.insert(1, vec![0x7a; 4096]); - state.db_size_pages = 1; - } - - hooks.fail_next_commit("InjectedTransportError: flush transport dropped"); - let err = ctx - .flush_dirty_pages() - .expect_err("transport failure should bubble out of flush_dirty_pages"); - - assert!( - matches!(err, CommitBufferError::Other(ref message) if message.contains("InjectedTransportError")), - "flush failure should surface as a transport error: {err:?}", - ); - assert!( - ctx.is_dead(), - "flush transport failure should poison the VFS" - ); - assert_eq!( - db.take_last_kv_error().as_deref(), - Some("InjectedTransportError: flush transport dropped"), - ); - } - - #[test] - fn commit_atomic_write_marks_vfs_dead_after_transport_error() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let startup = runtime.block_on(harness.startup_data(&engine)); - let transport = SqliteTransport::from_direct(engine); - let hooks = transport - .direct_hooks() - .expect("direct transport should expose test hooks"); - let vfs = SqliteVfsV2::register_with_transport( - &next_test_name("sqlite-v2-direct-vfs"), - transport, - harness.actor_id.clone(), - runtime.handle().clone(), - startup, - VfsV2Config::default(), - ) - .expect("v2 vfs should register"); - let db = open_database(vfs, &harness.actor_id).expect("sqlite database should open"); - let ctx = direct_vfs_ctx(&db); - - { - let mut state = ctx.state.write(); - state.write_buffer.in_atomic_write = true; - state.write_buffer.saved_db_size = state.db_size_pages; - state.write_buffer.dirty.insert(1, vec![0x5c; 4096]); - state.db_size_pages = 1; - } - - hooks.fail_next_commit("InjectedTransportError: atomic transport dropped"); - let err = ctx - .commit_atomic_write() - .expect_err("transport failure should bubble out of commit_atomic_write"); - - assert!( - matches!(err, CommitBufferError::Other(ref message) if message.contains("InjectedTransportError")), - "atomic-write failure should surface as a transport error: {err:?}", - ); - assert!( - ctx.is_dead(), - "commit_atomic_write transport failure should poison the VFS", - ); - assert_eq!( - db.take_last_kv_error().as_deref(), - Some("InjectedTransportError: atomic transport dropped"), - ); - } - - #[test] - fn direct_engine_handles_multithreaded_statement_churn() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = Arc::new(StdMutex::new(harness.open_db(&runtime))); - - { - let db = db.lock().expect("db mutex should lock"); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY AUTOINCREMENT, value TEXT NOT NULL);", - ) - .expect("create table should succeed"); - } - - let mut workers = Vec::new(); - for worker_id in 0..4 { - let db = Arc::clone(&db); - workers.push(thread::spawn(move || { - for idx in 0..40 { - let db = db.lock().expect("db mutex should lock"); - sqlite_step_statement( - db.as_ptr(), - &format!( - "INSERT INTO items (value) VALUES ('worker-{worker_id}-row-{idx}');" - ), - ) - .expect("threaded insert should succeed"); - } - })); - } - for worker in workers { - worker.join().expect("worker thread should finish"); - } - - let db = db.lock().expect("db mutex should lock"); - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM items;") - .expect("threaded row count should succeed"), - 160 - ); - } - - #[test] - fn direct_engine_isolates_two_actors_on_one_shared_engine() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let actor_a = next_test_name("sqlite-v2-actor-a"); - let actor_b = next_test_name("sqlite-v2-actor-b"); - let db_a = harness.open_db_on_engine( - &runtime, - Arc::clone(&engine), - &actor_a, - VfsV2Config::default(), - ); - let db_b = harness.open_db_on_engine(&runtime, engine, &actor_b, VfsV2Config::default()); - - sqlite_exec( - db_a.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("actor A create table should succeed"); - sqlite_exec( - db_b.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("actor B create table should succeed"); - sqlite_step_statement( - db_a.as_ptr(), - "INSERT INTO items (id, value) VALUES (1, 'alpha');", - ) - .expect("actor A insert should succeed"); - sqlite_step_statement( - db_b.as_ptr(), - "INSERT INTO items (id, value) VALUES (1, 'beta');", - ) - .expect("actor B insert should succeed"); - - assert_eq!( - sqlite_query_text(db_a.as_ptr(), "SELECT value FROM items WHERE id = 1;") - .expect("actor A select should succeed"), - "alpha" - ); - assert_eq!( - sqlite_query_text(db_b.as_ptr(), "SELECT value FROM items WHERE id = 1;") - .expect("actor B select should succeed"), - "beta" - ); - } - - #[test] - fn direct_engine_hot_row_updates_survive_reopen() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - - { - let db = harness.open_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE counters (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO counters (id, value) VALUES (1, 'v-0');", - ) - .expect("seed row should succeed"); - for i in 1..=150 { - sqlite_step_statement( - db.as_ptr(), - &format!("UPDATE counters SET value = 'v-{i}' WHERE id = 1;"), - ) - .expect("hot-row update should succeed"); - } - } - - let reopened = harness.open_db(&runtime); - assert_eq!( - sqlite_query_text( - reopened.as_ptr(), - "SELECT value FROM counters WHERE id = 1;" - ) - .expect("final value should survive reopen"), - "v-150" - ); - } - - #[test] - fn direct_engine_preserves_mixed_workload_across_sleep_wake() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - - { - let db = harness.open_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL, status TEXT NOT NULL);", - ) - .expect("create table should succeed"); - for id in 1..=50 { - sqlite_step_statement( - db.as_ptr(), - &format!( - "INSERT INTO items (id, value, status) VALUES ({id}, 'item-{id}', 'new');" - ), - ) - .expect("seed insert should succeed"); - } - for id in 1..=20 { - sqlite_step_statement( - db.as_ptr(), - &format!( - "UPDATE items SET status = 'updated', value = 'item-{id}-updated' WHERE id = {id};" - ), - ) - .expect("update should succeed"); - } - for id in 41..=50 { - sqlite_step_statement(db.as_ptr(), &format!("DELETE FROM items WHERE id = {id};")) - .expect("delete should succeed"); - } - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO items (id, value, status) VALUES (1000, 'disconnect-write', 'new');", - ) - .expect("disconnect-style write before close should succeed"); - } - - let reopened = harness.open_db(&runtime); - assert_eq!( - sqlite_query_i64(reopened.as_ptr(), "SELECT COUNT(*) FROM items;") - .expect("row count after reopen should succeed"), - 41 - ); - assert_eq!( - sqlite_query_i64( - reopened.as_ptr(), - "SELECT COUNT(*) FROM items WHERE status = 'updated';", - ) - .expect("updated row count should succeed"), - 20 - ); - assert_eq!( - sqlite_query_text( - reopened.as_ptr(), - "SELECT value FROM items WHERE id = 1000;", - ) - .expect("disconnect write should survive reopen"), - "disconnect-write" - ); - } - - #[test] - fn direct_engine_reopens_cleanly_after_failed_migration() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - - { - let db = harness.open_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_exec(db.as_ptr(), "ALTER TABLE items ADD COLUMN;") - .expect_err("broken migration should fail"); - } - - let reopened = harness.open_db(&runtime); - sqlite_step_statement( - reopened.as_ptr(), - "INSERT INTO items (id, value) VALUES (1, 'still-alive');", - ) - .expect("reopened database should still accept writes after migration failure"); - assert_eq!( - sqlite_query_text(reopened.as_ptr(), "SELECT value FROM items WHERE id = 1;") - .expect("select after reopen should succeed"), - "still-alive" - ); - } - - #[test] - fn direct_engine_reads_continue_while_compaction_runs() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let db = Arc::new(StdMutex::new(harness.open_db_on_engine( - &runtime, - Arc::clone(&engine), - &harness.actor_id, - VfsV2Config::default(), - ))); - - { - let db = db.lock().expect("db mutex should lock"); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("create table should succeed"); - for id in 1..=48 { - sqlite_step_statement( - db.as_ptr(), - &format!("INSERT INTO items (id, value) VALUES ({id}, 'row-{id}');"), - ) - .expect("seed insert should succeed"); - } - } - - let keep_reading = Arc::new(AtomicBool::new(true)); - let read_error = Arc::new(StdMutex::new(None::)); - let db_for_reader = Arc::clone(&db); - let keep_reading_for_thread = Arc::clone(&keep_reading); - let read_error_for_thread = Arc::clone(&read_error); - let reader = thread::spawn(move || { - while keep_reading_for_thread.load(AtomicOrdering::Relaxed) { - let db = db_for_reader.lock().expect("db mutex should lock"); - direct_vfs_ctx(&db) - .state - .write() - .page_cache - .invalidate_all(); - if let Err(err) = - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM items WHERE id >= 1;") - { - *read_error_for_thread - .lock() - .expect("read error mutex should lock") = Some(err); - break; - } - } - }); - - runtime - .block_on(engine.compact_worker(&harness.actor_id, 8)) - .expect("compaction should succeed"); - keep_reading.store(false, AtomicOrdering::Relaxed); - reader.join().expect("reader thread should finish"); - - assert!( - read_error - .lock() - .expect("read error mutex should lock") - .is_none(), - "reads should keep working while compaction folds deltas", - ); - let db = db.lock().expect("db mutex should lock"); - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM items;") - .expect("final row count should succeed"), - 48 - ); - } - - #[test] - fn open_database_supports_empty_db_schema_setup() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 2, - ..sqlite_meta(8 * 1024 * 1024) - }, - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 2, - ..sqlite_meta(8 * 1024 * 1024) - }, - }, - ), - )); - protocol.set_mirror_commit_meta(true); - - let vfs = SqliteVfsV2::register_with_transport( - "test-v2-empty-db", - SqliteTransport::from_mock(protocol.clone()), - "actor".to_string(), - runtime.handle().clone(), - protocol::SqliteStartupData { - generation: 7, - meta: protocol::SqliteMeta { - db_size_pages: 0, - ..sqlite_meta(8 * 1024 * 1024) - }, - preloaded_pages: Vec::new(), - }, - VfsV2Config::default(), - ) - .expect("vfs should register"); - let db = open_database(vfs, "actor").expect("db should open"); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE test (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("schema setup should succeed"); - } - - #[test] - fn open_database_supports_insert_after_pragma_migration() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 32, - ..sqlite_meta(8 * 1024 * 1024) - }, - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 32, - ..sqlite_meta(8 * 1024 * 1024) - }, - }, - ), - )); - - let vfs = SqliteVfsV2::register_with_transport( - "test-v2-pragma-migration", - SqliteTransport::from_mock(protocol.clone()), - "actor".to_string(), - runtime.handle().clone(), - protocol::SqliteStartupData { - generation: 7, - meta: protocol::SqliteMeta { - db_size_pages: 0, - ..sqlite_meta(8 * 1024 * 1024) - }, - preloaded_pages: Vec::new(), - }, - VfsV2Config::default(), - ) - .expect("vfs should register"); - let db = open_database(vfs, "actor").expect("db should open"); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_exec( - db.as_ptr(), - "ALTER TABLE items ADD COLUMN status TEXT NOT NULL DEFAULT 'active';", - ) - .expect("alter table should succeed"); - sqlite_exec(db.as_ptr(), "PRAGMA user_version = 2;").expect("pragma should succeed"); - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO items (name) VALUES ('test-item');", - ) - .expect("insert after pragma migration should succeed"); - } - - #[test] - fn open_database_supports_explicit_status_insert_after_pragma_migration() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 32, - ..sqlite_meta(8 * 1024 * 1024) - }, - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 32, - ..sqlite_meta(8 * 1024 * 1024) - }, - }, - ), - )); - protocol.set_mirror_commit_meta(true); - - let vfs = SqliteVfsV2::register_with_transport( - "test-v2-pragma-explicit", - SqliteTransport::from_mock(protocol), - "actor".to_string(), - runtime.handle().clone(), - protocol::SqliteStartupData { - generation: 7, - meta: protocol::SqliteMeta { - db_size_pages: 0, - ..sqlite_meta(8 * 1024 * 1024) - }, - preloaded_pages: Vec::new(), - }, - VfsV2Config::default(), - ) - .expect("vfs should register"); - let db = open_database(vfs, "actor").expect("db should open"); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_exec( - db.as_ptr(), - "ALTER TABLE items ADD COLUMN status TEXT NOT NULL DEFAULT 'active';", - ) - .expect("alter table should succeed"); - sqlite_exec(db.as_ptr(), "PRAGMA user_version = 2;").expect("pragma should succeed"); - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO items (name, status) VALUES ('done-item', 'completed');", - ) - .expect("explicit status insert should succeed"); - } - - #[test] - fn open_database_supports_hot_row_update_churn() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 128, - ..sqlite_meta(8 * 1024 * 1024) - }, - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 128, - ..sqlite_meta(8 * 1024 * 1024) - }, - }, - ), - )); - protocol.set_mirror_commit_meta(true); - - let vfs = SqliteVfsV2::register_with_transport( - "test-v2-hot-row-updates", - SqliteTransport::from_mock(protocol), - "actor".to_string(), - runtime.handle().clone(), - protocol::SqliteStartupData { - generation: 7, - meta: protocol::SqliteMeta { - db_size_pages: 0, - ..sqlite_meta(8 * 1024 * 1024) - }, - preloaded_pages: Vec::new(), - }, - VfsV2Config::default(), - ) - .expect("vfs should register"); - let db = open_database(vfs, "actor").expect("db should open"); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE test_data (id INTEGER PRIMARY KEY AUTOINCREMENT, value TEXT NOT NULL, payload TEXT NOT NULL DEFAULT '', created_at INTEGER NOT NULL);", - ) - .expect("create table should succeed"); - for i in 0..10 { - sqlite_step_statement( - db.as_ptr(), - &format!( - "INSERT INTO test_data (value, payload, created_at) VALUES ('init-{i}', '', 1);" - ), - ) - .expect("seed insert should succeed"); - } - for i in 0..240 { - let row_id = i % 10 + 1; - sqlite_step_statement( - db.as_ptr(), - &format!("UPDATE test_data SET value = 'v-{i}' WHERE id = {row_id};"), - ) - .expect("hot-row update should succeed"); - } - } - - #[test] - fn open_database_supports_cross_thread_exec_sequence() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 32, - ..sqlite_meta(8 * 1024 * 1024) - }, - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: protocol::SqliteMeta { - db_size_pages: 32, - ..sqlite_meta(8 * 1024 * 1024) - }, - }, - ), - )); - protocol.set_mirror_commit_meta(true); - - let vfs = SqliteVfsV2::register_with_transport( - "test-v2-cross-thread", - SqliteTransport::from_mock(protocol), - "actor".to_string(), - runtime.handle().clone(), - protocol::SqliteStartupData { - generation: 7, - meta: protocol::SqliteMeta { - db_size_pages: 0, - ..sqlite_meta(8 * 1024 * 1024) - }, - preloaded_pages: Vec::new(), - }, - VfsV2Config::default(), - ) - .expect("vfs should register"); - let db = Arc::new(StdMutex::new( - open_database(vfs, "actor").expect("db should open"), - )); - - { - let db = db.clone(); - thread::spawn(move || { - let db = db.lock().expect("db mutex should lock"); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE items (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_exec( - db.as_ptr(), - "ALTER TABLE items ADD COLUMN status TEXT NOT NULL DEFAULT 'active';", - ) - .expect("alter table should succeed"); - sqlite_exec(db.as_ptr(), "PRAGMA user_version = 2;") - .expect("pragma should succeed"); - }) - .join() - .expect("migration thread should finish"); - } - - thread::spawn(move || { - let db = db.lock().expect("db mutex should lock"); - sqlite_step_statement( - db.as_ptr(), - "INSERT INTO items (name) VALUES ('test-item');", - ) - .expect("cross-thread insert should succeed"); - }) - .join() - .expect("insert thread should finish"); - } - - #[test] - fn aux_files_are_shared_by_path_until_deleted() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }, - ), - )); - let ctx = VfsV2Context::new( - "actor".to_string(), - runtime.handle().clone(), - SqliteTransport::from_mock(protocol), - protocol::SqliteStartupData { - generation: 7, - meta: sqlite_meta(8 * 1024 * 1024), - preloaded_pages: Vec::new(), - }, - VfsV2Config::default(), - unsafe { std::mem::zeroed() }, - ); - - let first = ctx.open_aux_file("actor-journal"); - first.bytes.lock().extend_from_slice(&[1, 2, 3, 4]); - let second = ctx.open_aux_file("actor-journal"); - assert_eq!(*second.bytes.lock(), vec![1, 2, 3, 4]); - assert!(ctx.aux_file_exists("actor-journal")); - - ctx.delete_aux_file("actor-journal"); - assert!(!ctx.aux_file_exists("actor-journal")); - assert!(ctx.open_aux_file("actor-journal").bytes.lock().is_empty()); - } - - #[test] - fn truncate_main_file_discards_pages_beyond_eof() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }, - ), - )); - let ctx = VfsV2Context::new( - "actor".to_string(), - runtime.handle().clone(), - SqliteTransport::from_mock(protocol), - protocol::SqliteStartupData { - generation: 7, - meta: protocol::SqliteMeta { - db_size_pages: 4, - ..sqlite_meta(8 * 1024 * 1024) - }, - preloaded_pages: vec![ - protocol::SqliteFetchedPage { - pgno: 1, - bytes: Some(vec![1; 4096]), - }, - protocol::SqliteFetchedPage { - pgno: 4, - bytes: Some(vec![4; 4096]), - }, - ], - }, - VfsV2Config::default(), - unsafe { std::mem::zeroed() }, - ); - { - let mut state = ctx.state.write(); - state.write_buffer.dirty.insert(3, vec![3; 4096]); - state.write_buffer.dirty.insert(4, vec![4; 4096]); - } - - ctx.truncate_main_file(2 * 4096); - - let state = ctx.state.read(); - assert_eq!(state.db_size_pages, 2); - assert!(!state.write_buffer.dirty.contains_key(&3)); - assert!(!state.write_buffer.dirty.contains_key(&4)); - assert!(state.page_cache.get(&4).is_none()); - } - - #[test] - fn resolve_pages_does_not_rewind_meta_on_stale_response() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let mut protocol = MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }, - ), - ); - protocol.get_pages_response = - protocol::SqliteGetPagesResponse::SqliteGetPagesOk(protocol::SqliteGetPagesOk { - pages: vec![protocol::SqliteFetchedPage { - pgno: 2, - bytes: Some(vec![2; 4096]), - }], - meta: protocol::SqliteMeta { - head_txid: 1, - db_size_pages: 1, - max_delta_bytes: 32 * 1024 * 1024, - ..sqlite_meta(8 * 1024 * 1024) - }, - }); - let ctx = VfsV2Context::new( - "actor".to_string(), - runtime.handle().clone(), - SqliteTransport::from_mock(Arc::new(protocol)), - protocol::SqliteStartupData { - generation: 7, - meta: protocol::SqliteMeta { - head_txid: 3, - db_size_pages: 3, - ..sqlite_meta(8 * 1024 * 1024) - }, - preloaded_pages: vec![protocol::SqliteFetchedPage { - pgno: 1, - bytes: Some(vec![1; 4096]), - }], - }, - VfsV2Config::default(), - unsafe { std::mem::zeroed() }, - ); - - let resolved = ctx - .resolve_pages(&[2], false) - .expect("missing page should resolve"); - - assert_eq!(resolved.get(&2), Some(&Some(vec![2; 4096]))); - let state = ctx.state.read(); - assert_eq!(state.head_txid, 3); - assert_eq!(state.db_size_pages, 3); - assert_eq!(state.max_delta_bytes, 32 * 1024 * 1024); - } - - #[test] - fn resolve_pages_does_not_shrink_db_size_pages_on_same_head_response() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let mut protocol = MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }, - ), - ); - protocol.get_pages_response = - protocol::SqliteGetPagesResponse::SqliteGetPagesOk(protocol::SqliteGetPagesOk { - pages: vec![protocol::SqliteFetchedPage { - pgno: 4, - bytes: Some(vec![4; 4096]), - }], - meta: protocol::SqliteMeta { - head_txid: 3, - db_size_pages: 1, - max_delta_bytes: 16 * 1024 * 1024, - ..sqlite_meta(8 * 1024 * 1024) - }, - }); - let ctx = VfsV2Context::new( - "actor".to_string(), - runtime.handle().clone(), - SqliteTransport::from_mock(Arc::new(protocol)), - protocol::SqliteStartupData { - generation: 7, - meta: protocol::SqliteMeta { - head_txid: 3, - db_size_pages: 4, - ..sqlite_meta(8 * 1024 * 1024) - }, - preloaded_pages: vec![protocol::SqliteFetchedPage { - pgno: 1, - bytes: Some(vec![1; 4096]), - }], - }, - VfsV2Config::default(), - unsafe { std::mem::zeroed() }, - ); - - let resolved = ctx - .resolve_pages(&[4], false) - .expect("missing page should resolve"); - - assert_eq!(resolved.get(&4), Some(&Some(vec![4; 4096]))); - let state = ctx.state.read(); - assert_eq!(state.head_txid, 3); - assert_eq!(state.db_size_pages, 4); - assert_eq!(state.max_delta_bytes, 16 * 1024 * 1024); - } - - #[test] - fn commit_buffered_pages_uses_fast_path() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { - new_head_txid: 13, - meta: sqlite_meta(8 * 1024 * 1024), - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 14, - meta: sqlite_meta(8 * 1024 * 1024), - }, - ), - )); - - let outcome = runtime - .block_on(commit_buffered_pages( - &SqliteTransport::from_mock(protocol.clone()), - BufferedCommitRequest { - actor_id: "actor".to_string(), - generation: 7, - expected_head_txid: 12, - new_db_size_pages: 1, - max_delta_bytes: 8 * 1024 * 1024, - max_pages_per_stage: 4_000, - dirty_pages: dirty_pages(1, 9), - }, - )) - .expect("fast-path commit should succeed"); - let (outcome, metrics) = outcome; - - assert_eq!(outcome.path, CommitPath::Fast); - assert_eq!(outcome.new_head_txid, 13); - assert!(metrics.serialize_ns > 0); - assert!(metrics.transport_ns > 0); - assert_eq!(protocol.commit_requests().len(), 1); - assert!(protocol.stage_requests().is_empty()); - assert!(protocol.finalize_requests().is_empty()); - } - - #[test] - fn commit_buffered_pages_falls_back_to_slow_path() { - let runtime = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build"); - let protocol = Arc::new(MockProtocol::new( - protocol::SqliteCommitResponse::SqliteCommitTooLarge(protocol::SqliteCommitTooLarge { - actual_size_bytes: 3 * 4096, - max_size_bytes: 4096, - }), - protocol::SqliteCommitStageResponse::SqliteCommitStageOk( - protocol::SqliteCommitStageOk { - chunk_idx_committed: 0, - }, - ), - protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( - protocol::SqliteCommitFinalizeOk { - new_head_txid: 14, - meta: sqlite_meta(4096), - }, - ), - )); - - let protocol_for_release = protocol.clone(); - let release = std::thread::spawn(move || { - runtime.block_on(async { - protocol_for_release.finalize_started.notified().await; - assert_eq!(protocol_for_release.awaited_stage_responses(), 0); - protocol_for_release.release_finalize.notify_one(); - }); - }); - - let outcome = Builder::new_current_thread() - .enable_all() - .build() - .expect("runtime should build") - .block_on(commit_buffered_pages( - &SqliteTransport::from_mock(protocol.clone()), - BufferedCommitRequest { - actor_id: "actor".to_string(), - generation: 7, - expected_head_txid: 12, - new_db_size_pages: 3, - max_delta_bytes: 4096, - max_pages_per_stage: 1, - dirty_pages: dirty_pages(3, 4), - }, - )) - .expect("slow-path commit should succeed"); - let (outcome, metrics) = outcome; - - release.join().expect("release thread should finish"); - - assert_eq!(outcome.path, CommitPath::Slow); - assert_eq!(outcome.new_head_txid, 14); - assert!(metrics.serialize_ns > 0); - assert!(metrics.transport_ns > 0); - assert!(protocol.commit_requests().is_empty()); - assert!(!protocol.stage_requests().is_empty()); - assert!( - protocol - .stage_requests() - .iter() - .enumerate() - .all(|(chunk_idx, request)| request.chunk_idx as usize == chunk_idx) - ); - assert!( - protocol - .stage_requests() - .last() - .is_some_and(|request| request.is_last) - ); - assert_eq!(protocol.awaited_stage_responses(), 0); - assert_eq!(protocol.finalize_requests().len(), 1); - } - - #[test] - fn vfs_records_commit_phase_durations() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - let ctx = direct_vfs_ctx(&db); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE metrics_test (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", - ) - .expect("create table should succeed"); - - let relaxed = std::sync::atomic::Ordering::Relaxed; - ctx.commit_request_build_ns.store(0, relaxed); - ctx.commit_serialize_ns.store(0, relaxed); - ctx.commit_transport_ns.store(0, relaxed); - ctx.commit_state_update_ns.store(0, relaxed); - ctx.commit_duration_ns_total.store(0, relaxed); - ctx.commit_total.store(0, relaxed); - - sqlite_exec( - db.as_ptr(), - "INSERT INTO metrics_test (id, value) VALUES (1, 'hello');", - ) - .expect("insert should succeed"); - - let metrics = db.sqlite_vfs_metrics(); - assert_eq!(metrics.commit_count, 1); - assert!(metrics.request_build_ns > 0); - assert!(metrics.serialize_ns > 0); - assert!(metrics.transport_ns > 0); - assert!(metrics.state_update_ns > 0); - assert!(metrics.total_ns >= metrics.request_build_ns); - assert!(metrics.request_build_ns + metrics.transport_ns + metrics.state_update_ns > 0); - } - - #[test] - fn profile_large_tx_insert_5mb() { - // 5MB = 1280 rows x 4KB blobs in one transaction - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - let ctx = direct_vfs_ctx(&db); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE bench (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", - ) - .expect("create table should succeed"); - - let relaxed = std::sync::atomic::Ordering::Relaxed; - ctx.resolve_pages_total.store(0, relaxed); - ctx.resolve_pages_cache_hits.store(0, relaxed); - ctx.resolve_pages_fetches.store(0, relaxed); - ctx.pages_fetched_total.store(0, relaxed); - ctx.prefetch_pages_total.store(0, relaxed); - ctx.commit_total.store(0, relaxed); - - let start = std::time::Instant::now(); - sqlite_exec(db.as_ptr(), "BEGIN;").expect("begin"); - for i in 0..1280 { - sqlite_step_statement( - db.as_ptr(), - &format!( - "INSERT INTO bench (id, payload) VALUES ({}, randomblob(4096));", - i - ), - ) - .expect("insert should succeed"); - } - sqlite_exec(db.as_ptr(), "COMMIT;").expect("commit"); - let elapsed = start.elapsed(); - - let resolve_total = ctx.resolve_pages_total.load(relaxed); - let cache_hits = ctx.resolve_pages_cache_hits.load(relaxed); - let fetches = ctx.resolve_pages_fetches.load(relaxed); - let pages_fetched = ctx.pages_fetched_total.load(relaxed); - let prefetch = ctx.prefetch_pages_total.load(relaxed); - let commits = ctx.commit_total.load(relaxed); - - eprintln!("=== 5MB INSERT PROFILE (1280 rows x 4KB) ==="); - eprintln!(" wall clock: {:?}", elapsed); - eprintln!(" resolve_pages calls: {}", resolve_total); - eprintln!(" cache hits (pages): {}", cache_hits); - eprintln!(" engine fetches: {}", fetches); - eprintln!(" pages fetched total: {}", pages_fetched); - eprintln!(" prefetch pages: {}", prefetch); - eprintln!(" commits: {}", commits); - eprintln!("============================================"); - - // In a single transaction, all 1280 row writes are to new pages. - // Only the single commit at the end should hit the engine. - assert_eq!( - fetches, 0, - "expected 0 engine fetches during 5MB insert transaction" - ); - assert_eq!( - commits, 1, - "expected exactly 1 commit for transactional insert" - ); - - let count = sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM bench;") - .expect("count should succeed"); - assert_eq!(count, 1280); - } - - #[test] - fn profile_hot_row_updates() { - // 100 updates to the same row - this is the autocommit case - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - let ctx = direct_vfs_ctx(&db); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE counter (id INTEGER PRIMARY KEY, value INTEGER NOT NULL);", - ) - .expect("create"); - sqlite_exec(db.as_ptr(), "INSERT INTO counter VALUES (1, 0);").expect("insert"); - - let relaxed = std::sync::atomic::Ordering::Relaxed; - ctx.resolve_pages_total.store(0, relaxed); - ctx.resolve_pages_cache_hits.store(0, relaxed); - ctx.resolve_pages_fetches.store(0, relaxed); - ctx.pages_fetched_total.store(0, relaxed); - ctx.prefetch_pages_total.store(0, relaxed); - ctx.commit_total.store(0, relaxed); - - let start = std::time::Instant::now(); - for _ in 0..100 { - sqlite_exec( - db.as_ptr(), - "UPDATE counter SET value = value + 1 WHERE id = 1;", - ) - .expect("update"); - } - let elapsed = start.elapsed(); - - let fetches = ctx.resolve_pages_fetches.load(relaxed); - let commits = ctx.commit_total.load(relaxed); - - eprintln!("=== 100 HOT ROW UPDATES (autocommit) ==="); - eprintln!(" wall clock: {:?}", elapsed); - eprintln!( - " resolve_pages calls: {}", - ctx.resolve_pages_total.load(relaxed) - ); - eprintln!( - " cache hits (pages): {}", - ctx.resolve_pages_cache_hits.load(relaxed) - ); - eprintln!(" engine fetches: {}", fetches); - eprintln!( - " pages fetched total: {}", - ctx.pages_fetched_total.load(relaxed) - ); - eprintln!( - " prefetch pages: {}", - ctx.prefetch_pages_total.load(relaxed) - ); - eprintln!(" commits: {}", commits); - eprintln!("========================================="); - - // Hot row updates: each update modifies the same page. Pages already - // in write_buffer or cache should not need re-fetching. With the - // counter's page(s) already warm, subsequent updates should be - // 100% cache hits (0 fetches). Autocommit means 100 separate commits. - assert_eq!( - fetches, 0, - "expected 0 engine fetches for 100 hot row updates" - ); - assert_eq!( - commits, 100, - "expected 100 commits (autocommit per statement)" - ); - } - - #[test] - fn profile_large_tx_insert_1mb_preloaded() { - // Same as the 1MB test but preload all pages first to see commit-only cost - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let actor_id = &harness.actor_id; - - // First pass: create and populate the table to generate pages - let db1 = - harness.open_db_on_engine(&runtime, engine.clone(), actor_id, VfsV2Config::default()); - sqlite_exec( - db1.as_ptr(), - "CREATE TABLE bench (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", - ) - .expect("create table should succeed"); - sqlite_exec(db1.as_ptr(), "BEGIN;").expect("begin"); - for i in 0..256 { - sqlite_step_statement( - db1.as_ptr(), - &format!( - "INSERT INTO bench (id, payload) VALUES ({}, randomblob(4096));", - i - ), - ) - .expect("insert should succeed"); - } - sqlite_exec(db1.as_ptr(), "COMMIT;").expect("commit"); - drop(db1); - - // Second pass: reopen with warm cache (takeover preloads page 1, rest from reads) - let db2 = - harness.open_db_on_engine(&runtime, engine.clone(), actor_id, VfsV2Config::default()); - let ctx = direct_vfs_ctx(&db2); - - // Warm the cache by reading everything - sqlite_exec(db2.as_ptr(), "SELECT COUNT(*) FROM bench;").expect("count"); - - // Reset counters - let relaxed = std::sync::atomic::Ordering::Relaxed; - ctx.resolve_pages_total.store(0, relaxed); - ctx.resolve_pages_cache_hits.store(0, relaxed); - ctx.resolve_pages_fetches.store(0, relaxed); - ctx.pages_fetched_total.store(0, relaxed); - ctx.prefetch_pages_total.store(0, relaxed); - ctx.commit_total.store(0, relaxed); - - let start = std::time::Instant::now(); - sqlite_exec(db2.as_ptr(), "BEGIN;").expect("begin"); - for i in 256..512 { - sqlite_step_statement( - db2.as_ptr(), - &format!( - "INSERT INTO bench (id, payload) VALUES ({}, randomblob(4096));", - i - ), - ) - .expect("insert should succeed"); - } - sqlite_exec(db2.as_ptr(), "COMMIT;").expect("commit"); - let elapsed = start.elapsed(); - - let resolve_total = ctx.resolve_pages_total.load(relaxed); - let cache_hits = ctx.resolve_pages_cache_hits.load(relaxed); - let fetches = ctx.resolve_pages_fetches.load(relaxed); - let pages_fetched = ctx.pages_fetched_total.load(relaxed); - let prefetch = ctx.prefetch_pages_total.load(relaxed); - let commits = ctx.commit_total.load(relaxed); - - eprintln!("=== 1MB INSERT PROFILE (WARM CACHE) ==="); - eprintln!(" wall clock: {:?}", elapsed); - eprintln!(" resolve_pages calls: {}", resolve_total); - eprintln!(" cache hits (pages): {}", cache_hits); - eprintln!(" engine fetches: {}", fetches); - eprintln!(" pages fetched total: {}", pages_fetched); - eprintln!(" prefetch pages: {}", prefetch); - eprintln!(" commits: {}", commits); - eprintln!("========================================"); - - // Second 256-row transaction into the already-populated table. - // All new pages are beyond db_size_pages, so no engine fetches. - assert_eq!( - fetches, 0, - "expected 0 engine fetches during warm 1MB insert" - ); - assert_eq!( - commits, 1, - "expected exactly 1 commit for transactional insert" - ); - - let count = sqlite_query_i64(db2.as_ptr(), "SELECT COUNT(*) FROM bench;") - .expect("count should succeed"); - assert_eq!(count, 512); - } - - #[test] - fn profile_large_tx_insert_1mb() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - let ctx = direct_vfs_ctx(&db); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE bench (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", - ) - .expect("create table should succeed"); - - // Reset counters after schema setup - ctx.resolve_pages_total - .store(0, std::sync::atomic::Ordering::Relaxed); - ctx.resolve_pages_cache_hits - .store(0, std::sync::atomic::Ordering::Relaxed); - ctx.resolve_pages_fetches - .store(0, std::sync::atomic::Ordering::Relaxed); - ctx.pages_fetched_total - .store(0, std::sync::atomic::Ordering::Relaxed); - ctx.prefetch_pages_total - .store(0, std::sync::atomic::Ordering::Relaxed); - ctx.commit_total - .store(0, std::sync::atomic::Ordering::Relaxed); - - let start = std::time::Instant::now(); - - sqlite_exec(db.as_ptr(), "BEGIN;").expect("begin should succeed"); - for i in 0..256 { - sqlite_step_statement( - db.as_ptr(), - &format!( - "INSERT INTO bench (id, payload) VALUES ({}, randomblob(4096));", - i - ), - ) - .expect("insert should succeed"); - } - sqlite_exec(db.as_ptr(), "COMMIT;").expect("commit should succeed"); - - let elapsed = start.elapsed(); - let relaxed = std::sync::atomic::Ordering::Relaxed; - - let resolve_total = ctx.resolve_pages_total.load(relaxed); - let cache_hits = ctx.resolve_pages_cache_hits.load(relaxed); - let fetches = ctx.resolve_pages_fetches.load(relaxed); - let pages_fetched = ctx.pages_fetched_total.load(relaxed); - let prefetch = ctx.prefetch_pages_total.load(relaxed); - let commits = ctx.commit_total.load(relaxed); - - eprintln!("=== 1MB INSERT PROFILE (256 rows x 4KB) ==="); - eprintln!(" wall clock: {:?}", elapsed); - eprintln!(" resolve_pages calls: {}", resolve_total); - eprintln!(" cache hits (pages): {}", cache_hits); - eprintln!(" engine fetches: {}", fetches); - eprintln!(" pages fetched total: {}", pages_fetched); - eprintln!(" prefetch pages: {}", prefetch); - eprintln!(" commits: {}", commits); - eprintln!("============================================"); - - // Assert expected zero-fetch behavior: in a single transaction, - // all writes are to new pages, so no engine fetches should happen. - // Only the single commit at the end should hit the engine. - assert_eq!( - fetches, 0, - "expected 0 engine fetches during 1MB insert transaction" - ); - assert_eq!( - commits, 1, - "expected exactly 1 commit for transactional insert" - ); - - let count = sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM bench;") - .expect("count should succeed"); - assert_eq!(count, 256); - } - - // Regression test for fence mismatch during rapid autocommit inserts. - // Each autocommit INSERT is its own transaction. This test drives many - // sequential commits through the VFS and verifies they all succeed. - #[test] - fn autocommit_inserts_maintain_head_txid_consistency() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let db = harness.open_db(&runtime); - let ctx = direct_vfs_ctx(&db); - - sqlite_exec( - db.as_ptr(), - "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", - ) - .expect("create table should succeed"); - - let relaxed = std::sync::atomic::Ordering::Relaxed; - ctx.commit_total.store(0, relaxed); - - // 100 sequential autocommit inserts. If fence mismatch is the bug, - // this will fail partway through with "commit head_txid X did not - // match current head_txid X-1". - for i in 0..100 { - sqlite_exec( - db.as_ptr(), - &format!("INSERT INTO t (id, v) VALUES ({i}, {});", i * 2), - ) - .expect("autocommit insert should not fence-mismatch"); - } - - let commits = ctx.commit_total.load(relaxed); - // Each autocommit INSERT = 1 commit. CREATE TABLE was 1 more. - // We reset commit_total after CREATE, so expect 100. - assert_eq!(commits, 100, "expected exactly 100 commits"); - - let count = - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM t;").expect("count should succeed"); - assert_eq!(count, 100); - - // Verify the sum to make sure data is correct and not corrupted - let sum = - sqlite_query_i64(db.as_ptr(), "SELECT SUM(v) FROM t;").expect("sum should succeed"); - assert_eq!(sum, (0..100).map(|i| i * 2).sum::()); - } - - // Regression test: 5 actors run 200 autocommits each on the same engine. - // Compaction is triggered via the mpsc channel after each commit, so this - // also exercises the commit-vs-compaction race that caused fence rewinds - // before the tx_get_value_serializable fix. - #[test] - fn stress_concurrent_multi_actor_autocommits() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - - let mut dbs = Vec::new(); - for i in 0..5 { - let actor_id = format!("{}-stress-{}", harness.actor_id, i); - let db = harness.open_db_on_engine( - &runtime, - engine.clone(), - &actor_id, - VfsV2Config::default(), - ); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", - ) - .expect("create"); - dbs.push(db); - } - - // Interleave 200 autocommit inserts across all 5 actors - for i in 0..200 { - for db in &dbs { - sqlite_exec( - db.as_ptr(), - &format!("INSERT INTO t (id, v) VALUES ({i}, {i});"), - ) - .expect("insert"); - } - } - - for db in &dbs { - let count = sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM t;").expect("count"); - assert_eq!(count, 200); - } - } - - // Regression test: two actors run autocommits concurrently on the same - // SqliteEngine. If anything in the engine (e.g., compaction) cross-contaminates - // actors or races on shared state, we'd see fence mismatches. - #[test] - fn concurrent_multi_actor_autocommits() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - - let actor_a = format!("{}-a", harness.actor_id); - let actor_b = format!("{}-b", harness.actor_id); - - let db_a = - harness.open_db_on_engine(&runtime, engine.clone(), &actor_a, VfsV2Config::default()); - let db_b = - harness.open_db_on_engine(&runtime, engine.clone(), &actor_b, VfsV2Config::default()); - - sqlite_exec( - db_a.as_ptr(), - "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", - ) - .expect("create a"); - sqlite_exec( - db_b.as_ptr(), - "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", - ) - .expect("create b"); - - // Run 100 autocommits on each actor, interleaved. - for i in 0..100 { - sqlite_exec( - db_a.as_ptr(), - &format!("INSERT INTO t (id, v) VALUES ({i}, {i});"), - ) - .expect("insert a"); - sqlite_exec( - db_b.as_ptr(), - &format!("INSERT INTO t (id, v) VALUES ({i}, {i});"), - ) - .expect("insert b"); - } - - let count_a = sqlite_query_i64(db_a.as_ptr(), "SELECT COUNT(*) FROM t;").expect("count a"); - assert_eq!(count_a, 100); - let count_b = sqlite_query_i64(db_b.as_ptr(), "SELECT COUNT(*) FROM t;").expect("count b"); - assert_eq!(count_b, 100); - } - - // Same as above but across a close/reopen cycle to exercise takeover. - #[test] - fn autocommit_survives_close_reopen() { - let runtime = direct_runtime(); - let harness = DirectEngineHarness::new(); - let engine = runtime.block_on(harness.open_engine()); - let actor_id = &harness.actor_id; - - { - let db = harness.open_db_on_engine( - &runtime, - engine.clone(), - actor_id, - VfsV2Config::default(), - ); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", - ) - .expect("create table"); - for i in 0..50 { - sqlite_exec( - db.as_ptr(), - &format!("INSERT INTO t (id, v) VALUES ({i}, {});", i), - ) - .expect("insert"); - } - } - - // Reopen (triggers takeover which bumps generation) - let db2 = - harness.open_db_on_engine(&runtime, engine.clone(), actor_id, VfsV2Config::default()); - for i in 50..100 { - sqlite_exec( - db2.as_ptr(), - &format!("INSERT INTO t (id, v) VALUES ({i}, {});", i), - ) - .expect("insert after reopen"); - } - - let count = sqlite_query_i64(db2.as_ptr(), "SELECT COUNT(*) FROM t;") - .expect("count should succeed"); - assert_eq!(count, 100); - } - - // Bench-parity tests. Each mirrors a workload in - // examples/kitchen-sink/src/actors/testing/test-sqlite-bench.ts so - // storage-layer regressions surface here without needing the full stack. - - fn open_bench_db(runtime: &tokio::runtime::Runtime) -> NativeDatabaseV2 { - let harness = DirectEngineHarness::new(); - harness.open_db(runtime) - } - - #[test] - fn bench_insert_tx_x10000() { - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER);", - ) - .unwrap(); - - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for i in 0..10_000 { - sqlite_exec( - db.as_ptr(), - &format!("INSERT INTO t (id, v) VALUES ({i}, {i});"), - ) - .unwrap(); - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM t;").unwrap(), - 10_000 - ); - } - - #[test] - fn bench_large_tx_insert_500kb() { - large_tx_insert(500 * 1024); - } - - #[test] - fn bench_large_tx_insert_10mb() { - large_tx_insert(10 * 1024 * 1024); - } - - #[test] - fn bench_large_tx_insert_50mb() { - // 50MB exercises the slow-path stage/finalize chunking that has - // historically hit decode errors under certain transports. - large_tx_insert(50 * 1024 * 1024); - } - - fn large_tx_insert(target_bytes: usize) { - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE large_tx (id INTEGER PRIMARY KEY AUTOINCREMENT, payload BLOB NOT NULL);", - ) - .unwrap(); - - let row_size = 4 * 1024; - let rows = (target_bytes + row_size - 1) / row_size; - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for _ in 0..rows { - sqlite_exec( - db.as_ptr(), - &format!("INSERT INTO large_tx (payload) VALUES (randomblob({row_size}));"), - ) - .unwrap(); - } - if let Err(err) = sqlite_exec(db.as_ptr(), "COMMIT") { - let vfs_err = direct_vfs_ctx(&db).clone_last_error(); - panic!( - "COMMIT failed for {} MiB: sqlite={}, vfs_last_error={:?}", - target_bytes / (1024 * 1024), - err, - vfs_err, - ); - } - - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM large_tx;").unwrap(), - rows as i64 - ); - } - - #[test] - fn bench_churn_insert_delete_10x1000() { - // Tests freelist reuse / space reclamation under heavy churn. - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE churn (id INTEGER PRIMARY KEY AUTOINCREMENT, payload BLOB NOT NULL);", - ) - .unwrap(); - for _ in 0..10 { - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for _ in 0..1000 { - sqlite_exec( - db.as_ptr(), - "INSERT INTO churn (payload) VALUES (randomblob(1024));", - ) - .unwrap(); - } - sqlite_exec(db.as_ptr(), "DELETE FROM churn;").unwrap(); - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - } - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM churn;").unwrap(), - 0 - ); - } - - #[test] - fn bench_mixed_oltp_large() { - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE mixed (id INTEGER PRIMARY KEY, v INTEGER NOT NULL, data BLOB NOT NULL);", - ) - .unwrap(); - - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for i in 0..500 { - sqlite_exec( - db.as_ptr(), - &format!( - "INSERT INTO mixed (id, v, data) VALUES ({i}, {}, randomblob(1024));", - i * 2 - ), - ) - .unwrap(); - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for i in 0..500 { - sqlite_exec( - db.as_ptr(), - &format!( - "INSERT INTO mixed (id, v, data) VALUES ({}, {}, randomblob(1024));", - 500 + i, - i * 3 - ), - ) - .unwrap(); - sqlite_exec( - db.as_ptr(), - &format!("UPDATE mixed SET v = v + 1 WHERE id = {i};"), - ) - .unwrap(); - if i % 5 == 0 && i >= 50 { - sqlite_exec( - db.as_ptr(), - &format!("DELETE FROM mixed WHERE id = {};", i - 50), - ) - .unwrap(); - } - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - - let count = sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM mixed;").unwrap(); - assert!(count > 900 && count < 1000); - } - - #[test] - fn bench_bulk_update_1000_rows() { - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE bulk (id INTEGER PRIMARY KEY, v INTEGER);", - ) - .unwrap(); - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for i in 0..1000 { - sqlite_exec( - db.as_ptr(), - &format!("INSERT INTO bulk (id, v) VALUES ({i}, {i});"), - ) - .unwrap(); - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for i in 0..1000 { - sqlite_exec( - db.as_ptr(), - &format!("UPDATE bulk SET v = v + 1 WHERE id = {i};"), - ) - .unwrap(); - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT SUM(v) FROM bulk;").unwrap(), - (0..1000).map(|i| i + 1).sum::() - ); - } - - #[test] - fn bench_truncate_and_regrow() { - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE regrow (id INTEGER PRIMARY KEY AUTOINCREMENT, payload BLOB NOT NULL);", - ) - .unwrap(); - for _ in 0..2 { - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for _ in 0..500 { - sqlite_exec( - db.as_ptr(), - "INSERT INTO regrow (payload) VALUES (randomblob(1024));", - ) - .unwrap(); - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - sqlite_exec(db.as_ptr(), "DELETE FROM regrow;").unwrap(); - } - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM regrow;").unwrap(), - 0 - ); - } - - #[test] - fn bench_many_small_tables() { - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for i in 0..50 { - sqlite_exec( - db.as_ptr(), - &format!("CREATE TABLE t_{i} (id INTEGER PRIMARY KEY, v INTEGER);"), - ) - .unwrap(); - for j in 0..10 { - sqlite_exec( - db.as_ptr(), - &format!("INSERT INTO t_{i} (id, v) VALUES ({j}, {});", i * j), - ) - .unwrap(); - } - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - - let total: i64 = (0..50) - .map(|i| { - sqlite_query_i64(db.as_ptr(), &format!("SELECT COUNT(*) FROM t_{i};")).unwrap() - }) - .sum(); - assert_eq!(total, 500); - } - - #[test] - fn bench_index_creation_on_10k_rows() { - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE idx_test (id INTEGER PRIMARY KEY AUTOINCREMENT, k TEXT NOT NULL, v INTEGER NOT NULL);", - ) - .unwrap(); - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for i in 0..10_000 { - sqlite_exec( - db.as_ptr(), - &format!( - "INSERT INTO idx_test (k, v) VALUES ('key-{}-{i}', {i});", - i % 1000 - ), - ) - .unwrap(); - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - - sqlite_exec(db.as_ptr(), "CREATE INDEX idx_test_k ON idx_test(k);").unwrap(); - - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM idx_test;").unwrap(), - 10_000 - ); - } - - #[test] - fn bench_growing_aggregation() { - let runtime = direct_runtime(); - let db = open_bench_db(&runtime); - sqlite_exec( - db.as_ptr(), - "CREATE TABLE agg (id INTEGER PRIMARY KEY AUTOINCREMENT, v INTEGER NOT NULL);", - ) - .unwrap(); - - let batches = 20; - let per_batch = 100; - for batch in 0..batches { - sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); - for i in 0..per_batch { - sqlite_exec( - db.as_ptr(), - &format!("INSERT INTO agg (v) VALUES ({});", batch * per_batch + i), - ) - .unwrap(); - } - sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - let expected_sum: i64 = (0..(batch + 1) * per_batch).map(|i| i as i64).sum(); - assert_eq!( - sqlite_query_i64(db.as_ptr(), "SELECT SUM(v) FROM agg;").unwrap(), - expected_sum - ); - } - } -} diff --git a/rivetkit-rust/packages/rivetkit-sqlite/src/vfs.rs b/rivetkit-rust/packages/rivetkit-sqlite/src/vfs.rs index cc7899701f..ee908bfd87 100644 --- a/rivetkit-rust/packages/rivetkit-sqlite/src/vfs.rs +++ b/rivetkit-rust/packages/rivetkit-sqlite/src/vfs.rs @@ -2,30 +2,60 @@ //! //! This crate now owns the KV-backed SQLite behavior used by `rivetkit-napi`. -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ffi::{CStr, CString, c_char, c_int, c_void}; use std::ptr; use std::slice; use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::{Arc, Mutex, OnceLock}; +use std::sync::Arc; +use std::time::Instant; +use anyhow::Result; use libsqlite3_sys::*; +use moka::sync::Cache; +use parking_lot::{Mutex, RwLock}; +use rivet_envoy_client::handle::EnvoyHandle; +use rivet_envoy_protocol as protocol; +use sqlite_storage::ltx::{encode_ltx_v3, LtxHeader}; +#[cfg(test)] +use sqlite_storage::{engine::SqliteEngine, error::SqliteStorageError}; use tokio::runtime::Handle; +#[cfg(test)] +use tokio::sync::Notify; + +const DEFAULT_CACHE_CAPACITY_PAGES: u64 = 50_000; +const DEFAULT_PREFETCH_DEPTH: usize = 16; +const DEFAULT_MAX_PREFETCH_BYTES: usize = 256 * 1024; +const DEFAULT_MAX_PAGES_PER_STAGE: usize = 4_000; +const DEFAULT_PAGE_SIZE: usize = 4096; +const MAX_PATHNAME: c_int = 64; +const TEMP_AUX_PATH_PREFIX: &str = "__sqlite_temp__"; +const EMPTY_DB_PAGE_HEADER_PREFIX: [u8; 108] = [ + 83, 81, 76, 105, 116, 101, 32, 102, 111, 114, 109, 97, 116, 32, 51, 0, 16, 0, 1, 1, 0, 64, 32, + 32, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 46, 138, 17, 13, 0, 0, 0, 0, 16, 0, 0, +]; -use crate::kv; -use crate::sqlite_kv::{KvGetResult, SqliteKv, SqliteKvError}; +#[cfg(test)] +static NEXT_STAGE_ID: AtomicU64 = AtomicU64::new(1); +static NEXT_TEMP_AUX_ID: AtomicU64 = AtomicU64::new(1); unsafe extern "C" { fn sqlite3_close_v2(db: *mut sqlite3) -> c_int; } -// MARK: Panic Guard +fn empty_db_page() -> Vec { + let mut page = vec![0u8; DEFAULT_PAGE_SIZE]; + page[..EMPTY_DB_PAGE_HEADER_PREFIX.len()].copy_from_slice(&EMPTY_DB_PAGE_HEADER_PREFIX); + page +} fn panic_message(payload: &Box) -> String { - if let Some(s) = payload.downcast_ref::<&str>() { - s.to_string() - } else if let Some(s) = payload.downcast_ref::() { - s.clone() + if let Some(message) = payload.downcast_ref::<&str>() { + message.to_string() + } else if let Some(message) = payload.downcast_ref::() { + message.clone() } else { "unknown panic".to_string() } @@ -36,1282 +66,2223 @@ macro_rules! vfs_catch_unwind { match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| $body)) { Ok(result) => result, Err(panic) => { - tracing::error!(message = panic_message(&panic), "vfs callback panicked"); + tracing::error!( + message = panic_message(&panic), + "sqlite callback panicked" + ); $err_val } } }; } -// MARK: Constants +#[derive(Clone)] +struct SqliteTransport { + inner: Arc, +} -/// File metadata version for KV-backed SQLite storage. -const META_VERSION: u16 = 1; +enum SqliteTransportInner { + Envoy(EnvoyHandle), + #[cfg(test)] + Direct { + engine: Arc, + hooks: Arc, + }, + #[cfg(test)] + Test(Arc), +} -/// Encoded metadata size. This is 2 bytes of version plus 8 bytes of size. -const META_ENCODED_SIZE: usize = 10; +impl SqliteTransport { + fn from_envoy(handle: EnvoyHandle) -> Self { + Self { + inner: Arc::new(SqliteTransportInner::Envoy(handle)), + } + } -/// Maximum pathname length reported to SQLite. -const MAX_PATHNAME: c_int = 64; + #[cfg(test)] + fn from_direct(engine: Arc) -> Self { + Self { + inner: Arc::new(SqliteTransportInner::Direct { + engine, + hooks: Arc::new(DirectTransportHooks::default()), + }), + } + } -/// Maximum number of keys accepted by a single KV put or delete request. -const KV_MAX_BATCH_KEYS: usize = 128; + #[cfg(test)] + fn from_mock(protocol: Arc) -> Self { + Self { + inner: Arc::new(SqliteTransportInner::Test(protocol)), + } + } -/// Opt-in flag for the native read cache. Disabled by default to match the WASM VFS. -const READ_CACHE_ENV_VAR: &str = "RIVETKIT_SQLITE_NATIVE_READ_CACHE"; + #[cfg(test)] + fn direct_hooks(&self) -> Option> { + match &*self.inner { + SqliteTransportInner::Direct { hooks, .. } => Some(Arc::clone(hooks)), + _ => None, + } + } -/// First 108 bytes of a valid empty page-1 SQLite database. -/// -/// This is the canonical empty page-1 header for the KV-backed SQLite VFS. -const EMPTY_DB_PAGE_HEADER_PREFIX: [u8; 108] = [ - 83, 81, 76, 105, 116, 101, 32, 102, 111, 114, 109, 97, 116, 32, 51, 0, 16, 0, 1, 1, 0, 64, 32, - 32, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 46, 138, 17, 13, 0, 0, 0, 0, 16, 0, 0, -]; + async fn get_pages( + &self, + req: protocol::SqliteGetPagesRequest, + ) -> Result { + match &*self.inner { + SqliteTransportInner::Envoy(handle) => handle.sqlite_get_pages(req).await, + #[cfg(test)] + SqliteTransportInner::Direct { engine, .. } => { + let pgnos = req.pgnos.clone(); + match engine.get_pages(&req.actor_id, req.generation, pgnos).await { + Ok(pages) => Ok(protocol::SqliteGetPagesResponse::SqliteGetPagesOk( + protocol::SqliteGetPagesOk { + pages: pages.into_iter().map(protocol_fetched_page).collect(), + meta: protocol_sqlite_meta(engine.load_meta(&req.actor_id).await?), + }, + )), + Err(err) => { + if let Some(SqliteStorageError::FenceMismatch { reason }) = + sqlite_storage_error(&err) + { + Ok(protocol::SqliteGetPagesResponse::SqliteFenceMismatch( + protocol::SqliteFenceMismatch { + actual_meta: protocol_sqlite_meta( + engine.load_meta(&req.actor_id).await?, + ), + reason: reason.clone(), + }, + )) + } else if matches!( + sqlite_storage_error(&err), + Some(SqliteStorageError::MetaMissing { operation }) + if *operation == "get_pages" && req.generation == 1 + ) { + match engine + .takeover( + &req.actor_id, + sqlite_storage::takeover::TakeoverConfig::new(1), + ) + .await + { + Ok(_) => {} + Err(takeover_err) + if matches!( + sqlite_storage_error(&takeover_err), + Some(SqliteStorageError::ConcurrentTakeover) + ) => {} + Err(takeover_err) => { + return Ok( + protocol::SqliteGetPagesResponse::SqliteErrorResponse( + sqlite_error_response(&takeover_err), + ), + ); + } + } + + match engine + .get_pages(&req.actor_id, req.generation, req.pgnos) + .await + { + Ok(pages) => { + Ok(protocol::SqliteGetPagesResponse::SqliteGetPagesOk( + protocol::SqliteGetPagesOk { + pages: pages + .into_iter() + .map(protocol_fetched_page) + .collect(), + meta: protocol_sqlite_meta( + engine.load_meta(&req.actor_id).await?, + ), + }, + )) + } + Err(retry_err) => { + Ok(protocol::SqliteGetPagesResponse::SqliteErrorResponse( + sqlite_error_response(&retry_err), + )) + } + } + } else { + Ok(protocol::SqliteGetPagesResponse::SqliteErrorResponse( + sqlite_error_response(&err), + )) + } + } + } + } + #[cfg(test)] + SqliteTransportInner::Test(protocol) => protocol.get_pages(req).await, + } + } -fn empty_db_page() -> Vec { - let mut page = vec![0u8; kv::CHUNK_SIZE]; - page[..EMPTY_DB_PAGE_HEADER_PREFIX.len()].copy_from_slice(&EMPTY_DB_PAGE_HEADER_PREFIX); - page -} + async fn commit( + &self, + req: protocol::SqliteCommitRequest, + ) -> Result { + match &*self.inner { + SqliteTransportInner::Envoy(handle) => handle.sqlite_commit(req).await, + #[cfg(test)] + SqliteTransportInner::Direct { engine, hooks } => { + if let Some(message) = hooks.take_commit_error() { + return Err(anyhow::anyhow!(message)); + } -// MARK: Metadata Encoding + match engine + .commit( + &req.actor_id, + sqlite_storage::commit::CommitRequest { + generation: req.generation, + head_txid: req.expected_head_txid, + db_size_pages: req.new_db_size_pages, + dirty_pages: req + .dirty_pages + .into_iter() + .map(storage_dirty_page) + .collect(), + now_ms: sqlite_now_ms()?, + }, + ) + .await + { + Ok(result) => Ok(protocol::SqliteCommitResponse::SqliteCommitOk( + protocol::SqliteCommitOk { + new_head_txid: result.txid, + meta: protocol_sqlite_meta(result.meta), + }, + )), + Err(err) => { + if let Some(SqliteStorageError::FenceMismatch { reason }) = + sqlite_storage_error(&err) + { + Ok(protocol::SqliteCommitResponse::SqliteFenceMismatch( + protocol::SqliteFenceMismatch { + actual_meta: protocol_sqlite_meta( + engine.load_meta(&req.actor_id).await?, + ), + reason: reason.clone(), + }, + )) + } else if let Some(SqliteStorageError::CommitTooLarge { + actual_size_bytes, + max_size_bytes, + }) = sqlite_storage_error(&err) + { + Ok(protocol::SqliteCommitResponse::SqliteCommitTooLarge( + protocol::SqliteCommitTooLarge { + actual_size_bytes: *actual_size_bytes, + max_size_bytes: *max_size_bytes, + }, + )) + } else { + Ok(protocol::SqliteCommitResponse::SqliteErrorResponse( + sqlite_error_response(&err), + )) + } + } + } + } + #[cfg(test)] + SqliteTransportInner::Test(protocol) => protocol.commit(req).await, + } + } -pub fn encode_file_meta(size: i64) -> Vec { - let mut buf = Vec::with_capacity(META_ENCODED_SIZE); - buf.extend_from_slice(&META_VERSION.to_le_bytes()); - buf.extend_from_slice(&(size as u64).to_le_bytes()); - buf -} + async fn commit_stage_begin( + &self, + req: protocol::SqliteCommitStageBeginRequest, + ) -> Result { + match &*self.inner { + SqliteTransportInner::Envoy(handle) => handle.sqlite_commit_stage_begin(req).await, + #[cfg(test)] + SqliteTransportInner::Direct { engine, .. } => { + match engine + .commit_stage_begin( + &req.actor_id, + sqlite_storage::commit::CommitStageBeginRequest { + generation: req.generation, + }, + ) + .await + { + Ok(result) => Ok( + protocol::SqliteCommitStageBeginResponse::SqliteCommitStageBeginOk( + protocol::SqliteCommitStageBeginOk { txid: result.txid }, + ), + ), + Err(err) => { + if let Some(SqliteStorageError::FenceMismatch { reason }) = + sqlite_storage_error(&err) + { + Ok( + protocol::SqliteCommitStageBeginResponse::SqliteFenceMismatch( + protocol::SqliteFenceMismatch { + actual_meta: protocol_sqlite_meta( + engine.load_meta(&req.actor_id).await?, + ), + reason: reason.clone(), + }, + ), + ) + } else { + Ok( + protocol::SqliteCommitStageBeginResponse::SqliteErrorResponse( + sqlite_error_response(&err), + ), + ) + } + } + } + } + #[cfg(test)] + SqliteTransportInner::Test(protocol) => protocol.commit_stage_begin(req).await, + } + } -pub fn decode_file_meta(data: &[u8]) -> Option { - if data.len() < META_ENCODED_SIZE { - return None; + async fn commit_stage( + &self, + req: protocol::SqliteCommitStageRequest, + ) -> Result { + match &*self.inner { + SqliteTransportInner::Envoy(handle) => handle.sqlite_commit_stage(req).await, + #[cfg(test)] + SqliteTransportInner::Direct { engine, .. } => { + match engine + .commit_stage( + &req.actor_id, + sqlite_storage::commit::CommitStageRequest { + generation: req.generation, + txid: req.txid, + chunk_idx: req.chunk_idx, + bytes: req.bytes, + is_last: req.is_last, + }, + ) + .await + { + Ok(result) => Ok(protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: result.chunk_idx_committed, + }, + )), + Err(err) => { + if let Some(SqliteStorageError::FenceMismatch { reason }) = + sqlite_storage_error(&err) + { + Ok(protocol::SqliteCommitStageResponse::SqliteFenceMismatch( + protocol::SqliteFenceMismatch { + actual_meta: protocol_sqlite_meta( + engine.load_meta(&req.actor_id).await?, + ), + reason: reason.clone(), + }, + )) + } else { + Ok(protocol::SqliteCommitStageResponse::SqliteErrorResponse( + sqlite_error_response(&err), + )) + } + } + } + } + #[cfg(test)] + SqliteTransportInner::Test(protocol) => protocol.commit_stage(req).await, + } } - let version_bytes: [u8; 2] = data[0..2].try_into().ok()?; - if u16::from_le_bytes(version_bytes) != META_VERSION { - return None; + + fn queue_commit_stage(&self, req: protocol::SqliteCommitStageRequest) -> Result { + match &*self.inner { + SqliteTransportInner::Envoy(handle) => { + handle.sqlite_commit_stage_fire_and_forget(req)?; + Ok(true) + } + #[cfg(test)] + SqliteTransportInner::Direct { .. } => Ok(false), + #[cfg(test)] + SqliteTransportInner::Test(protocol) => { + protocol.queue_commit_stage(req); + Ok(true) + } + } } - let size_bytes: [u8; 8] = data[2..10].try_into().ok()?; - let size = u64::from_le_bytes(size_bytes); - if size > i64::MAX as u64 { - return None; + + async fn commit_finalize( + &self, + req: protocol::SqliteCommitFinalizeRequest, + ) -> Result { + match &*self.inner { + SqliteTransportInner::Envoy(handle) => handle.sqlite_commit_finalize(req).await, + #[cfg(test)] + SqliteTransportInner::Direct { engine, .. } => { + match engine + .commit_finalize( + &req.actor_id, + sqlite_storage::commit::CommitFinalizeRequest { + generation: req.generation, + expected_head_txid: req.expected_head_txid, + txid: req.txid, + new_db_size_pages: req.new_db_size_pages, + now_ms: sqlite_now_ms()?, + origin_override: None, + }, + ) + .await + { + Ok(result) => Ok( + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: result.new_head_txid, + meta: protocol_sqlite_meta(result.meta), + }, + ), + ), + Err(err) => { + if let Some(SqliteStorageError::FenceMismatch { reason }) = + sqlite_storage_error(&err) + { + Ok(protocol::SqliteCommitFinalizeResponse::SqliteFenceMismatch( + protocol::SqliteFenceMismatch { + actual_meta: protocol_sqlite_meta( + engine.load_meta(&req.actor_id).await?, + ), + reason: reason.clone(), + }, + )) + } else if let Some(SqliteStorageError::StageNotFound { stage_id }) = + sqlite_storage_error(&err) + { + Ok(protocol::SqliteCommitFinalizeResponse::SqliteStageNotFound( + protocol::SqliteStageNotFound { + stage_id: *stage_id, + }, + )) + } else { + Ok(protocol::SqliteCommitFinalizeResponse::SqliteErrorResponse( + sqlite_error_response(&err), + )) + } + } + } + } + #[cfg(test)] + SqliteTransportInner::Test(protocol) => protocol.commit_finalize(req).await, + } } - Some(size as i64) } -fn is_valid_file_size(size: i64) -> bool { - size >= 0 && (size as u64) <= kv::MAX_FILE_SIZE +#[cfg(test)] +#[derive(Default)] +struct DirectTransportHooks { + fail_next_commit: Mutex>, } -fn read_cache_enabled() -> bool { - static READ_CACHE_ENABLED: OnceLock = OnceLock::new(); +#[cfg(test)] +impl DirectTransportHooks { + fn fail_next_commit(&self, message: impl Into) { + *self.fail_next_commit.lock() = Some(message.into()); + } - *READ_CACHE_ENABLED.get_or_init(|| { - std::env::var(READ_CACHE_ENV_VAR) - .map(|value| { - matches!( - value.to_ascii_lowercase().as_str(), - "1" | "true" | "yes" | "on" - ) - }) - .unwrap_or(false) - }) + fn take_commit_error(&self) -> Option { + self.fail_next_commit.lock().take() + } } -type StartupPreloadEntries = Vec<(Vec, Vec)>; +#[cfg(test)] +fn protocol_sqlite_meta(meta: sqlite_storage::types::SqliteMeta) -> protocol::SqliteMeta { + protocol::SqliteMeta { + schema_version: meta.schema_version, + generation: meta.generation, + head_txid: meta.head_txid, + materialized_txid: meta.materialized_txid, + db_size_pages: meta.db_size_pages, + page_size: meta.page_size, + creation_ts_ms: meta.creation_ts_ms, + max_delta_bytes: meta.max_delta_bytes, + } +} -fn sort_startup_preload(entries: &mut StartupPreloadEntries) { - entries.sort_by(|a, b| a.0.cmp(&b.0)); +#[cfg(test)] +fn protocol_fetched_page(page: sqlite_storage::types::FetchedPage) -> protocol::SqliteFetchedPage { + protocol::SqliteFetchedPage { + pgno: page.pgno, + bytes: page.bytes, + } } -fn startup_preload_search(entries: &StartupPreloadEntries, key: &[u8]) -> Result { - entries.binary_search_by(|(candidate, _)| candidate.as_slice().cmp(key)) +#[cfg(test)] +fn storage_dirty_page(page: protocol::SqliteDirtyPage) -> sqlite_storage::types::DirtyPage { + sqlite_storage::types::DirtyPage { + pgno: page.pgno, + bytes: page.bytes, + } } -fn startup_preload_get<'a>(entries: &'a StartupPreloadEntries, key: &[u8]) -> Option<&'a [u8]> { - startup_preload_search(entries, key) - .ok() - .map(|idx| entries[idx].1.as_slice()) +#[cfg(test)] +fn sqlite_storage_error(err: &anyhow::Error) -> Option<&SqliteStorageError> { + err.downcast_ref::() } -fn startup_preload_put(entries: &mut StartupPreloadEntries, key: &[u8], value: &[u8]) { - if let Ok(idx) = startup_preload_search(entries, key) { - entries[idx].1 = value.to_vec(); - } +#[cfg(test)] +fn sqlite_error_reason(err: &anyhow::Error) -> String { + err.chain() + .map(ToString::to_string) + .collect::>() + .join(": ") } -fn startup_preload_delete(entries: &mut StartupPreloadEntries, key: &[u8]) { - if let Ok(idx) = startup_preload_search(entries, key) { - entries.remove(idx); +#[cfg(test)] +fn sqlite_error_response(err: &anyhow::Error) -> protocol::SqliteErrorResponse { + protocol::SqliteErrorResponse { + message: sqlite_error_reason(err), } } -fn startup_preload_delete_range(entries: &mut StartupPreloadEntries, start: &[u8], end: &[u8]) { - entries.retain(|(key, _)| key.as_slice() < start || key.as_slice() >= end); -} +fn sqlite_now_ms() -> Result { + use std::time::{SystemTime, UNIX_EPOCH}; -// MARK: VFS Metrics + Ok(SystemTime::now() + .duration_since(UNIX_EPOCH)? + .as_millis() + .try_into()?) +} -/// Per-VFS-callback operation metrics for diagnosing native SQLite VFS performance. -pub struct VfsMetrics { - pub xread_count: AtomicU64, - pub xread_us: AtomicU64, - pub xwrite_count: AtomicU64, - pub xwrite_us: AtomicU64, - pub xwrite_buffered_count: AtomicU64, - pub xsync_count: AtomicU64, - pub xsync_us: AtomicU64, - pub commit_atomic_count: AtomicU64, - pub commit_atomic_us: AtomicU64, - pub commit_atomic_pages: AtomicU64, +#[cfg(test)] +struct MockProtocol { + commit_response: protocol::SqliteCommitResponse, + stage_response: protocol::SqliteCommitStageResponse, + finalize_response: protocol::SqliteCommitFinalizeResponse, + get_pages_response: protocol::SqliteGetPagesResponse, + mirror_commit_meta: Mutex, + commit_requests: Mutex>, + stage_requests: Mutex>, + awaited_stage_responses: Mutex, + finalize_requests: Mutex>, + get_pages_requests: Mutex>, + finalize_started: Notify, + release_finalize: Notify, } -impl VfsMetrics { - pub fn new() -> Self { +#[cfg(test)] +impl MockProtocol { + fn new( + commit_response: protocol::SqliteCommitResponse, + stage_response: protocol::SqliteCommitStageResponse, + finalize_response: protocol::SqliteCommitFinalizeResponse, + ) -> Self { Self { - xread_count: AtomicU64::new(0), - xread_us: AtomicU64::new(0), - xwrite_count: AtomicU64::new(0), - xwrite_us: AtomicU64::new(0), - xwrite_buffered_count: AtomicU64::new(0), - xsync_count: AtomicU64::new(0), - xsync_us: AtomicU64::new(0), - commit_atomic_count: AtomicU64::new(0), - commit_atomic_us: AtomicU64::new(0), - commit_atomic_pages: AtomicU64::new(0), + commit_response, + stage_response, + finalize_response, + get_pages_response: protocol::SqliteGetPagesResponse::SqliteGetPagesOk( + protocol::SqliteGetPagesOk { + pages: vec![], + meta: sqlite_meta(8 * 1024 * 1024), + }, + ), + mirror_commit_meta: Mutex::new(false), + commit_requests: Mutex::new(Vec::new()), + stage_requests: Mutex::new(Vec::new()), + awaited_stage_responses: Mutex::new(0), + finalize_requests: Mutex::new(Vec::new()), + get_pages_requests: Mutex::new(Vec::new()), + finalize_started: Notify::new(), + release_finalize: Notify::new(), } } -} - -// MARK: VFS Context - -struct VfsContext { - kv: Arc, - actor_id: String, - main_file_name: String, - // Bounded startup entries shipped with actor start. This is not the opt-in read cache. - startup_preload: Mutex>, - read_cache_enabled: bool, - last_error: Mutex>, - rt_handle: Handle, - io_methods: Box, - vfs_metrics: Arc, -} -impl VfsContext { - fn clear_last_error(&self) { - match self.last_error.lock() { - Ok(mut last_error) => { - *last_error = None; - } - Err(err) => { - tracing::warn!(%err, "native sqlite last_error mutex poisoned"); - } - } + fn commit_requests(&self) -> parking_lot::MutexGuard<'_, Vec> { + self.commit_requests.lock() } - fn set_last_error(&self, message: String) { - match self.last_error.lock() { - Ok(mut last_error) => { - *last_error = Some(message); - } - Err(err) => { - tracing::warn!(%err, "native sqlite last_error mutex poisoned"); - } - } + fn stage_requests( + &self, + ) -> parking_lot::MutexGuard<'_, Vec> { + self.stage_requests.lock() } - fn clone_last_error(&self) -> Option { - match self.last_error.lock() { - Ok(last_error) => last_error.clone(), - Err(err) => { - tracing::warn!(%err, "native sqlite last_error mutex poisoned"); - None - } - } + fn awaited_stage_responses(&self) -> usize { + *self.awaited_stage_responses.lock() } - fn take_last_error(&self) -> Option { - match self.last_error.lock() { - Ok(mut last_error) => last_error.take(), - Err(err) => { - tracing::warn!(%err, "native sqlite last_error mutex poisoned"); - None - } - } + fn finalize_requests( + &self, + ) -> parking_lot::MutexGuard<'_, Vec> { + self.finalize_requests.lock() } - fn report_kv_error(&self, err: SqliteKvError) -> String { - let message = err.to_string(); - self.set_last_error(message.clone()); - self.kv.on_error(&self.actor_id, &err); - message + fn get_pages_requests( + &self, + ) -> parking_lot::MutexGuard<'_, Vec> { + self.get_pages_requests.lock() } - fn resolve_file_tag(&self, path: &str) -> Option { - if path == self.main_file_name { - return Some(kv::FILE_TAG_MAIN); - } + fn set_mirror_commit_meta(&self, enabled: bool) { + *self.mirror_commit_meta.lock() = enabled; + } - if let Some(suffix) = path.strip_prefix(&self.main_file_name) { - match suffix { - "-journal" => Some(kv::FILE_TAG_JOURNAL), - "-wal" => Some(kv::FILE_TAG_WAL), - "-shm" => Some(kv::FILE_TAG_SHM), - _ => None, - } - } else { - None - } + fn queue_commit_stage(&self, req: protocol::SqliteCommitStageRequest) { + self.stage_requests().push(req); } - fn update_startup_preload(&self, f: impl FnOnce(&mut StartupPreloadEntries)) { - if let Ok(mut guard) = self.startup_preload.lock() { - if let Some(entries) = guard.as_mut() { - f(entries); - } - } + async fn get_pages( + &self, + req: protocol::SqliteGetPagesRequest, + ) -> Result { + self.get_pages_requests().push(req); + Ok(self.get_pages_response.clone()) } - fn kv_get(&self, keys: Vec>) -> Result { - let key_count = keys.len(); - let start = std::time::Instant::now(); - let (preloaded_keys, preloaded_values, miss_keys) = - if let Ok(guard) = self.startup_preload.lock() { - if let Some(entries) = guard.as_ref() { - let mut hit_keys = Vec::new(); - let mut hit_values = Vec::new(); - let mut misses = Vec::new(); - for key in keys { - if let Some(value) = startup_preload_get(entries, key.as_slice()) { - hit_keys.push(key); - hit_values.push(value.to_vec()); - } else { - misses.push(key); - } - } - (hit_keys, hit_values, misses) - } else { - (Vec::new(), Vec::new(), keys) - } - } else { - (Vec::new(), Vec::new(), keys) - }; - let result = if miss_keys.is_empty() { - Ok(KvGetResult { - keys: preloaded_keys, - values: preloaded_values, - }) - } else { - self.rt_handle - .block_on(self.kv.batch_get(&self.actor_id, miss_keys)) - .map(|mut result| { - result.keys.extend(preloaded_keys); - result.values.extend(preloaded_values); - result - }) - .map_err(|err| self.report_kv_error(err)) - }; - if result.is_ok() { - self.clear_last_error(); + async fn commit( + &self, + req: protocol::SqliteCommitRequest, + ) -> Result { + let req = req.clone(); + self.commit_requests().push(req.clone()); + if *self.mirror_commit_meta.lock() { + if let protocol::SqliteCommitResponse::SqliteCommitOk(ok) = &self.commit_response { + let mut meta = ok.meta.clone(); + meta.head_txid = req.expected_head_txid + 1; + meta.db_size_pages = req.new_db_size_pages; + return Ok(protocol::SqliteCommitResponse::SqliteCommitOk( + protocol::SqliteCommitOk { + new_head_txid: req.expected_head_txid + 1, + meta, + }, + )); + } } - let elapsed = start.elapsed(); - tracing::debug!( - op = %format_args!("get({key_count}keys)"), - duration_us = elapsed.as_micros() as u64, - "kv round-trip" - ); - result + Ok(self.commit_response.clone()) } - fn kv_put(&self, keys: Vec>, values: Vec>) -> Result<(), String> { - let key_count = keys.len(); - let start = std::time::Instant::now(); - let result = self - .rt_handle - .block_on( - self.kv - .batch_put(&self.actor_id, keys.clone(), values.clone()), - ) - .map_err(|err| self.report_kv_error(err)); - if result.is_ok() { - self.clear_last_error(); - self.update_startup_preload(|entries| { - for (key, value) in keys.iter().zip(values.iter()) { - startup_preload_put(entries, key.as_slice(), value.as_slice()); - } - }); - } - let elapsed = start.elapsed(); - tracing::debug!( - op = %format_args!("put({key_count}keys)"), - duration_us = elapsed.as_micros() as u64, - "kv round-trip" - ); - result + async fn commit_stage_begin( + &self, + _req: protocol::SqliteCommitStageBeginRequest, + ) -> Result { + Ok( + protocol::SqliteCommitStageBeginResponse::SqliteCommitStageBeginOk( + protocol::SqliteCommitStageBeginOk { + txid: next_stage_id(), + }, + ), + ) } - fn kv_delete(&self, keys: Vec>) -> Result<(), String> { - let key_count = keys.len(); - let start = std::time::Instant::now(); - let result = self - .rt_handle - .block_on(self.kv.batch_delete(&self.actor_id, keys.clone())) - .map_err(|err| self.report_kv_error(err)); - if result.is_ok() { - self.clear_last_error(); - self.update_startup_preload(|entries| { - for key in &keys { - startup_preload_delete(entries, key.as_slice()); - } - }); - } - let elapsed = start.elapsed(); - tracing::debug!( - op = %format_args!("del({key_count}keys)"), - duration_us = elapsed.as_micros() as u64, - "kv round-trip" - ); - result + async fn commit_stage( + &self, + req: protocol::SqliteCommitStageRequest, + ) -> Result { + *self.awaited_stage_responses.lock() += 1; + self.stage_requests().push(req); + Ok(self.stage_response.clone()) } - fn kv_delete_range(&self, start: Vec, end: Vec) -> Result<(), String> { - let start_time = std::time::Instant::now(); - let preload_start = start.clone(); - let preload_end = end.clone(); - let result = self - .rt_handle - .block_on(self.kv.delete_range(&self.actor_id, start, end)) - .map_err(|err| self.report_kv_error(err)); - if result.is_ok() { - self.clear_last_error(); - self.update_startup_preload(|entries| { - startup_preload_delete_range( - entries, - preload_start.as_slice(), - preload_end.as_slice(), + async fn commit_finalize( + &self, + req: protocol::SqliteCommitFinalizeRequest, + ) -> Result { + let req = req.clone(); + self.finalize_requests().push(req.clone()); + self.finalize_started.notify_one(); + self.release_finalize.notified().await; + if *self.mirror_commit_meta.lock() { + if let protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk(ok) = + &self.finalize_response + { + let mut meta = ok.meta.clone(); + meta.head_txid = req.expected_head_txid + 1; + meta.db_size_pages = req.new_db_size_pages; + return Ok( + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: req.expected_head_txid + 1, + meta, + }, + ), ); - }); + } } - let elapsed = start_time.elapsed(); - tracing::debug!( - op = "delRange", - duration_us = elapsed.as_micros() as u64, - "kv round-trip" - ); - result + Ok(self.finalize_response.clone()) } +} - fn delete_file(&self, file_tag: u8) -> Result<(), String> { - let meta_key = kv::get_meta_key(file_tag); - self.kv_delete(vec![meta_key.to_vec()])?; - self.kv_delete_range( - kv::get_chunk_key(file_tag, 0).to_vec(), - kv::get_chunk_key_range_end(file_tag).to_vec(), - ) +#[cfg(test)] +fn sqlite_meta(max_delta_bytes: u64) -> protocol::SqliteMeta { + protocol::SqliteMeta { + schema_version: 2, + generation: 7, + head_txid: 12, + materialized_txid: 12, + db_size_pages: 1, + page_size: 4096, + creation_ts_ms: 1_700_000_000_000, + max_delta_bytes, } } -// MARK: File State - -struct KvFileState { - batch_mode: bool, - dirty_buffer: BTreeMap>, - saved_file_size: i64, - /// Read cache: maps chunk keys to their data. Populated on KV gets, - /// updated on writes, cleared on truncate/delete. This avoids - /// redundant KV round-trips for pages SQLite reads multiple times. - read_cache: Option, Vec>>, +#[derive(Debug, Clone)] +pub struct VfsConfig { + pub cache_capacity_pages: u64, + pub prefetch_depth: usize, + pub max_prefetch_bytes: usize, + pub max_pages_per_stage: usize, } -impl KvFileState { - fn new(read_cache_enabled: bool) -> Self { +impl Default for VfsConfig { + fn default() -> Self { Self { - batch_mode: false, - dirty_buffer: BTreeMap::new(), - saved_file_size: 0, - read_cache: read_cache_enabled.then(HashMap::new), + cache_capacity_pages: DEFAULT_CACHE_CAPACITY_PAGES, + prefetch_depth: DEFAULT_PREFETCH_DEPTH, + max_prefetch_bytes: DEFAULT_MAX_PREFETCH_BYTES, + max_pages_per_stage: DEFAULT_MAX_PAGES_PER_STAGE, } } } -#[repr(C)] -struct KvFile { - base: sqlite3_file, - ctx: *const VfsContext, - state: *mut KvFileState, - file_tag: u8, - meta_key: [u8; 4], - size: i64, - meta_dirty: bool, - flags: c_int, +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CommitPath { + Fast, + Slow, } -// MARK: Helpers +#[derive(Debug, Clone)] +pub struct BufferedCommitRequest { + pub actor_id: String, + pub generation: u64, + pub expected_head_txid: u64, + pub new_db_size_pages: u32, + pub max_delta_bytes: u64, + pub max_pages_per_stage: usize, + pub dirty_pages: Vec, +} -unsafe fn get_file(p: *mut sqlite3_file) -> &'static mut KvFile { - &mut *(p as *mut KvFile) +#[derive(Debug, Clone)] +pub struct BufferedCommitOutcome { + pub path: CommitPath, + pub new_head_txid: u64, + pub meta: protocol::SqliteMeta, } -unsafe fn get_file_state(state: *mut KvFileState) -> &'static mut KvFileState { - &mut *state +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CommitBufferError { + FenceMismatch(String), + StageNotFound(u64), + Other(String), } -unsafe fn free_file_state(file: &mut KvFile) { - if !file.state.is_null() { - drop(Box::from_raw(file.state)); - file.state = ptr::null_mut(); - } +#[derive(Debug, Clone, Copy, Default)] +pub struct SqliteVfsMetricsSnapshot { + pub request_build_ns: u64, + pub serialize_ns: u64, + pub transport_ns: u64, + pub state_update_ns: u64, + pub total_ns: u64, + pub commit_count: u64, } -unsafe fn get_vfs_ctx(p: *mut sqlite3_vfs) -> &'static VfsContext { - &*((*p).pAppData as *const VfsContext) +#[derive(Debug, Clone, Copy, Default)] +struct CommitTransportMetrics { + serialize_ns: u64, + transport_ns: u64, } -fn build_value_map(resp: &KvGetResult) -> HashMap<&[u8], &[u8]> { - resp.keys - .iter() - .zip(resp.values.iter()) - .filter(|(_, value)| !value.is_empty()) - .map(|(key, value)| (key.as_slice(), value.as_slice())) - .collect() +pub struct VfsContext { + actor_id: String, + runtime: Handle, + transport: SqliteTransport, + config: VfsConfig, + state: RwLock, + aux_files: RwLock>>, + last_error: Mutex>, + commit_atomic_count: AtomicU64, + io_methods: Box, + // Performance counters + pub resolve_pages_total: AtomicU64, + pub resolve_pages_cache_hits: AtomicU64, + pub resolve_pages_fetches: AtomicU64, + pub pages_fetched_total: AtomicU64, + pub prefetch_pages_total: AtomicU64, + pub commit_total: AtomicU64, + pub commit_request_build_ns: AtomicU64, + pub commit_serialize_ns: AtomicU64, + pub commit_transport_ns: AtomicU64, + pub commit_state_update_ns: AtomicU64, + pub commit_duration_ns_total: AtomicU64, } -fn split_entries(entries: Vec<(Vec, Vec)>) -> (Vec>, Vec>) { - let mut keys = Vec::with_capacity(entries.len()); - let mut values = Vec::with_capacity(entries.len()); - for (key, value) in entries { - keys.push(key); - values.push(value); - } - (keys, values) +#[derive(Debug, Clone)] +struct VfsState { + generation: u64, + head_txid: u64, + db_size_pages: u32, + page_size: usize, + max_delta_bytes: u64, + page_cache: Cache>, + write_buffer: WriteBuffer, + predictor: PrefetchPredictor, + dead: bool, } -// MARK: IO Callbacks +#[derive(Debug, Clone, Default)] +struct WriteBuffer { + in_atomic_write: bool, + saved_db_size: u32, + dirty: BTreeMap>, +} -unsafe extern "C" fn kv_io_close(p_file: *mut sqlite3_file) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - let file = get_file(p_file); - let ctx = &*file.ctx; +#[derive(Debug, Clone, Default)] +struct PrefetchPredictor { + last_pgno: Option, + last_delta: Option, + stride_run_len: usize, + // Inspired by mvSQLite's Markov + stride predictor design (Apache-2.0). + transitions: HashMap>, +} - let result = if file.flags & SQLITE_OPEN_DELETEONCLOSE != 0 { - ctx.delete_file(file.file_tag) - } else if file.meta_dirty { - ctx.kv_put( - vec![file.meta_key.to_vec()], - vec![encode_file_meta(file.size)], - ) - } else { - Ok(()) - }; +#[derive(Debug)] +enum GetPagesError { + FenceMismatch(String), + Other(String), +} - free_file_state(file); +#[repr(C)] +struct VfsFile { + base: sqlite3_file, + ctx: *const VfsContext, + aux: *mut AuxFileHandle, +} - match result { - Ok(()) => SQLITE_OK, - Err(err) => { - tracing::error!(%err, file_tag = file.file_tag, "failed to close file"); - SQLITE_IOERR - } - } - }) +#[derive(Default)] +struct AuxFileState { + bytes: Mutex>, } -unsafe extern "C" fn kv_io_read( - p_file: *mut sqlite3_file, - p_buf: *mut c_void, - i_amt: c_int, - i_offset: sqlite3_int64, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_READ, { - if i_amt <= 0 { - return SQLITE_OK; - } +struct AuxFileHandle { + path: String, + state: Arc, + delete_on_close: bool, +} - let file = get_file(p_file); - let state = get_file_state(file.state); - let ctx = &*file.ctx; - let read_start = std::time::Instant::now(); - ctx.vfs_metrics.xread_count.fetch_add(1, Ordering::Relaxed); - let requested_length = i_amt as usize; - let buf = slice::from_raw_parts_mut(p_buf as *mut u8, requested_length); +unsafe impl Send for VfsContext {} +unsafe impl Sync for VfsContext {} - if i_offset < 0 { - return SQLITE_IOERR_READ; - } +pub struct SqliteVfs { + vfs_ptr: *mut sqlite3_vfs, + _name: CString, + ctx_ptr: *mut VfsContext, +} - let offset = i_offset as usize; - let file_size = file.size as usize; - if offset >= file_size { - buf.fill(0); - return SQLITE_IOERR_SHORT_READ; - } +unsafe impl Send for SqliteVfs {} +unsafe impl Sync for SqliteVfs {} + +pub struct NativeDatabase { + db: *mut sqlite3, + _vfs: SqliteVfs, +} - let start_chunk = offset / kv::CHUNK_SIZE; - let end_chunk = (offset + requested_length - 1) / kv::CHUNK_SIZE; +unsafe impl Send for NativeDatabase {} - let mut chunk_keys_to_fetch = Vec::new(); - let mut buffered_chunks: HashMap = HashMap::new(); - // Skip fetching chunks already present in the dirty buffer (batch mode) or read cache. - for chunk_idx in start_chunk..=end_chunk { - if state.batch_mode { - if state.dirty_buffer.contains_key(&(chunk_idx as u32)) { - continue; +impl PrefetchPredictor { + fn record(&mut self, pgno: u32) { + if let Some(last_pgno) = self.last_pgno { + let delta = pgno as i64 - last_pgno as i64; + if let Some(last_delta) = self.last_delta { + self.transitions + .entry(last_delta) + .or_default() + .entry(delta) + .and_modify(|count| *count += 1) + .or_insert(1); + if delta == last_delta { + self.stride_run_len += 1; + } else { + self.stride_run_len = 1; } + } else { + self.stride_run_len = 1; } - let key = kv::get_chunk_key(file.file_tag, chunk_idx as u32); - if let Some(read_cache) = state.read_cache.as_ref() { - if let Some(cached) = read_cache.get(key.as_slice()) { - buffered_chunks.insert(chunk_idx, cached.as_slice()); - continue; - } - } - chunk_keys_to_fetch.push(key.to_vec()); + self.last_delta = Some(delta); } + self.last_pgno = Some(pgno); + } - let resp = if chunk_keys_to_fetch.is_empty() { - KvGetResult { - keys: Vec::new(), - values: Vec::new(), - } - } else { - match ctx.kv_get(chunk_keys_to_fetch) { - Ok(resp) => resp, - Err(_) => return SQLITE_IOERR_READ, - } - }; - let value_map = build_value_map(&resp); - - for chunk_idx in start_chunk..=end_chunk { - let chunk_data = if state.batch_mode { - state - .dirty_buffer - .get(&(chunk_idx as u32)) - .map(|buffered| buffered.as_slice()) - } else { - None + fn multi_predict(&self, from_pgno: u32, depth: usize, db_size_pages: u32) -> Vec { + if depth == 0 || db_size_pages == 0 { + return Vec::new(); + } + + let mut seen = HashSet::new(); + let mut predicted = Vec::with_capacity(depth); + + if let Some(delta) = self.last_delta { + if self.stride_run_len >= 2 && delta > 0 { + let mut current = from_pgno as i64; + for _ in 0..depth { + current += delta; + if !(1..=db_size_pages as i64).contains(¤t) { + break; + } + let pgno = current as u32; + if seen.insert(pgno) { + predicted.push(pgno); + } + } + if predicted.len() >= depth { + return predicted; + } } - .or_else(|| buffered_chunks.get(&chunk_idx).copied()) - .or_else(|| { - let chunk_key = kv::get_chunk_key(file.file_tag, chunk_idx as u32); - value_map.get(chunk_key.as_slice()).copied() - }); - let chunk_offset = chunk_idx * kv::CHUNK_SIZE; - let read_start = offset.saturating_sub(chunk_offset); - let read_end = std::cmp::min(kv::CHUNK_SIZE, offset + requested_length - chunk_offset); - let dest_start = chunk_offset + read_start - offset; - - if let Some(chunk_data) = chunk_data { - let source_end = std::cmp::min(read_end, chunk_data.len()); - if source_end > read_start { - let dest_end = dest_start + (source_end - read_start); - buf[dest_start..dest_end].copy_from_slice(&chunk_data[read_start..source_end]); + + let mut current_delta = delta; + let mut current_pgno = from_pgno as i64; + for _ in predicted.len()..depth { + let Some(next_delta) = self + .transitions + .get(¤t_delta) + .and_then(|counts| counts.iter().max_by_key(|(_, count)| *count)) + .map(|(delta, _)| *delta) + else { + break; + }; + + current_pgno += next_delta; + if !(1..=db_size_pages as i64).contains(¤t_pgno) { + break; } - if source_end < read_end { - let zero_start = dest_start + (source_end - read_start); - let zero_end = dest_start + (read_end - read_start); - buf[zero_start..zero_end].fill(0); + let pgno = current_pgno as u32; + if seen.insert(pgno) { + predicted.push(pgno); } - } else { - let dest_end = dest_start + (read_end - read_start); - buf[dest_start..dest_end].fill(0); + current_delta = next_delta; } } - // `resp` is empty when every chunk was served from the dirty buffer or read cache. - // In that case this loop is a no-op. - if let Some(read_cache) = state.read_cache.as_mut() { - for (key, value) in resp.keys.iter().zip(resp.values.iter()) { - if !value.is_empty() { - read_cache.insert(key.clone(), value.clone()); - } + predicted + } +} + +impl VfsState { + fn new(config: &VfsConfig, startup: &protocol::SqliteStartupData) -> Self { + let page_cache = Cache::builder() + .max_capacity(config.cache_capacity_pages) + .build(); + for page in &startup.preloaded_pages { + if let Some(bytes) = &page.bytes { + page_cache.insert(page.pgno, bytes.clone()); } } - let actual_bytes = std::cmp::min(requested_length, file_size - offset); - if actual_bytes < requested_length { - buf[actual_bytes..].fill(0); - ctx.vfs_metrics - .xread_us - .fetch_add(read_start.elapsed().as_micros() as u64, Ordering::Relaxed); - return SQLITE_IOERR_SHORT_READ; + let mut state = Self { + generation: startup.generation, + head_txid: startup.meta.head_txid, + db_size_pages: startup.meta.db_size_pages, + page_size: startup.meta.page_size as usize, + max_delta_bytes: startup.meta.max_delta_bytes, + page_cache, + write_buffer: WriteBuffer::default(), + predictor: PrefetchPredictor::default(), + dead: false, + }; + if state.db_size_pages == 0 && !state.page_cache.contains_key(&1) { + state.page_cache.insert(1, empty_db_page()); + state.db_size_pages = 1; } + state + } - ctx.vfs_metrics - .xread_us - .fetch_add(read_start.elapsed().as_micros() as u64, Ordering::Relaxed); - SQLITE_OK - }) + fn update_meta(&mut self, meta: &protocol::SqliteMeta) { + self.generation = meta.generation; + self.head_txid = meta.head_txid; + self.db_size_pages = meta.db_size_pages; + self.page_size = meta.page_size as usize; + self.max_delta_bytes = meta.max_delta_bytes; + } + + fn update_read_meta(&mut self, meta: &protocol::SqliteMeta) { + self.max_delta_bytes = meta.max_delta_bytes; + } } -unsafe extern "C" fn kv_io_write( - p_file: *mut sqlite3_file, - p_buf: *const c_void, - i_amt: c_int, - i_offset: sqlite3_int64, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_WRITE, { - if i_amt <= 0 { - return SQLITE_OK; +impl VfsContext { + fn new( + actor_id: String, + runtime: Handle, + transport: SqliteTransport, + startup: protocol::SqliteStartupData, + config: VfsConfig, + io_methods: sqlite3_io_methods, + ) -> Self { + Self { + actor_id, + runtime, + transport, + config: config.clone(), + state: RwLock::new(VfsState::new(&config, &startup)), + aux_files: RwLock::new(BTreeMap::new()), + last_error: Mutex::new(None), + commit_atomic_count: AtomicU64::new(0), + io_methods: Box::new(io_methods), + resolve_pages_total: AtomicU64::new(0), + resolve_pages_cache_hits: AtomicU64::new(0), + resolve_pages_fetches: AtomicU64::new(0), + pages_fetched_total: AtomicU64::new(0), + prefetch_pages_total: AtomicU64::new(0), + commit_total: AtomicU64::new(0), + commit_request_build_ns: AtomicU64::new(0), + commit_serialize_ns: AtomicU64::new(0), + commit_transport_ns: AtomicU64::new(0), + commit_state_update_ns: AtomicU64::new(0), + commit_duration_ns_total: AtomicU64::new(0), } + } - let file = get_file(p_file); - let ctx = &*file.ctx; - let write_start = std::time::Instant::now(); - ctx.vfs_metrics.xwrite_count.fetch_add(1, Ordering::Relaxed); - let data = slice::from_raw_parts(p_buf as *const u8, i_amt as usize); - - if i_offset < 0 { - return SQLITE_IOERR_WRITE; - } + fn clear_last_error(&self) { + *self.last_error.lock() = None; + } - let offset = i_offset as usize; - let write_length = i_amt as usize; - let write_end_offset = match offset.checked_add(write_length) { - Some(end) => end, - None => return SQLITE_IOERR_WRITE, - }; - if write_end_offset as u64 > kv::MAX_FILE_SIZE { - return SQLITE_IOERR_WRITE; - } + fn set_last_error(&self, message: String) { + *self.last_error.lock() = Some(message); + } - let start_chunk = offset / kv::CHUNK_SIZE; - let end_chunk = (offset + write_length - 1) / kv::CHUNK_SIZE; + fn clone_last_error(&self) -> Option { + self.last_error.lock().clone() + } - { - let state = get_file_state(file.state); - if state.batch_mode { - for chunk_idx in start_chunk..=end_chunk { - let chunk_offset = chunk_idx * kv::CHUNK_SIZE; - let source_start = - std::cmp::max(0isize, chunk_offset as isize - offset as isize) as usize; - let source_end = - std::cmp::min(write_length, chunk_offset + kv::CHUNK_SIZE - offset); - state - .dirty_buffer - .insert(chunk_idx as u32, data[source_start..source_end].to_vec()); - } + fn take_last_error(&self) -> Option { + self.last_error.lock().take() + } - let new_size = std::cmp::max(file.size, write_end_offset as i64); - if new_size != file.size { - file.size = new_size; - file.meta_dirty = true; - } + fn add_commit_phase_metrics( + &self, + request_build_ns: u64, + transport_metrics: CommitTransportMetrics, + state_update_ns: u64, + total_ns: u64, + ) { + self.commit_request_build_ns + .fetch_add(request_build_ns, Ordering::Relaxed); + self.commit_serialize_ns + .fetch_add(transport_metrics.serialize_ns, Ordering::Relaxed); + self.commit_transport_ns + .fetch_add(transport_metrics.transport_ns, Ordering::Relaxed); + self.commit_state_update_ns + .fetch_add(state_update_ns, Ordering::Relaxed); + self.commit_duration_ns_total + .fetch_add(total_ns, Ordering::Relaxed); + } - ctx.vfs_metrics - .xwrite_buffered_count - .fetch_add(1, Ordering::Relaxed); - ctx.vfs_metrics - .xwrite_us - .fetch_add(write_start.elapsed().as_micros() as u64, Ordering::Relaxed); - return SQLITE_OK; - } + fn sqlite_vfs_metrics(&self) -> SqliteVfsMetricsSnapshot { + SqliteVfsMetricsSnapshot { + request_build_ns: self.commit_request_build_ns.load(Ordering::Relaxed), + serialize_ns: self.commit_serialize_ns.load(Ordering::Relaxed), + transport_ns: self.commit_transport_ns.load(Ordering::Relaxed), + state_update_ns: self.commit_state_update_ns.load(Ordering::Relaxed), + total_ns: self.commit_duration_ns_total.load(Ordering::Relaxed), + commit_count: self.commit_total.load(Ordering::Relaxed), } + } + + fn page_size(&self) -> usize { + self.state.read().page_size.max(DEFAULT_PAGE_SIZE) + } - struct WritePlan { - chunk_key: Vec, - chunk_offset: usize, - write_start: usize, - write_end: usize, - cached_chunk: Option>, - existing_chunk_index: Option, + fn open_aux_file(&self, path: &str) -> Arc { + if let Some(state) = self.aux_files.read().get(path) { + return state.clone(); } - let mut plans = Vec::new(); - let mut chunk_keys_to_fetch = Vec::new(); - for chunk_idx in start_chunk..=end_chunk { - let chunk_offset = chunk_idx * kv::CHUNK_SIZE; - let write_start = offset.saturating_sub(chunk_offset); - let write_end = std::cmp::min(kv::CHUNK_SIZE, offset + write_length - chunk_offset); - let existing_bytes_in_chunk = if file.size as usize > chunk_offset { - std::cmp::min(kv::CHUNK_SIZE, file.size as usize - chunk_offset) - } else { - 0 - }; - let needs_existing = write_start > 0 || existing_bytes_in_chunk > write_end; - let chunk_key = kv::get_chunk_key(file.file_tag, chunk_idx as u32).to_vec(); - let cached_chunk = if needs_existing && ctx.read_cache_enabled { - let state = get_file_state(file.state); - state - .read_cache - .as_ref() - .and_then(|read_cache| read_cache.get(chunk_key.as_slice()).cloned()) - } else { - None - }; - let existing_chunk_index = if needs_existing && cached_chunk.is_none() { - let idx = chunk_keys_to_fetch.len(); - chunk_keys_to_fetch.push(chunk_key.clone()); - Some(idx) - } else { - None - }; + let mut aux_files = self.aux_files.write(); + aux_files + .entry(path.to_string()) + .or_insert_with(|| Arc::new(AuxFileState::default())) + .clone() + } - plans.push(WritePlan { - chunk_key, - chunk_offset, - write_start, - write_end, - cached_chunk, - existing_chunk_index, - }); - } + fn aux_file_exists(&self, path: &str) -> bool { + self.aux_files.read().contains_key(path) + } - let existing_chunks = if chunk_keys_to_fetch.is_empty() { - Vec::new() - } else { - match ctx.kv_get(chunk_keys_to_fetch.clone()) { - Ok(resp) => { - let value_map = build_value_map(&resp); - chunk_keys_to_fetch - .iter() - .map(|key| value_map.get(key.as_slice()).map(|value| value.to_vec())) - .collect::>() - } - Err(_) => return SQLITE_IOERR_WRITE, - } - }; + fn delete_aux_file(&self, path: &str) { + self.aux_files.write().remove(path); + } - let mut entries_to_write = Vec::with_capacity(plans.len() + 1); - for plan in &plans { - let existing_chunk = plan.cached_chunk.as_deref().or_else(|| { - plan.existing_chunk_index - .and_then(|idx| existing_chunks.get(idx)) - .and_then(|value| value.as_deref()) - }); + fn is_dead(&self) -> bool { + self.state.read().dead + } - let mut new_chunk = if let Some(existing_chunk) = existing_chunk { - let mut chunk = vec![0u8; std::cmp::max(existing_chunk.len(), plan.write_end)]; - chunk[..existing_chunk.len()].copy_from_slice(existing_chunk); - chunk - } else { - vec![0u8; plan.write_end] - }; + fn mark_dead(&self, message: String) { + self.set_last_error(message); + self.state.write().dead = true; + } - let source_start = plan.chunk_offset + plan.write_start - offset; - let source_end = source_start + (plan.write_end - plan.write_start); - new_chunk[plan.write_start..plan.write_end] - .copy_from_slice(&data[source_start..source_end]); + fn resolve_pages( + &self, + target_pgnos: &[u32], + prefetch: bool, + ) -> std::result::Result>>, GetPagesError> { + use std::sync::atomic::Ordering::Relaxed; + self.resolve_pages_total.fetch_add(1, Relaxed); - entries_to_write.push((plan.chunk_key.clone(), new_chunk)); - } + let mut resolved = HashMap::new(); + let mut missing = Vec::new(); + let mut seen = HashSet::new(); - let previous_size = file.size; - let previous_meta_dirty = file.meta_dirty; - let new_size = std::cmp::max(file.size, write_end_offset as i64); - if new_size != previous_size { - file.size = new_size; - file.meta_dirty = true; - } - if file.meta_dirty { - entries_to_write.push((file.meta_key.to_vec(), encode_file_meta(file.size))); - } + { + let state = self.state.read(); + if state.dead { + return Err(GetPagesError::Other( + "sqlite actor lost its fence".to_string(), + )); + } - if let Some(read_cache) = get_file_state(file.state).read_cache.as_mut() { - for (key, value) in &entries_to_write { - // Only cache chunk keys here. Metadata keys are read on open/access - // and should not be mixed into the per-page cache. - if key.len() == 8 { - read_cache.insert(key.clone(), value.clone()); + for pgno in target_pgnos.iter().copied() { + if !seen.insert(pgno) { + continue; + } + if let Some(bytes) = state.write_buffer.dirty.get(&pgno) { + resolved.insert(pgno, Some(bytes.clone())); + continue; + } + if let Some(bytes) = state.page_cache.get(&pgno) { + resolved.insert(pgno, Some(bytes)); + continue; } + missing.push(pgno); } } - let (keys, values) = split_entries(entries_to_write); - if ctx.kv_put(keys, values).is_err() { - file.size = previous_size; - file.meta_dirty = previous_meta_dirty; - return SQLITE_IOERR_WRITE; + if missing.is_empty() { + self.resolve_pages_cache_hits + .fetch_add(target_pgnos.len() as u64, Relaxed); + return Ok(resolved); } - file.meta_dirty = false; - - ctx.vfs_metrics - .xwrite_us - .fetch_add(write_start.elapsed().as_micros() as u64, Ordering::Relaxed); - SQLITE_OK - }) -} - -unsafe extern "C" fn kv_io_truncate(p_file: *mut sqlite3_file, size: sqlite3_int64) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_TRUNCATE, { - let file = get_file(p_file); - let ctx = &*file.ctx; + self.resolve_pages_cache_hits + .fetch_add((seen.len() - missing.len()) as u64, Relaxed); - if size < 0 || size as u64 > kv::MAX_FILE_SIZE { - return SQLITE_IOERR_TRUNCATE; - } + let (generation, to_fetch) = { + let mut state = self.state.write(); + for pgno in target_pgnos.iter().copied() { + state.predictor.record(pgno); + } - if size >= file.size { - if size > file.size { - let previous_size = file.size; - let previous_meta_dirty = file.meta_dirty; - file.size = size; - file.meta_dirty = true; - if ctx - .kv_put( - vec![file.meta_key.to_vec()], - vec![encode_file_meta(file.size)], - ) - .is_err() - { - file.size = previous_size; - file.meta_dirty = previous_meta_dirty; - return SQLITE_IOERR_TRUNCATE; + let mut to_fetch = missing.clone(); + if prefetch { + let page_budget = (self.config.max_prefetch_bytes / state.page_size.max(1)).max(1); + let prediction_budget = page_budget.saturating_sub(to_fetch.len()); + let seed_pgno = target_pgnos.last().copied().unwrap_or_default(); + for predicted in state.predictor.multi_predict( + seed_pgno, + prediction_budget.min(self.config.prefetch_depth), + state.db_size_pages.max(seed_pgno), + ) { + if resolved.contains_key(&predicted) || to_fetch.contains(&predicted) { + continue; + } + to_fetch.push(predicted); } - file.meta_dirty = false; } - return SQLITE_OK; - } - - let last_chunk_to_keep = if size == 0 { - -1 - } else { - (size - 1) / kv::CHUNK_SIZE as i64 - }; - let last_existing_chunk = if file.size == 0 { - -1 - } else { - (file.size - 1) / kv::CHUNK_SIZE as i64 + (state.generation, to_fetch) }; - if let Some(read_cache) = get_file_state(file.state).read_cache.as_mut() { - // The read cache stores only chunk keys. Keep entries strictly before - // the truncation boundary so reads cannot serve bytes from removed chunks. - read_cache.retain(|key, _| { - // Chunk keys are 8 bytes: [prefix, version, CHUNK_PREFIX, file_tag, idx_be32] - if key.len() == 8 && key[3] == file.file_tag { - let chunk_idx = u32::from_be_bytes([key[4], key[5], key[6], key[7]]); - (chunk_idx as i64) <= last_chunk_to_keep - } else { - true - } - }); - } - - let previous_size = file.size; - let previous_meta_dirty = file.meta_dirty; - file.size = size; - file.meta_dirty = true; - if ctx - .kv_put( - vec![file.meta_key.to_vec()], - vec![encode_file_meta(file.size)], - ) - .is_err() { - file.size = previous_size; - file.meta_dirty = previous_meta_dirty; - return SQLITE_IOERR_TRUNCATE; + let prefetch_count = to_fetch.len() - missing.len(); + self.resolve_pages_fetches.fetch_add(1, Relaxed); + self.pages_fetched_total + .fetch_add(to_fetch.len() as u64, Relaxed); + self.prefetch_pages_total + .fetch_add(prefetch_count as u64, Relaxed); + tracing::debug!( + missing = missing.len(), + prefetch = prefetch_count, + total_fetch = to_fetch.len(), + "vfs get_pages fetch" + ); } - file.meta_dirty = false; - if size > 0 && size as usize % kv::CHUNK_SIZE != 0 { - let last_chunk_key = kv::get_chunk_key(file.file_tag, last_chunk_to_keep as u32); - let resp = match ctx.kv_get(vec![last_chunk_key.to_vec()]) { - Ok(resp) => resp, - Err(_) => return SQLITE_IOERR_TRUNCATE, - }; - let value_map = build_value_map(&resp); - if let Some(last_chunk_data) = value_map.get(last_chunk_key.as_slice()) { - let truncated_len = size as usize % kv::CHUNK_SIZE; - if last_chunk_data.len() > truncated_len { - let truncated_chunk = last_chunk_data[..truncated_len].to_vec(); - if ctx - .kv_put(vec![last_chunk_key.to_vec()], vec![truncated_chunk.clone()]) - .is_err() - { - return SQLITE_IOERR_TRUNCATE; - } - if let Some(read_cache) = get_file_state(file.state).read_cache.as_mut() { - read_cache.insert(last_chunk_key.to_vec(), truncated_chunk); + let response = self + .runtime + .block_on(self.transport.get_pages(protocol::SqliteGetPagesRequest { + actor_id: self.actor_id.clone(), + generation, + pgnos: to_fetch.clone(), + })) + .map_err(|err| GetPagesError::Other(err.to_string()))?; + + match response { + protocol::SqliteGetPagesResponse::SqliteFenceMismatch(mismatch) => { + Err(GetPagesError::FenceMismatch(mismatch.reason)) + } + protocol::SqliteGetPagesResponse::SqliteGetPagesOk(ok) => { + let mut state = self.state.write(); + state.update_read_meta(&ok.meta); + for fetched in ok.pages { + if let Some(bytes) = &fetched.bytes { + state.page_cache.insert(fetched.pgno, bytes.clone()); } + resolved.insert(fetched.pgno, fetched.bytes); + } + for pgno in missing { + resolved.entry(pgno).or_insert(None); } + Ok(resolved) + } + protocol::SqliteGetPagesResponse::SqliteErrorResponse(error) => { + Err(GetPagesError::Other(error.message)) } } + } - if last_chunk_to_keep < last_existing_chunk { - if ctx - .kv_delete_range( - kv::get_chunk_key(file.file_tag, (last_chunk_to_keep + 1) as u32).to_vec(), - kv::get_chunk_key_range_end(file.file_tag).to_vec(), - ) - .is_err() - { - return SQLITE_IOERR_TRUNCATE; + fn flush_dirty_pages( + &self, + ) -> std::result::Result, CommitBufferError> { + let total_start = Instant::now(); + let request_build_start = Instant::now(); + let request = { + let state = self.state.read(); + if state.dead { + return Err(CommitBufferError::Other( + "sqlite actor lost its fence".to_string(), + )); + } + if state.write_buffer.in_atomic_write || state.write_buffer.dirty.is_empty() { + return Ok(None); } - } - SQLITE_OK - }) -} + BufferedCommitRequest { + actor_id: self.actor_id.clone(), + generation: state.generation, + expected_head_txid: state.head_txid, + new_db_size_pages: state.db_size_pages, + max_delta_bytes: state.max_delta_bytes, + max_pages_per_stage: self.config.max_pages_per_stage, + dirty_pages: state + .write_buffer + .dirty + .iter() + .map(|(pgno, bytes)| protocol::SqliteDirtyPage { + pgno: *pgno, + bytes: bytes.clone(), + }) + .collect(), + } + }; + let request_build_ns = request_build_start.elapsed().as_nanos() as u64; -unsafe extern "C" fn kv_io_sync(p_file: *mut sqlite3_file, _flags: c_int) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_FSYNC, { - let file = get_file(p_file); - if !file.meta_dirty { - return SQLITE_OK; + let (outcome, transport_metrics) = match self + .runtime + .block_on(commit_buffered_pages(&self.transport, request.clone())) + { + Ok(outcome) => outcome, + Err(err) => { + mark_dead_for_non_fence_commit_error(self, &err); + return Err(err); + } + }; + self.commit_total + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + tracing::debug!( + dirty_pages = request.dirty_pages.len(), + path = ?outcome.path, + new_head_txid = outcome.new_head_txid, + request_build_ns, + serialize_ns = transport_metrics.serialize_ns, + transport_ns = transport_metrics.transport_ns, + "vfs commit complete (flush)" + ); + let state_update_start = Instant::now(); + let mut state = self.state.write(); + state.update_meta(&outcome.meta); + state.db_size_pages = request.new_db_size_pages; + for dirty_page in &request.dirty_pages { + state + .page_cache + .insert(dirty_page.pgno, dirty_page.bytes.clone()); } + state.write_buffer.dirty.clear(); + let state_update_ns = state_update_start.elapsed().as_nanos() as u64; + self.add_commit_phase_metrics( + request_build_ns, + transport_metrics, + state_update_ns, + total_start.elapsed().as_nanos() as u64, + ); + Ok(Some(outcome)) + } - let ctx = &*file.ctx; - if ctx - .kv_put( - vec![file.meta_key.to_vec()], - vec![encode_file_meta(file.size)], - ) - .is_err() + fn commit_atomic_write(&self) -> std::result::Result<(), CommitBufferError> { + let total_start = Instant::now(); + let request_build_start = Instant::now(); + let request = { + let mut state = self.state.write(); + if state.dead { + return Err(CommitBufferError::Other( + "sqlite actor lost its fence".to_string(), + )); + } + if !state.write_buffer.in_atomic_write { + return Ok(()); + } + if state.write_buffer.dirty.is_empty() { + state.write_buffer.in_atomic_write = false; + return Ok(()); + } + + BufferedCommitRequest { + actor_id: self.actor_id.clone(), + generation: state.generation, + expected_head_txid: state.head_txid, + new_db_size_pages: state.db_size_pages, + max_delta_bytes: state.max_delta_bytes, + max_pages_per_stage: self.config.max_pages_per_stage, + dirty_pages: state + .write_buffer + .dirty + .iter() + .map(|(pgno, bytes)| protocol::SqliteDirtyPage { + pgno: *pgno, + bytes: bytes.clone(), + }) + .collect(), + } + }; + let request_build_ns = request_build_start.elapsed().as_nanos() as u64; + + let (outcome, transport_metrics) = match self + .runtime + .block_on(commit_buffered_pages(&self.transport, request.clone())) { - return SQLITE_IOERR_FSYNC; + Ok(outcome) => outcome, + Err(err) => { + mark_dead_for_non_fence_commit_error(self, &err); + return Err(err); + } + }; + self.commit_total + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + tracing::debug!( + dirty_pages = request.dirty_pages.len(), + path = ?outcome.path, + new_head_txid = outcome.new_head_txid, + request_build_ns, + serialize_ns = transport_metrics.serialize_ns, + transport_ns = transport_metrics.transport_ns, + "vfs commit complete (atomic)" + ); + self.set_last_error(format!( + "post-commit atomic write succeeded: requested_db_size_pages={}, returned_db_size_pages={}, returned_head_txid={}", + request.new_db_size_pages, + outcome.meta.db_size_pages, + outcome.meta.head_txid, + )); + let state_update_start = Instant::now(); + let mut state = self.state.write(); + state.update_meta(&outcome.meta); + state.db_size_pages = request.new_db_size_pages; + for dirty_page in &request.dirty_pages { + state + .page_cache + .insert(dirty_page.pgno, dirty_page.bytes.clone()); } - file.meta_dirty = false; + state.write_buffer.dirty.clear(); + state.write_buffer.in_atomic_write = false; + let state_update_ns = state_update_start.elapsed().as_nanos() as u64; + self.add_commit_phase_metrics( + request_build_ns, + transport_metrics, + state_update_ns, + total_start.elapsed().as_nanos() as u64, + ); + Ok(()) + } - SQLITE_OK - }) + fn truncate_main_file(&self, size: sqlite3_int64) { + let page_size = self.page_size() as i64; + let truncated_pages = ((size + page_size - 1) / page_size) as u32; + let mut state = self.state.write(); + state.db_size_pages = truncated_pages; + state + .write_buffer + .dirty + .retain(|pgno, _| *pgno <= truncated_pages); + state.page_cache.invalidate_all(); + } } -unsafe extern "C" fn kv_io_file_size( - p_file: *mut sqlite3_file, - p_size: *mut sqlite3_int64, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_FSTAT, { - let file = get_file(p_file); - *p_size = file.size; - SQLITE_OK - }) +fn cleanup_batch_atomic_probe(db: *mut sqlite3) { + if let Err(err) = sqlite_exec(db, "DROP TABLE IF EXISTS __rivet_batch_probe;") { + tracing::warn!(%err, "failed to clean up sqlite batch atomic probe table"); + } } -unsafe extern "C" fn kv_io_lock(_p_file: *mut sqlite3_file, _level: c_int) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_LOCK, SQLITE_OK) +fn assert_batch_atomic_probe( + db: *mut sqlite3, + vfs: &SqliteVfs, +) -> std::result::Result<(), String> { + let commit_atomic_before = vfs.commit_atomic_count(); + let probe_sql = "\ + BEGIN IMMEDIATE;\ + CREATE TABLE IF NOT EXISTS __rivet_batch_probe(x INTEGER);\ + INSERT INTO __rivet_batch_probe VALUES(1);\ + DELETE FROM __rivet_batch_probe;\ + DROP TABLE IF EXISTS __rivet_batch_probe;\ + COMMIT;\ + "; + + if let Err(err) = sqlite_exec(db, probe_sql) { + cleanup_batch_atomic_probe(db); + return Err(format!("batch atomic probe failed: {err}")); + } + + let commit_atomic_after = vfs.commit_atomic_count(); + if commit_atomic_after == commit_atomic_before { + tracing::error!( + "batch atomic writes not active for sqlite, SQLITE_ENABLE_BATCH_ATOMIC_WRITE may be missing" + ); + cleanup_batch_atomic_probe(db); + return Err( + "batch atomic writes not active for sqlite, SQLITE_ENABLE_BATCH_ATOMIC_WRITE may be missing" + .to_string(), + ); + } + + Ok(()) } -unsafe extern "C" fn kv_io_unlock(_p_file: *mut sqlite3_file, _level: c_int) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_UNLOCK, SQLITE_OK) +fn mark_dead_for_non_fence_commit_error(ctx: &VfsContext, err: &CommitBufferError) { + match err { + CommitBufferError::FenceMismatch(_) => {} + CommitBufferError::StageNotFound(stage_id) => { + ctx.mark_dead(format!( + "sqlite stage {stage_id} missing during commit finalize" + )); + } + CommitBufferError::Other(message) => ctx.mark_dead(message.clone()), + } } -unsafe extern "C" fn kv_io_check_reserved_lock( - _p_file: *mut sqlite3_file, - p_res_out: *mut c_int, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - *p_res_out = 0; - SQLITE_OK +fn mark_dead_from_fence_commit_error(ctx: &VfsContext, err: &CommitBufferError) { + if let CommitBufferError::FenceMismatch(reason) = err { + ctx.mark_dead(reason.clone()); + } +} + +fn dirty_pages_raw_bytes(dirty_pages: &[protocol::SqliteDirtyPage]) -> Result { + dirty_pages.iter().try_fold(0u64, |total, dirty_page| { + let page_len = u64::try_from(dirty_page.bytes.len())?; + Ok(total + page_len) }) } -unsafe extern "C" fn kv_io_file_control( - p_file: *mut sqlite3_file, - op: c_int, - _p_arg: *mut c_void, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - let file = get_file(p_file); - if file.state.is_null() { - return SQLITE_NOTFOUND; - } - let state = get_file_state(file.state); +fn split_bytes(bytes: &[u8], max_chunk_bytes: usize) -> Vec> { + if bytes.is_empty() || max_chunk_bytes == 0 { + return vec![bytes.to_vec()]; + } - match op { - SQLITE_FCNTL_BEGIN_ATOMIC_WRITE => { - state.saved_file_size = file.size; - state.batch_mode = true; - file.meta_dirty = false; - state.dirty_buffer.clear(); - SQLITE_OK - } - SQLITE_FCNTL_COMMIT_ATOMIC_WRITE => { - let ctx = &*file.ctx; - let commit_start = std::time::Instant::now(); - let dirty_page_count = state.dirty_buffer.len() as u64; - let max_dirty_pages = if file.meta_dirty { - KV_MAX_BATCH_KEYS - 1 - } else { - KV_MAX_BATCH_KEYS - }; + bytes + .chunks(max_chunk_bytes) + .map(|chunk| chunk.to_vec()) + .collect() +} - if state.dirty_buffer.len() > max_dirty_pages { - state.dirty_buffer.clear(); - file.size = state.saved_file_size; - file.meta_dirty = false; - state.batch_mode = false; - return SQLITE_IOERR; - } +#[cfg(test)] +fn next_stage_id() -> u64 { + NEXT_STAGE_ID.fetch_add(1, Ordering::Relaxed) +} - let mut entries = Vec::with_capacity(state.dirty_buffer.len() + 1); - for (chunk_index, data) in &state.dirty_buffer { - entries.push(( - kv::get_chunk_key(file.file_tag, *chunk_index).to_vec(), - data.clone(), - )); - } - if file.meta_dirty { - entries.push((file.meta_key.to_vec(), encode_file_meta(file.size))); - } +fn next_temp_aux_path() -> String { + format!( + "{TEMP_AUX_PATH_PREFIX}-{}", + NEXT_TEMP_AUX_ID.fetch_add(1, Ordering::Relaxed) + ) +} - let (keys, values) = split_entries(entries); - if ctx.kv_put(keys, values).is_err() { - state.dirty_buffer.clear(); - file.size = state.saved_file_size; - file.meta_dirty = false; - state.batch_mode = false; - return SQLITE_IOERR; - } +unsafe fn get_aux_state(file: &VfsFile) -> Option<&AuxFileHandle> { + (!file.aux.is_null()).then(|| &*file.aux) +} - // Move dirty buffer entries into the read cache so subsequent - // reads can serve them without a KV round-trip. - let flushed: Vec<_> = std::mem::take(&mut state.dirty_buffer) - .into_iter() - .collect(); - if let Some(read_cache) = state.read_cache.as_mut() { - // Only chunk pages belong in the read cache. The metadata write above - // still goes through KV, but should not be cached as a page. - for (chunk_index, data) in flushed { - let key = kv::get_chunk_key(file.file_tag, chunk_index); - read_cache.insert(key.to_vec(), data); - } - } - file.meta_dirty = false; - state.batch_mode = false; - ctx.vfs_metrics - .commit_atomic_count - .fetch_add(1, Ordering::Relaxed); - ctx.vfs_metrics - .commit_atomic_pages - .fetch_add(dirty_page_count, Ordering::Relaxed); - ctx.vfs_metrics - .commit_atomic_us - .fetch_add(commit_start.elapsed().as_micros() as u64, Ordering::Relaxed); - SQLITE_OK +async fn commit_buffered_pages( + transport: &SqliteTransport, + request: BufferedCommitRequest, +) -> std::result::Result<(BufferedCommitOutcome, CommitTransportMetrics), CommitBufferError> { + let raw_dirty_bytes = dirty_pages_raw_bytes(&request.dirty_pages) + .map_err(|err| CommitBufferError::Other(err.to_string()))?; + let mut metrics = CommitTransportMetrics::default(); + + if raw_dirty_bytes <= request.max_delta_bytes { + let serialize_start = Instant::now(); + let fast_request = protocol::SqliteCommitRequest { + actor_id: request.actor_id.clone(), + generation: request.generation, + expected_head_txid: request.expected_head_txid, + dirty_pages: request.dirty_pages.clone(), + new_db_size_pages: request.new_db_size_pages, + }; + metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; + let transport_start = Instant::now(); + match transport + .commit(fast_request) + .await + .map_err(|err| CommitBufferError::Other(err.to_string()))? + { + protocol::SqliteCommitResponse::SqliteCommitOk(ok) => { + metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; + return Ok(( + BufferedCommitOutcome { + path: CommitPath::Fast, + new_head_txid: ok.new_head_txid, + meta: ok.meta, + }, + metrics, + )); } - SQLITE_FCNTL_ROLLBACK_ATOMIC_WRITE => { - if !state.batch_mode { - return SQLITE_OK; - } - state.dirty_buffer.clear(); - file.size = state.saved_file_size; - file.meta_dirty = false; - state.batch_mode = false; - SQLITE_OK + protocol::SqliteCommitResponse::SqliteFenceMismatch(mismatch) => { + return Err(CommitBufferError::FenceMismatch(mismatch.reason)); + } + protocol::SqliteCommitResponse::SqliteCommitTooLarge(_) => { + metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; + } + protocol::SqliteCommitResponse::SqliteErrorResponse(error) => { + return Err(CommitBufferError::Other(error.message)); } - _ => SQLITE_NOTFOUND, } - }) + } + + let serialize_start = Instant::now(); + let stage_begin_request = protocol::SqliteCommitStageBeginRequest { + actor_id: request.actor_id.clone(), + generation: request.generation, + }; + metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; + let transport_start = Instant::now(); + let txid = match transport + .commit_stage_begin(stage_begin_request) + .await + .map_err(|err| CommitBufferError::Other(err.to_string()))? + { + protocol::SqliteCommitStageBeginResponse::SqliteCommitStageBeginOk(ok) => { + metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; + ok.txid + } + protocol::SqliteCommitStageBeginResponse::SqliteFenceMismatch(mismatch) => { + return Err(CommitBufferError::FenceMismatch(mismatch.reason)); + } + protocol::SqliteCommitStageBeginResponse::SqliteErrorResponse(error) => { + return Err(CommitBufferError::Other(error.message)); + } + }; + + let serialize_start = Instant::now(); + let encoded_delta = encode_ltx_v3( + LtxHeader::delta( + txid, + request.new_db_size_pages, + sqlite_now_ms().map_err(|err| CommitBufferError::Other(err.to_string()))?, + ), + &request + .dirty_pages + .iter() + .map(|dirty_page| sqlite_storage::types::DirtyPage { + pgno: dirty_page.pgno, + bytes: dirty_page.bytes.clone(), + }) + .collect::>(), + ) + .map_err(|err| CommitBufferError::Other(err.to_string()))?; + let staged_chunks = split_bytes( + &encoded_delta, + request.max_delta_bytes.try_into().map_err(|_| { + CommitBufferError::Other("sqlite max_delta_bytes exceeded usize".to_string()) + })?, + ); + metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; + + for (chunk_idx, chunk_bytes) in staged_chunks.iter().enumerate() { + let serialize_start = Instant::now(); + let stage_request = protocol::SqliteCommitStageRequest { + actor_id: request.actor_id.clone(), + generation: request.generation, + txid, + chunk_idx: chunk_idx as u32, + bytes: chunk_bytes.clone(), + is_last: chunk_idx + 1 == staged_chunks.len(), + }; + metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; + if transport + .queue_commit_stage(stage_request.clone()) + .map_err(|err| CommitBufferError::Other(err.to_string()))? + { + continue; + } + + let transport_start = Instant::now(); + match transport + .commit_stage(stage_request) + .await + .map_err(|err| CommitBufferError::Other(err.to_string()))? + { + protocol::SqliteCommitStageResponse::SqliteCommitStageOk(_) => { + metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; + } + protocol::SqliteCommitStageResponse::SqliteFenceMismatch(mismatch) => { + return Err(CommitBufferError::FenceMismatch(mismatch.reason)); + } + protocol::SqliteCommitStageResponse::SqliteErrorResponse(error) => { + return Err(CommitBufferError::Other(error.message)); + } + } + } + + let serialize_start = Instant::now(); + let finalize_request = protocol::SqliteCommitFinalizeRequest { + actor_id: request.actor_id, + generation: request.generation, + expected_head_txid: request.expected_head_txid, + txid, + new_db_size_pages: request.new_db_size_pages, + }; + metrics.serialize_ns += serialize_start.elapsed().as_nanos() as u64; + let transport_start = Instant::now(); + match transport + .commit_finalize(finalize_request) + .await + .map_err(|err| CommitBufferError::Other(err.to_string()))? + { + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk(ok) => { + metrics.transport_ns += transport_start.elapsed().as_nanos() as u64; + Ok(( + BufferedCommitOutcome { + path: CommitPath::Slow, + new_head_txid: ok.new_head_txid, + meta: ok.meta, + }, + metrics, + )) + } + protocol::SqliteCommitFinalizeResponse::SqliteFenceMismatch(mismatch) => { + Err(CommitBufferError::FenceMismatch(mismatch.reason)) + } + protocol::SqliteCommitFinalizeResponse::SqliteStageNotFound(not_found) => { + Err(CommitBufferError::StageNotFound(not_found.stage_id)) + } + protocol::SqliteCommitFinalizeResponse::SqliteErrorResponse(error) => { + Err(CommitBufferError::Other(error.message)) + } + } } -unsafe extern "C" fn kv_io_sector_size(_p_file: *mut sqlite3_file) -> c_int { - vfs_catch_unwind!(kv::CHUNK_SIZE as c_int, kv::CHUNK_SIZE as c_int) +unsafe fn get_file(p: *mut sqlite3_file) -> &'static mut VfsFile { + &mut *(p as *mut VfsFile) } -unsafe extern "C" fn kv_io_device_characteristics(_p_file: *mut sqlite3_file) -> c_int { - vfs_catch_unwind!(0, SQLITE_IOCAP_BATCH_ATOMIC) +unsafe fn get_vfs_ctx(p: *mut sqlite3_vfs) -> &'static VfsContext { + &*((*p).pAppData as *const VfsContext) } -// MARK: VFS Callbacks +fn sqlite_error_message(db: *mut sqlite3) -> String { + unsafe { + if db.is_null() { + "unknown sqlite error".to_string() + } else { + CStr::from_ptr(sqlite3_errmsg(db)) + .to_string_lossy() + .into_owned() + } + } +} -unsafe extern "C" fn kv_vfs_open( - p_vfs: *mut sqlite3_vfs, - z_name: *const c_char, - p_file: *mut sqlite3_file, - flags: c_int, - p_out_flags: *mut c_int, -) -> c_int { - vfs_catch_unwind!(SQLITE_CANTOPEN, { - if z_name.is_null() { - return SQLITE_CANTOPEN; +fn sqlite_exec(db: *mut sqlite3, sql: &str) -> std::result::Result<(), String> { + let c_sql = CString::new(sql).map_err(|err| err.to_string())?; + let rc = unsafe { sqlite3_exec(db, c_sql.as_ptr(), None, ptr::null_mut(), ptr::null_mut()) }; + if rc != SQLITE_OK { + return Err(format!( + "`{sql}` failed with code {rc}: {}", + sqlite_error_message(db) + )); + } + Ok(()) +} + +#[cfg(test)] +fn sqlite_step_statement(db: *mut sqlite3, sql: &str) -> std::result::Result<(), String> { + let c_sql = CString::new(sql).map_err(|err| err.to_string())?; + let mut stmt = ptr::null_mut(); + let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; + if rc != SQLITE_OK { + return Err(format!( + "`{sql}` prepare failed with code {rc}: {}", + sqlite_error_message(db) + )); + } + if stmt.is_null() { + return Ok(()); + } + + let result = loop { + let step_rc = unsafe { sqlite3_step(stmt) }; + if step_rc == SQLITE_DONE { + break Ok(()); } + if step_rc != SQLITE_ROW { + break Err(format!( + "`{sql}` step failed with code {step_rc}: {}", + sqlite_error_message(db) + )); + } + }; - let ctx = get_vfs_ctx(p_vfs); - let path = match CStr::from_ptr(z_name).to_str() { - Ok(path) => path, - Err(_) => return SQLITE_CANTOPEN, - }; - let file_tag = match ctx.resolve_file_tag(path) { - Some(file_tag) => file_tag, - None => return SQLITE_CANTOPEN, - }; - let meta_key = kv::get_meta_key(file_tag); + unsafe { + sqlite3_finalize(stmt); + } - let resp = match ctx.kv_get(vec![meta_key.to_vec()]) { - Ok(resp) => resp, - Err(_) => return SQLITE_CANTOPEN, - }; - let value_map = build_value_map(&resp); + result +} + +fn page_span(offset: i64, length: usize, page_size: usize) -> std::result::Result, ()> { + if offset < 0 { + return Err(()); + } + if length == 0 { + return Ok(Vec::new()); + } - let size = if let Some(size_data) = value_map.get(meta_key.as_slice()) { - let size = match decode_file_meta(size_data) { - Some(size) => size, - None => return SQLITE_IOERR, + let start = offset as usize / page_size + 1; + let end = (offset as usize + length - 1) / page_size + 1; + Ok((start as u32..=end as u32).collect()) +} + +unsafe extern "C" fn io_close(p_file: *mut sqlite3_file) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR, { + if p_file.is_null() { + return SQLITE_OK; + } + let file = get_file(p_file); + let result = if !file.aux.is_null() { + let aux = Box::from_raw(file.aux); + if aux.delete_on_close { + let ctx = &*file.ctx; + ctx.delete_aux_file(&aux.path); + } + file.aux = ptr::null_mut(); + Ok(()) + } else { + let ctx = &*file.ctx; + let should_flush = { + let state = ctx.state.read(); + state.write_buffer.in_atomic_write || !state.write_buffer.dirty.is_empty() }; - if !is_valid_file_size(size) { - return SQLITE_IOERR; - } - size - } else if flags & SQLITE_OPEN_CREATE != 0 { - if file_tag == kv::FILE_TAG_MAIN { - let size = kv::CHUNK_SIZE as i64; - let entries = vec![ - (kv::get_chunk_key(file_tag, 0).to_vec(), empty_db_page()), - (meta_key.to_vec(), encode_file_meta(size)), - ]; - let (keys, values) = split_entries(entries); - if ctx.kv_put(keys, values).is_err() { - return SQLITE_CANTOPEN; + if should_flush { + if ctx.state.read().write_buffer.in_atomic_write { + ctx.commit_atomic_write().map(|_| ()) + } else { + ctx.flush_dirty_pages().map(|_| ()) } - size } else { - let size = 0i64; - if ctx - .kv_put(vec![meta_key.to_vec()], vec![encode_file_meta(size)]) - .is_err() - { - return SQLITE_CANTOPEN; - } - size + Ok(()) } - } else { - return SQLITE_CANTOPEN; - }; - - let state = Box::into_raw(Box::new(KvFileState::new(ctx.read_cache_enabled))); - let base = sqlite3_file { - pMethods: ctx.io_methods.as_ref() as *const sqlite3_io_methods, }; - ptr::write( - p_file as *mut KvFile, - KvFile { - base, - ctx: ctx as *const VfsContext, - state, - file_tag, - meta_key, - size, - meta_dirty: false, - flags, - }, - ); - - if !p_out_flags.is_null() { - *p_out_flags = flags; + file.base.pMethods = ptr::null(); + match result { + Ok(()) => SQLITE_OK, + Err(err) => { + let ctx = &*file.ctx; + mark_dead_from_fence_commit_error(ctx, &err); + SQLITE_IOERR + } } - - SQLITE_OK }) } -unsafe extern "C" fn kv_vfs_delete( - p_vfs: *mut sqlite3_vfs, - z_name: *const c_char, - _sync_dir: c_int, +unsafe extern "C" fn io_read( + p_file: *mut sqlite3_file, + p_buf: *mut c_void, + i_amt: c_int, + i_offset: sqlite3_int64, ) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_DELETE, { - if z_name.is_null() { - return SQLITE_IOERR_DELETE; + vfs_catch_unwind!(SQLITE_IOERR_READ, { + if i_amt <= 0 { + return SQLITE_OK; } - let ctx = get_vfs_ctx(p_vfs); - let path = match CStr::from_ptr(z_name).to_str() { - Ok(path) => path, - Err(_) => return SQLITE_IOERR_DELETE, + let file = get_file(p_file); + if let Some(aux) = get_aux_state(file) { + if i_offset < 0 { + return SQLITE_IOERR_READ; + } + + let offset = i_offset as usize; + let requested = i_amt as usize; + let buf = slice::from_raw_parts_mut(p_buf.cast::(), requested); + buf.fill(0); + + let bytes = aux.state.bytes.lock(); + if offset >= bytes.len() { + return SQLITE_IOERR_SHORT_READ; + } + + let copy_len = requested.min(bytes.len() - offset); + buf[..copy_len].copy_from_slice(&bytes[offset..offset + copy_len]); + return if copy_len < requested { + SQLITE_IOERR_SHORT_READ + } else { + SQLITE_OK + }; + } + + let ctx = &*file.ctx; + if ctx.is_dead() { + return SQLITE_IOERR_READ; + } + + let buf = slice::from_raw_parts_mut(p_buf.cast::(), i_amt as usize); + let requested_pages = match page_span(i_offset, i_amt as usize, ctx.page_size()) { + Ok(pages) => pages, + Err(_) => return SQLITE_IOERR_READ, }; - let file_tag = match ctx.resolve_file_tag(path) { - Some(file_tag) => file_tag, - None => return SQLITE_IOERR_DELETE, + let page_size = ctx.page_size(); + let file_size = { + let state = ctx.state.read(); + state.db_size_pages as usize * state.page_size }; - match ctx.delete_file(file_tag) { - Ok(()) => SQLITE_OK, - Err(_) => SQLITE_IOERR_DELETE, + let resolved = match ctx.resolve_pages(&requested_pages, true) { + Ok(pages) => pages, + Err(GetPagesError::FenceMismatch(reason)) => { + ctx.mark_dead(reason); + return SQLITE_IOERR_READ; + } + Err(GetPagesError::Other(message)) => { + ctx.mark_dead(message); + return SQLITE_IOERR_READ; + } + }; + ctx.clear_last_error(); + + buf.fill(0); + for pgno in requested_pages { + let Some(Some(bytes)) = resolved.get(&pgno) else { + continue; + }; + let page_start = (pgno as usize - 1) * page_size; + let copy_start = page_start.max(i_offset as usize); + let copy_end = (page_start + page_size).min(i_offset as usize + i_amt as usize); + if copy_start >= copy_end { + continue; + } + let page_offset = copy_start - page_start; + let dest_offset = copy_start - i_offset as usize; + let copy_len = copy_end - copy_start; + buf[dest_offset..dest_offset + copy_len] + .copy_from_slice(&bytes[page_offset..page_offset + copy_len]); + } + + if i_offset as usize + i_amt as usize > file_size { + return SQLITE_IOERR_SHORT_READ; } + + SQLITE_OK }) } -unsafe extern "C" fn kv_vfs_access( - p_vfs: *mut sqlite3_vfs, - z_name: *const c_char, - _flags: c_int, - p_res_out: *mut c_int, +unsafe extern "C" fn io_write( + p_file: *mut sqlite3_file, + p_buf: *const c_void, + i_amt: c_int, + i_offset: sqlite3_int64, ) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR_ACCESS, { - if z_name.is_null() { - *p_res_out = 0; + vfs_catch_unwind!(SQLITE_IOERR_WRITE, { + if i_amt <= 0 { return SQLITE_OK; } - let ctx = get_vfs_ctx(p_vfs); - let path = match CStr::from_ptr(z_name).to_str() { - Ok(path) => path, - Err(_) => { - *p_res_out = 0; - return SQLITE_OK; + let file = get_file(p_file); + if let Some(aux) = get_aux_state(file) { + if i_offset < 0 { + return SQLITE_IOERR_WRITE; } - }; - let file_tag = match ctx.resolve_file_tag(path) { - Some(file_tag) => file_tag, - None => { - *p_res_out = 0; - return SQLITE_OK; + + let offset = i_offset as usize; + let source = slice::from_raw_parts(p_buf.cast::(), i_amt as usize); + let mut bytes = aux.state.bytes.lock(); + let end = offset + source.len(); + if bytes.len() < end { + bytes.resize(end, 0); } + bytes[offset..end].copy_from_slice(source); + return SQLITE_OK; + } + + let ctx = &*file.ctx; + if ctx.is_dead() { + return SQLITE_IOERR_WRITE; + } + + let page_size = ctx.page_size(); + let source = slice::from_raw_parts(p_buf.cast::(), i_amt as usize); + let target_pages = match page_span(i_offset, i_amt as usize, page_size) { + Ok(pages) => pages, + Err(_) => return SQLITE_IOERR_WRITE, }; - let meta_key = kv::get_meta_key(file_tag); - let resp = match ctx.kv_get(vec![meta_key.to_vec()]) { - Ok(resp) => resp, - Err(_) => return SQLITE_IOERR_ACCESS, - }; - let value_map = build_value_map(&resp); - *p_res_out = if value_map.contains_key(meta_key.as_slice()) { - 1 + + // Fast path: for full-page aligned writes we don't need the existing + // page data because we're overwriting every byte. Skip resolve_pages + // to eliminate a round trip to the engine per page. Also, for pages + // beyond db_size_pages (new allocations), there's nothing to fetch. + let offset = i_offset as usize; + let amt = i_amt as usize; + let is_aligned_full_page = offset % page_size == 0 && amt % page_size == 0; + + let resolved = if is_aligned_full_page { + HashMap::new() } else { - 0 + let (db_size_pages, pages_to_resolve): (u32, Vec) = { + let state = ctx.state.read(); + let known_max = state.db_size_pages; + ( + known_max, + target_pages + .iter() + .copied() + .filter(|pgno| *pgno <= known_max) + .collect(), + ) + }; + + let mut resolved = if pages_to_resolve.is_empty() { + HashMap::new() + } else { + match ctx.resolve_pages(&pages_to_resolve, false) { + Ok(pages) => pages, + Err(GetPagesError::FenceMismatch(reason)) => { + ctx.mark_dead(reason); + return SQLITE_IOERR_WRITE; + } + Err(GetPagesError::Other(message)) => { + ctx.mark_dead(message); + return SQLITE_IOERR_WRITE; + } + } + }; + for pgno in &target_pages { + if *pgno > db_size_pages { + resolved.entry(*pgno).or_insert(None); + } + } + resolved }; + let mut dirty_pages = BTreeMap::new(); + for pgno in target_pages { + let page_start = (pgno as usize - 1) * page_size; + let patch_start = page_start.max(offset); + let patch_end = (page_start + page_size).min(offset + amt); + let Some(copy_len) = patch_end.checked_sub(patch_start) else { + continue; + }; + if copy_len == 0 { + continue; + } + + let mut page = if is_aligned_full_page { + vec![0; page_size] + } else { + resolved + .get(&pgno) + .and_then(|bytes| bytes.clone()) + .unwrap_or_else(|| vec![0; page_size]) + }; + if page.len() < page_size { + page.resize(page_size, 0); + } + + let page_offset = patch_start - page_start; + let source_offset = patch_start - offset; + page[page_offset..page_offset + copy_len] + .copy_from_slice(&source[source_offset..source_offset + copy_len]); + dirty_pages.insert(pgno, page); + } + + let mut state = ctx.state.write(); + for (pgno, bytes) in dirty_pages { + state.write_buffer.dirty.insert(pgno, bytes); + } + let end_page = ((offset + amt) + page_size - 1) / page_size; + state.db_size_pages = state.db_size_pages.max(end_page as u32); + ctx.clear_last_error(); SQLITE_OK }) } -unsafe extern "C" fn kv_vfs_full_pathname( - _p_vfs: *mut sqlite3_vfs, - z_name: *const c_char, - n_out: c_int, - z_out: *mut c_char, -) -> c_int { - vfs_catch_unwind!(SQLITE_IOERR, { - if z_name.is_null() || z_out.is_null() || n_out <= 0 { - return SQLITE_IOERR; +unsafe extern "C" fn io_truncate(p_file: *mut sqlite3_file, size: sqlite3_int64) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR_TRUNCATE, { + if size < 0 { + return SQLITE_IOERR_TRUNCATE; } - - let name = CStr::from_ptr(z_name); - let bytes = name.to_bytes_with_nul(); - if bytes.len() >= n_out as usize { - return SQLITE_IOERR; + let file = get_file(p_file); + if let Some(aux) = get_aux_state(file) { + aux.state.bytes.lock().truncate(size as usize); + return SQLITE_OK; } - - ptr::copy_nonoverlapping(bytes.as_ptr() as *const c_char, z_out, bytes.len()); + let ctx = &*file.ctx; + ctx.truncate_main_file(size); SQLITE_OK }) } -unsafe extern "C" fn kv_vfs_randomness( - _p_vfs: *mut sqlite3_vfs, - n_byte: c_int, - z_out: *mut c_char, -) -> c_int { - vfs_catch_unwind!(0, { - let buf = slice::from_raw_parts_mut(z_out as *mut u8, n_byte as usize); - match getrandom::getrandom(buf) { - Ok(()) => n_byte, +unsafe extern "C" fn io_sync(p_file: *mut sqlite3_file, _flags: c_int) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR_FSYNC, { + let file = get_file(p_file); + if get_aux_state(file).is_some() { + return SQLITE_OK; + } + let ctx = &*file.ctx; + match ctx.flush_dirty_pages() { + Ok(_) => SQLITE_OK, + Err(err) => { + mark_dead_from_fence_commit_error(ctx, &err); + SQLITE_IOERR_FSYNC + } + } + }) +} + +unsafe extern "C" fn io_file_size( + p_file: *mut sqlite3_file, + p_size: *mut sqlite3_int64, +) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR_FSTAT, { + let file = get_file(p_file); + if let Some(aux) = get_aux_state(file) { + *p_size = aux.state.bytes.lock().len() as sqlite3_int64; + return SQLITE_OK; + } + let ctx = &*file.ctx; + let state = ctx.state.read(); + *p_size = (state.db_size_pages as usize * state.page_size) as sqlite3_int64; + SQLITE_OK + }) +} + +unsafe extern "C" fn io_lock(_p_file: *mut sqlite3_file, _level: c_int) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR_LOCK, SQLITE_OK) +} + +unsafe extern "C" fn io_unlock(_p_file: *mut sqlite3_file, _level: c_int) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR_UNLOCK, SQLITE_OK) +} + +unsafe extern "C" fn io_check_reserved_lock( + _p_file: *mut sqlite3_file, + p_res_out: *mut c_int, +) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR, { + *p_res_out = 0; + SQLITE_OK + }) +} + +unsafe extern "C" fn io_file_control( + p_file: *mut sqlite3_file, + op: c_int, + _p_arg: *mut c_void, +) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR, { + let file = get_file(p_file); + if get_aux_state(file).is_some() { + return SQLITE_NOTFOUND; + } + let ctx = &*file.ctx; + + match op { + SQLITE_FCNTL_BEGIN_ATOMIC_WRITE => { + let mut state = ctx.state.write(); + state.write_buffer.in_atomic_write = true; + state.write_buffer.saved_db_size = state.db_size_pages; + state.write_buffer.dirty.clear(); + SQLITE_OK + } + SQLITE_FCNTL_COMMIT_ATOMIC_WRITE => match ctx.commit_atomic_write() { + Ok(()) => { + ctx.commit_atomic_count.fetch_add(1, Ordering::Relaxed); + SQLITE_OK + } + Err(err) => { + mark_dead_from_fence_commit_error(ctx, &err); + SQLITE_IOERR + } + }, + SQLITE_FCNTL_ROLLBACK_ATOMIC_WRITE => { + let mut state = ctx.state.write(); + state.write_buffer.dirty.clear(); + state.write_buffer.in_atomic_write = false; + state.db_size_pages = state.write_buffer.saved_db_size; + SQLITE_OK + } + _ => SQLITE_NOTFOUND, + } + }) +} + +unsafe extern "C" fn io_sector_size(_p_file: *mut sqlite3_file) -> c_int { + vfs_catch_unwind!(DEFAULT_PAGE_SIZE as c_int, DEFAULT_PAGE_SIZE as c_int) +} + +unsafe extern "C" fn io_device_characteristics(p_file: *mut sqlite3_file) -> c_int { + vfs_catch_unwind!(0, { + let file = get_file(p_file); + if get_aux_state(file).is_some() { + 0 + } else { + SQLITE_IOCAP_BATCH_ATOMIC + } + }) +} + +unsafe extern "C" fn vfs_open( + p_vfs: *mut sqlite3_vfs, + z_name: *const c_char, + p_file: *mut sqlite3_file, + flags: c_int, + p_out_flags: *mut c_int, +) -> c_int { + vfs_catch_unwind!(SQLITE_CANTOPEN, { + let ctx = get_vfs_ctx(p_vfs); + let delete_on_close = (flags & SQLITE_OPEN_DELETEONCLOSE) != 0; + let path = if z_name.is_null() { + if delete_on_close { + next_temp_aux_path() + } else { + return SQLITE_CANTOPEN; + } + } else { + match CStr::from_ptr(z_name).to_str() { + Ok(path) => path.to_string(), + Err(_) => return SQLITE_CANTOPEN, + } + }; + let is_main = + path == ctx.actor_id && !delete_on_close && (flags & SQLITE_OPEN_MAIN_DB) != 0; + + let base = sqlite3_file { + pMethods: ctx.io_methods.as_ref(), + }; + let aux = if is_main { + ptr::null_mut() + } else { + Box::into_raw(Box::new(AuxFileHandle { + path: path.clone(), + state: ctx.open_aux_file(&path), + delete_on_close, + })) + }; + ptr::write( + p_file.cast::(), + VfsFile { + base, + ctx: ctx as *const VfsContext, + aux, + }, + ); + + if !p_out_flags.is_null() { + *p_out_flags = flags; + } + + SQLITE_OK + }) +} + +unsafe extern "C" fn vfs_delete( + p_vfs: *mut sqlite3_vfs, + z_name: *const c_char, + _sync_dir: c_int, +) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR_DELETE, { + if z_name.is_null() { + return SQLITE_OK; + } + + let ctx = get_vfs_ctx(p_vfs); + let path = match CStr::from_ptr(z_name).to_str() { + Ok(path) => path, + Err(_) => return SQLITE_OK, + }; + if path != ctx.actor_id { + ctx.delete_aux_file(path); + } + SQLITE_OK + }) +} + +unsafe extern "C" fn vfs_access( + p_vfs: *mut sqlite3_vfs, + z_name: *const c_char, + _flags: c_int, + p_res_out: *mut c_int, +) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR_ACCESS, { + if z_name.is_null() { + *p_res_out = 0; + return SQLITE_OK; + } + + let ctx = get_vfs_ctx(p_vfs); + let path = match CStr::from_ptr(z_name).to_str() { + Ok(path) => path, + Err(_) => { + *p_res_out = 0; + return SQLITE_OK; + } + }; + + *p_res_out = if path == ctx.actor_id || ctx.aux_file_exists(path) { + 1 + } else { + 0 + }; + SQLITE_OK + }) +} + +unsafe extern "C" fn vfs_full_pathname( + _p_vfs: *mut sqlite3_vfs, + z_name: *const c_char, + n_out: c_int, + z_out: *mut c_char, +) -> c_int { + vfs_catch_unwind!(SQLITE_IOERR, { + if z_name.is_null() || z_out.is_null() || n_out <= 0 { + return SQLITE_IOERR; + } + + let name = CStr::from_ptr(z_name); + let bytes = name.to_bytes_with_nul(); + if bytes.len() >= n_out as usize { + return SQLITE_IOERR; + } + + ptr::copy_nonoverlapping(bytes.as_ptr().cast::(), z_out, bytes.len()); + SQLITE_OK + }) +} + +unsafe extern "C" fn vfs_randomness( + _p_vfs: *mut sqlite3_vfs, + n_byte: c_int, + z_out: *mut c_char, +) -> c_int { + vfs_catch_unwind!(0, { + let buf = slice::from_raw_parts_mut(z_out.cast::(), n_byte as usize); + match getrandom::getrandom(buf) { + Ok(()) => n_byte, Err(_) => 0, } }) } -unsafe extern "C" fn kv_vfs_sleep(_p_vfs: *mut sqlite3_vfs, microseconds: c_int) -> c_int { +unsafe extern "C" fn vfs_sleep(_p_vfs: *mut sqlite3_vfs, microseconds: c_int) -> c_int { vfs_catch_unwind!(0, { std::thread::sleep(std::time::Duration::from_micros(microseconds as u64)); microseconds }) } -unsafe extern "C" fn kv_vfs_current_time(_p_vfs: *mut sqlite3_vfs, p_time_out: *mut f64) -> c_int { +unsafe extern "C" fn vfs_current_time(_p_vfs: *mut sqlite3_vfs, p_time_out: *mut f64) -> c_int { vfs_catch_unwind!(SQLITE_IOERR, { let now = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -1321,7 +2292,7 @@ unsafe extern "C" fn kv_vfs_current_time(_p_vfs: *mut sqlite3_vfs, p_time_out: * }) } -unsafe extern "C" fn kv_vfs_get_last_error( +unsafe extern "C" fn vfs_get_last_error( p_vfs: *mut sqlite3_vfs, n_byte: c_int, z_err_msg: *mut c_char, @@ -1332,8 +2303,7 @@ unsafe extern "C" fn kv_vfs_get_last_error( } let ctx = get_vfs_ctx(p_vfs); - let last_error = ctx.clone_last_error(); - let Some(message) = last_error else { + let Some(message) = ctx.clone_last_error() else { *z_err_msg = 0; return 0; }; @@ -1343,91 +2313,79 @@ unsafe extern "C" fn kv_vfs_get_last_error( let copy_len = bytes.len().min(max_len); let dst = z_err_msg.cast::(); ptr::copy_nonoverlapping(bytes.as_ptr(), dst, copy_len); - *dst.add(copy_len) = 0u8; + *dst.add(copy_len) = 0; 0 }) } -// MARK: KvVfs - -pub struct KvVfs { - vfs_ptr: *mut sqlite3_vfs, - _name: CString, - ctx_ptr: *mut VfsContext, -} - -unsafe impl Send for KvVfs {} -unsafe impl Sync for KvVfs {} - -impl KvVfs { - fn take_last_kv_error(&self) -> Option { - unsafe { (*self.ctx_ptr).take_last_error() } +impl SqliteVfs { + pub fn register( + name: &str, + handle: EnvoyHandle, + actor_id: String, + runtime: Handle, + startup: protocol::SqliteStartupData, + config: VfsConfig, + ) -> std::result::Result { + Self::register_with_transport( + name, + SqliteTransport::from_envoy(handle), + actor_id, + runtime, + startup, + config, + ) } - fn commit_atomic_count(&self) -> u64 { - unsafe { - (&(*self.ctx_ptr).vfs_metrics) - .commit_atomic_count - .load(Ordering::Relaxed) - } + fn take_last_error(&self) -> Option { + unsafe { (*self.ctx_ptr).take_last_error() } } - pub fn register( + fn register_with_transport( name: &str, - kv: Arc, + transport: SqliteTransport, actor_id: String, - rt_handle: Handle, - mut startup_preload: StartupPreloadEntries, - ) -> Result { + runtime: Handle, + startup: protocol::SqliteStartupData, + config: VfsConfig, + ) -> std::result::Result { let mut io_methods: sqlite3_io_methods = unsafe { std::mem::zeroed() }; io_methods.iVersion = 1; - io_methods.xClose = Some(kv_io_close); - io_methods.xRead = Some(kv_io_read); - io_methods.xWrite = Some(kv_io_write); - io_methods.xTruncate = Some(kv_io_truncate); - io_methods.xSync = Some(kv_io_sync); - io_methods.xFileSize = Some(kv_io_file_size); - io_methods.xLock = Some(kv_io_lock); - io_methods.xUnlock = Some(kv_io_unlock); - io_methods.xCheckReservedLock = Some(kv_io_check_reserved_lock); - io_methods.xFileControl = Some(kv_io_file_control); - io_methods.xSectorSize = Some(kv_io_sector_size); - io_methods.xDeviceCharacteristics = Some(kv_io_device_characteristics); - - let vfs_metrics = Arc::new(VfsMetrics::new()); - sort_startup_preload(&mut startup_preload); - let ctx = Box::new(VfsContext { - kv, - actor_id: actor_id.clone(), - main_file_name: actor_id, - startup_preload: Mutex::new((!startup_preload.is_empty()).then_some(startup_preload)), - read_cache_enabled: read_cache_enabled(), - last_error: Mutex::new(None), - rt_handle, - io_methods: Box::new(io_methods), - vfs_metrics, - }); + io_methods.xClose = Some(io_close); + io_methods.xRead = Some(io_read); + io_methods.xWrite = Some(io_write); + io_methods.xTruncate = Some(io_truncate); + io_methods.xSync = Some(io_sync); + io_methods.xFileSize = Some(io_file_size); + io_methods.xLock = Some(io_lock); + io_methods.xUnlock = Some(io_unlock); + io_methods.xCheckReservedLock = Some(io_check_reserved_lock); + io_methods.xFileControl = Some(io_file_control); + io_methods.xSectorSize = Some(io_sector_size); + io_methods.xDeviceCharacteristics = Some(io_device_characteristics); + + let ctx = Box::new(VfsContext::new( + actor_id, runtime, transport, startup, config, io_methods, + )); let ctx_ptr = Box::into_raw(ctx); - let name_cstring = CString::new(name).map_err(|err| err.to_string())?; let mut vfs: sqlite3_vfs = unsafe { std::mem::zeroed() }; vfs.iVersion = 1; - vfs.szOsFile = std::mem::size_of::() as c_int; + vfs.szOsFile = std::mem::size_of::() as c_int; vfs.mxPathname = MAX_PATHNAME; vfs.zName = name_cstring.as_ptr(); - vfs.pAppData = ctx_ptr as *mut c_void; - vfs.xOpen = Some(kv_vfs_open); - vfs.xDelete = Some(kv_vfs_delete); - vfs.xAccess = Some(kv_vfs_access); - vfs.xFullPathname = Some(kv_vfs_full_pathname); - vfs.xRandomness = Some(kv_vfs_randomness); - vfs.xSleep = Some(kv_vfs_sleep); - vfs.xCurrentTime = Some(kv_vfs_current_time); - vfs.xGetLastError = Some(kv_vfs_get_last_error); + vfs.pAppData = ctx_ptr.cast::(); + vfs.xOpen = Some(vfs_open); + vfs.xDelete = Some(vfs_delete); + vfs.xAccess = Some(vfs_access); + vfs.xFullPathname = Some(vfs_full_pathname); + vfs.xRandomness = Some(vfs_randomness); + vfs.xSleep = Some(vfs_sleep); + vfs.xCurrentTime = Some(vfs_current_time); + vfs.xGetLastError = Some(vfs_get_last_error); let vfs_ptr = Box::into_raw(Box::new(vfs)); - let rc = unsafe { sqlite3_vfs_register(vfs_ptr, 0) }; if rc != SQLITE_OK { unsafe { @@ -1447,9 +2405,13 @@ impl KvVfs { pub fn name_ptr(&self) -> *const c_char { self._name.as_ptr() } + + fn commit_atomic_count(&self) -> u64 { + unsafe { (*self.ctx_ptr).commit_atomic_count.load(Ordering::Relaxed) } + } } -impl Drop for KvVfs { +impl Drop for SqliteVfs { fn drop(&mut self) { unsafe { sqlite3_vfs_unregister(self.vfs_ptr); @@ -1459,22 +2421,17 @@ impl Drop for KvVfs { } } -// MARK: NativeDatabase - -pub struct NativeDatabase { - db: *mut sqlite3, - _vfs: KvVfs, -} - -unsafe impl Send for NativeDatabase {} - impl NativeDatabase { pub fn as_ptr(&self) -> *mut sqlite3 { self.db } pub fn take_last_kv_error(&self) -> Option { - self._vfs.take_last_kv_error() + self._vfs.take_last_error() + } + + pub fn sqlite_vfs_metrics(&self) -> SqliteVfsMetricsSnapshot { + unsafe { (*self._vfs.ctx_ptr).sqlite_vfs_metrics() } } } @@ -1494,88 +2451,29 @@ impl Drop for NativeDatabase { } } -fn sqlite_error_message(db: *mut sqlite3) -> String { - unsafe { - if db.is_null() { - "unknown sqlite error".to_string() - } else { - CStr::from_ptr(sqlite3_errmsg(db)) - .to_string_lossy() - .into_owned() - } - } -} +pub fn open_database( + vfs: SqliteVfs, + file_name: &str, +) -> std::result::Result { + let c_name = CString::new(file_name).map_err(|err| err.to_string())?; + let mut db: *mut sqlite3 = ptr::null_mut(); -fn sqlite_exec(db: *mut sqlite3, sql: &str) -> Result<(), String> { - let c_sql = CString::new(sql).map_err(|err| err.to_string())?; - let rc = unsafe { sqlite3_exec(db, c_sql.as_ptr(), None, ptr::null_mut(), ptr::null_mut()) }; + let rc = unsafe { + sqlite3_open_v2( + c_name.as_ptr(), + &mut db, + SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, + vfs.name_ptr(), + ) + }; if rc != SQLITE_OK { - return Err(format!( - "`{sql}` failed with code {rc}: {}", - sqlite_error_message(db) - )); - } - - Ok(()) -} - -fn cleanup_batch_atomic_probe(db: *mut sqlite3) { - if let Err(err) = sqlite_exec(db, "DROP TABLE IF EXISTS __rivet_batch_probe;") { - tracing::warn!(%err, "failed to clean up batch atomic probe table"); - } -} - -fn assert_batch_atomic_probe(db: *mut sqlite3, vfs: &KvVfs) -> Result<(), String> { - let commit_atomic_before = vfs.commit_atomic_count(); - let probe_sql = "\ - BEGIN IMMEDIATE;\ - CREATE TABLE IF NOT EXISTS __rivet_batch_probe(x INTEGER);\ - INSERT INTO __rivet_batch_probe VALUES(1);\ - DELETE FROM __rivet_batch_probe;\ - DROP TABLE IF EXISTS __rivet_batch_probe;\ - COMMIT;\ - "; - - if let Err(err) = sqlite_exec(db, probe_sql) { - cleanup_batch_atomic_probe(db); - return Err(format!("batch atomic probe failed: {err}")); - } - - let commit_atomic_after = vfs.commit_atomic_count(); - if commit_atomic_after == commit_atomic_before { - tracing::error!( - "batch atomic writes not active, SQLITE_ENABLE_BATCH_ATOMIC_WRITE may be missing" - ); - cleanup_batch_atomic_probe(db); - return Err( - "batch atomic writes not active, SQLITE_ENABLE_BATCH_ATOMIC_WRITE may be missing" - .to_string(), - ); - } - - Ok(()) -} - -pub fn open_database(vfs: KvVfs, file_name: &str) -> Result { - let c_name = CString::new(file_name).map_err(|err| err.to_string())?; - let mut db: *mut sqlite3 = ptr::null_mut(); - - let rc = unsafe { - sqlite3_open_v2( - c_name.as_ptr(), - &mut db, - SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, - vfs.name_ptr(), - ) - }; - if rc != SQLITE_OK { - let message = sqlite_error_message(db); - if !db.is_null() { - unsafe { - sqlite3_close(db); - } - } - return Err(format!("sqlite3_open_v2 failed with code {rc}: {message}")); + let message = sqlite_error_message(db); + if !db.is_null() { + unsafe { + sqlite3_close(db); + } + } + return Err(format!("sqlite3_open_v2 failed with code {rc}: {message}")); } for pragma in &[ @@ -1604,530 +2502,2587 @@ pub fn open_database(vfs: KvVfs, file_name: &str) -> Result> }, - Put { keys: Vec> }, - Delete { keys: Vec> }, - DeleteRange { start: Vec, end: Vec }, + fn dirty_pages(page_count: u32, fill: u8) -> Vec { + (0..page_count) + .map(|offset| protocol::SqliteDirtyPage { + pgno: offset + 1, + bytes: vec![fill; 4096], + }) + .collect() + } + + fn next_test_name(prefix: &str) -> String { + let id = TEST_ID.fetch_add(1, Ordering::Relaxed); + format!("{prefix}-{id}") + } + + fn random_hex() -> String { + let mut bytes = [0u8; 8]; + getrandom::getrandom(&mut bytes).expect("random bytes should be available"); + bytes.iter().map(|byte| format!("{byte:02x}")).collect() + } + + struct DirectEngineHarness { + actor_id: String, + db_dir: TempDir, + subspace: Subspace, + } + + impl DirectEngineHarness { + fn new() -> Self { + Self { + actor_id: next_test_name("sqlite-direct-actor"), + db_dir: tempfile::tempdir().expect("temp dir should build"), + subspace: Subspace::new(&("sqlite-direct", random_hex())), + } + } + + async fn open_engine(&self) -> Arc { + let mut attempts = 0; + let driver = loop { + match universaldb::driver::RocksDbDatabaseDriver::new( + self.db_dir.path().to_path_buf(), + ) + .await + { + Ok(driver) => break driver, + Err(_err) if attempts < 50 => { + attempts += 1; + std::thread::sleep(std::time::Duration::from_millis(10)); + } + Err(err) => panic!("rocksdb driver should build: {err:#}"), + } + }; + let db = Arc::new(universaldb::Database::new(Arc::new(driver))); + let (engine, _compaction_rx) = SqliteEngine::new(db, self.subspace.clone()); + + Arc::new(engine) + } + + async fn startup_data_for( + &self, + actor_id: &str, + engine: &SqliteEngine, + ) -> protocol::SqliteStartupData { + let takeover = engine + .takeover( + actor_id, + sqlite_storage::takeover::TakeoverConfig::new( + sqlite_now_ms().expect("startup time should resolve"), + ), + ) + .await + .expect("takeover should succeed"); + + protocol::SqliteStartupData { + generation: takeover.generation, + meta: protocol_sqlite_meta(takeover.meta), + preloaded_pages: takeover + .preloaded_pages + .into_iter() + .map(protocol_fetched_page) + .collect(), + } + } + + async fn startup_data(&self, engine: &SqliteEngine) -> protocol::SqliteStartupData { + self.startup_data_for(&self.actor_id, engine).await + } + + fn open_db_on_engine( + &self, + runtime: &tokio::runtime::Runtime, + engine: Arc, + actor_id: &str, + config: VfsConfig, + ) -> NativeDatabase { + let startup = runtime.block_on(self.startup_data_for(actor_id, &engine)); + let vfs = SqliteVfs::register_with_transport( + &next_test_name("sqlite-direct-vfs"), + SqliteTransport::from_direct(engine), + actor_id.to_string(), + runtime.handle().clone(), + startup, + config, + ) + .expect("v2 vfs should register"); + + open_database(vfs, actor_id).expect("sqlite database should open") + } + + fn open_db(&self, runtime: &tokio::runtime::Runtime) -> NativeDatabase { + let engine = runtime.block_on(self.open_engine()); + self.open_db_on_engine(runtime, engine, &self.actor_id, VfsConfig::default()) + } + } + + fn direct_vfs_ctx(db: &NativeDatabase) -> &VfsContext { + unsafe { &*db._vfs.ctx_ptr } + } + + fn sqlite_query_i64(db: *mut sqlite3, sql: &str) -> std::result::Result { + let c_sql = CString::new(sql).map_err(|err| err.to_string())?; + let mut stmt = ptr::null_mut(); + let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; + if rc != SQLITE_OK { + return Err(format!( + "`{sql}` prepare failed with code {rc}: {}", + sqlite_error_message(db) + )); + } + if stmt.is_null() { + return Err(format!("`{sql}` returned no statement")); + } + + let result = match unsafe { sqlite3_step(stmt) } { + SQLITE_ROW => Ok(unsafe { sqlite3_column_int64(stmt, 0) }), + step_rc => Err(format!( + "`{sql}` step failed with code {step_rc}: {}", + sqlite_error_message(db) + )), + }; + + unsafe { + sqlite3_finalize(stmt); + } + + result + } + + fn sqlite_query_text(db: *mut sqlite3, sql: &str) -> std::result::Result { + let c_sql = CString::new(sql).map_err(|err| err.to_string())?; + let mut stmt = ptr::null_mut(); + let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; + if rc != SQLITE_OK { + return Err(format!( + "`{sql}` prepare failed with code {rc}: {}", + sqlite_error_message(db) + )); + } + if stmt.is_null() { + return Err(format!("`{sql}` returned no statement")); + } + + let result = match unsafe { sqlite3_step(stmt) } { + SQLITE_ROW => { + let text_ptr = unsafe { sqlite3_column_text(stmt, 0) }; + if text_ptr.is_null() { + Ok(String::new()) + } else { + Ok(unsafe { CStr::from_ptr(text_ptr.cast()) } + .to_string_lossy() + .into_owned()) + } + } + step_rc => Err(format!( + "`{sql}` step failed with code {step_rc}: {}", + sqlite_error_message(db) + )), + }; + + unsafe { + sqlite3_finalize(stmt); + } + + result + } + + fn sqlite_file_control(db: *mut sqlite3, op: c_int) -> std::result::Result { + let main = CString::new("main").map_err(|err| err.to_string())?; + let rc = unsafe { sqlite3_file_control(db, main.as_ptr(), op, ptr::null_mut()) }; + if rc != SQLITE_OK { + return Err(format!( + "sqlite3_file_control op {op} failed with code {rc}: {}", + sqlite_error_message(db) + )); + } + + Ok(rc) + } + + fn direct_runtime() -> tokio::runtime::Runtime { + Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .expect("runtime should build") + } + + #[test] + fn predictor_prefers_stride_after_repeated_reads() { + let mut predictor = PrefetchPredictor::default(); + for pgno in [5, 8, 11, 14] { + predictor.record(pgno); + } + + assert_eq!(predictor.multi_predict(14, 3, 30), vec![17, 20, 23]); + } + + #[test] + fn startup_data_populates_cache_without_protocol_calls() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }, + ), + )); + let startup = protocol::SqliteStartupData { + generation: 3, + meta: sqlite_meta(8 * 1024 * 1024), + preloaded_pages: vec![protocol::SqliteFetchedPage { + pgno: 1, + bytes: Some(vec![7; 4096]), + }], + }; + + let ctx = VfsContext::new( + "actor".to_string(), + runtime.handle().clone(), + SqliteTransport::from_mock(protocol.clone()), + startup, + VfsConfig::default(), + unsafe { std::mem::zeroed() }, + ); + + assert_eq!(ctx.state.read().page_cache.get(&1), Some(vec![7; 4096])); + assert!(protocol.get_pages_requests().is_empty()); + } + + #[test] + fn direct_engine_supports_create_insert_select_and_user_version() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + + assert_eq!( + sqlite_file_control(db.as_ptr(), SQLITE_FCNTL_BEGIN_ATOMIC_WRITE) + .expect("batch atomic begin should succeed"), + SQLITE_OK + ); + assert_eq!( + sqlite_file_control(db.as_ptr(), SQLITE_FCNTL_COMMIT_ATOMIC_WRITE) + .expect("batch atomic commit should succeed"), + SQLITE_OK + ); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO items (id, value) VALUES (1, 'alpha');", + ) + .expect("insert should succeed"); + sqlite_exec(db.as_ptr(), "PRAGMA user_version = 42;") + .expect("user_version pragma should succeed"); + + assert_eq!( + sqlite_query_text(db.as_ptr(), "SELECT value FROM items WHERE id = 1;") + .expect("select should succeed"), + "alpha" + ); + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM items;") + .expect("count should succeed"), + 1 + ); + assert_eq!( + sqlite_query_i64(db.as_ptr(), "PRAGMA user_version;") + .expect("user_version read should succeed"), + 42 + ); + } + + #[test] + fn direct_engine_handles_large_rows_and_multi_page_growth() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE blobs (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", + ) + .expect("create table should succeed"); + + for _ in 0..48 { + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO blobs (payload) VALUES (randomblob(3500));", + ) + .expect("seed insert should succeed"); + } + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO blobs (payload) VALUES (randomblob(9000));", + ) + .expect("large row insert should succeed"); + + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM blobs;") + .expect("count should succeed"), + 49 + ); + assert!( + sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;").expect("page_count should succeed") + > 20 + ); + assert!( + sqlite_query_i64(db.as_ptr(), "SELECT max(length(payload)) FROM blobs;") + .expect("max payload length should succeed") + >= 9000 + ); + } + + #[test] + fn direct_engine_persists_data_across_close_and_reopen() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + + { + let db = harness.open_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE events (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO events (id, value) VALUES (1, 'persisted');", + ) + .expect("insert should succeed"); + sqlite_exec(db.as_ptr(), "PRAGMA user_version = 7;") + .expect("user_version write should succeed"); + } + + let reopened = harness.open_db(&runtime); + assert_eq!( + sqlite_query_i64(reopened.as_ptr(), "SELECT COUNT(*) FROM events;") + .expect("count after reopen should succeed"), + 1 + ); + assert_eq!( + sqlite_query_text(reopened.as_ptr(), "SELECT value FROM events WHERE id = 1;") + .expect("value after reopen should succeed"), + "persisted" + ); + assert_eq!( + sqlite_query_i64(reopened.as_ptr(), "PRAGMA user_version;") + .expect("user_version after reopen should succeed"), + 7 + ); + } + + #[test] + fn direct_engine_handles_aux_files_and_truncate_then_regrow() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + + sqlite_exec(db.as_ptr(), "PRAGMA temp_store = FILE;") + .expect("temp_store pragma should succeed"); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE blobs (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", + ) + .expect("create table should succeed"); + + for _ in 0..32 { + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO blobs (payload) VALUES (randomblob(8192));", + ) + .expect("growth insert should succeed"); + } + let grown_pages = sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;") + .expect("grown page_count should succeed"); + assert!(grown_pages > 40); + + sqlite_exec( + db.as_ptr(), + "CREATE TEMP TABLE scratch AS SELECT id FROM blobs ORDER BY id DESC;", + ) + .expect("temp table should succeed"); + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM scratch;") + .expect("temp table count should succeed"), + 32 + ); + + sqlite_exec(db.as_ptr(), "DELETE FROM blobs;").expect("delete should succeed"); + sqlite_exec(db.as_ptr(), "VACUUM;").expect("vacuum should succeed"); + let shrunk_pages = sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;") + .expect("shrunk page_count should succeed"); + assert!(shrunk_pages < grown_pages); + + for _ in 0..8 { + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO blobs (payload) VALUES (randomblob(8192));", + ) + .expect("regrow insert should succeed"); + } + let regrown_pages = sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;") + .expect("regrown page_count should succeed"); + assert!(regrown_pages > shrunk_pages); + } + + #[test] + fn direct_engine_batch_atomic_probe_runs_on_open() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + + assert!( + db._vfs.commit_atomic_count() > 0, + "open_database should run the sqlite batch-atomic probe", + ); + } + + #[test] + fn direct_engine_keeps_head_txid_after_cache_miss_reads_between_commits() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let db = harness.open_db_on_engine( + &runtime, + engine, + &harness.actor_id, + VfsConfig { + cache_capacity_pages: 2, + prefetch_depth: 0, + max_prefetch_bytes: 0, + ..VfsConfig::default() + }, + ); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_exec(db.as_ptr(), "CREATE INDEX items_value_idx ON items(value);") + .expect("create index should succeed"); + for i in 0..120 { + sqlite_step_statement( + db.as_ptr(), + &format!( + "INSERT INTO items (id, value) VALUES ({}, 'item-{i:03}');", + i + 1 + ), + ) + .expect("seed insert should succeed"); + } + + let ctx = direct_vfs_ctx(&db); + let head_after_first_phase = ctx.state.read().head_txid; + + ctx.state.write().page_cache.invalidate_all(); + assert_eq!( + sqlite_query_text( + db.as_ptr(), + "SELECT value FROM items WHERE value = 'item-091';", + ) + .expect("cache-miss read should succeed"), + "item-091" + ); + let head_after_cache_miss = ctx.state.read().head_txid; + assert_eq!( + head_after_cache_miss, head_after_first_phase, + "cache-miss reads must not rewind head_txid", + ); + + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO items (id, value) VALUES (1000, 'after-cache-miss');", + ) + .expect("commit after cache-miss read should succeed"); + assert!( + ctx.state.read().head_txid > head_after_cache_miss, + "head_txid should still advance after the follow-up commit", + ); + } + + #[test] + fn direct_engine_uses_slow_path_for_large_real_engine_commits() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let startup = runtime.block_on(harness.startup_data(&engine)); + let dirty_pages = (1..=2300u32) + .map(|pgno| protocol::SqliteDirtyPage { + pgno, + bytes: vec![(pgno % 251) as u8; 4096], + }) + .collect::>(); + + let outcome = runtime + .block_on(commit_buffered_pages( + &SqliteTransport::from_direct(Arc::clone(&engine)), + BufferedCommitRequest { + actor_id: harness.actor_id.clone(), + generation: startup.generation, + expected_head_txid: startup.meta.head_txid, + new_db_size_pages: 2300, + max_delta_bytes: startup.meta.max_delta_bytes, + max_pages_per_stage: 256, + dirty_pages, + }, + )) + .expect("slow-path direct commit should succeed"); + let (outcome, metrics) = outcome; + + assert_eq!(outcome.path, CommitPath::Slow); + assert_eq!(outcome.new_head_txid, startup.meta.head_txid + 1); + assert!(metrics.serialize_ns > 0); + assert!(metrics.transport_ns > 0); + + let pages = runtime + .block_on(engine.get_pages(&harness.actor_id, startup.generation, vec![1, 1024, 2300])) + .expect("pages should read back after slow-path commit"); + let expected_page_1 = vec![1u8; 4096]; + let expected_page_1024 = vec![(1024 % 251) as u8; 4096]; + let expected_page_2300 = vec![(2300 % 251) as u8; 4096]; + assert_eq!(pages.len(), 3); + assert_eq!(pages[0].bytes.as_deref(), Some(expected_page_1.as_slice())); + assert_eq!( + pages[1].bytes.as_deref(), + Some(expected_page_1024.as_slice()) + ); + assert_eq!( + pages[2].bytes.as_deref(), + Some(expected_page_2300.as_slice()) + ); } - #[derive(Default)] - struct MemoryKv { - stores: Mutex, Vec>>>, - op_log: Mutex>>, + #[test] + fn direct_engine_marks_vfs_dead_after_transport_errors() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let startup = runtime.block_on(harness.startup_data(&engine)); + let transport = SqliteTransport::from_direct(engine); + let hooks = transport + .direct_hooks() + .expect("direct transport should expose test hooks"); + let vfs = SqliteVfs::register_with_transport( + &next_test_name("sqlite-direct-vfs"), + transport, + harness.actor_id.clone(), + runtime.handle().clone(), + startup, + VfsConfig::default(), + ) + .expect("v2 vfs should register"); + let db = open_database(vfs, &harness.actor_id).expect("sqlite database should open"); + + hooks.fail_next_commit("InjectedTransportError: commit transport dropped"); + let err = sqlite_exec( + db.as_ptr(), + "CREATE TABLE broken (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect_err("failing transport commit should surface as an IO error"); + assert!( + err.contains("I/O") || err.contains("disk I/O"), + "sqlite should surface transport failure as an IO error: {err}", + ); + assert!( + direct_vfs_ctx(&db).is_dead(), + "transport error should kill the v2 VFS" + ); + assert_eq!( + db.take_last_kv_error().as_deref(), + Some("InjectedTransportError: commit transport dropped"), + ); + assert!( + sqlite_query_i64(db.as_ptr(), "PRAGMA page_count;").is_err(), + "subsequent reads should fail once the VFS is dead", + ); + } + + #[test] + fn flush_dirty_pages_marks_vfs_dead_after_transport_error() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let startup = runtime.block_on(harness.startup_data(&engine)); + let transport = SqliteTransport::from_direct(engine); + let hooks = transport + .direct_hooks() + .expect("direct transport should expose test hooks"); + let vfs = SqliteVfs::register_with_transport( + &next_test_name("sqlite-direct-vfs"), + transport, + harness.actor_id.clone(), + runtime.handle().clone(), + startup, + VfsConfig::default(), + ) + .expect("v2 vfs should register"); + let db = open_database(vfs, &harness.actor_id).expect("sqlite database should open"); + let ctx = direct_vfs_ctx(&db); + + { + let mut state = ctx.state.write(); + state.write_buffer.dirty.insert(1, vec![0x7a; 4096]); + state.db_size_pages = 1; + } + + hooks.fail_next_commit("InjectedTransportError: flush transport dropped"); + let err = ctx + .flush_dirty_pages() + .expect_err("transport failure should bubble out of flush_dirty_pages"); + + assert!( + matches!(err, CommitBufferError::Other(ref message) if message.contains("InjectedTransportError")), + "flush failure should surface as a transport error: {err:?}", + ); + assert!( + ctx.is_dead(), + "flush transport failure should poison the VFS" + ); + assert_eq!( + db.take_last_kv_error().as_deref(), + Some("InjectedTransportError: flush transport dropped"), + ); + } + + #[test] + fn commit_atomic_write_marks_vfs_dead_after_transport_error() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let startup = runtime.block_on(harness.startup_data(&engine)); + let transport = SqliteTransport::from_direct(engine); + let hooks = transport + .direct_hooks() + .expect("direct transport should expose test hooks"); + let vfs = SqliteVfs::register_with_transport( + &next_test_name("sqlite-direct-vfs"), + transport, + harness.actor_id.clone(), + runtime.handle().clone(), + startup, + VfsConfig::default(), + ) + .expect("v2 vfs should register"); + let db = open_database(vfs, &harness.actor_id).expect("sqlite database should open"); + let ctx = direct_vfs_ctx(&db); + + { + let mut state = ctx.state.write(); + state.write_buffer.in_atomic_write = true; + state.write_buffer.saved_db_size = state.db_size_pages; + state.write_buffer.dirty.insert(1, vec![0x5c; 4096]); + state.db_size_pages = 1; + } + + hooks.fail_next_commit("InjectedTransportError: atomic transport dropped"); + let err = ctx + .commit_atomic_write() + .expect_err("transport failure should bubble out of commit_atomic_write"); + + assert!( + matches!(err, CommitBufferError::Other(ref message) if message.contains("InjectedTransportError")), + "atomic-write failure should surface as a transport error: {err:?}", + ); + assert!( + ctx.is_dead(), + "commit_atomic_write transport failure should poison the VFS", + ); + assert_eq!( + db.take_last_kv_error().as_deref(), + Some("InjectedTransportError: atomic transport dropped"), + ); + } + + #[test] + fn direct_engine_handles_multithreaded_statement_churn() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = Arc::new(StdMutex::new(harness.open_db(&runtime))); + + { + let db = db.lock().expect("db mutex should lock"); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY AUTOINCREMENT, value TEXT NOT NULL);", + ) + .expect("create table should succeed"); + } + + let mut workers = Vec::new(); + for worker_id in 0..4 { + let db = Arc::clone(&db); + workers.push(thread::spawn(move || { + for idx in 0..40 { + let db = db.lock().expect("db mutex should lock"); + sqlite_step_statement( + db.as_ptr(), + &format!( + "INSERT INTO items (value) VALUES ('worker-{worker_id}-row-{idx}');" + ), + ) + .expect("threaded insert should succeed"); + } + })); + } + for worker in workers { + worker.join().expect("worker thread should finish"); + } + + let db = db.lock().expect("db mutex should lock"); + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM items;") + .expect("threaded row count should succeed"), + 160 + ); + } + + #[test] + fn direct_engine_isolates_two_actors_on_one_shared_engine() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let actor_a = next_test_name("sqlite-actor-a"); + let actor_b = next_test_name("sqlite-actor-b"); + let db_a = harness.open_db_on_engine( + &runtime, + Arc::clone(&engine), + &actor_a, + VfsConfig::default(), + ); + let db_b = harness.open_db_on_engine(&runtime, engine, &actor_b, VfsConfig::default()); + + sqlite_exec( + db_a.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("actor A create table should succeed"); + sqlite_exec( + db_b.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("actor B create table should succeed"); + sqlite_step_statement( + db_a.as_ptr(), + "INSERT INTO items (id, value) VALUES (1, 'alpha');", + ) + .expect("actor A insert should succeed"); + sqlite_step_statement( + db_b.as_ptr(), + "INSERT INTO items (id, value) VALUES (1, 'beta');", + ) + .expect("actor B insert should succeed"); + + assert_eq!( + sqlite_query_text(db_a.as_ptr(), "SELECT value FROM items WHERE id = 1;") + .expect("actor A select should succeed"), + "alpha" + ); + assert_eq!( + sqlite_query_text(db_b.as_ptr(), "SELECT value FROM items WHERE id = 1;") + .expect("actor B select should succeed"), + "beta" + ); + } + + #[test] + fn direct_engine_hot_row_updates_survive_reopen() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + + { + let db = harness.open_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE counters (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO counters (id, value) VALUES (1, 'v-0');", + ) + .expect("seed row should succeed"); + for i in 1..=150 { + sqlite_step_statement( + db.as_ptr(), + &format!("UPDATE counters SET value = 'v-{i}' WHERE id = 1;"), + ) + .expect("hot-row update should succeed"); + } + } + + let reopened = harness.open_db(&runtime); + assert_eq!( + sqlite_query_text( + reopened.as_ptr(), + "SELECT value FROM counters WHERE id = 1;" + ) + .expect("final value should survive reopen"), + "v-150" + ); + } + + #[test] + fn direct_engine_preserves_mixed_workload_across_sleep_wake() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + + { + let db = harness.open_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL, status TEXT NOT NULL);", + ) + .expect("create table should succeed"); + for id in 1..=50 { + sqlite_step_statement( + db.as_ptr(), + &format!( + "INSERT INTO items (id, value, status) VALUES ({id}, 'item-{id}', 'new');" + ), + ) + .expect("seed insert should succeed"); + } + for id in 1..=20 { + sqlite_step_statement( + db.as_ptr(), + &format!( + "UPDATE items SET status = 'updated', value = 'item-{id}-updated' WHERE id = {id};" + ), + ) + .expect("update should succeed"); + } + for id in 41..=50 { + sqlite_step_statement(db.as_ptr(), &format!("DELETE FROM items WHERE id = {id};")) + .expect("delete should succeed"); + } + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO items (id, value, status) VALUES (1000, 'disconnect-write', 'new');", + ) + .expect("disconnect-style write before close should succeed"); + } + + let reopened = harness.open_db(&runtime); + assert_eq!( + sqlite_query_i64(reopened.as_ptr(), "SELECT COUNT(*) FROM items;") + .expect("row count after reopen should succeed"), + 41 + ); + assert_eq!( + sqlite_query_i64( + reopened.as_ptr(), + "SELECT COUNT(*) FROM items WHERE status = 'updated';", + ) + .expect("updated row count should succeed"), + 20 + ); + assert_eq!( + sqlite_query_text( + reopened.as_ptr(), + "SELECT value FROM items WHERE id = 1000;", + ) + .expect("disconnect write should survive reopen"), + "disconnect-write" + ); + } + + #[test] + fn direct_engine_reopens_cleanly_after_failed_migration() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + + { + let db = harness.open_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_exec(db.as_ptr(), "ALTER TABLE items ADD COLUMN;") + .expect_err("broken migration should fail"); + } + + let reopened = harness.open_db(&runtime); + sqlite_step_statement( + reopened.as_ptr(), + "INSERT INTO items (id, value) VALUES (1, 'still-alive');", + ) + .expect("reopened database should still accept writes after migration failure"); + assert_eq!( + sqlite_query_text(reopened.as_ptr(), "SELECT value FROM items WHERE id = 1;") + .expect("select after reopen should succeed"), + "still-alive" + ); + } + + #[test] + fn direct_engine_reads_continue_while_compaction_runs() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let db = Arc::new(StdMutex::new(harness.open_db_on_engine( + &runtime, + Arc::clone(&engine), + &harness.actor_id, + VfsConfig::default(), + ))); + + { + let db = db.lock().expect("db mutex should lock"); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("create table should succeed"); + for id in 1..=48 { + sqlite_step_statement( + db.as_ptr(), + &format!("INSERT INTO items (id, value) VALUES ({id}, 'row-{id}');"), + ) + .expect("seed insert should succeed"); + } + } + + let keep_reading = Arc::new(AtomicBool::new(true)); + let read_error = Arc::new(StdMutex::new(None::)); + let db_for_reader = Arc::clone(&db); + let keep_reading_for_thread = Arc::clone(&keep_reading); + let read_error_for_thread = Arc::clone(&read_error); + let reader = thread::spawn(move || { + while keep_reading_for_thread.load(AtomicOrdering::Relaxed) { + let db = db_for_reader.lock().expect("db mutex should lock"); + direct_vfs_ctx(&db) + .state + .write() + .page_cache + .invalidate_all(); + if let Err(err) = + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM items WHERE id >= 1;") + { + *read_error_for_thread + .lock() + .expect("read error mutex should lock") = Some(err); + break; + } + } + }); + + runtime + .block_on(engine.compact_worker(&harness.actor_id, 8)) + .expect("compaction should succeed"); + keep_reading.store(false, AtomicOrdering::Relaxed); + reader.join().expect("reader thread should finish"); + + assert!( + read_error + .lock() + .expect("read error mutex should lock") + .is_none(), + "reads should keep working while compaction folds deltas", + ); + let db = db.lock().expect("db mutex should lock"); + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM items;") + .expect("final row count should succeed"), + 48 + ); + } + + #[test] + fn open_database_supports_empty_db_schema_setup() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 2, + ..sqlite_meta(8 * 1024 * 1024) + }, + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 2, + ..sqlite_meta(8 * 1024 * 1024) + }, + }, + ), + )); + protocol.set_mirror_commit_meta(true); + + let vfs = SqliteVfs::register_with_transport( + "test-v2-empty-db", + SqliteTransport::from_mock(protocol.clone()), + "actor".to_string(), + runtime.handle().clone(), + protocol::SqliteStartupData { + generation: 7, + meta: protocol::SqliteMeta { + db_size_pages: 0, + ..sqlite_meta(8 * 1024 * 1024) + }, + preloaded_pages: Vec::new(), + }, + VfsConfig::default(), + ) + .expect("vfs should register"); + let db = open_database(vfs, "actor").expect("db should open"); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE test (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("schema setup should succeed"); + } + + #[test] + fn open_database_supports_insert_after_pragma_migration() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 32, + ..sqlite_meta(8 * 1024 * 1024) + }, + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 32, + ..sqlite_meta(8 * 1024 * 1024) + }, + }, + ), + )); + + let vfs = SqliteVfs::register_with_transport( + "test-v2-pragma-migration", + SqliteTransport::from_mock(protocol.clone()), + "actor".to_string(), + runtime.handle().clone(), + protocol::SqliteStartupData { + generation: 7, + meta: protocol::SqliteMeta { + db_size_pages: 0, + ..sqlite_meta(8 * 1024 * 1024) + }, + preloaded_pages: Vec::new(), + }, + VfsConfig::default(), + ) + .expect("vfs should register"); + let db = open_database(vfs, "actor").expect("db should open"); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_exec( + db.as_ptr(), + "ALTER TABLE items ADD COLUMN status TEXT NOT NULL DEFAULT 'active';", + ) + .expect("alter table should succeed"); + sqlite_exec(db.as_ptr(), "PRAGMA user_version = 2;").expect("pragma should succeed"); + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO items (name) VALUES ('test-item');", + ) + .expect("insert after pragma migration should succeed"); + } + + #[test] + fn open_database_supports_explicit_status_insert_after_pragma_migration() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 32, + ..sqlite_meta(8 * 1024 * 1024) + }, + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 32, + ..sqlite_meta(8 * 1024 * 1024) + }, + }, + ), + )); + protocol.set_mirror_commit_meta(true); + + let vfs = SqliteVfs::register_with_transport( + "test-v2-pragma-explicit", + SqliteTransport::from_mock(protocol), + "actor".to_string(), + runtime.handle().clone(), + protocol::SqliteStartupData { + generation: 7, + meta: protocol::SqliteMeta { + db_size_pages: 0, + ..sqlite_meta(8 * 1024 * 1024) + }, + preloaded_pages: Vec::new(), + }, + VfsConfig::default(), + ) + .expect("vfs should register"); + let db = open_database(vfs, "actor").expect("db should open"); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_exec( + db.as_ptr(), + "ALTER TABLE items ADD COLUMN status TEXT NOT NULL DEFAULT 'active';", + ) + .expect("alter table should succeed"); + sqlite_exec(db.as_ptr(), "PRAGMA user_version = 2;").expect("pragma should succeed"); + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO items (name, status) VALUES ('done-item', 'completed');", + ) + .expect("explicit status insert should succeed"); + } + + #[test] + fn open_database_supports_hot_row_update_churn() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 128, + ..sqlite_meta(8 * 1024 * 1024) + }, + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 128, + ..sqlite_meta(8 * 1024 * 1024) + }, + }, + ), + )); + protocol.set_mirror_commit_meta(true); + + let vfs = SqliteVfs::register_with_transport( + "test-v2-hot-row-updates", + SqliteTransport::from_mock(protocol), + "actor".to_string(), + runtime.handle().clone(), + protocol::SqliteStartupData { + generation: 7, + meta: protocol::SqliteMeta { + db_size_pages: 0, + ..sqlite_meta(8 * 1024 * 1024) + }, + preloaded_pages: Vec::new(), + }, + VfsConfig::default(), + ) + .expect("vfs should register"); + let db = open_database(vfs, "actor").expect("db should open"); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE test_data (id INTEGER PRIMARY KEY AUTOINCREMENT, value TEXT NOT NULL, payload TEXT NOT NULL DEFAULT '', created_at INTEGER NOT NULL);", + ) + .expect("create table should succeed"); + for i in 0..10 { + sqlite_step_statement( + db.as_ptr(), + &format!( + "INSERT INTO test_data (value, payload, created_at) VALUES ('init-{i}', '', 1);" + ), + ) + .expect("seed insert should succeed"); + } + for i in 0..240 { + let row_id = i % 10 + 1; + sqlite_step_statement( + db.as_ptr(), + &format!("UPDATE test_data SET value = 'v-{i}' WHERE id = {row_id};"), + ) + .expect("hot-row update should succeed"); + } + } + + #[test] + fn open_database_supports_cross_thread_exec_sequence() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 32, + ..sqlite_meta(8 * 1024 * 1024) + }, + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: protocol::SqliteMeta { + db_size_pages: 32, + ..sqlite_meta(8 * 1024 * 1024) + }, + }, + ), + )); + protocol.set_mirror_commit_meta(true); + + let vfs = SqliteVfs::register_with_transport( + "test-v2-cross-thread", + SqliteTransport::from_mock(protocol), + "actor".to_string(), + runtime.handle().clone(), + protocol::SqliteStartupData { + generation: 7, + meta: protocol::SqliteMeta { + db_size_pages: 0, + ..sqlite_meta(8 * 1024 * 1024) + }, + preloaded_pages: Vec::new(), + }, + VfsConfig::default(), + ) + .expect("vfs should register"); + let db = Arc::new(StdMutex::new( + open_database(vfs, "actor").expect("db should open"), + )); + + { + let db = db.clone(); + thread::spawn(move || { + let db = db.lock().expect("db mutex should lock"); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE items (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_exec( + db.as_ptr(), + "ALTER TABLE items ADD COLUMN status TEXT NOT NULL DEFAULT 'active';", + ) + .expect("alter table should succeed"); + sqlite_exec(db.as_ptr(), "PRAGMA user_version = 2;") + .expect("pragma should succeed"); + }) + .join() + .expect("migration thread should finish"); + } + + thread::spawn(move || { + let db = db.lock().expect("db mutex should lock"); + sqlite_step_statement( + db.as_ptr(), + "INSERT INTO items (name) VALUES ('test-item');", + ) + .expect("cross-thread insert should succeed"); + }) + .join() + .expect("insert thread should finish"); + } + + #[test] + fn aux_files_are_shared_by_path_until_deleted() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }, + ), + )); + let ctx = VfsContext::new( + "actor".to_string(), + runtime.handle().clone(), + SqliteTransport::from_mock(protocol), + protocol::SqliteStartupData { + generation: 7, + meta: sqlite_meta(8 * 1024 * 1024), + preloaded_pages: Vec::new(), + }, + VfsConfig::default(), + unsafe { std::mem::zeroed() }, + ); + + let first = ctx.open_aux_file("actor-journal"); + first.bytes.lock().extend_from_slice(&[1, 2, 3, 4]); + let second = ctx.open_aux_file("actor-journal"); + assert_eq!(*second.bytes.lock(), vec![1, 2, 3, 4]); + assert!(ctx.aux_file_exists("actor-journal")); + + ctx.delete_aux_file("actor-journal"); + assert!(!ctx.aux_file_exists("actor-journal")); + assert!(ctx.open_aux_file("actor-journal").bytes.lock().is_empty()); + } + + #[test] + fn truncate_main_file_discards_pages_beyond_eof() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }, + ), + )); + let ctx = VfsContext::new( + "actor".to_string(), + runtime.handle().clone(), + SqliteTransport::from_mock(protocol), + protocol::SqliteStartupData { + generation: 7, + meta: protocol::SqliteMeta { + db_size_pages: 4, + ..sqlite_meta(8 * 1024 * 1024) + }, + preloaded_pages: vec![ + protocol::SqliteFetchedPage { + pgno: 1, + bytes: Some(vec![1; 4096]), + }, + protocol::SqliteFetchedPage { + pgno: 4, + bytes: Some(vec![4; 4096]), + }, + ], + }, + VfsConfig::default(), + unsafe { std::mem::zeroed() }, + ); + { + let mut state = ctx.state.write(); + state.write_buffer.dirty.insert(3, vec![3; 4096]); + state.write_buffer.dirty.insert(4, vec![4; 4096]); + } + + ctx.truncate_main_file(2 * 4096); + + let state = ctx.state.read(); + assert_eq!(state.db_size_pages, 2); + assert!(!state.write_buffer.dirty.contains_key(&3)); + assert!(!state.write_buffer.dirty.contains_key(&4)); + assert!(state.page_cache.get(&4).is_none()); + } + + #[test] + fn resolve_pages_does_not_rewind_meta_on_stale_response() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let mut protocol = MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }, + ), + ); + protocol.get_pages_response = + protocol::SqliteGetPagesResponse::SqliteGetPagesOk(protocol::SqliteGetPagesOk { + pages: vec![protocol::SqliteFetchedPage { + pgno: 2, + bytes: Some(vec![2; 4096]), + }], + meta: protocol::SqliteMeta { + head_txid: 1, + db_size_pages: 1, + max_delta_bytes: 32 * 1024 * 1024, + ..sqlite_meta(8 * 1024 * 1024) + }, + }); + let ctx = VfsContext::new( + "actor".to_string(), + runtime.handle().clone(), + SqliteTransport::from_mock(Arc::new(protocol)), + protocol::SqliteStartupData { + generation: 7, + meta: protocol::SqliteMeta { + head_txid: 3, + db_size_pages: 3, + ..sqlite_meta(8 * 1024 * 1024) + }, + preloaded_pages: vec![protocol::SqliteFetchedPage { + pgno: 1, + bytes: Some(vec![1; 4096]), + }], + }, + VfsConfig::default(), + unsafe { std::mem::zeroed() }, + ); + + let resolved = ctx + .resolve_pages(&[2], false) + .expect("missing page should resolve"); + + assert_eq!(resolved.get(&2), Some(&Some(vec![2; 4096]))); + let state = ctx.state.read(); + assert_eq!(state.head_txid, 3); + assert_eq!(state.db_size_pages, 3); + assert_eq!(state.max_delta_bytes, 32 * 1024 * 1024); + } + + #[test] + fn resolve_pages_does_not_shrink_db_size_pages_on_same_head_response() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let mut protocol = MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }, + ), + ); + protocol.get_pages_response = + protocol::SqliteGetPagesResponse::SqliteGetPagesOk(protocol::SqliteGetPagesOk { + pages: vec![protocol::SqliteFetchedPage { + pgno: 4, + bytes: Some(vec![4; 4096]), + }], + meta: protocol::SqliteMeta { + head_txid: 3, + db_size_pages: 1, + max_delta_bytes: 16 * 1024 * 1024, + ..sqlite_meta(8 * 1024 * 1024) + }, + }); + let ctx = VfsContext::new( + "actor".to_string(), + runtime.handle().clone(), + SqliteTransport::from_mock(Arc::new(protocol)), + protocol::SqliteStartupData { + generation: 7, + meta: protocol::SqliteMeta { + head_txid: 3, + db_size_pages: 4, + ..sqlite_meta(8 * 1024 * 1024) + }, + preloaded_pages: vec![protocol::SqliteFetchedPage { + pgno: 1, + bytes: Some(vec![1; 4096]), + }], + }, + VfsConfig::default(), + unsafe { std::mem::zeroed() }, + ); + + let resolved = ctx + .resolve_pages(&[4], false) + .expect("missing page should resolve"); + + assert_eq!(resolved.get(&4), Some(&Some(vec![4; 4096]))); + let state = ctx.state.read(); + assert_eq!(state.head_txid, 3); + assert_eq!(state.db_size_pages, 4); + assert_eq!(state.max_delta_bytes, 16 * 1024 * 1024); + } + + #[test] + fn commit_buffered_pages_uses_fast_path() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitOk(protocol::SqliteCommitOk { + new_head_txid: 13, + meta: sqlite_meta(8 * 1024 * 1024), + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 14, + meta: sqlite_meta(8 * 1024 * 1024), + }, + ), + )); + + let outcome = runtime + .block_on(commit_buffered_pages( + &SqliteTransport::from_mock(protocol.clone()), + BufferedCommitRequest { + actor_id: "actor".to_string(), + generation: 7, + expected_head_txid: 12, + new_db_size_pages: 1, + max_delta_bytes: 8 * 1024 * 1024, + max_pages_per_stage: 4_000, + dirty_pages: dirty_pages(1, 9), + }, + )) + .expect("fast-path commit should succeed"); + let (outcome, metrics) = outcome; + + assert_eq!(outcome.path, CommitPath::Fast); + assert_eq!(outcome.new_head_txid, 13); + assert!(metrics.serialize_ns > 0); + assert!(metrics.transport_ns > 0); + assert_eq!(protocol.commit_requests().len(), 1); + assert!(protocol.stage_requests().is_empty()); + assert!(protocol.finalize_requests().is_empty()); + } + + #[test] + fn commit_buffered_pages_falls_back_to_slow_path() { + let runtime = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build"); + let protocol = Arc::new(MockProtocol::new( + protocol::SqliteCommitResponse::SqliteCommitTooLarge(protocol::SqliteCommitTooLarge { + actual_size_bytes: 3 * 4096, + max_size_bytes: 4096, + }), + protocol::SqliteCommitStageResponse::SqliteCommitStageOk( + protocol::SqliteCommitStageOk { + chunk_idx_committed: 0, + }, + ), + protocol::SqliteCommitFinalizeResponse::SqliteCommitFinalizeOk( + protocol::SqliteCommitFinalizeOk { + new_head_txid: 14, + meta: sqlite_meta(4096), + }, + ), + )); + + let protocol_for_release = protocol.clone(); + let release = std::thread::spawn(move || { + runtime.block_on(async { + protocol_for_release.finalize_started.notified().await; + assert_eq!(protocol_for_release.awaited_stage_responses(), 0); + protocol_for_release.release_finalize.notify_one(); + }); + }); + + let outcome = Builder::new_current_thread() + .enable_all() + .build() + .expect("runtime should build") + .block_on(commit_buffered_pages( + &SqliteTransport::from_mock(protocol.clone()), + BufferedCommitRequest { + actor_id: "actor".to_string(), + generation: 7, + expected_head_txid: 12, + new_db_size_pages: 3, + max_delta_bytes: 4096, + max_pages_per_stage: 1, + dirty_pages: dirty_pages(3, 4), + }, + )) + .expect("slow-path commit should succeed"); + let (outcome, metrics) = outcome; + + release.join().expect("release thread should finish"); + + assert_eq!(outcome.path, CommitPath::Slow); + assert_eq!(outcome.new_head_txid, 14); + assert!(metrics.serialize_ns > 0); + assert!(metrics.transport_ns > 0); + assert!(protocol.commit_requests().is_empty()); + assert!(!protocol.stage_requests().is_empty()); + assert!(protocol + .stage_requests() + .iter() + .enumerate() + .all(|(chunk_idx, request)| request.chunk_idx as usize == chunk_idx)); + assert!(protocol + .stage_requests() + .last() + .is_some_and(|request| request.is_last)); + assert_eq!(protocol.awaited_stage_responses(), 0); + assert_eq!(protocol.finalize_requests().len(), 1); + } + + #[test] + fn vfs_records_commit_phase_durations() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + let ctx = direct_vfs_ctx(&db); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE metrics_test (id INTEGER PRIMARY KEY, value TEXT NOT NULL);", + ) + .expect("create table should succeed"); + + let relaxed = std::sync::atomic::Ordering::Relaxed; + ctx.commit_request_build_ns.store(0, relaxed); + ctx.commit_serialize_ns.store(0, relaxed); + ctx.commit_transport_ns.store(0, relaxed); + ctx.commit_state_update_ns.store(0, relaxed); + ctx.commit_duration_ns_total.store(0, relaxed); + ctx.commit_total.store(0, relaxed); + + sqlite_exec( + db.as_ptr(), + "INSERT INTO metrics_test (id, value) VALUES (1, 'hello');", + ) + .expect("insert should succeed"); + + let metrics = db.sqlite_vfs_metrics(); + assert_eq!(metrics.commit_count, 1); + assert!(metrics.request_build_ns > 0); + assert!(metrics.serialize_ns > 0); + assert!(metrics.transport_ns > 0); + assert!(metrics.state_update_ns > 0); + assert!(metrics.total_ns >= metrics.request_build_ns); + assert!(metrics.request_build_ns + metrics.transport_ns + metrics.state_update_ns > 0); } - impl MemoryKv { - fn new() -> Self { - Self::default() - } + #[test] + fn profile_large_tx_insert_5mb() { + // 5MB = 1280 rows x 4KB blobs in one transaction + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + let ctx = direct_vfs_ctx(&db); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE bench (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", + ) + .expect("create table should succeed"); - fn record_op(&self, actor_id: &str, op: KvOp) { - let mut op_log = self.op_log.lock().unwrap(); - op_log.entry(actor_id.to_string()).or_default().push(op); - } + let relaxed = std::sync::atomic::Ordering::Relaxed; + ctx.resolve_pages_total.store(0, relaxed); + ctx.resolve_pages_cache_hits.store(0, relaxed); + ctx.resolve_pages_fetches.store(0, relaxed); + ctx.pages_fetched_total.store(0, relaxed); + ctx.prefetch_pages_total.store(0, relaxed); + ctx.commit_total.store(0, relaxed); - fn snapshot_actor(&self, actor_id: &str) -> HashMap, Vec> { - self.stores - .lock() - .unwrap() - .get(actor_id) - .cloned() - .unwrap_or_default() + let start = std::time::Instant::now(); + sqlite_exec(db.as_ptr(), "BEGIN;").expect("begin"); + for i in 0..1280 { + sqlite_step_statement( + db.as_ptr(), + &format!( + "INSERT INTO bench (id, payload) VALUES ({}, randomblob(4096));", + i + ), + ) + .expect("insert should succeed"); } + sqlite_exec(db.as_ptr(), "COMMIT;").expect("commit"); + let elapsed = start.elapsed(); - fn op_log(&self, actor_id: &str) -> Vec { - self.op_log - .lock() - .unwrap() - .get(actor_id) - .cloned() - .unwrap_or_default() - } + let resolve_total = ctx.resolve_pages_total.load(relaxed); + let cache_hits = ctx.resolve_pages_cache_hits.load(relaxed); + let fetches = ctx.resolve_pages_fetches.load(relaxed); + let pages_fetched = ctx.pages_fetched_total.load(relaxed); + let prefetch = ctx.prefetch_pages_total.load(relaxed); + let commits = ctx.commit_total.load(relaxed); + + eprintln!("=== 5MB INSERT PROFILE (1280 rows x 4KB) ==="); + eprintln!(" wall clock: {:?}", elapsed); + eprintln!(" resolve_pages calls: {}", resolve_total); + eprintln!(" cache hits (pages): {}", cache_hits); + eprintln!(" engine fetches: {}", fetches); + eprintln!(" pages fetched total: {}", pages_fetched); + eprintln!(" prefetch pages: {}", prefetch); + eprintln!(" commits: {}", commits); + eprintln!("============================================"); + + // In a single transaction, all 1280 row writes are to new pages. + // Only the single commit at the end should hit the engine. + assert_eq!( + fetches, 0, + "expected 0 engine fetches during 5MB insert transaction" + ); + assert_eq!( + commits, 1, + "expected exactly 1 commit for transactional insert" + ); - fn journal_was_used(&self, actor_id: &str) -> bool { - self.op_log(actor_id).iter().any(|op| match op { - KvOp::Get { keys } | KvOp::Put { keys } | KvOp::Delete { keys } => keys - .iter() - .any(|key| key_file_tag(key.as_slice()) == Some(kv::FILE_TAG_JOURNAL)), - KvOp::DeleteRange { start, end } => { - key_file_tag(start.as_slice()) == Some(kv::FILE_TAG_JOURNAL) - || key_file_tag(end.as_slice()) == Some(kv::FILE_TAG_JOURNAL) - } - }) - } + let count = sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM bench;") + .expect("count should succeed"); + assert_eq!(count, 1280); } - #[async_trait::async_trait] - impl SqliteKv for MemoryKv { - async fn batch_get( - &self, - actor_id: &str, - keys: Vec>, - ) -> Result { - self.record_op(actor_id, KvOp::Get { keys: keys.clone() }); - - let store_guard = self.stores.lock().unwrap(); - let actor_store = store_guard.get(actor_id); - let mut found_keys = Vec::new(); - let mut found_values = Vec::new(); - for key in keys { - if let Some(value) = actor_store.and_then(|store| store.get(&key)) { - found_keys.push(key); - found_values.push(value.clone()); - } - } + #[test] + fn profile_hot_row_updates() { + // 100 updates to the same row - this is the autocommit case + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + let ctx = direct_vfs_ctx(&db); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE counter (id INTEGER PRIMARY KEY, value INTEGER NOT NULL);", + ) + .expect("create"); + sqlite_exec(db.as_ptr(), "INSERT INTO counter VALUES (1, 0);").expect("insert"); - Ok(KvGetResult { - keys: found_keys, - values: found_values, - }) + let relaxed = std::sync::atomic::Ordering::Relaxed; + ctx.resolve_pages_total.store(0, relaxed); + ctx.resolve_pages_cache_hits.store(0, relaxed); + ctx.resolve_pages_fetches.store(0, relaxed); + ctx.pages_fetched_total.store(0, relaxed); + ctx.prefetch_pages_total.store(0, relaxed); + ctx.commit_total.store(0, relaxed); + + let start = std::time::Instant::now(); + for _ in 0..100 { + sqlite_exec( + db.as_ptr(), + "UPDATE counter SET value = value + 1 WHERE id = 1;", + ) + .expect("update"); } + let elapsed = start.elapsed(); - async fn batch_put( - &self, - actor_id: &str, - keys: Vec>, - values: Vec>, - ) -> Result<(), SqliteKvError> { - if keys.len() != values.len() { - return Err(SqliteKvError::new("keys and values length mismatch")); - } + let fetches = ctx.resolve_pages_fetches.load(relaxed); + let commits = ctx.commit_total.load(relaxed); - self.record_op(actor_id, KvOp::Put { keys: keys.clone() }); + eprintln!("=== 100 HOT ROW UPDATES (autocommit) ==="); + eprintln!(" wall clock: {:?}", elapsed); + eprintln!( + " resolve_pages calls: {}", + ctx.resolve_pages_total.load(relaxed) + ); + eprintln!( + " cache hits (pages): {}", + ctx.resolve_pages_cache_hits.load(relaxed) + ); + eprintln!(" engine fetches: {}", fetches); + eprintln!( + " pages fetched total: {}", + ctx.pages_fetched_total.load(relaxed) + ); + eprintln!( + " prefetch pages: {}", + ctx.prefetch_pages_total.load(relaxed) + ); + eprintln!(" commits: {}", commits); + eprintln!("========================================="); - let mut stores = self.stores.lock().unwrap(); - let actor_store = stores.entry(actor_id.to_string()).or_default(); - for (key, value) in keys.into_iter().zip(values.into_iter()) { - actor_store.insert(key, value); - } + // Hot row updates: each update modifies the same page. Pages already + // in write_buffer or cache should not need re-fetching. With the + // counter's page(s) already warm, subsequent updates should be + // 100% cache hits (0 fetches). Autocommit means 100 separate commits. + assert_eq!( + fetches, 0, + "expected 0 engine fetches for 100 hot row updates" + ); + assert_eq!( + commits, 100, + "expected 100 commits (autocommit per statement)" + ); + } - Ok(()) + #[test] + fn profile_large_tx_insert_1mb_preloaded() { + // Same as the 1MB test but preload all pages first to see commit-only cost + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let actor_id = &harness.actor_id; + + // First pass: create and populate the table to generate pages + let db1 = + harness.open_db_on_engine(&runtime, engine.clone(), actor_id, VfsConfig::default()); + sqlite_exec( + db1.as_ptr(), + "CREATE TABLE bench (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", + ) + .expect("create table should succeed"); + sqlite_exec(db1.as_ptr(), "BEGIN;").expect("begin"); + for i in 0..256 { + sqlite_step_statement( + db1.as_ptr(), + &format!( + "INSERT INTO bench (id, payload) VALUES ({}, randomblob(4096));", + i + ), + ) + .expect("insert should succeed"); } + sqlite_exec(db1.as_ptr(), "COMMIT;").expect("commit"); + drop(db1); + + // Second pass: reopen with warm cache (takeover preloads page 1, rest from reads) + let db2 = + harness.open_db_on_engine(&runtime, engine.clone(), actor_id, VfsConfig::default()); + let ctx = direct_vfs_ctx(&db2); + + // Warm the cache by reading everything + sqlite_exec(db2.as_ptr(), "SELECT COUNT(*) FROM bench;").expect("count"); + + // Reset counters + let relaxed = std::sync::atomic::Ordering::Relaxed; + ctx.resolve_pages_total.store(0, relaxed); + ctx.resolve_pages_cache_hits.store(0, relaxed); + ctx.resolve_pages_fetches.store(0, relaxed); + ctx.pages_fetched_total.store(0, relaxed); + ctx.prefetch_pages_total.store(0, relaxed); + ctx.commit_total.store(0, relaxed); - async fn batch_delete( - &self, - actor_id: &str, - keys: Vec>, - ) -> Result<(), SqliteKvError> { - self.record_op(actor_id, KvOp::Delete { keys: keys.clone() }); + let start = std::time::Instant::now(); + sqlite_exec(db2.as_ptr(), "BEGIN;").expect("begin"); + for i in 256..512 { + sqlite_step_statement( + db2.as_ptr(), + &format!( + "INSERT INTO bench (id, payload) VALUES ({}, randomblob(4096));", + i + ), + ) + .expect("insert should succeed"); + } + sqlite_exec(db2.as_ptr(), "COMMIT;").expect("commit"); + let elapsed = start.elapsed(); - let mut stores = self.stores.lock().unwrap(); - let actor_store = stores.entry(actor_id.to_string()).or_default(); - for key in keys { - actor_store.remove(&key); - } + let resolve_total = ctx.resolve_pages_total.load(relaxed); + let cache_hits = ctx.resolve_pages_cache_hits.load(relaxed); + let fetches = ctx.resolve_pages_fetches.load(relaxed); + let pages_fetched = ctx.pages_fetched_total.load(relaxed); + let prefetch = ctx.prefetch_pages_total.load(relaxed); + let commits = ctx.commit_total.load(relaxed); + + eprintln!("=== 1MB INSERT PROFILE (WARM CACHE) ==="); + eprintln!(" wall clock: {:?}", elapsed); + eprintln!(" resolve_pages calls: {}", resolve_total); + eprintln!(" cache hits (pages): {}", cache_hits); + eprintln!(" engine fetches: {}", fetches); + eprintln!(" pages fetched total: {}", pages_fetched); + eprintln!(" prefetch pages: {}", prefetch); + eprintln!(" commits: {}", commits); + eprintln!("========================================"); + + // Second 256-row transaction into the already-populated table. + // All new pages are beyond db_size_pages, so no engine fetches. + assert_eq!( + fetches, 0, + "expected 0 engine fetches during warm 1MB insert" + ); + assert_eq!( + commits, 1, + "expected exactly 1 commit for transactional insert" + ); - Ok(()) - } + let count = sqlite_query_i64(db2.as_ptr(), "SELECT COUNT(*) FROM bench;") + .expect("count should succeed"); + assert_eq!(count, 512); + } - async fn delete_range( - &self, - actor_id: &str, - start: Vec, - end: Vec, - ) -> Result<(), SqliteKvError> { - self.record_op( - actor_id, - KvOp::DeleteRange { - start: start.clone(), - end: end.clone(), - }, - ); + #[test] + fn profile_large_tx_insert_1mb() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + let ctx = direct_vfs_ctx(&db); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE bench (id INTEGER PRIMARY KEY, payload BLOB NOT NULL);", + ) + .expect("create table should succeed"); + + // Reset counters after schema setup + ctx.resolve_pages_total + .store(0, std::sync::atomic::Ordering::Relaxed); + ctx.resolve_pages_cache_hits + .store(0, std::sync::atomic::Ordering::Relaxed); + ctx.resolve_pages_fetches + .store(0, std::sync::atomic::Ordering::Relaxed); + ctx.pages_fetched_total + .store(0, std::sync::atomic::Ordering::Relaxed); + ctx.prefetch_pages_total + .store(0, std::sync::atomic::Ordering::Relaxed); + ctx.commit_total + .store(0, std::sync::atomic::Ordering::Relaxed); - let mut stores = self.stores.lock().unwrap(); - let actor_store = stores.entry(actor_id.to_string()).or_default(); - actor_store.retain(|key, _| { - !(key.as_slice() >= start.as_slice() && key.as_slice() < end.as_slice()) - }); + let start = std::time::Instant::now(); - Ok(()) + sqlite_exec(db.as_ptr(), "BEGIN;").expect("begin should succeed"); + for i in 0..256 { + sqlite_step_statement( + db.as_ptr(), + &format!( + "INSERT INTO bench (id, payload) VALUES ({}, randomblob(4096));", + i + ), + ) + .expect("insert should succeed"); } - } + sqlite_exec(db.as_ptr(), "COMMIT;").expect("commit should succeed"); - fn next_test_name(prefix: &str) -> String { - let id = TEST_ID.fetch_add(1, Ordering::Relaxed); - format!("{prefix}-{id}") + let elapsed = start.elapsed(); + let relaxed = std::sync::atomic::Ordering::Relaxed; + + let resolve_total = ctx.resolve_pages_total.load(relaxed); + let cache_hits = ctx.resolve_pages_cache_hits.load(relaxed); + let fetches = ctx.resolve_pages_fetches.load(relaxed); + let pages_fetched = ctx.pages_fetched_total.load(relaxed); + let prefetch = ctx.prefetch_pages_total.load(relaxed); + let commits = ctx.commit_total.load(relaxed); + + eprintln!("=== 1MB INSERT PROFILE (256 rows x 4KB) ==="); + eprintln!(" wall clock: {:?}", elapsed); + eprintln!(" resolve_pages calls: {}", resolve_total); + eprintln!(" cache hits (pages): {}", cache_hits); + eprintln!(" engine fetches: {}", fetches); + eprintln!(" pages fetched total: {}", pages_fetched); + eprintln!(" prefetch pages: {}", prefetch); + eprintln!(" commits: {}", commits); + eprintln!("============================================"); + + // Assert expected zero-fetch behavior: in a single transaction, + // all writes are to new pages, so no engine fetches should happen. + // Only the single commit at the end should hit the engine. + assert_eq!( + fetches, 0, + "expected 0 engine fetches during 1MB insert transaction" + ); + assert_eq!( + commits, 1, + "expected exactly 1 commit for transactional insert" + ); + + let count = sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM bench;") + .expect("count should succeed"); + assert_eq!(count, 256); } - fn with_test_db(test_fn: impl FnOnce(*mut sqlite3, Arc, &str)) { - let runtime = tokio::runtime::Builder::new_current_thread() - .build() - .unwrap(); - let kv = Arc::new(MemoryKv::new()); - let actor_id = next_test_name("sqlite-native-test"); - let vfs_name = next_test_name("sqlite-native-vfs"); - let vfs = KvVfs::register( - &vfs_name, - kv.clone(), - actor_id.clone(), - runtime.handle().clone(), - Vec::new(), + // Regression test for fence mismatch during rapid autocommit inserts. + // Each autocommit INSERT is its own transaction. This test drives many + // sequential commits through the VFS and verifies they all succeed. + #[test] + fn autocommit_inserts_maintain_head_txid_consistency() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let db = harness.open_db(&runtime); + let ctx = direct_vfs_ctx(&db); + + sqlite_exec( + db.as_ptr(), + "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", ) - .unwrap(); - let db = open_database(vfs, &actor_id).unwrap(); + .expect("create table should succeed"); + + let relaxed = std::sync::atomic::Ordering::Relaxed; + ctx.commit_total.store(0, relaxed); + + // 100 sequential autocommit inserts. If fence mismatch is the bug, + // this will fail partway through with "commit head_txid X did not + // match current head_txid X-1". + for i in 0..100 { + sqlite_exec( + db.as_ptr(), + &format!("INSERT INTO t (id, v) VALUES ({i}, {});", i * 2), + ) + .expect("autocommit insert should not fence-mismatch"); + } + + let commits = ctx.commit_total.load(relaxed); + // Each autocommit INSERT = 1 commit. CREATE TABLE was 1 more. + // We reset commit_total after CREATE, so expect 100. + assert_eq!(commits, 100, "expected exactly 100 commits"); - test_fn(db.as_ptr(), kv, &actor_id); + let count = + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM t;").expect("count should succeed"); + assert_eq!(count, 100); - drop(db); - drop(runtime); + // Verify the sum to make sure data is correct and not corrupted + let sum = + sqlite_query_i64(db.as_ptr(), "SELECT SUM(v) FROM t;").expect("sum should succeed"); + assert_eq!(sum, (0..100).map(|i| i * 2).sum::()); } - fn exec_sql(db: *mut sqlite3, sql: &str) { - let c_sql = CString::new(sql).unwrap(); - let mut err_msg = ptr::null_mut(); - let rc = unsafe { sqlite3_exec(db, c_sql.as_ptr(), None, ptr::null_mut(), &mut err_msg) }; - if rc != SQLITE_OK { - let message = if err_msg.is_null() { - format!("sqlite error {rc}") - } else { - let message = unsafe { CStr::from_ptr(err_msg) } - .to_string_lossy() - .into_owned(); - unsafe { sqlite3_free(err_msg as *mut c_void) }; - message - }; - panic!("sqlite3_exec failed for `{sql}`: {message}"); + // Regression test: 5 actors run 200 autocommits each on the same engine. + // Compaction is triggered via the mpsc channel after each commit, so this + // also exercises the commit-vs-compaction race that caused fence rewinds + // before the tx_get_value_serializable fix. + #[test] + fn stress_concurrent_multi_actor_autocommits() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + + let mut dbs = Vec::new(); + for i in 0..5 { + let actor_id = format!("{}-stress-{}", harness.actor_id, i); + let db = harness.open_db_on_engine( + &runtime, + engine.clone(), + &actor_id, + VfsConfig::default(), + ); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", + ) + .expect("create"); + dbs.push(db); } - } - fn query_i64(db: *mut sqlite3, sql: &str) -> i64 { - let c_sql = CString::new(sql).unwrap(); - let mut stmt = ptr::null_mut(); - let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; - assert_eq!(rc, SQLITE_OK, "failed to prepare `{sql}`"); - assert!( - !stmt.is_null(), - "sqlite returned a null statement for `{sql}`" - ); + // Interleave 200 autocommit inserts across all 5 actors + for i in 0..200 { + for db in &dbs { + sqlite_exec( + db.as_ptr(), + &format!("INSERT INTO t (id, v) VALUES ({i}, {i});"), + ) + .expect("insert"); + } + } - let step_rc = unsafe { sqlite3_step(stmt) }; - assert_eq!(step_rc, SQLITE_ROW, "expected a row from `{sql}`"); - let value = unsafe { sqlite3_column_int64(stmt, 0) }; - let done_rc = unsafe { sqlite3_step(stmt) }; - assert_eq!(done_rc, SQLITE_DONE, "expected SQLITE_DONE after `{sql}`"); + for db in &dbs { + let count = sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM t;").expect("count"); + assert_eq!(count, 200); + } + } - unsafe { - sqlite3_finalize(stmt); + // Regression test: two actors run autocommits concurrently on the same + // SqliteEngine. If anything in the engine (e.g., compaction) cross-contaminates + // actors or races on shared state, we'd see fence mismatches. + #[test] + fn concurrent_multi_actor_autocommits() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + + let actor_a = format!("{}-a", harness.actor_id); + let actor_b = format!("{}-b", harness.actor_id); + + let db_a = + harness.open_db_on_engine(&runtime, engine.clone(), &actor_a, VfsConfig::default()); + let db_b = + harness.open_db_on_engine(&runtime, engine.clone(), &actor_b, VfsConfig::default()); + + sqlite_exec( + db_a.as_ptr(), + "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", + ) + .expect("create a"); + sqlite_exec( + db_b.as_ptr(), + "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", + ) + .expect("create b"); + + // Run 100 autocommits on each actor, interleaved. + for i in 0..100 { + sqlite_exec( + db_a.as_ptr(), + &format!("INSERT INTO t (id, v) VALUES ({i}, {i});"), + ) + .expect("insert a"); + sqlite_exec( + db_b.as_ptr(), + &format!("INSERT INTO t (id, v) VALUES ({i}, {i});"), + ) + .expect("insert b"); } - value + let count_a = sqlite_query_i64(db_a.as_ptr(), "SELECT COUNT(*) FROM t;").expect("count a"); + assert_eq!(count_a, 100); + let count_b = sqlite_query_i64(db_b.as_ptr(), "SELECT COUNT(*) FROM t;").expect("count b"); + assert_eq!(count_b, 100); } - fn query_texts(db: *mut sqlite3, sql: &str) -> Vec { - let c_sql = CString::new(sql).unwrap(); - let mut stmt = ptr::null_mut(); - let rc = unsafe { sqlite3_prepare_v2(db, c_sql.as_ptr(), -1, &mut stmt, ptr::null_mut()) }; - assert_eq!(rc, SQLITE_OK, "failed to prepare `{sql}`"); - assert!( - !stmt.is_null(), - "sqlite returned a null statement for `{sql}`" - ); + // Same as above but across a close/reopen cycle to exercise takeover. + #[test] + fn autocommit_survives_close_reopen() { + let runtime = direct_runtime(); + let harness = DirectEngineHarness::new(); + let engine = runtime.block_on(harness.open_engine()); + let actor_id = &harness.actor_id; - let mut values = Vec::new(); - loop { - let step_rc = unsafe { sqlite3_step(stmt) }; - if step_rc == SQLITE_DONE { - break; - } - assert_eq!( - step_rc, SQLITE_ROW, - "expected SQLITE_ROW or SQLITE_DONE for `{sql}`" - ); - let text_ptr = unsafe { sqlite3_column_text(stmt, 0) }; - assert!(!text_ptr.is_null(), "expected text result for `{sql}`"); - values.push( - unsafe { CStr::from_ptr(text_ptr as *const c_char) } - .to_string_lossy() - .into_owned(), + { + let db = harness.open_db_on_engine( + &runtime, + engine.clone(), + actor_id, + VfsConfig::default(), ); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER NOT NULL);", + ) + .expect("create table"); + for i in 0..50 { + sqlite_exec( + db.as_ptr(), + &format!("INSERT INTO t (id, v) VALUES ({i}, {});", i), + ) + .expect("insert"); + } } - unsafe { - sqlite3_finalize(stmt); + // Reopen (triggers takeover which bumps generation) + let db2 = + harness.open_db_on_engine(&runtime, engine.clone(), actor_id, VfsConfig::default()); + for i in 50..100 { + sqlite_exec( + db2.as_ptr(), + &format!("INSERT INTO t (id, v) VALUES ({i}, {});", i), + ) + .expect("insert after reopen"); } - values - } - - fn key_file_tag(key: &[u8]) -> Option { - (key.len() >= 4 && key[0] == kv::SQLITE_PREFIX && key[1] == kv::SQLITE_SCHEMA_VERSION) - .then_some(key[3]) + let count = sqlite_query_i64(db2.as_ptr(), "SELECT COUNT(*) FROM t;") + .expect("count should succeed"); + assert_eq!(count, 100); } - fn assert_journal_round_trip(kv: &MemoryKv, actor_id: &str) { - assert!( - kv.journal_was_used(actor_id), - "expected rollback journal KV operations for actor {actor_id}" - ); - assert!( - kv.snapshot_actor(actor_id) - .keys() - .all(|key| key_file_tag(key.as_slice()) != Some(kv::FILE_TAG_JOURNAL)), - "expected rollback journal keys to be deleted after commit for actor {actor_id}" - ); - } + // Bench-parity tests. Each mirrors a workload in + // examples/kitchen-sink/src/actors/testing/test-sqlite-bench.ts so + // storage-layer regressions surface here without needing the full stack. - #[test] - fn encode_decode_round_trip() { - for size in [0i64, 1, 4096, 1_000_000, i64::MAX / 2] { - let encoded = encode_file_meta(size); - assert_eq!(encoded.len(), META_ENCODED_SIZE); - assert_eq!(&encoded[0..2], &META_VERSION.to_le_bytes()); - let decoded = decode_file_meta(&encoded).unwrap(); - assert_eq!(decoded, size); - } + fn open_bench_db(runtime: &tokio::runtime::Runtime) -> NativeDatabase { + let harness = DirectEngineHarness::new(); + harness.open_db(runtime) } #[test] - fn startup_probe_asserts_batch_atomic_writes_are_active() { - let runtime = tokio::runtime::Builder::new_current_thread() - .build() - .unwrap(); - let kv = Arc::new(MemoryKv::new()); - let actor_id = next_test_name("sqlite-native-probe"); - let vfs_name = next_test_name("sqlite-native-probe-vfs"); - let vfs = KvVfs::register( - &vfs_name, - kv, - actor_id.clone(), - runtime.handle().clone(), - Vec::new(), + fn bench_insert_tx_x10000() { + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE t (id INTEGER PRIMARY KEY, v INTEGER);", ) .unwrap(); - let db = open_database(vfs, &actor_id).unwrap(); - assert!( - db._vfs.commit_atomic_count() > 0, - "expected startup probe to trigger COMMIT_ATOMIC_WRITE" - ); - drop(db); - drop(runtime); - } - #[test] - fn encode_zero_size() { - let encoded = encode_file_meta(0); - assert_eq!(encoded, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]); - } + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for i in 0..10_000 { + sqlite_exec( + db.as_ptr(), + &format!("INSERT INTO t (id, v) VALUES ({i}, {i});"), + ) + .unwrap(); + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - #[test] - fn encode_known_size() { - let encoded = encode_file_meta(4096); assert_eq!( - encoded, - [1, 0, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00] + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM t;").unwrap(), + 10_000 ); } #[test] - fn decode_invalid_version() { - let data = [2u8, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - assert!(decode_file_meta(&data).is_none()); - } - - #[test] - fn decode_too_short() { - assert!(decode_file_meta(&[]).is_none()); - assert!(decode_file_meta(&[1]).is_none()); - assert!(decode_file_meta(&[1, 0]).is_none()); - assert!(decode_file_meta(&[1, 0, 0, 0, 0]).is_none()); + fn bench_large_tx_insert_500kb() { + large_tx_insert(500 * 1024); } #[test] - fn kv_file_struct_is_larger_than_sqlite3_file() { - assert!(std::mem::size_of::() > std::mem::size_of::()); + fn bench_large_tx_insert_10mb() { + large_tx_insert(10 * 1024 * 1024); } #[test] - fn meta_encoded_size_constant() { - assert_eq!(META_ENCODED_SIZE, 10); + fn bench_large_tx_insert_50mb() { + // 50MB exercises the slow-path stage/finalize chunking that has + // historically hit decode errors under certain transports. + large_tx_insert(50 * 1024 * 1024); } - #[test] - fn meta_version_matches_wasm_vfs() { - assert_eq!(META_VERSION, 1); - } + fn large_tx_insert(target_bytes: usize) { + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE large_tx (id INTEGER PRIMARY KEY AUTOINCREMENT, payload BLOB NOT NULL);", + ) + .unwrap(); - #[test] - fn encode_matches_vbare_format() { - let encoded = encode_file_meta(42); - assert_eq!(encoded[0], 0x01); - assert_eq!(encoded[1], 0x00); - assert_eq!(&encoded[2..], &42u64.to_le_bytes()); - } + let row_size = 4 * 1024; + let rows = (target_bytes + row_size - 1) / row_size; + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for _ in 0..rows { + sqlite_exec( + db.as_ptr(), + &format!("INSERT INTO large_tx (payload) VALUES (randomblob({row_size}));"), + ) + .unwrap(); + } + if let Err(err) = sqlite_exec(db.as_ptr(), "COMMIT") { + let vfs_err = direct_vfs_ctx(&db).clone_last_error(); + panic!( + "COMMIT failed for {} MiB: sqlite={}, vfs_last_error={:?}", + target_bytes / (1024 * 1024), + err, + vfs_err, + ); + } - #[test] - fn empty_db_page_matches_generated_prefix() { - let page = empty_db_page(); - assert_eq!(page.len(), kv::CHUNK_SIZE); assert_eq!( - &page[..EMPTY_DB_PAGE_HEADER_PREFIX.len()], - &EMPTY_DB_PAGE_HEADER_PREFIX - ); - assert!( - page[EMPTY_DB_PAGE_HEADER_PREFIX.len()..] - .iter() - .all(|byte| *byte == 0) + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM large_tx;").unwrap(), + rows as i64 ); } #[test] - fn startup_preload_helpers_use_exact_key_matches() { - let mut entries = vec![ - (vec![3], vec![30]), - (vec![1], vec![10]), - (vec![2], vec![20]), - ]; - sort_startup_preload(&mut entries); - - assert_eq!(startup_preload_get(&entries, &[1]), Some(&[10][..])); - assert_eq!(startup_preload_get(&entries, &[2]), Some(&[20][..])); - assert_eq!(startup_preload_get(&entries, &[4]), None); + fn bench_churn_insert_delete_10x1000() { + // Tests freelist reuse / space reclamation under heavy churn. + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE churn (id INTEGER PRIMARY KEY AUTOINCREMENT, payload BLOB NOT NULL);", + ) + .unwrap(); + for _ in 0..10 { + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for _ in 0..1000 { + sqlite_exec( + db.as_ptr(), + "INSERT INTO churn (payload) VALUES (randomblob(1024));", + ) + .unwrap(); + } + sqlite_exec(db.as_ptr(), "DELETE FROM churn;").unwrap(); + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); + } + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM churn;").unwrap(), + 0 + ); } #[test] - fn startup_preload_helpers_update_without_growing() { - let mut entries = vec![(vec![1], vec![10]), (vec![2], vec![20])]; - sort_startup_preload(&mut entries); + fn bench_mixed_oltp_large() { + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE mixed (id INTEGER PRIMARY KEY, v INTEGER NOT NULL, data BLOB NOT NULL);", + ) + .unwrap(); - startup_preload_put(&mut entries, &[2], &[99]); - startup_preload_put(&mut entries, &[3], &[30]); - startup_preload_delete(&mut entries, &[1]); - startup_preload_delete(&mut entries, &[7]); + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for i in 0..500 { + sqlite_exec( + db.as_ptr(), + &format!( + "INSERT INTO mixed (id, v, data) VALUES ({i}, {}, randomblob(1024));", + i * 2 + ), + ) + .unwrap(); + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); + + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for i in 0..500 { + sqlite_exec( + db.as_ptr(), + &format!( + "INSERT INTO mixed (id, v, data) VALUES ({}, {}, randomblob(1024));", + 500 + i, + i * 3 + ), + ) + .unwrap(); + sqlite_exec( + db.as_ptr(), + &format!("UPDATE mixed SET v = v + 1 WHERE id = {i};"), + ) + .unwrap(); + if i % 5 == 0 && i >= 50 { + sqlite_exec( + db.as_ptr(), + &format!("DELETE FROM mixed WHERE id = {};", i - 50), + ) + .unwrap(); + } + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - assert_eq!(entries, vec![(vec![2], vec![99])]); + let count = sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM mixed;").unwrap(); + assert!(count > 900 && count < 1000); } #[test] - fn startup_preload_helpers_delete_range_is_half_open() { - let mut entries = vec![ - (vec![1], vec![10]), - (vec![2], vec![20]), - (vec![3], vec![30]), - (vec![4], vec![40]), - ]; - sort_startup_preload(&mut entries); + fn bench_bulk_update_1000_rows() { + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE bulk (id INTEGER PRIMARY KEY, v INTEGER);", + ) + .unwrap(); + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for i in 0..1000 { + sqlite_exec( + db.as_ptr(), + &format!("INSERT INTO bulk (id, v) VALUES ({i}, {i});"), + ) + .unwrap(); + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - startup_preload_delete_range(&mut entries, &[2], &[4]); + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for i in 0..1000 { + sqlite_exec( + db.as_ptr(), + &format!("UPDATE bulk SET v = v + 1 WHERE id = {i};"), + ) + .unwrap(); + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - assert_eq!(entries, vec![(vec![1], vec![10]), (vec![4], vec![40])]); + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT SUM(v) FROM bulk;").unwrap(), + (0..1000).map(|i| i + 1).sum::() + ); } #[test] - fn v1_vfs_single_insert_and_select() { - with_test_db(|db, kv, actor_id| { - exec_sql( - db, - "CREATE TABLE users (id INTEGER PRIMARY KEY, value INTEGER NOT NULL);", - ); - exec_sql(db, "INSERT INTO users (value) VALUES (42);"); - - assert_eq!(query_i64(db, "SELECT value FROM users WHERE id = 1;"), 42); - assert_journal_round_trip(kv.as_ref(), actor_id); - }); + fn bench_truncate_and_regrow() { + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE regrow (id INTEGER PRIMARY KEY AUTOINCREMENT, payload BLOB NOT NULL);", + ) + .unwrap(); + for _ in 0..2 { + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for _ in 0..500 { + sqlite_exec( + db.as_ptr(), + "INSERT INTO regrow (payload) VALUES (randomblob(1024));", + ) + .unwrap(); + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); + sqlite_exec(db.as_ptr(), "DELETE FROM regrow;").unwrap(); + } + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM regrow;").unwrap(), + 0 + ); } #[test] - fn v1_vfs_multi_row_insert() { - with_test_db(|db, kv, actor_id| { - exec_sql( - db, - "CREATE TABLE metrics (id INTEGER PRIMARY KEY, value INTEGER NOT NULL);", - ); - exec_sql( - db, - "INSERT INTO metrics (value) VALUES (5), (7), (11), (13), (17);", - ); + fn bench_many_small_tables() { + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for i in 0..50 { + sqlite_exec( + db.as_ptr(), + &format!("CREATE TABLE t_{i} (id INTEGER PRIMARY KEY, v INTEGER);"), + ) + .unwrap(); + for j in 0..10 { + sqlite_exec( + db.as_ptr(), + &format!("INSERT INTO t_{i} (id, v) VALUES ({j}, {});", i * j), + ) + .unwrap(); + } + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - assert_eq!(query_i64(db, "SELECT COUNT(*) FROM metrics;"), 5); - assert_eq!(query_i64(db, "SELECT SUM(value) FROM metrics;"), 53); - assert_journal_round_trip(kv.as_ref(), actor_id); - }); + let total: i64 = (0..50) + .map(|i| { + sqlite_query_i64(db.as_ptr(), &format!("SELECT COUNT(*) FROM t_{i};")).unwrap() + }) + .sum(); + assert_eq!(total, 500); } #[test] - fn v1_vfs_update_existing_row() { - with_test_db(|db, kv, actor_id| { - exec_sql( - db, - "CREATE TABLE docs (id INTEGER PRIMARY KEY, title TEXT NOT NULL);", - ); - exec_sql(db, "INSERT INTO docs (title) VALUES ('draft');"); - exec_sql(db, "UPDATE docs SET title = 'published' WHERE id = 1;"); - - assert_eq!( - query_texts(db, "SELECT title FROM docs WHERE id = 1;"), - vec!["published".to_string()] - ); - assert_journal_round_trip(kv.as_ref(), actor_id); - }); - } + fn bench_index_creation_on_10k_rows() { + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE idx_test (id INTEGER PRIMARY KEY AUTOINCREMENT, k TEXT NOT NULL, v INTEGER NOT NULL);", + ) + .unwrap(); + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for i in 0..10_000 { + sqlite_exec( + db.as_ptr(), + &format!( + "INSERT INTO idx_test (k, v) VALUES ('key-{}-{i}', {i});", + i % 1000 + ), + ) + .unwrap(); + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); - #[test] - fn v1_vfs_delete_row() { - with_test_db(|db, kv, actor_id| { - exec_sql( - db, - "CREATE TABLE events (id INTEGER PRIMARY KEY, name TEXT NOT NULL);", - ); - exec_sql( - db, - "INSERT INTO events (name) VALUES ('open'), ('close'), ('archive');", - ); - exec_sql(db, "DELETE FROM events WHERE name = 'close';"); + sqlite_exec(db.as_ptr(), "CREATE INDEX idx_test_k ON idx_test(k);").unwrap(); - assert_eq!(query_i64(db, "SELECT COUNT(*) FROM events;"), 2); - assert_eq!( - query_texts(db, "SELECT name FROM events ORDER BY id;"), - vec!["open".to_string(), "archive".to_string()] - ); - assert_journal_round_trip(kv.as_ref(), actor_id); - }); + assert_eq!( + sqlite_query_i64(db.as_ptr(), "SELECT COUNT(*) FROM idx_test;").unwrap(), + 10_000 + ); } #[test] - fn v1_vfs_multiple_tables_schema() { - with_test_db(|db, kv, actor_id| { - exec_sql( - db, - " - CREATE TABLE projects (id INTEGER PRIMARY KEY, name TEXT NOT NULL); - CREATE TABLE tasks ( - id INTEGER PRIMARY KEY, - project_id INTEGER NOT NULL, - title TEXT NOT NULL - ); - INSERT INTO projects (name) VALUES ('sqlite-vfs'); - INSERT INTO tasks (project_id, title) VALUES (1, 'baseline'), (1, 'verify'); - ", - ); + fn bench_growing_aggregation() { + let runtime = direct_runtime(); + let db = open_bench_db(&runtime); + sqlite_exec( + db.as_ptr(), + "CREATE TABLE agg (id INTEGER PRIMARY KEY AUTOINCREMENT, v INTEGER NOT NULL);", + ) + .unwrap(); - assert_eq!(query_i64(db, "SELECT COUNT(*) FROM projects;"), 1); - assert_eq!(query_i64(db, "SELECT COUNT(*) FROM tasks;"), 2); + let batches = 20; + let per_batch = 100; + for batch in 0..batches { + sqlite_exec(db.as_ptr(), "BEGIN").unwrap(); + for i in 0..per_batch { + sqlite_exec( + db.as_ptr(), + &format!("INSERT INTO agg (v) VALUES ({});", batch * per_batch + i), + ) + .unwrap(); + } + sqlite_exec(db.as_ptr(), "COMMIT").unwrap(); + let expected_sum: i64 = (0..(batch + 1) * per_batch).map(|i| i as i64).sum(); assert_eq!( - query_texts( - db, - "SELECT title FROM tasks WHERE project_id = 1 ORDER BY id;", - ), - vec!["baseline".to_string(), "verify".to_string()] + sqlite_query_i64(db.as_ptr(), "SELECT SUM(v) FROM agg;").unwrap(), + expected_sum ); - assert_journal_round_trip(kv.as_ref(), actor_id); - }); + } } } diff --git a/rivetkit-typescript/packages/rivetkit-napi/index.d.ts b/rivetkit-typescript/packages/rivetkit-napi/index.d.ts index 280e7d09b5..034ba60015 100644 --- a/rivetkit-typescript/packages/rivetkit-napi/index.d.ts +++ b/rivetkit-typescript/packages/rivetkit-napi/index.d.ts @@ -93,7 +93,7 @@ export interface JsSqliteVfsMetrics { commitCount: number } /** Open a native SQLite database backed by the envoy's KV channel. */ -export declare function openDatabaseFromEnvoy(jsHandle: JsEnvoyHandle, actorId: string, preloadedEntries?: Array | undefined | null): Promise +export declare function openDatabaseFromEnvoy(jsHandle: JsEnvoyHandle, actorId: string): Promise export interface JsQueueNextOptions { names?: Array timeoutMs?: number diff --git a/rivetkit-typescript/packages/rivetkit-napi/src/bridge_actor.rs b/rivetkit-typescript/packages/rivetkit-napi/src/bridge_actor.rs index 8c41b3c2bb..7d4aa8032e 100644 --- a/rivetkit-typescript/packages/rivetkit-napi/src/bridge_actor.rs +++ b/rivetkit-typescript/packages/rivetkit-napi/src/bridge_actor.rs @@ -30,8 +30,6 @@ pub type CanHibernateResponseMap = Arc>; -/// Map of sqlite schema versions keyed by actor ID. -pub type SqliteSchemaVersionMap = Arc>; fn make_ws_key(gateway_id: &protocol::GatewayId, request_id: &protocol::RequestId) -> [u8; 8] { let mut key = [0u8; 8]; @@ -47,7 +45,6 @@ pub struct BridgeCallbacks { ws_sender_map: WsSenderMap, can_hibernate_response_map: CanHibernateResponseMap, sqlite_startup_map: SqliteStartupMap, - sqlite_schema_version_map: SqliteSchemaVersionMap, } impl BridgeCallbacks { @@ -57,7 +54,6 @@ impl BridgeCallbacks { ws_sender_map: WsSenderMap, can_hibernate_response_map: CanHibernateResponseMap, sqlite_startup_map: SqliteStartupMap, - sqlite_schema_version_map: SqliteSchemaVersionMap, ) -> Self { Self { event_cb, @@ -65,7 +61,6 @@ impl BridgeCallbacks { ws_sender_map, can_hibernate_response_map, sqlite_startup_map, - sqlite_schema_version_map, } } @@ -89,17 +84,8 @@ impl EnvoyCallbacks for BridgeCallbacks { let response_map = self.response_map.clone(); let event_cb = self.event_cb.clone(); let sqlite_startup_map = self.sqlite_startup_map.clone(); - let sqlite_schema_version_map = self.sqlite_schema_version_map.clone(); Box::pin(async move { - match sqlite_schema_version_map.entry_async(actor_id.clone()).await { - scc::hash_map::Entry::Occupied(mut entry) => { - *entry.get_mut() = sqlite_schema_version; - } - scc::hash_map::Entry::Vacant(entry) => { - entry.insert_entry(sqlite_schema_version); - } - } if let Some(startup) = sqlite_startup_data.clone() { match sqlite_startup_map.entry_async(actor_id.clone()).await { scc::hash_map::Entry::Occupied(mut entry) => { @@ -156,10 +142,8 @@ impl EnvoyCallbacks for BridgeCallbacks { let response_map = self.response_map.clone(); let event_cb = self.event_cb.clone(); let sqlite_startup_map = self.sqlite_startup_map.clone(); - let sqlite_schema_version_map = self.sqlite_schema_version_map.clone(); Box::pin(async move { - let _ = sqlite_schema_version_map.remove_async(&actor_id).await; let _ = sqlite_startup_map.remove_async(&actor_id).await; let response_id = uuid::Uuid::new_v4().to_string(); diff --git a/rivetkit-typescript/packages/rivetkit-napi/src/database.rs b/rivetkit-typescript/packages/rivetkit-napi/src/database.rs index 7026ecc6cc..f20355846b 100644 --- a/rivetkit-typescript/packages/rivetkit-napi/src/database.rs +++ b/rivetkit-typescript/packages/rivetkit-napi/src/database.rs @@ -6,8 +6,6 @@ use rivetkit_core::sqlite::{ }; use crate::envoy_handle::JsEnvoyHandle; -use crate::types::JsKvEntry; - #[napi] #[derive(Clone)] pub struct JsNativeDatabase { @@ -163,16 +161,14 @@ fn u64_to_i64(value: u64) -> i64 { pub(crate) async fn open_database_with_runtime_config( config: SqliteRuntimeConfig, - preloaded_entries: Vec<(Vec, Vec)>, ) -> napi::Result { let SqliteRuntimeConfig { handle, actor_id, - schema_version, startup_data, } = config; - let db = CoreSqliteDb::new(handle, actor_id, schema_version, startup_data); - db.open(preloaded_entries) + let db = CoreSqliteDb::new(handle, actor_id, startup_data); + db.open() .await .map_err(crate::napi_anyhow_error)?; Ok(JsNativeDatabase::new(db)) @@ -183,31 +179,15 @@ pub(crate) async fn open_database_with_runtime_config( pub async fn open_database_from_envoy( js_handle: &JsEnvoyHandle, actor_id: String, - preloaded_entries: Option>, ) -> napi::Result { - let schema_version = js_handle - .clone_sqlite_schema_version(&actor_id) - .await - .ok_or_else(|| { - napi::Error::from_reason(format!( - "missing sqlite schema version for actor {actor_id}" - )) - })?; let startup_data = js_handle.clone_sqlite_startup_data(&actor_id).await; - let preloaded_entries = preloaded_entries - .unwrap_or_default() - .into_iter() - .map(|entry| (entry.key.to_vec(), entry.value.to_vec())) - .collect(); open_database_with_runtime_config( SqliteRuntimeConfig { handle: js_handle.handle.clone(), actor_id, - schema_version, startup_data, }, - preloaded_entries, ) .await } diff --git a/rivetkit-typescript/packages/rivetkit-napi/src/envoy_handle.rs b/rivetkit-typescript/packages/rivetkit-napi/src/envoy_handle.rs index d7b572ec88..278724f0ee 100644 --- a/rivetkit-typescript/packages/rivetkit-napi/src/envoy_handle.rs +++ b/rivetkit-typescript/packages/rivetkit-napi/src/envoy_handle.rs @@ -9,7 +9,7 @@ use tokio::runtime::Runtime; use rivet_envoy_protocol as protocol; use crate::bridge_actor::{ - CanHibernateResponseMap, ResponseMap, SqliteSchemaVersionMap, SqliteStartupMap, WsSenderMap, + CanHibernateResponseMap, ResponseMap, SqliteStartupMap, WsSenderMap, }; use crate::types::{self, JsKvEntry, JsKvListOptions}; @@ -33,7 +33,6 @@ pub struct JsEnvoyHandle { pub(crate) ws_sender_map: WsSenderMap, pub(crate) can_hibernate_response_map: CanHibernateResponseMap, pub(crate) sqlite_startup_map: SqliteStartupMap, - pub(crate) sqlite_schema_version_map: SqliteSchemaVersionMap, } impl JsEnvoyHandle { @@ -44,7 +43,6 @@ impl JsEnvoyHandle { ws_sender_map: WsSenderMap, can_hibernate_response_map: CanHibernateResponseMap, sqlite_startup_map: SqliteStartupMap, - sqlite_schema_version_map: SqliteSchemaVersionMap, ) -> Self { Self { runtime, @@ -53,16 +51,9 @@ impl JsEnvoyHandle { ws_sender_map, can_hibernate_response_map, sqlite_startup_map, - sqlite_schema_version_map, } } - pub async fn clone_sqlite_schema_version(&self, actor_id: &str) -> Option { - self.sqlite_schema_version_map - .read_async(actor_id, |_, version| *version) - .await - } - pub async fn clone_sqlite_startup_data( &self, actor_id: &str, diff --git a/rivetkit-typescript/packages/rivetkit-napi/src/lib.rs b/rivetkit-typescript/packages/rivetkit-napi/src/lib.rs index 2e3d72c077..0f0750f04b 100644 --- a/rivetkit-typescript/packages/rivetkit-napi/src/lib.rs +++ b/rivetkit-typescript/packages/rivetkit-napi/src/lib.rs @@ -70,8 +70,7 @@ fn init_tracing(log_level: Option<&str>) { } use crate::bridge_actor::{ - BridgeCallbacks, CanHibernateResponseMap, ResponseMap, SqliteSchemaVersionMap, - SqliteStartupMap, WsSenderMap, + BridgeCallbacks, CanHibernateResponseMap, ResponseMap, SqliteStartupMap, WsSenderMap, }; use crate::envoy_handle::JsEnvoyHandle; use crate::types::JsEnvoyConfig; @@ -96,7 +95,6 @@ pub fn start_envoy_sync_js( let can_hibernate_response_map: CanHibernateResponseMap = Arc::new(tokio::sync::Mutex::new(HashMap::new())); let sqlite_startup_map: SqliteStartupMap = Arc::new(scc::HashMap::new()); - let sqlite_schema_version_map: SqliteSchemaVersionMap = Arc::new(scc::HashMap::new()); // Create threadsafe callback for bridging events to JS let tsfn: bridge_actor::EventCallback = event_callback.create_threadsafe_function( @@ -114,7 +112,6 @@ pub fn start_envoy_sync_js( ws_sender_map.clone(), can_hibernate_response_map.clone(), sqlite_startup_map.clone(), - sqlite_schema_version_map.clone(), )); let envoy_config = EnvoyConfig { @@ -140,7 +137,6 @@ pub fn start_envoy_sync_js( ws_sender_map, can_hibernate_response_map, sqlite_startup_map, - sqlite_schema_version_map, )) } diff --git a/rivetkit-typescript/packages/rivetkit-napi/src/sqlite_db.rs b/rivetkit-typescript/packages/rivetkit-napi/src/sqlite_db.rs index a76589e55d..7d3d88c564 100644 --- a/rivetkit-typescript/packages/rivetkit-napi/src/sqlite_db.rs +++ b/rivetkit-typescript/packages/rivetkit-napi/src/sqlite_db.rs @@ -33,7 +33,6 @@ impl SqliteDb { .sql() .runtime_config() .map_err(crate::napi_anyhow_error)?, - Vec::new(), ) .await?, ); diff --git a/rivetkit-typescript/packages/rivetkit-napi/wrapper.d.ts b/rivetkit-typescript/packages/rivetkit-napi/wrapper.d.ts index 1664d9eb71..818947190e 100644 --- a/rivetkit-typescript/packages/rivetkit-napi/wrapper.d.ts +++ b/rivetkit-typescript/packages/rivetkit-napi/wrapper.d.ts @@ -143,6 +143,5 @@ export declare function startEnvoy(config: EnvoyConfig): Promise; export declare function openDatabaseFromEnvoy( handle: EnvoyHandle, actorId: string, - preloadedEntries?: readonly [Uint8Array, Uint8Array][] | null, ): Promise; export declare const utils: {}; diff --git a/rivetkit-typescript/packages/rivetkit-napi/wrapper.js b/rivetkit-typescript/packages/rivetkit-napi/wrapper.js index cd8e047274..55ae24c27c 100644 --- a/rivetkit-typescript/packages/rivetkit-napi/wrapper.js +++ b/rivetkit-typescript/packages/rivetkit-napi/wrapper.js @@ -159,19 +159,9 @@ async function startEnvoy(config) { /** * Open a native database backed by envoy KV. */ -async function openDatabaseFromEnvoy(handle, actorId, preloadedEntries) { +async function openDatabaseFromEnvoy(handle, actorId) { const rawHandle = handle._raw || handle; - const nativePreloadedEntries = preloadedEntries - ? preloadedEntries.map(([key, value]) => ({ - key: Buffer.from(key), - value: Buffer.from(value), - })) - : null; - return native.openDatabaseFromEnvoy( - rawHandle, - actorId, - nativePreloadedEntries, - ); + return native.openDatabaseFromEnvoy(rawHandle, actorId); } function decodePreloadedKv(preloadedKv) {