Skip to content

Commit f16ec0c

Browse files
authored
fix(sqlite): WAL checkpoint + add view_index for mmap loading (#11)
* fix(sqlite-provider): checkpoint WAL after open_or_build Data written during build may only exist in the WAL file. Without an explicit checkpoint, the data can be lost if the process exits before SQLite performs a passive checkpoint. This caused empty query results when reloading the .db file after an engine restart. * feat(registry): add view_index for memory-mapped index loading view_index uses mmap instead of loading the full index into RAM, keeping resident memory proportional to the working set. Prefer this for the reload-from-disk path where the index file is already local. * test(rule): add tests for SELECT-only-distance query pattern Covers the case where the SELECT list contains only the distance UDF and no base table columns, with bare and qualified table references.
1 parent a016358 commit f16ec0c

3 files changed

Lines changed: 76 additions & 1 deletion

File tree

src/registry.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ impl USearchIndexConfig {
8787
.map_err(|e| DataFusionError::Execution(format!("USearch Index::new failed: {e}")))
8888
}
8989

90-
/// Load a previously saved index from `path`.
90+
/// Load a previously saved index from `path` into memory.
9191
///
9292
/// Uses the same `IndexOptions` as `build_index()`. The options must
9393
/// match those used when the index was originally built — passing wrong
@@ -101,6 +101,26 @@ impl USearchIndexConfig {
101101
Ok(index)
102102
}
103103

104+
/// Memory-map a previously saved index from `path`.
105+
///
106+
/// Unlike [`load_index`], this does not copy the index into RAM. The OS
107+
/// pages data in on demand, keeping resident memory proportional to the
108+
/// working set rather than the full index size. Prefer this for the
109+
/// reload-from-disk path where the index file is already local.
110+
///
111+
/// The returned [`Index`] is fully functional for search; the backing
112+
/// file must remain on disk for the lifetime of the index.
113+
///
114+
/// [`load_index`]: Self::load_index
115+
pub fn view_index(&self, path: &str) -> Result<Index> {
116+
let index = Index::new(&self.to_index_options())
117+
.map_err(|e| DataFusionError::Execution(format!("USearch Index::new failed: {e}")))?;
118+
index
119+
.view(path)
120+
.map_err(|e| DataFusionError::Execution(format!("USearch index view failed: {e}")))?;
121+
Ok(index)
122+
}
123+
104124
fn to_index_options(&self) -> IndexOptions {
105125
IndexOptions {
106126
dimensions: self.dimensions,

src/sqlite_provider.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,12 @@ impl SqliteLookupProvider {
252252
)?;
253253
}
254254

255+
// Checkpoint WAL so the data is flushed to the main database file.
256+
// Without this, data written during build may only exist in the WAL
257+
// and can be lost if the process exits before a passive checkpoint.
258+
conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")
259+
.map_err(|e| DataFusionError::Execution(format!("WAL checkpoint failed: {e}")))?;
260+
255261
let mut conns = vec![conn];
256262
for _ in 1..pool_size {
257263
conns.push(open_conn(db_path)?);

tests/optimizer_rule.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,3 +419,52 @@ async fn test_qualified_ref_where_clause_rewrites() {
419419
"qualified ref + WHERE → filter absorbed, rule must fire\nPlan: {plan:?}"
420420
);
421421
}
422+
423+
// ═══════════════════════════════════════════════════════════════════════════════
424+
// SELECT only distance — no base columns projected
425+
// ═══════════════════════════════════════════════════════════════════════════════
426+
//
427+
// When the SELECT list contains only the distance UDF (no base table columns),
428+
// the Projection node has a single computed expression. The optimizer must still
429+
// recognise the pattern and rewrite to USearchNode.
430+
431+
/// Bare table, SELECT only distance alias, ORDER BY alias — rule must fire.
432+
#[tokio::test]
433+
async fn test_select_only_distance_bare_rewrites() {
434+
let ctx = make_ctx(MetricKind::L2sq).await;
435+
let sql =
436+
format!("SELECT l2_distance(vector, {Q}) AS dist FROM items ORDER BY dist ASC LIMIT 5");
437+
let plan = optimized_plan(&ctx, &sql).await;
438+
assert!(
439+
contains_usearch_node(&plan),
440+
"SELECT only distance (bare) → rule must fire\nPlan: {plan:?}"
441+
);
442+
}
443+
444+
/// Qualified table, SELECT only distance alias, ORDER BY alias — rule must fire.
445+
#[tokio::test]
446+
async fn test_select_only_distance_qualified_rewrites() {
447+
let ctx = make_ctx_qualified(MetricKind::L2sq).await;
448+
let sql = format!(
449+
"SELECT l2_distance(vector, {Q}) AS dist FROM datafusion.public.items ORDER BY dist ASC LIMIT 5"
450+
);
451+
let plan = optimized_plan(&ctx, &sql).await;
452+
assert!(
453+
contains_usearch_node(&plan),
454+
"SELECT only distance (qualified) → rule must fire\nPlan: {plan:?}"
455+
);
456+
}
457+
458+
/// Bare table, SELECT only distance (no alias), ORDER BY the UDF directly.
459+
#[tokio::test]
460+
async fn test_select_only_distance_no_alias_rewrites() {
461+
let ctx = make_ctx(MetricKind::L2sq).await;
462+
let sql = format!(
463+
"SELECT l2_distance(vector, {Q}) FROM items ORDER BY l2_distance(vector, {Q}) ASC LIMIT 5"
464+
);
465+
let plan = optimized_plan(&ctx, &sql).await;
466+
assert!(
467+
contains_usearch_node(&plan),
468+
"SELECT only distance (no alias, ORDER BY UDF) → rule must fire\nPlan: {plan:?}"
469+
);
470+
}

0 commit comments

Comments
 (0)