-
Notifications
You must be signed in to change notification settings - Fork 0
feat: split VectorIndexResolver into peek/resolve/ensure_loaded #18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2f98713
b5823ac
649cc86
5d23a84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -214,6 +214,48 @@ pub struct RegisteredTable { | |
| pub config: USearchTableConfig, | ||
| } | ||
|
|
||
| // ── VectorIndexResolver ─────────────────────────────────────────────────────── | ||
|
|
||
| /// Lightweight metadata returned by the sync `peek` check. | ||
| /// Contains enough info for the optimizer to validate the ANN rewrite | ||
| /// and build the logical plan node, without needing the loaded index objects. | ||
| #[derive(Debug, Clone)] | ||
| pub struct VectorIndexMeta { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| pub metric: MetricKind, | ||
| pub scalar_kind: ScalarKind, | ||
| pub schema: SchemaRef, | ||
| pub key_col: String, | ||
| pub config: USearchTableConfig, | ||
| } | ||
|
|
||
| /// Trait for resolving vector index entries by name. | ||
| /// | ||
| /// Split into two operations to accommodate DataFusion's mixed sync/async pipeline: | ||
| /// - `peek()` — sync, cheap: used by the optimizer rule to decide if ANN rewrite applies | ||
| /// - `resolve()` — sync: returns fully loaded entry from cache (for executor hot path) | ||
| /// - `ensure_loaded()` — async: loads the index on cache miss (for planner/executor) | ||
| /// | ||
| /// The built-in [`USearchRegistry`] implements all three as direct hashmap lookups | ||
| /// (always cached). Production systems can implement catalog-backed loading in | ||
| /// `ensure_loaded`. | ||
| #[async_trait::async_trait] | ||
| pub trait VectorIndexResolver: Send + Sync + std::fmt::Debug { | ||
| /// Sync, cheap: check if a vector index exists and return its metadata. | ||
| /// Used by the optimizer rule to decide if the ANN rewrite should apply. | ||
| /// Must NOT do I/O — only check local cache or lightweight state. | ||
| fn peek(&self, name: &str) -> Option<VectorIndexMeta>; | ||
|
|
||
| /// Sync: return a fully loaded entry from cache. | ||
| /// Returns `None` on cache miss — the caller should call `ensure_loaded` first. | ||
| fn resolve(&self, name: &str) -> Option<Arc<RegisteredTable>>; | ||
|
|
||
| /// Async: ensure the index is loaded and available in cache. | ||
| /// On cache miss or stale entry, downloads files and loads the index. | ||
| /// On cache hit with current generation, returns immediately. | ||
| /// Called from the async physical planner before building the execution plan. | ||
| async fn ensure_loaded(&self, name: &str) -> datafusion::common::Result<()>; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The trait contract between |
||
| } | ||
|
|
||
| // ── USearchRegistry ─────────────────────────────────────────────────────────── | ||
|
|
||
| pub struct USearchRegistry { | ||
|
|
@@ -373,6 +415,33 @@ impl USearchRegistry { | |
| pub fn into_arc(self) -> Arc<Self> { | ||
| Arc::new(self) | ||
| } | ||
|
|
||
| /// Convert into a trait object for use with the optimizer rule and planner. | ||
| pub fn into_resolver(self) -> Arc<dyn VectorIndexResolver> { | ||
| Arc::new(self) | ||
| } | ||
| } | ||
|
|
||
| #[async_trait::async_trait] | ||
| impl VectorIndexResolver for USearchRegistry { | ||
| fn peek(&self, name: &str) -> Option<VectorIndexMeta> { | ||
| self.get(name).map(|r| VectorIndexMeta { | ||
| metric: r.metric, | ||
| scalar_kind: r.scalar_kind, | ||
| schema: r.schema.clone(), | ||
| key_col: r.key_col.clone(), | ||
| config: r.config.clone(), | ||
| }) | ||
| } | ||
|
|
||
| fn resolve(&self, name: &str) -> Option<Arc<RegisteredTable>> { | ||
| self.get(name) | ||
| } | ||
|
|
||
| async fn ensure_loaded(&self, _name: &str) -> datafusion::common::Result<()> { | ||
| // USearchRegistry is always fully cached — nothing to load. | ||
| Ok(()) | ||
| } | ||
| } | ||
|
|
||
| impl Default for USearchRegistry { | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -33,16 +33,21 @@ use datafusion::physical_plan::{ | |||||||||||
| }; | ||||||||||||
| use datafusion::scalar::ScalarValue; | ||||||||||||
|
|
||||||||||||
| use crate::registry::USearchRegistry; | ||||||||||||
| use crate::registry::VectorIndexResolver; | ||||||||||||
|
|
||||||||||||
| // ── UDTF ───────────────────────────────────────────────────────────────────── | ||||||||||||
|
|
||||||||||||
| /// Table function: vector_usearch(table_name, query_vec, k [, ef_search]) | ||||||||||||
| /// | ||||||||||||
| /// Returns `(key: UInt64, _distance: Float32)`. Join with your data table on | ||||||||||||
| /// the key column to retrieve full rows. | ||||||||||||
| /// | ||||||||||||
| /// This entry point is synchronous. For async-backed [`VectorIndexResolver`] | ||||||||||||
| /// implementations, it only works when the target index is already loaded in | ||||||||||||
| /// the local cache. `vector_usearch()` does not call `ensure_loaded()` and | ||||||||||||
| /// cannot trigger async index loads. | ||||||||||||
| pub struct USearchUDTF { | ||||||||||||
| registry: Arc<USearchRegistry>, | ||||||||||||
| registry: Arc<dyn VectorIndexResolver>, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| impl fmt::Debug for USearchProvider { | ||||||||||||
|
|
@@ -58,7 +63,7 @@ impl fmt::Debug for USearchUDTF { | |||||||||||
| } | ||||||||||||
|
|
||||||||||||
| impl USearchUDTF { | ||||||||||||
| pub fn new(registry: Arc<USearchRegistry>) -> Self { | ||||||||||||
| pub fn new(registry: Arc<dyn VectorIndexResolver>) -> Self { | ||||||||||||
| Self { registry } | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
@@ -85,9 +90,11 @@ impl TableFunctionImpl for USearchUDTF { | |||||||||||
| None | ||||||||||||
| }; | ||||||||||||
|
|
||||||||||||
| let registered = self.registry.get(&table_name).ok_or_else(|| { | ||||||||||||
| let registered = self.registry.resolve(&table_name).ok_or_else(|| { | ||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P1 — UDTF bypasses
The PR introduces Two options: Option A — Document the limitation so implementors know to pre-load before calling
Suggested change
Option B — Add a sync |
||||||||||||
| DataFusionError::Execution(format!( | ||||||||||||
| "vector_usearch: table '{table_name}' not registered" | ||||||||||||
| "vector_usearch: table '{table_name}' is not loaded locally. \ | ||||||||||||
| This synchronous path only checks the local cache and cannot trigger async \ | ||||||||||||
| loads. Use the optimizer/planner vector query path or pre-load the index first." | ||||||||||||
| )) | ||||||||||||
| })?; | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
resolve()is called without a precedingensure_loaded(). For async-backed resolvers that support cache eviction, the index will be gone here and the query fails — eviction is not actually handled, the error message is just friendlier.Fix: call
ensure_loadedfirst:Also update the comment on line 364 — "handled correctly" implies the index is reloaded on eviction, which it is not without this call.