Skip to content

Commit 66cebda

Browse files
committed
feat(phase-32): HNSW index, mmap persistence, tree-sitter AST chunker, ONNX embedder
- Add HnswVectorStore with instant-distance for O(log n) ANN search - Add memory-mapped loading (load_mmap) to RustVectorStore via memmap2 - Add AstChunker with tree-sitter parsing for 10 languages (Python, JS, TS, TSX, Rust, Go, Java, C, C++, Ruby) - Add OnnxEmbedder (feature-gated behind 'onnx') for ONNX Runtime inference - HNSW binary format (hnsw_vectors.bin) with embedded metadata for mmap loading - Flat VectorStore now uses FlatVectorStorage enum (Owned | Mmap) - Update rust_backend.py with HnswVectorStore, AstChunker, OnnxEmbedder exports - All 2596 tests passing
1 parent a3ae6eb commit 66cebda

9 files changed

Lines changed: 2839 additions & 18 deletions

File tree

codexa-core/Cargo.lock

Lines changed: 1545 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

codexa-core/Cargo.toml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,25 @@ serde_json = "1"
2121
regex = "1"
2222
ignore = "0.4"
2323
memmap2 = "0.9"
24+
instant-distance = "0.6.1"
25+
tree-sitter = "0.24"
26+
tree-sitter-python = "0.25.0"
27+
tree-sitter-javascript = "0.25.0"
28+
tree-sitter-typescript = "0.23.2"
29+
tree-sitter-rust = "0.24.0"
30+
tree-sitter-go = "0.25.0"
31+
tree-sitter-java = "0.23.5"
32+
tree-sitter-c = "0.24.1"
33+
tree-sitter-cpp = "0.23.4"
34+
tree-sitter-ruby = "0.23.1"
35+
ort = { version = "2.0.0-rc.12", features = ["download-binaries"], optional = true }
36+
ndarray = { version = "0.17.2", optional = true }
2437

2538
[profile.release]
2639
opt-level = 3
2740
lto = "thin"
2841
codegen-units = 1
42+
43+
[features]
44+
default = []
45+
onnx = ["dep:ort", "dep:ndarray"]

codexa-core/src/ann.rs

Lines changed: 123 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
//! Drop-in replacement for the Python FAISS-backed VectorStore.
44
//! Uses flat inner-product search (matching IndexFlatIP behaviour).
55
//! Vectors are stored as a contiguous `Vec<f32>` for cache-friendly access.
6+
//! Supports memory-mapped loading via `load_mmap` for near-instant startup.
67
8+
use memmap2::Mmap;
79
use numpy::{PyArray1, PyReadonlyArray1, PyReadonlyArray2};
810
use pyo3::prelude::*;
911
use pyo3::types::PyType;
@@ -82,6 +84,44 @@ impl ChunkMeta {
8284
}
8385
}
8486

87+
// ---------------------------------------------------------------------------
88+
// VectorStorage — owned or memory-mapped
89+
// ---------------------------------------------------------------------------
90+
91+
enum FlatVectorStorage {
92+
Owned(Vec<f32>),
93+
Mmap {
94+
_mmap: Mmap,
95+
ptr: *const f32,
96+
len: usize,
97+
},
98+
}
99+
100+
unsafe impl Send for FlatVectorStorage {}
101+
unsafe impl Sync for FlatVectorStorage {}
102+
103+
impl FlatVectorStorage {
104+
fn as_slice(&self) -> &[f32] {
105+
match self {
106+
FlatVectorStorage::Owned(v) => v.as_slice(),
107+
FlatVectorStorage::Mmap { ptr, len, .. } => unsafe {
108+
std::slice::from_raw_parts(*ptr, *len)
109+
},
110+
}
111+
}
112+
113+
fn to_owned_mut(&mut self) -> &mut Vec<f32> {
114+
if let FlatVectorStorage::Mmap { ptr, len, .. } = self {
115+
let slice = unsafe { std::slice::from_raw_parts(*ptr, *len) };
116+
*self = FlatVectorStorage::Owned(slice.to_vec());
117+
}
118+
match self {
119+
FlatVectorStorage::Owned(v) => v,
120+
_ => unreachable!(),
121+
}
122+
}
123+
}
124+
85125
// ---------------------------------------------------------------------------
86126
// VectorStore — flat inner-product search
87127
// ---------------------------------------------------------------------------
@@ -90,7 +130,7 @@ impl ChunkMeta {
90130
pub struct RustVectorStore {
91131
dimension: usize,
92132
/// Flat contiguous storage: vectors[i*dim .. (i+1)*dim]
93-
vectors: Vec<f32>,
133+
vectors: FlatVectorStorage,
94134
metadata: Vec<ChunkMeta>,
95135
/// file_path → set of vector indices
96136
file_index: HashMap<String, Vec<usize>>,
@@ -102,7 +142,7 @@ impl RustVectorStore {
102142
fn new(dimension: usize) -> Self {
103143
Self {
104144
dimension,
105-
vectors: Vec::new(),
145+
vectors: FlatVectorStorage::Owned(Vec::new()),
106146
metadata: Vec::new(),
107147
file_index: HashMap::new(),
108148
}
@@ -146,7 +186,8 @@ impl RustVectorStore {
146186
}
147187

148188
let base_idx = self.metadata.len();
149-
self.vectors.reserve(n * dim);
189+
let vec_store = self.vectors.to_owned_mut();
190+
vec_store.reserve(n * dim);
150191

151192
for i in 0..n {
152193
let idx = base_idx + i;
@@ -157,7 +198,7 @@ impl RustVectorStore {
157198

158199
// Append vector data (row-major from numpy)
159200
for j in 0..dim {
160-
self.vectors.push(arr[[i, j]]);
201+
vec_store.push(arr[[i, j]]);
161202
}
162203
}
163204
self.metadata.extend(metadata_list);
@@ -191,6 +232,7 @@ impl RustVectorStore {
191232
}
192233

193234
let dim = self.dimension;
235+
let data = self.vectors.as_slice();
194236

195237
// Parallel inner-product computation
196238
let mut scores: Vec<(usize, f32)> = (0..n)
@@ -200,7 +242,7 @@ impl RustVectorStore {
200242
let mut dot: f32 = 0.0;
201243
// Manual loop for autovectorisation
202244
for j in 0..dim {
203-
dot += unsafe { *self.vectors.get_unchecked(offset + j) } * unsafe { *q.get_unchecked(j) };
245+
dot += unsafe { *data.get_unchecked(offset + j) } * unsafe { *q.get_unchecked(j) };
204246
}
205247
(i, dot)
206248
})
@@ -233,11 +275,12 @@ impl RustVectorStore {
233275

234276
// --- vectors.bin: [dim:u64][count:u64][f32 × dim × count] ---
235277
let vec_path = dir.join("vectors.bin");
236-
let total_floats = self.vectors.len();
278+
let data = self.vectors.as_slice();
279+
let total_floats = data.len();
237280
let mut buf = Vec::with_capacity(16 + total_floats * 4);
238281
buf.extend_from_slice(&(self.dimension as u64).to_le_bytes());
239282
buf.extend_from_slice(&(self.metadata.len() as u64).to_le_bytes());
240-
for &v in &self.vectors {
283+
for &v in data {
241284
buf.extend_from_slice(&v.to_le_bytes());
242285
}
243286
fs::write(&vec_path, &buf)
@@ -253,7 +296,7 @@ impl RustVectorStore {
253296
Ok(())
254297
}
255298

256-
/// Load a vector store from directory.
299+
/// Load a vector store from directory (reads file into memory).
257300
#[classmethod]
258301
fn load(_cls: &Bound<'_, PyType>, directory: &str) -> PyResult<Self> {
259302
let dir = Path::new(directory);
@@ -306,7 +349,71 @@ impl RustVectorStore {
306349

307350
Ok(Self {
308351
dimension,
309-
vectors,
352+
vectors: FlatVectorStorage::Owned(vectors),
353+
metadata,
354+
file_index,
355+
})
356+
}
357+
358+
/// Load a vector store with memory-mapped I/O for near-instant startup.
359+
///
360+
/// The vector data stays on disk and is paged in by the OS on demand.
361+
/// Mutations (add / remove) will copy the data to heap first.
362+
#[classmethod]
363+
fn load_mmap(_cls: &Bound<'_, PyType>, directory: &str) -> PyResult<Self> {
364+
let dir = Path::new(directory);
365+
366+
let vec_path = dir.join("vectors.bin");
367+
let file = fs::File::open(&vec_path)
368+
.map_err(|e| pyo3::exceptions::PyFileNotFoundError::new_err(e.to_string()))?;
369+
let mmap = unsafe { Mmap::map(&file) }
370+
.map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
371+
372+
if mmap.len() < 16 {
373+
return Err(pyo3::exceptions::PyValueError::new_err(
374+
"Invalid vectors.bin: file too small",
375+
));
376+
}
377+
378+
let dimension = u64::from_le_bytes(mmap[0..8].try_into().unwrap()) as usize;
379+
let count = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
380+
let expected = count * dimension * 4;
381+
if mmap.len() < 16 + expected {
382+
return Err(pyo3::exceptions::PyValueError::new_err(
383+
"Truncated vectors.bin",
384+
));
385+
}
386+
387+
let float_count = count * dimension;
388+
let ptr = mmap[16..].as_ptr() as *const f32;
389+
390+
// Load metadata.json
391+
let meta_path = dir.join("metadata.json");
392+
let meta_json = fs::read_to_string(&meta_path)
393+
.map_err(|e| pyo3::exceptions::PyFileNotFoundError::new_err(e.to_string()))?;
394+
let metadata: Vec<ChunkMeta> = serde_json::from_str(&meta_json)
395+
.map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;
396+
397+
if metadata.len() != count {
398+
return Err(pyo3::exceptions::PyValueError::new_err(format!(
399+
"Metadata count ({}) != vector count ({})",
400+
metadata.len(),
401+
count
402+
)));
403+
}
404+
405+
let mut file_index: HashMap<String, Vec<usize>> = HashMap::new();
406+
for (i, m) in metadata.iter().enumerate() {
407+
file_index.entry(m.file_path.clone()).or_default().push(i);
408+
}
409+
410+
Ok(Self {
411+
dimension,
412+
vectors: FlatVectorStorage::Mmap {
413+
_mmap: mmap,
414+
ptr,
415+
len: float_count,
416+
},
310417
metadata,
311418
file_index,
312419
})
@@ -323,18 +430,19 @@ impl RustVectorStore {
323430

324431
// Single-pass rebuild (no shifting)
325432
let dim = self.dimension;
326-
let mut new_vectors = Vec::with_capacity(self.vectors.len() - count * dim);
433+
let data = self.vectors.as_slice();
434+
let mut new_vectors = Vec::with_capacity(data.len() - count * dim);
327435
let mut new_metadata = Vec::with_capacity(self.metadata.len() - count);
328436

329437
for (i, meta) in self.metadata.iter().enumerate() {
330438
if !remove_set.contains(&i) {
331439
new_metadata.push(meta.clone());
332440
let start = i * dim;
333-
new_vectors.extend_from_slice(&self.vectors[start..start + dim]);
441+
new_vectors.extend_from_slice(&data[start..start + dim]);
334442
}
335443
}
336444

337-
self.vectors = new_vectors;
445+
self.vectors = FlatVectorStorage::Owned(new_vectors);
338446
self.metadata = new_metadata;
339447

340448
// Rebuild file index
@@ -357,12 +465,13 @@ impl RustVectorStore {
357465
None => return Vec::new(),
358466
};
359467
let dim = self.dimension;
468+
let data = self.vectors.as_slice();
360469
indices
361470
.iter()
362471
.map(|&idx| {
363472
let meta = self.metadata[idx].clone();
364473
let start = idx * dim;
365-
let slice = &self.vectors[start..start + dim];
474+
let slice = &data[start..start + dim];
366475
let arr = PyArray1::from_slice_bound(py, slice).unbind();
367476
(meta, arr)
368477
})
@@ -371,7 +480,7 @@ impl RustVectorStore {
371480

372481
/// Clear all stored data.
373482
fn clear(&mut self) {
374-
self.vectors.clear();
483+
self.vectors = FlatVectorStorage::Owned(Vec::new());
375484
self.metadata.clear();
376485
self.file_index.clear();
377486
}

0 commit comments

Comments
 (0)