Skip to content

Commit f664621

Browse files
authored
Merge pull request #15 from sochdb/release/0.5.0
Performance tweaks.
2 parents cebd99c + 889c84d commit f664621

File tree

6 files changed

+962
-68
lines changed

6 files changed

+962
-68
lines changed

README.md

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3670,6 +3670,85 @@ index.save("./index.bin")
36703670
index = VectorIndex.load("./index.bin")
36713671
```
36723672

3673+
### BatchAccumulator — Deferred High-Throughput Insertion
3674+
3675+
The `BatchAccumulator` provides **4–5× faster** bulk insertion by separating
3676+
data accumulation from HNSW graph construction:
3677+
3678+
| Phase | What happens | Cost (50K × 1536D) |
3679+
|-------|-------------|---------------------|
3680+
| **Accumulate** (`add()`) | Pure numpy memcpy, zero FFI | ~2–3 s |
3681+
| **Build** (`flush()`) | Single `insert_batch()` FFI call, full Rayon parallelism | ~13 s |
3682+
| **Total** | | **~15 s** |
3683+
| *Without BatchAccumulator* | *Incremental insert_batch calls* | *~20 s* |
3684+
3685+
**Why it's faster:**
3686+
3687+
1. **Zero FFI during accumulation**`add()` copies vectors into pre-allocated numpy arrays.
3688+
No ctypes calls, no Rust overhead, no HNSW graph updates.
3689+
2. **Single bulk graph build**`flush()` passes all N vectors in one FFI call.
3690+
Rust's Rayon-parallel HNSW builder uses wave-parallel construction (32-node waves,
3691+
adaptive ef capped at 48 in batch mode) for maximum throughput.
3692+
3. **Geometric buffer growth** — Pre-allocated arrays with 2× growth avoid repeated
3693+
memory allocations. Pass `estimated_size` to eliminate all growth allocations.
3694+
3695+
```python
3696+
from sochdb import VectorIndex, BatchAccumulator
3697+
import numpy as np
3698+
3699+
# Create index
3700+
index = VectorIndex(dimension=1536, max_connections=16, ef_construction=200)
3701+
3702+
# --- Option A: Explicit usage ---
3703+
acc = index.batch_accumulator(estimated_size=50_000)
3704+
3705+
# Accumulate from streaming data source (zero FFI, pure memcpy)
3706+
for batch_ids, batch_vecs in data_loader:
3707+
acc.add(batch_ids, batch_vecs) # O(N) numpy copy, no graph build
3708+
3709+
# Build HNSW in one shot (single FFI call, full Rayon parallelism)
3710+
inserted = acc.flush()
3711+
print(f"Indexed {inserted} vectors")
3712+
3713+
# --- Option B: Context manager (auto-flush on exit) ---
3714+
with index.batch_accumulator(50_000) as acc:
3715+
acc.add(ids, vecs)
3716+
# flush() called automatically
3717+
3718+
# --- Option C: Cross-process persistence (benchmark frameworks) ---
3719+
acc = index.batch_accumulator(50_000)
3720+
for chunk_ids, chunk_vecs in data_loader:
3721+
acc.add(chunk_ids, chunk_vecs)
3722+
acc.save("/tmp/index_data") # persist to disk as numpy files
3723+
3724+
# ... later, in a different process ...
3725+
acc2 = index.batch_accumulator()
3726+
acc2.load("/tmp/index_data") # load from disk
3727+
inserted = acc2.flush() # single bulk HNSW build
3728+
```
3729+
3730+
**API Reference:**
3731+
3732+
| Method | Description |
3733+
|--------|-------------|
3734+
| `index.batch_accumulator(estimated_size=0)` | Create accumulator bound to index |
3735+
| `acc.add(ids, vectors)` | Append chunk (zero FFI, numpy memcpy) |
3736+
| `acc.add_single(id, vector)` | Append one vector |
3737+
| `acc.flush()``int` | Build HNSW graph, return count inserted |
3738+
| `acc.save(directory)` | Persist to disk (numpy `.npy` files) |
3739+
| `acc.load(directory)` | Load from disk into accumulator |
3740+
| `len(acc)` / `acc.count` | Number of accumulated (unflushed) vectors |
3741+
3742+
**VectorDBBench benchmark results (OpenAI/COHERE 50K×1536D, M1 Pro):**
3743+
3744+
| Metric | SochDB | ChromaDB | LanceDB |
3745+
|--------|--------|----------|---------|
3746+
| Recall@100 | 0.9898 | 0.9967 | 0.9671 |
3747+
| Avg Latency | 3.2 ms | 15.2 ms | 9.6 ms |
3748+
| P99 Latency | 4.9 ms | 26.4 ms | 12.2 ms |
3749+
| Insert Duration | **5.1 s** | 64.7 s | 7.0 s |
3750+
| Total Load | **17.4 s** | 64.7 s | 30.2 s |
3751+
36733752
---
36743753

36753754
## 29. Vector Utilities

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "sochdb"
7-
version = "0.4.9"
7+
version = "0.5.0"
88
description = "SochDB is an AI-native database with token-optimized output, O(|path|) lookups, built-in vector search, and durable transactions."
99
readme = "README.md"
1010
license = {text = "Apache-2.0"}

src/sochdb/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
SochDB Python SDK v0.4.6
2+
SochDB Python SDK v0.5.0
33
44
Dual-mode architecture: Embedded (FFI) + Server (gRPC/IPC)
55
@@ -32,7 +32,7 @@
3232
client.put_kv("key", b"value")
3333
"""
3434

35-
__version__ = "0.4.7"
35+
__version__ = "0.5.0"
3636

3737
# Embedded mode (FFI)
3838
from .database import Database, Transaction
@@ -45,7 +45,7 @@
4545
SearchRequest,
4646
SearchResults,
4747
)
48-
from .vector import VectorIndex
48+
from .vector import VectorIndex, BatchAccumulator
4949

5050
# Queue API (v0.4.3)
5151
from .queue import (
@@ -115,6 +115,7 @@
115115
"SearchRequest",
116116
"SearchResults",
117117
"VectorIndex",
118+
"BatchAccumulator",
118119

119120
# Queue API (v0.4.3)
120121
"PriorityQueue",

src/sochdb/database.py

Lines changed: 201 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,9 +419,44 @@ def _setup_bindings(cls):
419419
except (AttributeError, OSError):
420420
pass
421421

422-
# Collection Search API (Native Rust vector search)
422+
# Collection API (Native Rust vector operations)
423423
# Optional: Only available in newer native library versions
424424
try:
425+
# sochdb_collection_create(ptr, namespace, collection, dimension, dist_type) -> c_int
426+
lib.sochdb_collection_create.argtypes = [
427+
ctypes.c_void_p, # ptr
428+
ctypes.c_char_p, # namespace
429+
ctypes.c_char_p, # collection
430+
ctypes.c_size_t, # dimension
431+
ctypes.c_uint8, # dist_type: 0=Cosine, 1=Euclidean, 2=Dot
432+
]
433+
lib.sochdb_collection_create.restype = ctypes.c_int
434+
435+
# sochdb_collection_insert(ptr, namespace, collection, id, vector_ptr, vector_len, metadata_json) -> c_int
436+
lib.sochdb_collection_insert.argtypes = [
437+
ctypes.c_void_p, # ptr
438+
ctypes.c_char_p, # namespace
439+
ctypes.c_char_p, # collection
440+
ctypes.c_char_p, # id
441+
ctypes.POINTER(ctypes.c_float), # vector_ptr
442+
ctypes.c_size_t, # vector_len
443+
ctypes.c_char_p, # metadata_json (nullable)
444+
]
445+
lib.sochdb_collection_insert.restype = ctypes.c_int
446+
447+
# sochdb_collection_insert_batch(ptr, ns, col, ids[], vectors_flat, dim, metas[], count) -> c_int
448+
lib.sochdb_collection_insert_batch.argtypes = [
449+
ctypes.c_void_p, # ptr
450+
ctypes.c_char_p, # namespace
451+
ctypes.c_char_p, # collection
452+
ctypes.POINTER(ctypes.c_char_p), # ids array
453+
ctypes.POINTER(ctypes.c_float), # flat vectors array
454+
ctypes.c_size_t, # dimension
455+
ctypes.POINTER(ctypes.c_char_p), # metadata_jsons array (nullable entries)
456+
ctypes.c_size_t, # count
457+
]
458+
lib.sochdb_collection_insert_batch.restype = ctypes.c_int
459+
425460
lib.sochdb_collection_search.argtypes = [
426461
ctypes.c_void_p, # ptr
427462
ctypes.c_char_p, # namespace
@@ -2386,6 +2421,171 @@ def ffi_collection_search(
23862421
# FFI not available, return empty (caller should fallback)
23872422
return None
23882423

2424+
def ffi_collection_create(
2425+
self,
2426+
namespace: str,
2427+
collection: str,
2428+
dimension: int,
2429+
metric: str = "cosine",
2430+
) -> bool:
2431+
"""
2432+
Create a collection via native Rust FFI.
2433+
2434+
Args:
2435+
namespace: Namespace name
2436+
collection: Collection name
2437+
dimension: Vector dimension
2438+
metric: Distance metric ("cosine", "euclidean", "dot_product")
2439+
2440+
Returns:
2441+
True on success
2442+
"""
2443+
self._check_open()
2444+
dist_map = {"cosine": 0, "euclidean": 1, "dot_product": 2, "dot": 2}
2445+
dist_type = dist_map.get(metric, 0)
2446+
try:
2447+
result = self._lib.sochdb_collection_create(
2448+
self._handle,
2449+
namespace.encode("utf-8"),
2450+
collection.encode("utf-8"),
2451+
dimension,
2452+
dist_type,
2453+
)
2454+
return result == 0
2455+
except (AttributeError, OSError):
2456+
return False
2457+
2458+
def ffi_collection_insert(
2459+
self,
2460+
namespace: str,
2461+
collection: str,
2462+
doc_id: str,
2463+
vector: "List[float]",
2464+
metadata: "Optional[Dict]" = None,
2465+
) -> bool:
2466+
"""
2467+
Insert a vector into a collection via native Rust FFI.
2468+
2469+
This persists the vector to disk AND inserts into the in-process HNSW index.
2470+
2471+
Args:
2472+
namespace: Namespace name
2473+
collection: Collection name
2474+
doc_id: Document ID string
2475+
vector: Vector embedding
2476+
metadata: Optional metadata dict
2477+
2478+
Returns:
2479+
True on success
2480+
"""
2481+
self._check_open()
2482+
import numpy as np
2483+
import json as _json
2484+
2485+
vec_array = np.array(vector, dtype=np.float32)
2486+
vec_ptr = vec_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
2487+
2488+
meta_json = None
2489+
if metadata:
2490+
meta_json = _json.dumps(metadata).encode("utf-8")
2491+
2492+
try:
2493+
result = self._lib.sochdb_collection_insert(
2494+
self._handle,
2495+
namespace.encode("utf-8"),
2496+
collection.encode("utf-8"),
2497+
doc_id.encode("utf-8"),
2498+
vec_ptr,
2499+
len(vector),
2500+
meta_json,
2501+
)
2502+
return result == 0
2503+
except (AttributeError, OSError):
2504+
return False
2505+
2506+
def ffi_collection_insert_batch(
2507+
self,
2508+
namespace: str,
2509+
collection: str,
2510+
ids: "List[str]",
2511+
vectors: "List[List[float]]",
2512+
metadatas: "Optional[List[Optional[Dict]]]" = None,
2513+
) -> int:
2514+
"""
2515+
Batch insert vectors into a collection via native Rust FFI.
2516+
Uses a single transaction for the entire batch for high throughput.
2517+
2518+
Args:
2519+
namespace: Namespace name
2520+
collection: Collection name
2521+
ids: List of document ID strings
2522+
vectors: List of vector embeddings
2523+
metadatas: Optional list of metadata dicts
2524+
2525+
Returns:
2526+
Number of successfully inserted vectors
2527+
"""
2528+
import numpy as np
2529+
2530+
if not ids or not vectors:
2531+
return 0
2532+
2533+
n = len(ids)
2534+
dimension = len(vectors[0])
2535+
ns_bytes = namespace.encode("utf-8")
2536+
col_bytes = collection.encode("utf-8")
2537+
2538+
# Build flat vector array (n * dimension floats)
2539+
flat_vectors = np.array(vectors, dtype=np.float32).flatten()
2540+
vec_ptr = flat_vectors.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
2541+
2542+
# Build C string array for IDs
2543+
id_bytes = [str(doc_id).encode("utf-8") for doc_id in ids]
2544+
IdArrayType = ctypes.c_char_p * n
2545+
id_array = IdArrayType(*id_bytes)
2546+
2547+
# Build C string array for metadata JSONs
2548+
meta_bytes = []
2549+
for i in range(n):
2550+
if metadatas and i < len(metadatas) and metadatas[i]:
2551+
meta_bytes.append(json.dumps(metadatas[i]).encode("utf-8"))
2552+
else:
2553+
meta_bytes.append(None)
2554+
MetaArrayType = ctypes.c_char_p * n
2555+
meta_array = MetaArrayType(*meta_bytes)
2556+
2557+
try:
2558+
result = self._lib.sochdb_collection_insert_batch(
2559+
self._handle,
2560+
ns_bytes,
2561+
col_bytes,
2562+
id_array,
2563+
vec_ptr,
2564+
ctypes.c_size_t(dimension),
2565+
meta_array,
2566+
ctypes.c_size_t(n),
2567+
)
2568+
return max(result, 0)
2569+
except (AttributeError, OSError):
2570+
# Fallback: per-vector insert if batch FFI not available
2571+
count = 0
2572+
for i, (doc_id, vector) in enumerate(zip(ids, vectors)):
2573+
vec_array = np.array(vector, dtype=np.float32)
2574+
vp = vec_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
2575+
meta_json = None
2576+
if metadatas and i < len(metadatas) and metadatas[i]:
2577+
meta_json = json.dumps(metadatas[i]).encode("utf-8")
2578+
try:
2579+
r = self._lib.sochdb_collection_insert(
2580+
self._handle, ns_bytes, col_bytes,
2581+
str(doc_id).encode("utf-8"), vp, len(vector), meta_json,
2582+
)
2583+
if r == 0:
2584+
count += 1
2585+
except (AttributeError, OSError):
2586+
pass
2587+
return count
2588+
23892589
def ffi_collection_keyword_search(
23902590
self,
23912591
namespace: str,

0 commit comments

Comments
 (0)