Merge pull request #15 from sochdb/release/0.5.0

sushanthpy · web-flow · commit f664621e41a6 · 2026-02-06T17:56:03.000-08:00
Performance tweaks.
diff --git a/README.md b/README.md
@@ -3670,6 +3670,85 @@ index.save("./index.bin")
 index = VectorIndex.load("./index.bin")
 ```
 
+### BatchAccumulator — Deferred High-Throughput Insertion
+
+The `BatchAccumulator` provides **4–5× faster** bulk insertion by separating
+data accumulation from HNSW graph construction:
+
+| Phase | What happens | Cost (50K × 1536D) |
+|-------|-------------|---------------------|
+| **Accumulate** (`add()`) | Pure numpy memcpy, zero FFI | ~2–3 s |
+| **Build** (`flush()`) | Single `insert_batch()` FFI call, full Rayon parallelism | ~13 s |
+| **Total** | | **~15 s** |
+| *Without BatchAccumulator* | *Incremental insert_batch calls* | *~20 s* |
+
+**Why it's faster:**
+
+1. **Zero FFI during accumulation** — `add()` copies vectors into pre-allocated numpy arrays.
+   No ctypes calls, no Rust overhead, no HNSW graph updates.
+2. **Single bulk graph build** — `flush()` passes all N vectors in one FFI call.
+   Rust's Rayon-parallel HNSW builder uses wave-parallel construction (32-node waves,
+   adaptive ef capped at 48 in batch mode) for maximum throughput.
+3. **Geometric buffer growth** — Pre-allocated arrays with 2× growth avoid repeated
+   memory allocations. Pass `estimated_size` to eliminate all growth allocations.
+
+```python
+from sochdb import VectorIndex, BatchAccumulator
+import numpy as np
+
+# Create index
+index = VectorIndex(dimension=1536, max_connections=16, ef_construction=200)
+
+# --- Option A: Explicit usage ---
+acc = index.batch_accumulator(estimated_size=50_000)
+
+# Accumulate from streaming data source (zero FFI, pure memcpy)
+for batch_ids, batch_vecs in data_loader:
+    acc.add(batch_ids, batch_vecs)  # O(N) numpy copy, no graph build
+
+# Build HNSW in one shot (single FFI call, full Rayon parallelism)
+inserted = acc.flush()
+print(f"Indexed {inserted} vectors")
+
+# --- Option B: Context manager (auto-flush on exit) ---
+with index.batch_accumulator(50_000) as acc:
+    acc.add(ids, vecs)
+# flush() called automatically
+
+# --- Option C: Cross-process persistence (benchmark frameworks) ---
+acc = index.batch_accumulator(50_000)
+for chunk_ids, chunk_vecs in data_loader:
+    acc.add(chunk_ids, chunk_vecs)
+acc.save("/tmp/index_data")            # persist to disk as numpy files
+
+# ... later, in a different process ...
+acc2 = index.batch_accumulator()
+acc2.load("/tmp/index_data")            # load from disk
+inserted = acc2.flush()                 # single bulk HNSW build
+```
+
+**API Reference:**
+
+| Method | Description |
+|--------|-------------|
+| `index.batch_accumulator(estimated_size=0)` | Create accumulator bound to index |
+| `acc.add(ids, vectors)` | Append chunk (zero FFI, numpy memcpy) |
+| `acc.add_single(id, vector)` | Append one vector |
+| `acc.flush()` → `int` | Build HNSW graph, return count inserted |
+| `acc.save(directory)` | Persist to disk (numpy `.npy` files) |
+| `acc.load(directory)` | Load from disk into accumulator |
+| `len(acc)` / `acc.count` | Number of accumulated (unflushed) vectors |
+
+**VectorDBBench benchmark results (OpenAI/COHERE 50K×1536D, M1 Pro):**
+
+| Metric | SochDB | ChromaDB | LanceDB |
+|--------|--------|----------|---------|
+| Recall@100 | 0.9898 | 0.9967 | 0.9671 |
+| Avg Latency | 3.2 ms | 15.2 ms | 9.6 ms |
+| P99 Latency | 4.9 ms | 26.4 ms | 12.2 ms |
+| Insert Duration | **5.1 s** | 64.7 s | 7.0 s |
+| Total Load | **17.4 s** | 64.7 s | 30.2 s |
+
 ---
 
 ## 29. Vector Utilities
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sochdb"
-version = "0.4.9"
+version = "0.5.0"
 description = "SochDB is an AI-native database with token-optimized output, O(|path|) lookups, built-in vector search, and durable transactions."
 readme = "README.md"
 license = {text = "Apache-2.0"}
diff --git a/src/sochdb/__init__.py b/src/sochdb/__init__.py
@@ -1,5 +1,5 @@
 """
-SochDB Python SDK v0.4.6
+SochDB Python SDK v0.5.0
 
 Dual-mode architecture: Embedded (FFI) + Server (gRPC/IPC)
 
@@ -32,7 +32,7 @@
     client.put_kv("key", b"value")
 """
 
-__version__ = "0.4.7"
+__version__ = "0.5.0"
 
 # Embedded mode (FFI)
 from .database import Database, Transaction
@@ -45,7 +45,7 @@
     SearchRequest,
     SearchResults,
 )
-from .vector import VectorIndex
+from .vector import VectorIndex, BatchAccumulator
 
 # Queue API (v0.4.3)
 from .queue import (
@@ -115,6 +115,7 @@
     "SearchRequest",
     "SearchResults",
     "VectorIndex",
+    "BatchAccumulator",
     
     # Queue API (v0.4.3)
     "PriorityQueue",
diff --git a/src/sochdb/database.py b/src/sochdb/database.py
@@ -419,9 +419,44 @@ def _setup_bindings(cls):
         except (AttributeError, OSError):
              pass
         
-        # Collection Search API (Native Rust vector search)
+        # Collection API (Native Rust vector operations)
         # Optional: Only available in newer native library versions
         try:
+            # sochdb_collection_create(ptr, namespace, collection, dimension, dist_type) -> c_int
+            lib.sochdb_collection_create.argtypes = [
+                ctypes.c_void_p,   # ptr
+                ctypes.c_char_p,   # namespace
+                ctypes.c_char_p,   # collection
+                ctypes.c_size_t,   # dimension
+                ctypes.c_uint8,    # dist_type: 0=Cosine, 1=Euclidean, 2=Dot
+            ]
+            lib.sochdb_collection_create.restype = ctypes.c_int
+
+            # sochdb_collection_insert(ptr, namespace, collection, id, vector_ptr, vector_len, metadata_json) -> c_int
+            lib.sochdb_collection_insert.argtypes = [
+                ctypes.c_void_p,   # ptr
+                ctypes.c_char_p,   # namespace
+                ctypes.c_char_p,   # collection
+                ctypes.c_char_p,   # id
+                ctypes.POINTER(ctypes.c_float),  # vector_ptr
+                ctypes.c_size_t,   # vector_len
+                ctypes.c_char_p,   # metadata_json (nullable)
+            ]
+            lib.sochdb_collection_insert.restype = ctypes.c_int
+
+            # sochdb_collection_insert_batch(ptr, ns, col, ids[], vectors_flat, dim, metas[], count) -> c_int
+            lib.sochdb_collection_insert_batch.argtypes = [
+                ctypes.c_void_p,                   # ptr
+                ctypes.c_char_p,                   # namespace
+                ctypes.c_char_p,                   # collection
+                ctypes.POINTER(ctypes.c_char_p),   # ids array
+                ctypes.POINTER(ctypes.c_float),    # flat vectors array
+                ctypes.c_size_t,                   # dimension
+                ctypes.POINTER(ctypes.c_char_p),   # metadata_jsons array (nullable entries)
+                ctypes.c_size_t,                   # count
+            ]
+            lib.sochdb_collection_insert_batch.restype = ctypes.c_int
+
             lib.sochdb_collection_search.argtypes = [
                 ctypes.c_void_p,   # ptr
                 ctypes.c_char_p,   # namespace
@@ -2386,6 +2421,171 @@ def ffi_collection_search(
             # FFI not available, return empty (caller should fallback)
             return None
 
+    def ffi_collection_create(
+        self,
+        namespace: str,
+        collection: str,
+        dimension: int,
+        metric: str = "cosine",
+    ) -> bool:
+        """
+        Create a collection via native Rust FFI.
+        
+        Args:
+            namespace: Namespace name
+            collection: Collection name
+            dimension: Vector dimension
+            metric: Distance metric ("cosine", "euclidean", "dot_product")
+            
+        Returns:
+            True on success
+        """
+        self._check_open()
+        dist_map = {"cosine": 0, "euclidean": 1, "dot_product": 2, "dot": 2}
+        dist_type = dist_map.get(metric, 0)
+        try:
+            result = self._lib.sochdb_collection_create(
+                self._handle,
+                namespace.encode("utf-8"),
+                collection.encode("utf-8"),
+                dimension,
+                dist_type,
+            )
+            return result == 0
+        except (AttributeError, OSError):
+            return False
+
+    def ffi_collection_insert(
+        self,
+        namespace: str,
+        collection: str,
+        doc_id: str,
+        vector: "List[float]",
+        metadata: "Optional[Dict]" = None,
+    ) -> bool:
+        """
+        Insert a vector into a collection via native Rust FFI.
+        
+        This persists the vector to disk AND inserts into the in-process HNSW index.
+        
+        Args:
+            namespace: Namespace name
+            collection: Collection name
+            doc_id: Document ID string
+            vector: Vector embedding
+            metadata: Optional metadata dict
+            
+        Returns:
+            True on success
+        """
+        self._check_open()
+        import numpy as np
+        import json as _json
+        
+        vec_array = np.array(vector, dtype=np.float32)
+        vec_ptr = vec_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+        
+        meta_json = None
+        if metadata:
+            meta_json = _json.dumps(metadata).encode("utf-8")
+        
+        try:
+            result = self._lib.sochdb_collection_insert(
+                self._handle,
+                namespace.encode("utf-8"),
+                collection.encode("utf-8"),
+                doc_id.encode("utf-8"),
+                vec_ptr,
+                len(vector),
+                meta_json,
+            )
+            return result == 0
+        except (AttributeError, OSError):
+            return False
+
+    def ffi_collection_insert_batch(
+        self,
+        namespace: str,
+        collection: str,
+        ids: "List[str]",
+        vectors: "List[List[float]]",
+        metadatas: "Optional[List[Optional[Dict]]]" = None,
+    ) -> int:
+        """
+        Batch insert vectors into a collection via native Rust FFI.
+        Uses a single transaction for the entire batch for high throughput.
+        
+        Args:
+            namespace: Namespace name
+            collection: Collection name
+            ids: List of document ID strings
+            vectors: List of vector embeddings
+            metadatas: Optional list of metadata dicts
+            
+        Returns:
+            Number of successfully inserted vectors
+        """
+        import numpy as np
+        
+        if not ids or not vectors:
+            return 0
+        
+        n = len(ids)
+        dimension = len(vectors[0])
+        ns_bytes = namespace.encode("utf-8")
+        col_bytes = collection.encode("utf-8")
+        
+        # Build flat vector array (n * dimension floats)
+        flat_vectors = np.array(vectors, dtype=np.float32).flatten()
+        vec_ptr = flat_vectors.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+        
+        # Build C string array for IDs
+        id_bytes = [str(doc_id).encode("utf-8") for doc_id in ids]
+        IdArrayType = ctypes.c_char_p * n
+        id_array = IdArrayType(*id_bytes)
+        
+        # Build C string array for metadata JSONs
+        meta_bytes = []
+        for i in range(n):
+            if metadatas and i < len(metadatas) and metadatas[i]:
+                meta_bytes.append(json.dumps(metadatas[i]).encode("utf-8"))
+            else:
+                meta_bytes.append(None)
+        MetaArrayType = ctypes.c_char_p * n
+        meta_array = MetaArrayType(*meta_bytes)
+        
+        try:
+            result = self._lib.sochdb_collection_insert_batch(
+                self._handle,
+                ns_bytes,
+                col_bytes,
+                id_array,
+                vec_ptr,
+                ctypes.c_size_t(dimension),
+                meta_array,
+                ctypes.c_size_t(n),
+            )
+            return max(result, 0)
+        except (AttributeError, OSError):
+            # Fallback: per-vector insert if batch FFI not available
+            count = 0
+            for i, (doc_id, vector) in enumerate(zip(ids, vectors)):
+                vec_array = np.array(vector, dtype=np.float32)
+                vp = vec_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+                meta_json = None
+                if metadatas and i < len(metadatas) and metadatas[i]:
+                    meta_json = json.dumps(metadatas[i]).encode("utf-8")
+                try:
+                    r = self._lib.sochdb_collection_insert(
+                        self._handle, ns_bytes, col_bytes,
+                        str(doc_id).encode("utf-8"), vp, len(vector), meta_json,
+                    )
+                    if r == 0:
+                        count += 1
+                except (AttributeError, OSError):
+                    pass
+            return count
+
     def ffi_collection_keyword_search(
         self,
         namespace: str,
diff --git a/src/sochdb/namespace.py b/src/sochdb/namespace.py
diff --git a/src/sochdb/vector.py b/src/sochdb/vector.py