feat: update vector index parameters and add RID filtering support in search

tae898 · tae898 · commit 1addedd4c556 · 2025-12-28T10:50:45.000+01:00
diff --git a/bindings/python/docs/api/vector.md b/bindings/python/docs/api/vector.md
@@ -112,8 +112,8 @@ db.create_vector_index(
     vector_property: str,
     dimensions: int,
     distance_function: str = "cosine",
-    max_connections: int = 16,
-    beam_width: int = 200
+    max_connections: int = 32,
+    beam_width: int = 256
 ) -> VectorIndex
 ```
 
@@ -126,11 +126,11 @@ db.create_vector_index(
   - `"cosine"`: Cosine distance (1 - cosine similarity)
   - `"euclidean"`: Euclidean distance (L2 norm)
   - `"inner_product"`: Negative inner product
-- `max_connections` (int): Max connections per node (default: 16)
+- `max_connections` (int): Max connections per node (default: 32)
   - Maps to `maxConnections` in JVector
   - Higher = better recall, more memory
-  - Typical range: 12-48
-- `beam_width` (int): Beam width for search/construction (default: 200)
+  - Typical range: 128-256
+- `beam_width` (int): Beam width for search/construction (default: 256)
   - Maps to `beamWidth` in JVector
   - Higher = better recall, slower search
   - Typical range: 100-400
@@ -162,16 +162,16 @@ index = db.create_vector_index(
     vector_property="embedding",
     dimensions=384,  # Match your embedding model
     distance_function="cosine",
-    m=16,
-    ef=128
+    max_connections=32,
+    beam_width=256
 )
 
 print(f"Created vector index: {index}")
 ```
 
 ---
 
-### `VectorIndex.find_nearest(query_vector, k=10, use_numpy=True)`
+### `VectorIndex.find_nearest(query_vector, k=10, overquery_factor=16, use_numpy=True, allowed_rids=None)`
 
 Find k-nearest neighbors to the query vector.
 
@@ -182,7 +182,11 @@ Find k-nearest neighbors to the query vector.
   - NumPy array: `np.array([0.1, 0.2, ...])`
   - Any array-like iterable
 - `k` (int): Number of neighbors to return (default: 10)
+- `overquery_factor` (int): Multiplier for search-time over-querying (implicit efSearch)
+  (default: 16)
 - `use_numpy` (bool): Return vectors as NumPy if available (default: `True`)
+- `allowed_rids` (List[str]): Optional list of RID strings (e.g. `["#1:0", "#2:5"]`) to
+  restrict search (default: `None`)
 
 **Returns:**
 
@@ -202,6 +206,10 @@ query_vector = generate_embedding(query_text)  # Your embedding function
 # Search for 5 most similar documents
 neighbors = index.find_nearest(query_vector, k=5)
 
+# Search with RID filtering
+allowed_rids = ["#10:5", "#10:8", "#10:12"]
+filtered_neighbors = index.find_nearest(query_vector, k=5, allowed_rids=allowed_rids)
+
 for vertex, distance in neighbors:
     doc_id = vertex.get("id")
     text = vertex.get("text")
diff --git a/bindings/python/docs/examples/03_vector_search.md b/bindings/python/docs/examples/03_vector_search.md
@@ -72,12 +72,9 @@ index = db.create_vector_index(
     vertex_type="Article",
     vector_property="embedding",
     dimensions=384,
-    id_property="id",
     distance_function="cosine",  # or "euclidean", "inner_product"
-    m=16,                         # connections per node
-    ef=128,                       # search quality
-    ef_construction=128,          # build quality
-    max_items=10000              # capacity
+    max_connections=32,          # connections per node (default: 32)
+    beam_width=256               # search quality (default: 256)
 )
 ```
 
@@ -88,14 +85,11 @@ index = db.create_vector_index(
   - `cosine`: Best for normalized vectors (text embeddings)
   - `euclidean`: Straight-line distance (image features)
   - `inner_product`: Dot product (when magnitude matters)
-- **M**: Connections per node (16 typical, 12-48 range)
+- **max_connections**: Connections per node (32 default, 16-64 range)
   - Higher = better accuracy, more memory
-  - 16 is good balance for most use cases
-- **ef**: Search beam width (100-200 typical)
+  - 32 is good balance for most use cases
+- **beam_width**: Search beam width (256 default, 100-400 range)
   - Higher = better recall, slower search
-- **ef_construction**: Build quality (100-200 typical)
-  - Higher = better index, slower build
-- **max_items**: Pre-allocated capacity
 
 ### Distance vs Similarity
 
@@ -120,15 +114,15 @@ index = db.create_vector_index(
 
 When you create and populate a vector index, ArcadeDB stores:
 
-**Files created** (for 10K documents, 384D, M=16):
+**Files created** (for 10K documents, 384D, M=32):
 ```
 Article_414002873519545.5.v0.hnswidx         4 KB   (metadata only)
 Article_0.1.65536.v0.bucket                 24 MB   (vertices + embeddings)
 Article_0_in_edges.3.65536.v0.bucket        22 MB   (incoming edges)
 Article_0_out_edges.2.65536.v0.bucket       22 MB   (outgoing edges)
-VectorProximity0_0.7.65536.v0.bucket        47 MB   (HNSW proximity edges)
+VectorProximity0_0.7.65536.v0.bucket        90 MB   (HNSW proximity edges)
 ─────────────────────────────────────────────────
-Total:                                     115 MB
+Total:                                     160 MB
 ```
 
 **Key insight**: The `.hnswidx` file is tiny (4KB) - it only stores metadata. The actual HNSW graph is stored as edges in the database!
@@ -319,18 +313,18 @@ RAM ≈ 4 bytes × dimensions × num_vectors × (1 + M/2)
 ```
 
 **Examples:**
-- 10K vectors, 384D, M=16: ~37 MB
-- 100K vectors, 384D, M=16: ~370 MB
-- 1M vectors, 384D, M=16: ~3.7 GB
-- 1M vectors, 1536D, M=16: ~14.7 GB
+- 10K vectors, 384D, M=32: ~50 MB
+- 100K vectors, 384D, M=32: ~500 MB
+- 1M vectors, 384D, M=32: ~5 GB
+- 1M vectors, 1536D, M=32: ~16 GB
 
 **Note:** This is working set, not total database size. ArcadeDB uses page caching, so hot data stays in RAM while cold data is read from disk on-demand.
 
 ### 4. Choosing Parameters
 
 **Start with defaults:**
 ```python
-M=16, ef=128, ef_construction=128
+max_connections=32, beam_width=256
 ```
 
 **Then tune based on needs:**
diff --git a/bindings/python/docs/examples/06_vector_search_recommendations.md b/bindings/python/docs/examples/06_vector_search_recommendations.md
@@ -287,13 +287,9 @@ index = db.create_vector_index(
     vertex_type="Movie",
     vector_property="embedding_v1",  # or "embedding_v2"
     dimensions=384,
-    max_items=10000,  # Adjusted to actual movie count
-    id_property="vector_id_v1",  # or "vector_id_v2"
-    edge_type="Movie_v1",  # Unique edge type per index
     distance_function="cosine",
-    m=16,  # Number of connections per layer
-    ef=128,  # Size of dynamic candidate list
-    ef_construction=128  # Size during index construction
+    max_connections=32,  # Number of connections per layer (default: 32)
+    beam_width=256       # Size of dynamic candidate list (default: 256)
 )
 ```
 
@@ -365,7 +361,7 @@ index = db.create_vector_index(
 
 **Vector search:**
 - Cache embeddings (stored in database properties)
-- Use appropriate HNSW parameters (m=16, ef=128)
+- Use appropriate JVector parameters (max_connections=32, beam_width=256)
 - Choose faster encoding model (paraphrase-MiniLM-L6-v2)
 
 **Memory management:**
diff --git a/bindings/python/docs/guide/vectors.md b/bindings/python/docs/guide/vectors.md
@@ -1,10 +1,13 @@
 # Vector Search Guide
 
-Vector search enables semantic similarity search using embeddings from machine learning models. This guide covers strategies, best practices, and patterns for implementing vector search with ArcadeDB.
+Vector search enables semantic similarity search using embeddings from machine learning
+models. This guide covers strategies, best practices, and patterns for implementing
+vector search with ArcadeDB.
 
 ## Overview
 
-Vector search transforms your data into high-dimensional vectors (embeddings) and finds similar items using distance metrics. Perfect for:
+Vector search transforms your data into high-dimensional vectors (embeddings) and finds
+similar items using distance metrics. Perfect for:
 
 - **Semantic Search**: Find documents by meaning, not just keywords
 - **Recommendation Systems**: Find similar products, users, or content
@@ -272,66 +275,91 @@ index = db.create_vector_index(
 
 ## Index Parameters
 
-### Max Connections (m)
+### Max Connections
 
-Controls connections per node in the graph. Maps to `maxConnections` in JVector.
+Controls connections per node in the graph. Maps to `maxConnections` in JVector and `M`
+in HNSW.
 
 ```python
 index = db.create_vector_index(
     vertex_type="Doc",
     vector_property="embedding",
     dimensions=384,
-    max_connections=16  # Number of connections
+    max_connections=32  # Number of connections (default: 32)
 )
 ```
 
 **Trade-offs:**
 
 | Max Connections | Recall | Memory | Build Speed | Search Speed |
 |-----------------|--------|--------|-------------|--------------|
-| 8-12            | Lower  | Low    | Fast        | Fast         |
-| 16-24           | Good   | Medium | Medium      | Medium       |
-| 32-48           | High   | High   | Slow        | Slow         |
+| 16              | Good   | Low    | Fast        | Fast         |
+| 32 (Default)    | Decent | Medium | Medium      | Medium       |
+| 64              | High   | High   | Slow        | Slow         |
 
 **Recommendations:**
 - **Small datasets (<100K)**: max_connections=16
-- **Medium datasets (100K-1M)**: max_connections=24
-- **Large datasets (>1M)**: max_connections=32-48
+- **Medium datasets (100K-1M)**: max_connections=32 (default)
+- **Large datasets (>1M)**: max_connections=64
 
 ---
 
 ### Beam Width (ef)
 
-Controls search quality vs speed. Maps to `beamWidth` in JVector.
+Controls search quality vs speed. Maps to `beamWidth` in JVector and `ef_construction`
+in HNSW.
 
 ```python
 index = db.create_vector_index(
     vertex_type="Doc",
     vector_property="embedding",
     dimensions=384,
-    beam_width=128  # Search candidate list size
+    beam_width=256  # Search candidate list size (default: 256)
 )
 ```
 
 **Trade-offs:**
 
 | Beam Width | Recall | Search Speed |
 |------------|--------|--------------|
-| 50-100     | Lower  | Fast         |
-| 128-200    | Good   | Medium       |
-| 200-400    | High   | Slow         |
+| <256       | Good   | Fast         |
+| 256 (Def)  | Medium | Medium       |
+| >256       | High   | Slow         |
 
 **Recommendations:**
-- **Fast search**: beam_width=50-100
-- **Balanced**: beam_width=128-200
-- **High accuracy**: beam_width=200-400
+- **Fast search**: beam_width=128
+- **Balanced**: beam_width=256 (default)
+- **High accuracy**: beam_width=512
+
+---
+
+### Overquery Factor
+
+Controls search-time accuracy by exploring more candidates than requested. This is
+similar to `efSearch` from HNSW.
+
+```python
+# Actual search will explore k * overquery_factor candidates
+results = index.find_nearest(
+    query_embedding,
+    k=10,
+    overquery_factor=16  # Default: 16
+)
+```
+
+**Trade-offs:**
+
+| Factor | Recall | Search Speed |
+|--------|--------|--------------|
+| <16    | Low    | Fast         |
+| 16     | Decent | Medium       |
+| >16    | High   | Slow         |
 
-**Recommendations:**
-- **Fast iteration**: ef_construction=100
-- **Production**: ef_construction=200
-- **Maximum quality**: ef_construction=400
 
-**Note:** Higher ef_construction improves recall but only affects index building, not search.
+**Recommendations:**
+- **Fast search**: overquery_factor=8
+- **Balanced**: overquery_factor=16 (default)
+- **High accuracy**: overquery_factor=32
 
 ## Schema Design
 
@@ -426,10 +454,33 @@ for vertex, distance in results:
 
 ### Hybrid Search (Vector + Filters)
 
-Combine vector similarity with metadata filters:
+Combine vector similarity with metadata filters.
+
+**Option 1: Pre-filtering (Recommended)**
+
+Filter candidates *before* vector search using `allowed_rids`. This is more efficient as
+it ensures you get `k` results that match your criteria.
+
+```python
+# 1. Query for matching RIDs using SQL or index lookup
+rs = db.query("sql", "SELECT @rid FROM Article WHERE category = 'Programming'")
+allowed_rids = [doc.getIdentity().toString() for doc in rs]
+
+# 2. Perform vector search restricted to those RIDs
+query_embedding = model.encode("python tutorial")
+results = index.find_nearest(query_embedding, k=10, allowed_rids=allowed_rids)
+
+for vertex, distance in results:
+    print(f"{vertex.get('title')} (distance: {distance:.4f})")
+```
+
+**Option 2: Post-filtering**
+
+Filter candidates *after* vector search. This is simpler but may return fewer than `k`
+results if many top candidates are filtered out.
 
 ```python
-# Get candidates from vector search
+# Get candidates from vector search (oversample with larger k)
 query_embedding = model.encode("python tutorial")
 candidates = index.find_nearest(query_embedding, k=100)
 
diff --git a/bindings/python/examples/03_vector_search.py b/bindings/python/examples/03_vector_search.py
@@ -303,23 +303,29 @@ def create_mock_embedding(category, doc_id):
 print(f"   💡 {args.impl.upper()} Parameters:")
 print(f"      • dimensions: {EMBEDDING_DIM} (matches embedding size)")
 print("      • distance_function: cosine (best for normalized vectors)")
-print(
-    "      • max_connections: 16 (connections per node, higher = more accurate but slower)"
-)
-print("      • beam_width: 128 (search quality, higher = more accurate)")
+if args.impl == "default":
+    print(
+        "      • max_connections: 32 (connections per node, higher = more accurate but slower)"
+    )
+    print("      • beam_width: 256 (search quality, higher = more accurate)")
+else:
+    print(
+        "      • max_connections: 16 (connections per node, higher = more accurate but slower)"
+    )
+    print("      • beam_width: 128 (search quality, higher = more accurate)")
+
 if args.impl == "hnsw":
     print(f"      • max_items: {num_articles} (set to actual document count)")
 print()
 
 if args.impl == "default":
     # Create vector index (JVector implementation - recommended)
+    # Using new defaults: max_connections=32, beam_width=256
     index = db.create_vector_index(
         vertex_type="Article",
         vector_property="embedding",
         dimensions=EMBEDDING_DIM,
         distance_function="cosine",
-        max_connections=16,
-        beam_width=128,
     )
 else:  # legacy
     # Create legacy HNSW vector index
diff --git a/bindings/python/examples/06_vector_search_recommendations.py b/bindings/python/examples/06_vector_search_recommendations.py
@@ -306,18 +306,17 @@ def create_vector_index(db, impl="default", property_suffix=""):
         print(f"  edge_type={edge_type}, metric=cosine, m=16, ef=128")
         print(f"  max_items={num_movies:,} (based on movies with embeddings)")
     else:
-        print("  metric=cosine, max_connections=16, beam_width=128")
+        print("  metric=cosine, max_connections=32, beam_width=256")
 
     start_time = time.time()
 
     if impl == "default":
+        # Using new defaults: max_connections=32, beam_width=256
         index = db.create_vector_index(
             vertex_type="Movie",
             vector_property=embedding_prop,
             dimensions=384,
             distance_function="cosine",
-            max_connections=16,
-            beam_width=128,
         )
     else:  # legacy
         # Create index with correct max_items
diff --git a/bindings/python/src/arcadedb_embedded/core.py b/bindings/python/src/arcadedb_embedded/core.py
diff --git a/bindings/python/src/arcadedb_embedded/vector.py b/bindings/python/src/arcadedb_embedded/vector.py
diff --git a/bindings/python/tests/test_vector.py b/bindings/python/tests/test_vector.py