Skip to content

Commit 1addedd

Browse files
committed
feat: update vector index parameters and add RID filtering support in search
1 parent e121168 commit 1addedd

9 files changed

Lines changed: 219 additions & 75 deletions

File tree

bindings/python/docs/api/vector.md

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ db.create_vector_index(
112112
vector_property: str,
113113
dimensions: int,
114114
distance_function: str = "cosine",
115-
max_connections: int = 16,
116-
beam_width: int = 200
115+
max_connections: int = 32,
116+
beam_width: int = 256
117117
) -> VectorIndex
118118
```
119119

@@ -126,11 +126,11 @@ db.create_vector_index(
126126
- `"cosine"`: Cosine distance (1 - cosine similarity)
127127
- `"euclidean"`: Euclidean distance (L2 norm)
128128
- `"inner_product"`: Negative inner product
129-
- `max_connections` (int): Max connections per node (default: 16)
129+
- `max_connections` (int): Max connections per node (default: 32)
130130
- Maps to `maxConnections` in JVector
131131
- Higher = better recall, more memory
132-
- Typical range: 12-48
133-
- `beam_width` (int): Beam width for search/construction (default: 200)
132+
- Typical range: 128-256
133+
- `beam_width` (int): Beam width for search/construction (default: 256)
134134
- Maps to `beamWidth` in JVector
135135
- Higher = better recall, slower search
136136
- Typical range: 100-400
@@ -162,16 +162,16 @@ index = db.create_vector_index(
162162
vector_property="embedding",
163163
dimensions=384, # Match your embedding model
164164
distance_function="cosine",
165-
m=16,
166-
ef=128
165+
max_connections=32,
166+
beam_width=256
167167
)
168168

169169
print(f"Created vector index: {index}")
170170
```
171171

172172
---
173173

174-
### `VectorIndex.find_nearest(query_vector, k=10, use_numpy=True)`
174+
### `VectorIndex.find_nearest(query_vector, k=10, overquery_factor=16, use_numpy=True, allowed_rids=None)`
175175

176176
Find k-nearest neighbors to the query vector.
177177

@@ -182,7 +182,11 @@ Find k-nearest neighbors to the query vector.
182182
- NumPy array: `np.array([0.1, 0.2, ...])`
183183
- Any array-like iterable
184184
- `k` (int): Number of neighbors to return (default: 10)
185+
- `overquery_factor` (int): Multiplier for search-time over-querying (implicit efSearch)
186+
(default: 16)
185187
- `use_numpy` (bool): Return vectors as NumPy if available (default: `True`)
188+
- `allowed_rids` (List[str]): Optional list of RID strings (e.g. `["#1:0", "#2:5"]`) to
189+
restrict search (default: `None`)
186190

187191
**Returns:**
188192

@@ -202,6 +206,10 @@ query_vector = generate_embedding(query_text) # Your embedding function
202206
# Search for 5 most similar documents
203207
neighbors = index.find_nearest(query_vector, k=5)
204208

209+
# Search with RID filtering
210+
allowed_rids = ["#10:5", "#10:8", "#10:12"]
211+
filtered_neighbors = index.find_nearest(query_vector, k=5, allowed_rids=allowed_rids)
212+
205213
for vertex, distance in neighbors:
206214
doc_id = vertex.get("id")
207215
text = vertex.get("text")

bindings/python/docs/examples/03_vector_search.md

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,9 @@ index = db.create_vector_index(
7272
vertex_type="Article",
7373
vector_property="embedding",
7474
dimensions=384,
75-
id_property="id",
7675
distance_function="cosine", # or "euclidean", "inner_product"
77-
m=16, # connections per node
78-
ef=128, # search quality
79-
ef_construction=128, # build quality
80-
max_items=10000 # capacity
76+
max_connections=32, # connections per node (default: 32)
77+
beam_width=256 # search quality (default: 256)
8178
)
8279
```
8380

@@ -88,14 +85,11 @@ index = db.create_vector_index(
8885
- `cosine`: Best for normalized vectors (text embeddings)
8986
- `euclidean`: Straight-line distance (image features)
9087
- `inner_product`: Dot product (when magnitude matters)
91-
- **M**: Connections per node (16 typical, 12-48 range)
88+
- **max_connections**: Connections per node (32 default, 16-64 range)
9289
- Higher = better accuracy, more memory
93-
- 16 is good balance for most use cases
94-
- **ef**: Search beam width (100-200 typical)
90+
- 32 is good balance for most use cases
91+
- **beam_width**: Search beam width (256 default, 100-400 range)
9592
- Higher = better recall, slower search
96-
- **ef_construction**: Build quality (100-200 typical)
97-
- Higher = better index, slower build
98-
- **max_items**: Pre-allocated capacity
9993

10094
### Distance vs Similarity
10195

@@ -120,15 +114,15 @@ index = db.create_vector_index(
120114

121115
When you create and populate a vector index, ArcadeDB stores:
122116

123-
**Files created** (for 10K documents, 384D, M=16):
117+
**Files created** (for 10K documents, 384D, M=32):
124118
```
125119
Article_414002873519545.5.v0.hnswidx 4 KB (metadata only)
126120
Article_0.1.65536.v0.bucket 24 MB (vertices + embeddings)
127121
Article_0_in_edges.3.65536.v0.bucket 22 MB (incoming edges)
128122
Article_0_out_edges.2.65536.v0.bucket 22 MB (outgoing edges)
129-
VectorProximity0_0.7.65536.v0.bucket 47 MB (HNSW proximity edges)
123+
VectorProximity0_0.7.65536.v0.bucket 90 MB (HNSW proximity edges)
130124
─────────────────────────────────────────────────
131-
Total: 115 MB
125+
Total: 160 MB
132126
```
133127

134128
**Key insight**: The `.hnswidx` file is tiny (4KB) - it only stores metadata. The actual HNSW graph is stored as edges in the database!
@@ -319,18 +313,18 @@ RAM ≈ 4 bytes × dimensions × num_vectors × (1 + M/2)
319313
```
320314

321315
**Examples:**
322-
- 10K vectors, 384D, M=16: ~37 MB
323-
- 100K vectors, 384D, M=16: ~370 MB
324-
- 1M vectors, 384D, M=16: ~3.7 GB
325-
- 1M vectors, 1536D, M=16: ~14.7 GB
316+
- 10K vectors, 384D, M=32: ~50 MB
317+
- 100K vectors, 384D, M=32: ~500 MB
318+
- 1M vectors, 384D, M=32: ~5 GB
319+
- 1M vectors, 1536D, M=32: ~16 GB
326320

327321
**Note:** This is working set, not total database size. ArcadeDB uses page caching, so hot data stays in RAM while cold data is read from disk on-demand.
328322

329323
### 4. Choosing Parameters
330324

331325
**Start with defaults:**
332326
```python
333-
M=16, ef=128, ef_construction=128
327+
max_connections=32, beam_width=256
334328
```
335329

336330
**Then tune based on needs:**

bindings/python/docs/examples/06_vector_search_recommendations.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -287,13 +287,9 @@ index = db.create_vector_index(
287287
vertex_type="Movie",
288288
vector_property="embedding_v1", # or "embedding_v2"
289289
dimensions=384,
290-
max_items=10000, # Adjusted to actual movie count
291-
id_property="vector_id_v1", # or "vector_id_v2"
292-
edge_type="Movie_v1", # Unique edge type per index
293290
distance_function="cosine",
294-
m=16, # Number of connections per layer
295-
ef=128, # Size of dynamic candidate list
296-
ef_construction=128 # Size during index construction
291+
max_connections=32, # Number of connections per layer (default: 32)
292+
beam_width=256 # Size of dynamic candidate list (default: 256)
297293
)
298294
```
299295

@@ -365,7 +361,7 @@ index = db.create_vector_index(
365361

366362
**Vector search:**
367363
- Cache embeddings (stored in database properties)
368-
- Use appropriate HNSW parameters (m=16, ef=128)
364+
- Use appropriate JVector parameters (max_connections=32, beam_width=256)
369365
- Choose faster encoding model (paraphrase-MiniLM-L6-v2)
370366

371367
**Memory management:**

bindings/python/docs/guide/vectors.md

Lines changed: 76 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
# Vector Search Guide
22

3-
Vector search enables semantic similarity search using embeddings from machine learning models. This guide covers strategies, best practices, and patterns for implementing vector search with ArcadeDB.
3+
Vector search enables semantic similarity search using embeddings from machine learning
4+
models. This guide covers strategies, best practices, and patterns for implementing
5+
vector search with ArcadeDB.
46

57
## Overview
68

7-
Vector search transforms your data into high-dimensional vectors (embeddings) and finds similar items using distance metrics. Perfect for:
9+
Vector search transforms your data into high-dimensional vectors (embeddings) and finds
10+
similar items using distance metrics. Perfect for:
811

912
- **Semantic Search**: Find documents by meaning, not just keywords
1013
- **Recommendation Systems**: Find similar products, users, or content
@@ -272,66 +275,91 @@ index = db.create_vector_index(
272275

273276
## Index Parameters
274277

275-
### Max Connections (m)
278+
### Max Connections
276279

277-
Controls connections per node in the graph. Maps to `maxConnections` in JVector.
280+
Controls connections per node in the graph. Maps to `maxConnections` in JVector and `M`
281+
in HNSW.
278282

279283
```python
280284
index = db.create_vector_index(
281285
vertex_type="Doc",
282286
vector_property="embedding",
283287
dimensions=384,
284-
max_connections=16 # Number of connections
288+
max_connections=32 # Number of connections (default: 32)
285289
)
286290
```
287291

288292
**Trade-offs:**
289293

290294
| Max Connections | Recall | Memory | Build Speed | Search Speed |
291295
|-----------------|--------|--------|-------------|--------------|
292-
| 8-12 | Lower | Low | Fast | Fast |
293-
| 16-24 | Good | Medium | Medium | Medium |
294-
| 32-48 | High | High | Slow | Slow |
296+
| 16 | Good | Low | Fast | Fast |
297+
| 32 (Default) | Decent | Medium | Medium | Medium |
298+
| 64 | High | High | Slow | Slow |
295299

296300
**Recommendations:**
297301
- **Small datasets (<100K)**: max_connections=16
298-
- **Medium datasets (100K-1M)**: max_connections=24
299-
- **Large datasets (>1M)**: max_connections=32-48
302+
- **Medium datasets (100K-1M)**: max_connections=32 (default)
303+
- **Large datasets (>1M)**: max_connections=64
300304

301305
---
302306

303307
### Beam Width (ef)
304308

305-
Controls search quality vs speed. Maps to `beamWidth` in JVector.
309+
Controls search quality vs speed. Maps to `beamWidth` in JVector and `ef_construction`
310+
in HNSW.
306311

307312
```python
308313
index = db.create_vector_index(
309314
vertex_type="Doc",
310315
vector_property="embedding",
311316
dimensions=384,
312-
beam_width=128 # Search candidate list size
317+
beam_width=256 # Search candidate list size (default: 256)
313318
)
314319
```
315320

316321
**Trade-offs:**
317322

318323
| Beam Width | Recall | Search Speed |
319324
|------------|--------|--------------|
320-
| 50-100 | Lower | Fast |
321-
| 128-200 | Good | Medium |
322-
| 200-400 | High | Slow |
325+
| <256 | Good | Fast |
326+
| 256 (Def) | Medium | Medium |
327+
| >256 | High | Slow |
323328

324329
**Recommendations:**
325-
- **Fast search**: beam_width=50-100
326-
- **Balanced**: beam_width=128-200
327-
- **High accuracy**: beam_width=200-400
330+
- **Fast search**: beam_width=128
331+
- **Balanced**: beam_width=256 (default)
332+
- **High accuracy**: beam_width=512
333+
334+
---
335+
336+
### Overquery Factor
337+
338+
Controls search-time accuracy by exploring more candidates than requested. This is
339+
similar to `efSearch` from HNSW.
340+
341+
```python
342+
# Actual search will explore k * overquery_factor candidates
343+
results = index.find_nearest(
344+
query_embedding,
345+
k=10,
346+
overquery_factor=16 # Default: 16
347+
)
348+
```
349+
350+
**Trade-offs:**
351+
352+
| Factor | Recall | Search Speed |
353+
|--------|--------|--------------|
354+
| <16 | Low | Fast |
355+
| 16 | Decent | Medium |
356+
| >16 | High | Slow |
328357

329-
**Recommendations:**
330-
- **Fast iteration**: ef_construction=100
331-
- **Production**: ef_construction=200
332-
- **Maximum quality**: ef_construction=400
333358

334-
**Note:** Higher ef_construction improves recall but only affects index building, not search.
359+
**Recommendations:**
360+
- **Fast search**: overquery_factor=8
361+
- **Balanced**: overquery_factor=16 (default)
362+
- **High accuracy**: overquery_factor=32
335363

336364
## Schema Design
337365

@@ -426,10 +454,33 @@ for vertex, distance in results:
426454

427455
### Hybrid Search (Vector + Filters)
428456

429-
Combine vector similarity with metadata filters:
457+
Combine vector similarity with metadata filters.
458+
459+
**Option 1: Pre-filtering (Recommended)**
460+
461+
Filter candidates *before* vector search using `allowed_rids`. This is more efficient as
462+
it ensures you get `k` results that match your criteria.
463+
464+
```python
465+
# 1. Query for matching RIDs using SQL or index lookup
466+
rs = db.query("sql", "SELECT @rid FROM Article WHERE category = 'Programming'")
467+
allowed_rids = [doc.getIdentity().toString() for doc in rs]
468+
469+
# 2. Perform vector search restricted to those RIDs
470+
query_embedding = model.encode("python tutorial")
471+
results = index.find_nearest(query_embedding, k=10, allowed_rids=allowed_rids)
472+
473+
for vertex, distance in results:
474+
print(f"{vertex.get('title')} (distance: {distance:.4f})")
475+
```
476+
477+
**Option 2: Post-filtering**
478+
479+
Filter candidates *after* vector search. This is simpler but may return fewer than `k`
480+
results if many top candidates are filtered out.
430481

431482
```python
432-
# Get candidates from vector search
483+
# Get candidates from vector search (oversample with larger k)
433484
query_embedding = model.encode("python tutorial")
434485
candidates = index.find_nearest(query_embedding, k=100)
435486

bindings/python/examples/03_vector_search.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -303,23 +303,29 @@ def create_mock_embedding(category, doc_id):
303303
print(f" 💡 {args.impl.upper()} Parameters:")
304304
print(f" • dimensions: {EMBEDDING_DIM} (matches embedding size)")
305305
print(" • distance_function: cosine (best for normalized vectors)")
306-
print(
307-
" • max_connections: 16 (connections per node, higher = more accurate but slower)"
308-
)
309-
print(" • beam_width: 128 (search quality, higher = more accurate)")
306+
if args.impl == "default":
307+
print(
308+
" • max_connections: 32 (connections per node, higher = more accurate but slower)"
309+
)
310+
print(" • beam_width: 256 (search quality, higher = more accurate)")
311+
else:
312+
print(
313+
" • max_connections: 16 (connections per node, higher = more accurate but slower)"
314+
)
315+
print(" • beam_width: 128 (search quality, higher = more accurate)")
316+
310317
if args.impl == "hnsw":
311318
print(f" • max_items: {num_articles} (set to actual document count)")
312319
print()
313320

314321
if args.impl == "default":
315322
# Create vector index (JVector implementation - recommended)
323+
# Using new defaults: max_connections=32, beam_width=256
316324
index = db.create_vector_index(
317325
vertex_type="Article",
318326
vector_property="embedding",
319327
dimensions=EMBEDDING_DIM,
320328
distance_function="cosine",
321-
max_connections=16,
322-
beam_width=128,
323329
)
324330
else: # legacy
325331
# Create legacy HNSW vector index

bindings/python/examples/06_vector_search_recommendations.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,18 +306,17 @@ def create_vector_index(db, impl="default", property_suffix=""):
306306
print(f" edge_type={edge_type}, metric=cosine, m=16, ef=128")
307307
print(f" max_items={num_movies:,} (based on movies with embeddings)")
308308
else:
309-
print(" metric=cosine, max_connections=16, beam_width=128")
309+
print(" metric=cosine, max_connections=32, beam_width=256")
310310

311311
start_time = time.time()
312312

313313
if impl == "default":
314+
# Using new defaults: max_connections=32, beam_width=256
314315
index = db.create_vector_index(
315316
vertex_type="Movie",
316317
vector_property=embedding_prop,
317318
dimensions=384,
318319
distance_function="cosine",
319-
max_connections=16,
320-
beam_width=128,
321320
)
322321
else: # legacy
323322
# Create index with correct max_items

0 commit comments

Comments
 (0)