Skip to content

Commit c2a96bf

Browse files
committed
feat: add support for INT8 encoding and sparse vector indexing in vector API
1 parent b51dd75 commit c2a96bf

11 files changed

Lines changed: 620 additions & 2 deletions

File tree

bindings/python/docs/api/database.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,10 +648,12 @@ db.create_vector_index(
648648
vertex_type: str,
649649
vector_property: str,
650650
dimensions: int,
651+
id_property: str | None = None,
651652
distance_function: str = "cosine",
652653
max_connections: int = 16,
653654
beam_width: int = 100,
654655
quantization: str = "INT8",
656+
encoding: str | None = None,
655657
location_cache_size: int | None = None,
656658
graph_build_cache_size: int | None = None,
657659
mutations_before_rebuild: int | None = None,
@@ -679,6 +681,7 @@ specifically need that surface.
679681
- `vertex_type` (str): Vertex type containing vectors
680682
- `vector_property` (str): Property storing vector arrays
681683
- `dimensions` (int): Vector dimensionality
684+
- `id_property` (str | None): Optional property used for key-based vector lookup.
682685
- `distance_function` (str): `"cosine"`, `"euclidean"`, or `"inner_product"`
683686
- `max_connections` (int): Max connections per node (default: 16). Maps to
684687
`maxConnections` in HNSW (JVector).
@@ -690,6 +693,10 @@ specifically need that surface.
690693
production workloads. In current ArcadeDB engine builds, `"PRODUCT"` also requires
691694
enough indexed vectors per bucket for PQ training. For tiny corpora, set `pq_clusters`
692695
explicitly to a small value or prefer another quantization mode.
696+
- `encoding` (str | None): Optional storage encoding for the document property.
697+
Use `"INT8"` when the underlying property is `BINARY` and stores pre-quantized
698+
bytes. Do not combine `encoding="INT8"` with `quantization="INT8"`; use
699+
`quantization="NONE"` for native INT8 storage.
693700
- `location_cache_size` (int | None): Override location cache size (default: `None`, uses engine default).
694701
- `graph_build_cache_size` (int | None): Override graph build cache size (default: `None`, uses engine default).
695702
- `mutations_before_rebuild` (int | None): Override rebuild threshold (default: `None`, uses engine default).

bindings/python/docs/api/vector.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ db.create_vector_index(
155155
max_connections: int = 16,
156156
beam_width: int = 100,
157157
quantization: str = "INT8",
158+
encoding: str | None = None,
158159
location_cache_size: int | None = None,
159160
graph_build_cache_size: int | None = None,
160161
mutations_before_rebuild: int | None = None,
@@ -194,6 +195,10 @@ db.create_vector_index(
194195
to a small value or prefer `"INT8"`, `"BINARY"`, or `None`.
195196
- Prefer `"INT8"` for current production usage in these bindings.
196197
- `"PRODUCT"`/PQ is available but currently not recommended for production workloads.
198+
- `encoding` (str | None): Optional storage encoding for the vector property.
199+
- Use `"INT8"` with a `BINARY` property when your vectors are already stored as
200+
signed bytes.
201+
- Pair `encoding="INT8"` with `quantization="NONE"` to avoid double quantization.
197202
- `build_graph_now` (bool): If `True` (default), eagerly prepares the vector graph
198203
during index creation. Set to `False` to defer graph preparation until first query.
199204

@@ -239,6 +244,21 @@ Treat this as a helper/manual API. For normal application queries, prefer SQL
239244
`vectorNeighbors` so search composes naturally with filtering, projection, and record
240245
exclusion.
241246

247+
---
248+
249+
### `to_java_byte_array(vector)`
250+
251+
Convert a Python byte-like or integer array-like object to a Java `byte[]`.
252+
253+
Use this when inserting native INT8 vectors into a `BINARY` property for indexes
254+
created with `encoding="INT8"`.
255+
256+
```python
257+
from arcadedb_embedded import to_java_byte_array
258+
259+
payload = to_java_byte_array([127, 0, -12, 5])
260+
```
261+
242262
**Note:** With default settings (`build_graph_now=True` in `create_vector_index`), graph
243263
preparation runs during index creation. In the preferred SQL path, this eager behavior is
244264
also the default. If you explicitly disable eager preparation, the first call to

bindings/python/docs/guide/vectors.md

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@ Preferred split:
6262
- Keep the secondary Python helper APIs in mind only for manual or maintenance cases;
6363
they are not the recommended application-facing workflow.
6464

65-
- Vector property type must be `ARRAY_OF_FLOATS`.
65+
- Vector property type is usually `ARRAY_OF_FLOATS`.
66+
- Use `BINARY` only when you are storing pre-quantized INT8 bytes with
67+
`encoding="INT8"`.
6668
- `CREATE INDEX ON Doc (embedding) LSM_VECTOR METADATA {...}` is the preferred creation
6769
path.
6870
- SQL builds the vector graph immediately by default.
@@ -241,6 +243,96 @@ rows = db.query(
241243
- Quantization via SQL: `METADATA {"quantization": "INT8"}` is the recommended path for
242244
embedded usage.
243245

246+
## Native INT8 Storage
247+
248+
If your application already has INT8 vectors, store them in a `BINARY` property and
249+
set `encoding="INT8"` on the vector index metadata.
250+
251+
```python
252+
import arcadedb_embedded as arcadedb
253+
254+
with arcadedb.create_database("./vector_demo_int8") as db:
255+
db.command("sql", "CREATE VERTEX TYPE ByteDoc")
256+
db.command("sql", "CREATE PROPERTY ByteDoc.id STRING")
257+
db.command("sql", "CREATE PROPERTY ByteDoc.embedding BINARY")
258+
259+
db.command(
260+
"sql",
261+
"""
262+
CREATE INDEX ON ByteDoc (embedding)
263+
LSM_VECTOR
264+
METADATA {
265+
"dimensions": 4,
266+
"similarity": "COSINE",
267+
"quantization": "NONE",
268+
"encoding": "INT8"
269+
}
270+
""",
271+
)
272+
273+
with db.transaction():
274+
db.command(
275+
"sql",
276+
"INSERT INTO ByteDoc SET id = ?, embedding = ?",
277+
"doc_a",
278+
arcadedb.to_java_byte_array([127, 0, 0, 0]),
279+
)
280+
```
281+
282+
Use `encoding="INT8"` only with `quantization="NONE"`. Combining INT8 storage encoding
283+
with INT8 quantization would quantize the same vector twice.
284+
285+
## Sparse Vectors
286+
287+
ArcadeDB also supports sparse top-K retrieval through `LSM_SPARSE_VECTOR` and
288+
`vector.sparseNeighbors(...)`.
289+
290+
```python
291+
import arcadedb_embedded as arcadedb
292+
import jpype.types as jtypes
293+
294+
with arcadedb.create_database("./sparse_demo") as db:
295+
db.command("sql", "CREATE DOCUMENT TYPE SparseDoc")
296+
db.command("sql", "CREATE PROPERTY SparseDoc.tokens ARRAY_OF_INTEGERS")
297+
db.command("sql", "CREATE PROPERTY SparseDoc.weights ARRAY_OF_FLOATS")
298+
299+
db.command(
300+
"sql",
301+
"""
302+
CREATE INDEX ON SparseDoc (tokens, weights)
303+
LSM_SPARSE_VECTOR
304+
METADATA {"dimensions": 128}
305+
""",
306+
)
307+
308+
rows = db.query(
309+
"sql",
310+
"SELECT expand(`vector.sparseNeighbors`('SparseDoc[tokens,weights]', ?, ?, 5))",
311+
jtypes.JArray(jtypes.JInt)([5]),
312+
arcadedb.to_java_float_array([1.0]),
313+
).to_list()
314+
```
315+
316+
## Grouped Search
317+
318+
Recent engine builds support `groupBy` / `groupSize` options on `vector.neighbors`.
319+
This is useful when you want diversity across a field such as source file, tenant, or
320+
document family.
321+
322+
```python
323+
rows = db.query(
324+
"sql",
325+
(
326+
"SELECT source_file, distance FROM "
327+
"(SELECT expand(`vector.neighbors`(?, ?, ?, { groupBy: 'source_file', groupSize: 1 }))) "
328+
"ORDER BY distance"
329+
),
330+
"GroupedDoc[embedding]",
331+
arcadedb.to_java_float_array([1.0, 0.0, 0.0, 0.0]),
332+
3,
333+
).to_list()
334+
```
335+
244336
## Examples & References
245337

246338
- **[Example 03: Vector Search – Semantic Similarity](../examples/03_vector_search.md)**

bindings/python/examples/03_vector_search.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
- Creating HNSW (JVector) indexes for fast nearest-neighbor search
1111
- Finding semantically similar documents using cosine similarity
1212
- Understanding vector search parameters (dimensions, distance functions)
13+
- INT8-encoded dense-vector storage for smaller payloads and bucket footprint
14+
- Sparse-vector indexing for token/weight retrieval workloads
1315
- Index population strategies and performance characteristics
1416
1517
Implementation Status:
@@ -45,6 +47,7 @@
4547
import time
4648

4749
import arcadedb_embedded as arcadedb
50+
import jpype.types as jtypes
4851
import numpy as np
4952

5053
# Parse command line arguments
@@ -139,6 +142,11 @@ def create_mock_embedding(category_seed, doc_seed):
139142

140143
return embedding.astype(np.float32)
141144

145+
def quantize_to_int8_bytes(vector: np.ndarray):
146+
"""Quantize a normalized float vector to signed int8 bytes."""
147+
scaled = np.clip(np.rint(vector * 127.0), -127, 127).astype(np.int8)
148+
return scaled.tolist()
149+
142150
# Generate documents
143151
documents = []
144152
for i in range(NUM_DOCUMENTS):
@@ -312,6 +320,129 @@ def create_mock_embedding(category_seed, doc_seed):
312320
print(f" ⏱️ All queries time: {time.time() - step_start:.3f}s")
313321
print()
314322

323+
# -----------------------------------------------------------------------------
324+
# Step 7: INT8-Encoded Dense Vectors
325+
# -----------------------------------------------------------------------------
326+
print("Step 7: Demonstrating INT8-encoded dense-vector storage...")
327+
step_start = time.time()
328+
329+
try:
330+
db.command("sql", "CREATE VERTEX TYPE Int8Article")
331+
db.command("sql", "CREATE PROPERTY Int8Article.id STRING")
332+
db.command("sql", "CREATE PROPERTY Int8Article.category STRING")
333+
db.command("sql", "CREATE PROPERTY Int8Article.embedding BINARY")
334+
335+
db.command(
336+
"sql",
337+
"""
338+
CREATE INDEX ON Int8Article (embedding)
339+
LSM_VECTOR
340+
METADATA {
341+
"dimensions": 4,
342+
"similarity": "COSINE",
343+
"quantization": "NONE",
344+
"encoding": "INT8"
345+
}
346+
""",
347+
)
348+
349+
int8_docs = [
350+
("int8_doc_1", "technology", [1.0, 0.0, 0.0, 0.0]),
351+
("int8_doc_2", "technology", [0.95, 0.05, 0.0, 0.0]),
352+
("int8_doc_3", "sports", [0.0, 1.0, 0.0, 0.0]),
353+
]
354+
355+
with db.transaction():
356+
for doc_id, category, vector in int8_docs:
357+
db.command(
358+
"sql",
359+
"INSERT INTO Int8Article SET id = ?, category = ?, embedding = ?",
360+
doc_id,
361+
category,
362+
arcadedb.to_java_byte_array(
363+
quantize_to_int8_bytes(np.array(vector))
364+
),
365+
)
366+
367+
int8_hits = db.query(
368+
"sql",
369+
(
370+
"SELECT id, category, distance FROM "
371+
"(SELECT expand(vectorNeighbors('Int8Article[embedding]', ?, 2))) "
372+
"ORDER BY distance"
373+
),
374+
arcadedb.to_java_float_array([1.0, 0.0, 0.0, 0.0]),
375+
).to_list()
376+
except arcadedb.ArcadeDBError as exc:
377+
print(" ⚠️ Skipping INT8-encoded dense-vector demo in this runtime")
378+
print(f" 💡 Reason: {exc}")
379+
else:
380+
print(" ✅ Created INT8-encoded dense index on a BINARY property")
381+
print(" 💡 Use this when your embeddings are already stored as int8 bytes")
382+
print(" Top matches for [1, 0, 0, 0]:")
383+
for hit in int8_hits:
384+
print(
385+
f" • {hit.get('id')} ({hit.get('category')}), "
386+
f"distance={hit.get('distance'):.4f}"
387+
)
388+
print(f" ⏱️ Time: {time.time() - step_start:.3f}s")
389+
print()
390+
391+
# -----------------------------------------------------------------------------
392+
# Step 8: Sparse Vectors
393+
# -----------------------------------------------------------------------------
394+
print("Step 8: Demonstrating sparse-vector retrieval...")
395+
step_start = time.time()
396+
397+
try:
398+
db.command("sql", "CREATE DOCUMENT TYPE SparseArticle")
399+
db.command("sql", "CREATE PROPERTY SparseArticle.id STRING")
400+
db.command("sql", "CREATE PROPERTY SparseArticle.tokens ARRAY_OF_INTEGERS")
401+
db.command("sql", "CREATE PROPERTY SparseArticle.weights ARRAY_OF_FLOATS")
402+
403+
db.command(
404+
"sql",
405+
"""
406+
CREATE INDEX ON SparseArticle (tokens, weights)
407+
LSM_SPARSE_VECTOR
408+
METADATA {
409+
"dimensions": 128
410+
}
411+
""",
412+
)
413+
414+
with db.transaction():
415+
db.command(
416+
"sql",
417+
"INSERT INTO SparseArticle SET id = 'sparse_doc_1', tokens = [1, 5, 10], weights = [0.5, 0.3, 0.2]",
418+
)
419+
db.command(
420+
"sql",
421+
"INSERT INTO SparseArticle SET id = 'sparse_doc_2', tokens = [2, 5, 11], weights = [0.4, 0.6, 0.1]",
422+
)
423+
424+
sparse_hits = db.query(
425+
"sql",
426+
(
427+
"SELECT id, score FROM "
428+
"(SELECT expand(`vector.sparseNeighbors`('SparseArticle[tokens,weights]', ?, ?, 5))) "
429+
"ORDER BY score DESC"
430+
),
431+
jtypes.JArray(jtypes.JInt)([5]),
432+
arcadedb.to_java_float_array([1.0]),
433+
).to_list()
434+
except arcadedb.ArcadeDBError as exc:
435+
print(" ⚠️ Skipping sparse-vector demo in this runtime")
436+
print(f" 💡 Reason: {exc}")
437+
else:
438+
print(" ✅ Created sparse-vector index on token/weight arrays")
439+
print(" 💡 Use this for BM25-style or learned sparse retrieval")
440+
print(" Top matches for sparse query {(5): 1.0}:")
441+
for hit in sparse_hits:
442+
print(f" • {hit.get('id')}, score={hit.get('score'):.4f}")
443+
print(f" ⏱️ Time: {time.time() - step_start:.3f}s")
444+
print()
445+
315446
# -----------------------------------------------------------------------------
316447
# Cleanup
317448
# -----------------------------------------------------------------------------

bindings/python/src/arcadedb_embedded/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,12 @@
5858
from .type_conversion import convert_java_to_python, convert_python_to_java
5959

6060
# Import vector utilities and index
61-
from .vector import VectorIndex, to_java_float_array, to_python_array
61+
from .vector import (
62+
VectorIndex,
63+
to_java_byte_array,
64+
to_java_float_array,
65+
to_python_array,
66+
)
6267

6368
__all__ = [
6469
"__version__",
@@ -91,6 +96,7 @@
9196
"convert_python_to_java",
9297
# Vector search
9398
"VectorIndex",
99+
"to_java_byte_array",
94100
"to_java_float_array",
95101
"to_python_array",
96102
# Data export

0 commit comments

Comments
 (0)