Skip to content

Commit 2922748

Browse files
committed
Refactor build script and enhance vector handling in Python bindings
- Updated the build script to clarify build methods and improve architecture normalization. - Added support for encoding in the Database class to handle INT8 storage. - Introduced a new utility function to convert Python byte-like objects to Java byte arrays. - Enhanced vector-related functions and tests to support new encoding and quantization features. - Added tests for full-text search with wildcards and vector neighbors with group options. - Improved error handling and documentation across various test cases.
1 parent 89e0d16 commit 2922748

29 files changed

Lines changed: 911 additions & 108 deletions

bindings/python/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Native Python bindings for ArcadeDB - the multi-model database that supports Graph, Document, Key/Value, Search Engine, Time Series, and Vector models.
44

5-
**Status**: ✅ Production Ready | **Tests**: 290 Passed | **Platforms**: 4 Supported
5+
**Status**: ✅ Production Ready | **Tests**: 331 Passed | **Platforms**: 4 Supported
66

77
---
88

@@ -92,7 +92,7 @@ Import: `import arcadedb_embedded as arcadedb`
9292

9393
## 🧪 Testing
9494

95-
**Status**: 290 passed
95+
**Status**: 331 passed
9696

9797
```bash
9898
# Run all tests

bindings/python/examples/03_vector_search.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
- Creating HNSW (JVector) indexes for fast nearest-neighbor search
1111
- Finding semantically similar documents using cosine similarity
1212
- Understanding vector search parameters (dimensions, distance functions)
13+
- INT8-encoded dense-vector storage for smaller payloads and bucket footprint
14+
- Sparse-vector indexing for token/weight retrieval workloads
1315
- Index population strategies and performance characteristics
1416
1517
Implementation Status:
@@ -45,6 +47,7 @@
4547
import time
4648

4749
import arcadedb_embedded as arcadedb
50+
import jpype.types as jtypes
4851
import numpy as np
4952

5053
# Parse command line arguments
@@ -139,6 +142,11 @@ def create_mock_embedding(category_seed, doc_seed):
139142

140143
return embedding.astype(np.float32)
141144

145+
def quantize_to_int8_bytes(vector: np.ndarray):
146+
"""Quantize a normalized float vector to signed int8 bytes."""
147+
scaled = np.clip(np.rint(vector * 127.0), -127, 127).astype(np.int8)
148+
return scaled.tolist()
149+
142150
# Generate documents
143151
documents = []
144152
for i in range(NUM_DOCUMENTS):
@@ -312,6 +320,129 @@ def create_mock_embedding(category_seed, doc_seed):
312320
print(f" ⏱️ All queries time: {time.time() - step_start:.3f}s")
313321
print()
314322

323+
# -----------------------------------------------------------------------------
324+
# Step 7: INT8-Encoded Dense Vectors
325+
# -----------------------------------------------------------------------------
326+
print("Step 7: Demonstrating INT8-encoded dense-vector storage...")
327+
step_start = time.time()
328+
329+
try:
330+
db.command("sql", "CREATE VERTEX TYPE Int8Article")
331+
db.command("sql", "CREATE PROPERTY Int8Article.id STRING")
332+
db.command("sql", "CREATE PROPERTY Int8Article.category STRING")
333+
db.command("sql", "CREATE PROPERTY Int8Article.embedding BINARY")
334+
335+
db.command(
336+
"sql",
337+
"""
338+
CREATE INDEX ON Int8Article (embedding)
339+
LSM_VECTOR
340+
METADATA {
341+
"dimensions": 4,
342+
"similarity": "COSINE",
343+
"quantization": "NONE",
344+
"encoding": "INT8"
345+
}
346+
""",
347+
)
348+
349+
int8_docs = [
350+
("int8_doc_1", "technology", [1.0, 0.0, 0.0, 0.0]),
351+
("int8_doc_2", "technology", [0.95, 0.05, 0.0, 0.0]),
352+
("int8_doc_3", "sports", [0.0, 1.0, 0.0, 0.0]),
353+
]
354+
355+
with db.transaction():
356+
for doc_id, category, vector in int8_docs:
357+
db.command(
358+
"sql",
359+
"INSERT INTO Int8Article SET id = ?, category = ?, embedding = ?",
360+
doc_id,
361+
category,
362+
arcadedb.to_java_byte_array(
363+
quantize_to_int8_bytes(np.array(vector))
364+
),
365+
)
366+
367+
int8_hits = db.query(
368+
"sql",
369+
(
370+
"SELECT id, category, distance FROM "
371+
"(SELECT expand(vectorNeighbors('Int8Article[embedding]', ?, 2))) "
372+
"ORDER BY distance"
373+
),
374+
arcadedb.to_java_float_array([1.0, 0.0, 0.0, 0.0]),
375+
).to_list()
376+
except arcadedb.ArcadeDBError as exc:
377+
print(" ⚠️ Skipping INT8-encoded dense-vector demo in this runtime")
378+
print(f" 💡 Reason: {exc}")
379+
else:
380+
print(" ✅ Created INT8-encoded dense index on a BINARY property")
381+
print(" 💡 Use this when your embeddings are already stored as int8 bytes")
382+
print(" Top matches for [1, 0, 0, 0]:")
383+
for hit in int8_hits:
384+
print(
385+
f" • {hit.get('id')} ({hit.get('category')}), "
386+
f"distance={hit.get('distance'):.4f}"
387+
)
388+
print(f" ⏱️ Time: {time.time() - step_start:.3f}s")
389+
print()
390+
391+
# -----------------------------------------------------------------------------
392+
# Step 8: Sparse Vectors
393+
# -----------------------------------------------------------------------------
394+
print("Step 8: Demonstrating sparse-vector retrieval...")
395+
step_start = time.time()
396+
397+
try:
398+
db.command("sql", "CREATE DOCUMENT TYPE SparseArticle")
399+
db.command("sql", "CREATE PROPERTY SparseArticle.id STRING")
400+
db.command("sql", "CREATE PROPERTY SparseArticle.tokens ARRAY_OF_INTEGERS")
401+
db.command("sql", "CREATE PROPERTY SparseArticle.weights ARRAY_OF_FLOATS")
402+
403+
db.command(
404+
"sql",
405+
"""
406+
CREATE INDEX ON SparseArticle (tokens, weights)
407+
LSM_SPARSE_VECTOR
408+
METADATA {
409+
"dimensions": 128
410+
}
411+
""",
412+
)
413+
414+
with db.transaction():
415+
db.command(
416+
"sql",
417+
"INSERT INTO SparseArticle SET id = 'sparse_doc_1', tokens = [1, 5, 10], weights = [0.5, 0.3, 0.2]",
418+
)
419+
db.command(
420+
"sql",
421+
"INSERT INTO SparseArticle SET id = 'sparse_doc_2', tokens = [2, 5, 11], weights = [0.4, 0.6, 0.1]",
422+
)
423+
424+
sparse_hits = db.query(
425+
"sql",
426+
(
427+
"SELECT id, score FROM "
428+
"(SELECT expand(`vector.sparseNeighbors`('SparseArticle[tokens,weights]', ?, ?, 5))) "
429+
"ORDER BY score DESC"
430+
),
431+
jtypes.JArray(jtypes.JInt)([5]),
432+
arcadedb.to_java_float_array([1.0]),
433+
).to_list()
434+
except arcadedb.ArcadeDBError as exc:
435+
print(" ⚠️ Skipping sparse-vector demo in this runtime")
436+
print(f" 💡 Reason: {exc}")
437+
else:
438+
print(" ✅ Created sparse-vector index on token/weight arrays")
439+
print(" 💡 Use this for BM25-style or learned sparse retrieval")
440+
print(" Top matches for sparse query {(5): 1.0}:")
441+
for hit in sparse_hits:
442+
print(f" • {hit.get('id')}, score={hit.get('score'):.4f}")
443+
print(f" ⏱️ Time: {time.time() - step_start:.3f}s")
444+
print()
445+
315446
# -----------------------------------------------------------------------------
316447
# Cleanup
317448
# -----------------------------------------------------------------------------

bindings/python/examples/11_vector_index_build.py

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def fetch_json(url: str) -> dict:
9898
if not url.startswith("https://"):
9999
raise ValueError(f"Refusing to open non-HTTPS URL: {url!r}")
100100
req = Request(url, headers={"User-Agent": "arcadedb-bench"})
101-
with urlopen(req, timeout=30) as response: # nosec B310 - https-only
101+
with urlopen(req, timeout=30) as response: # nosec B310
102102
payload = json.load(response)
103103
if not isinstance(payload, dict):
104104
raise RuntimeError(f"Expected JSON object from {url}")
@@ -495,24 +495,46 @@ def stream_shards(
495495
remaining -= take_total
496496

497497

498+
def quantize_to_int8_bytes(vector: np.ndarray) -> list[int]:
499+
array = np.asarray(vector, dtype=np.float32)
500+
if not np.all(np.isfinite(array)):
501+
raise ValueError("Cannot INT8-encode vectors with NaN or infinite values")
502+
scaled = np.clip(np.rint(array * 127.0), -127, 127).astype(np.int8)
503+
return scaled.tolist()
504+
505+
498506
def ingest_vectors_arcadedb(
499507
db,
500508
sources: List[dict],
501509
dim: int,
502510
count: int,
503511
batch_size: int,
504512
to_java_float_array,
513+
to_java_byte_array,
514+
encoding: str,
505515
) -> int:
516+
use_int8_encoding = encoding.upper() == "INT8"
517+
506518
for command in (
507519
"CREATE VERTEX TYPE VectorData",
508520
"CREATE PROPERTY VectorData.id INTEGER",
509-
"CREATE PROPERTY VectorData.vector ARRAY_OF_FLOATS",
521+
(
522+
"CREATE PROPERTY VectorData.vector BINARY"
523+
if use_int8_encoding
524+
else "CREATE PROPERTY VectorData.vector ARRAY_OF_FLOATS"
525+
),
510526
):
511527
try:
512528
db.command("sql", command)
513529
except Exception:
514530
pass
515531

532+
if use_int8_encoding and to_java_byte_array is None:
533+
raise RuntimeError(
534+
"arcadedb_embedded does not expose to_java_byte_array; update the "
535+
"local wheel before using --encoding INT8"
536+
)
537+
516538
ingested = 0
517539
for base_id, batch in stream_shards(
518540
sources,
@@ -522,7 +544,14 @@ def ingest_vectors_arcadedb(
522544
):
523545
with db.transaction():
524546
for idx, vec in enumerate(batch, start=base_id):
525-
jvec = to_java_float_array(vec) if to_java_float_array else vec.tolist()
547+
if use_int8_encoding:
548+
jvec = to_java_byte_array(quantize_to_int8_bytes(vec))
549+
else:
550+
jvec = (
551+
to_java_float_array(vec)
552+
if to_java_float_array
553+
else vec.tolist()
554+
)
526555
db.command(
527556
"sql",
528557
"INSERT INTO VectorData SET id = ?, vector = ?",
@@ -540,10 +569,19 @@ def create_index_arcadedb(
540569
max_connections: int,
541570
beam_width: int,
542571
quantization: str,
572+
encoding: str,
543573
store_vectors_in_graph: bool,
544574
add_hierarchy: bool,
545575
):
546576
quant = None if quantization.upper() == "NONE" else quantization.upper()
577+
enc = None if encoding.upper() == "NONE" else encoding.upper()
578+
579+
if enc == "INT8" and quant == "INT8":
580+
raise ValueError(
581+
"--encoding INT8 cannot be combined with --quantization INT8; "
582+
"use --quantization NONE for native INT8 storage"
583+
)
584+
547585
metadata_lines = [
548586
f'"dimensions": {int(dim)}',
549587
'"similarity": "COSINE"',
@@ -554,6 +592,8 @@ def create_index_arcadedb(
554592
]
555593
if quant is not None:
556594
metadata_lines.insert(4, f'"quantization": "{quant}"')
595+
if enc is not None:
596+
metadata_lines.insert(5, f'"encoding": "{enc}"')
557597

558598
metadata_body = ",\n ".join(metadata_lines)
559599

@@ -964,9 +1004,7 @@ def wait_for_qdrant_ready(host: str, port: int, timeout_sec: int = 120) -> None:
9641004
while True:
9651005
for url in urls:
9661006
try:
967-
with urlopen(
968-
url, timeout=3
969-
) as response: # nosec B310 - localhost health-check URL
1007+
with urlopen(url, timeout=3) as response: # nosec B310
9701008
if 200 <= int(response.status) < 500:
9711009
return
9721010
except Exception:
@@ -1016,9 +1054,7 @@ def ensure_milvus_compose_file(compose_file: Path, release_tag: str) -> None:
10161054
"https://github.com/milvus-io/milvus/releases/download/"
10171055
f"{release_tag}/milvus-standalone-docker-compose.yml"
10181056
)
1019-
urlretrieve(
1020-
url, str(compose_file)
1021-
) # nosec B310 - url is a hardcoded https://github.com URL
1057+
urlretrieve(url, str(compose_file)) # nosec B310
10221058
raw = compose_file.read_text(encoding="utf-8")
10231059

10241060
sanitized = re.sub(r"(?m)^\s*container_name:\s*.*\n", "", raw)
@@ -1863,6 +1899,12 @@ def main() -> None:
18631899
default="NONE",
18641900
help="ArcadeDB quantization mode (ignored by pgvector)",
18651901
)
1902+
parser.add_argument(
1903+
"--encoding",
1904+
choices=["NONE", "INT8"],
1905+
default="NONE",
1906+
help="ArcadeDB storage encoding for the vector property (default: NONE)",
1907+
)
18661908
parser.add_argument(
18671909
"--store-vectors-in-graph",
18681910
action="store_true",
@@ -1958,6 +2000,12 @@ def main() -> None:
19582000
args.milvus_compose_version = resolve_milvus_compose_version(
19592001
args.milvus_compose_version
19602002
)
2003+
if args.encoding != "NONE" and args.backend != "arcadedb_sql":
2004+
parser.error("--encoding is supported only for backend=arcadedb_sql")
2005+
if args.encoding == "INT8" and args.quantization != "NONE":
2006+
parser.error(
2007+
"--encoding INT8 requires --quantization NONE to avoid double quantization"
2008+
)
19612009
if args.backend == "pgvector" and args.docker_image == "python:3.12-slim":
19622010
args.docker_image = resolve_latest_pgvector_image()
19632011
configure_reproducibility(args.seed)
@@ -2001,6 +2049,7 @@ def main() -> None:
20012049
f"maxconn={args.max_connections}",
20022050
f"beam={args.beam_width}",
20032051
f"quant={args.quantization.lower()}",
2052+
f"enc={args.encoding.lower()}",
20042053
f"store={'on' if args.store_vectors_in_graph else 'off'}",
20052054
f"hier={'on' if args.add_hierarchy else 'off'}",
20062055
f"batch={args.batch_size}",
@@ -2028,6 +2077,8 @@ def main() -> None:
20282077
print(f"Dimensions: {dim}")
20292078
print(f"Max connections: {args.max_connections}")
20302079
print(f"Beam width: {args.beam_width}")
2080+
print(f"Quantization: {args.quantization}")
2081+
print(f"Encoding: {args.encoding}")
20312082
if args.backend == "pgvector":
20322083
print(f"Postgres shared_buffers: {pg_shared_buffers}")
20332084
print(f"DB path: {db_path}")
@@ -2067,6 +2118,7 @@ def record(name: str, result, dur: float, rss_start: float, rss_end: float):
20672118

20682119
try:
20692120
to_java_float_array = getattr(arcadedb, "to_java_float_array", None)
2121+
to_java_byte_array = getattr(arcadedb, "to_java_byte_array", None)
20702122
ingest_started_at = datetime.now(timezone.utc).isoformat()
20712123
print(f"Ingest start (arcadedb, UTC): {ingest_started_at}")
20722124
ingested, dur, r0, r1 = timed_section(
@@ -2078,6 +2130,8 @@ def record(name: str, result, dur: float, rss_start: float, rss_end: float):
20782130
count=count,
20792131
batch_size=args.batch_size,
20802132
to_java_float_array=to_java_float_array,
2133+
to_java_byte_array=to_java_byte_array,
2134+
encoding=args.encoding,
20812135
),
20822136
)
20832137
ingest_ended_at = datetime.now(timezone.utc).isoformat()
@@ -2095,6 +2149,7 @@ def record(name: str, result, dur: float, rss_start: float, rss_end: float):
20952149
max_connections=args.max_connections,
20962150
beam_width=args.beam_width,
20972151
quantization=args.quantization,
2152+
encoding=args.encoding,
20982153
store_vectors_in_graph=args.store_vectors_in_graph,
20992154
add_hierarchy=args.add_hierarchy,
21002155
),
@@ -2557,6 +2612,7 @@ def server_pid_provider() -> int | None:
25572612
"max_connections": args.max_connections,
25582613
"beam_width": args.beam_width,
25592614
"quantization": args.quantization,
2615+
"encoding": args.encoding,
25602616
"store_vectors_in_graph": args.store_vectors_in_graph,
25612617
"add_hierarchy": args.add_hierarchy,
25622618
"batch_size": args.batch_size,

0 commit comments

Comments
 (0)