|
10 | 10 | - Creating HNSW (JVector) indexes for fast nearest-neighbor search |
11 | 11 | - Finding semantically similar documents using cosine similarity |
12 | 12 | - Understanding vector search parameters (dimensions, distance functions) |
| 13 | +- INT8-encoded dense-vector storage for smaller payloads and bucket footprint |
| 14 | +- Sparse-vector indexing for token/weight retrieval workloads |
13 | 15 | - Index population strategies and performance characteristics |
14 | 16 |
|
15 | 17 | Implementation Status: |
|
45 | 47 | import time |
46 | 48 |
|
47 | 49 | import arcadedb_embedded as arcadedb |
| 50 | +import jpype.types as jtypes |
48 | 51 | import numpy as np |
49 | 52 |
|
50 | 53 | # Parse command line arguments |
@@ -139,6 +142,11 @@ def create_mock_embedding(category_seed, doc_seed): |
139 | 142 |
|
140 | 143 | return embedding.astype(np.float32) |
141 | 144 |
|
| 145 | + def quantize_to_int8_bytes(vector: np.ndarray): |
| 146 | + """Quantize a normalized float vector to signed int8 bytes.""" |
| 147 | + scaled = np.clip(np.rint(vector * 127.0), -127, 127).astype(np.int8) |
| 148 | + return scaled.tolist() |
| 149 | + |
142 | 150 | # Generate documents |
143 | 151 | documents = [] |
144 | 152 | for i in range(NUM_DOCUMENTS): |
@@ -312,6 +320,129 @@ def create_mock_embedding(category_seed, doc_seed): |
312 | 320 | print(f" ⏱️ All queries time: {time.time() - step_start:.3f}s") |
313 | 321 | print() |
314 | 322 |
|
| 323 | + # ----------------------------------------------------------------------------- |
| 324 | + # Step 7: INT8-Encoded Dense Vectors |
| 325 | + # ----------------------------------------------------------------------------- |
| 326 | + print("Step 7: Demonstrating INT8-encoded dense-vector storage...") |
| 327 | + step_start = time.time() |
| 328 | + |
| 329 | + try: |
| 330 | + db.command("sql", "CREATE VERTEX TYPE Int8Article") |
| 331 | + db.command("sql", "CREATE PROPERTY Int8Article.id STRING") |
| 332 | + db.command("sql", "CREATE PROPERTY Int8Article.category STRING") |
| 333 | + db.command("sql", "CREATE PROPERTY Int8Article.embedding BINARY") |
| 334 | + |
| 335 | + db.command( |
| 336 | + "sql", |
| 337 | + """ |
| 338 | + CREATE INDEX ON Int8Article (embedding) |
| 339 | + LSM_VECTOR |
| 340 | + METADATA { |
| 341 | + "dimensions": 4, |
| 342 | + "similarity": "COSINE", |
| 343 | + "quantization": "NONE", |
| 344 | + "encoding": "INT8" |
| 345 | + } |
| 346 | + """, |
| 347 | + ) |
| 348 | + |
| 349 | + int8_docs = [ |
| 350 | + ("int8_doc_1", "technology", [1.0, 0.0, 0.0, 0.0]), |
| 351 | + ("int8_doc_2", "technology", [0.95, 0.05, 0.0, 0.0]), |
| 352 | + ("int8_doc_3", "sports", [0.0, 1.0, 0.0, 0.0]), |
| 353 | + ] |
| 354 | + |
| 355 | + with db.transaction(): |
| 356 | + for doc_id, category, vector in int8_docs: |
| 357 | + db.command( |
| 358 | + "sql", |
| 359 | + "INSERT INTO Int8Article SET id = ?, category = ?, embedding = ?", |
| 360 | + doc_id, |
| 361 | + category, |
| 362 | + arcadedb.to_java_byte_array( |
| 363 | + quantize_to_int8_bytes(np.array(vector)) |
| 364 | + ), |
| 365 | + ) |
| 366 | + |
| 367 | + int8_hits = db.query( |
| 368 | + "sql", |
| 369 | + ( |
| 370 | + "SELECT id, category, distance FROM " |
| 371 | + "(SELECT expand(vectorNeighbors('Int8Article[embedding]', ?, 2))) " |
| 372 | + "ORDER BY distance" |
| 373 | + ), |
| 374 | + arcadedb.to_java_float_array([1.0, 0.0, 0.0, 0.0]), |
| 375 | + ).to_list() |
| 376 | + except arcadedb.ArcadeDBError as exc: |
| 377 | + print(" ⚠️ Skipping INT8-encoded dense-vector demo in this runtime") |
| 378 | + print(f" 💡 Reason: {exc}") |
| 379 | + else: |
| 380 | + print(" ✅ Created INT8-encoded dense index on a BINARY property") |
| 381 | + print(" 💡 Use this when your embeddings are already stored as int8 bytes") |
| 382 | + print(" Top matches for [1, 0, 0, 0]:") |
| 383 | + for hit in int8_hits: |
| 384 | + print( |
| 385 | + f" • {hit.get('id')} ({hit.get('category')}), " |
| 386 | + f"distance={hit.get('distance'):.4f}" |
| 387 | + ) |
| 388 | + print(f" ⏱️ Time: {time.time() - step_start:.3f}s") |
| 389 | + print() |
| 390 | + |
| 391 | + # ----------------------------------------------------------------------------- |
| 392 | + # Step 8: Sparse Vectors |
| 393 | + # ----------------------------------------------------------------------------- |
| 394 | + print("Step 8: Demonstrating sparse-vector retrieval...") |
| 395 | + step_start = time.time() |
| 396 | + |
| 397 | + try: |
| 398 | + db.command("sql", "CREATE DOCUMENT TYPE SparseArticle") |
| 399 | + db.command("sql", "CREATE PROPERTY SparseArticle.id STRING") |
| 400 | + db.command("sql", "CREATE PROPERTY SparseArticle.tokens ARRAY_OF_INTEGERS") |
| 401 | + db.command("sql", "CREATE PROPERTY SparseArticle.weights ARRAY_OF_FLOATS") |
| 402 | + |
| 403 | + db.command( |
| 404 | + "sql", |
| 405 | + """ |
| 406 | + CREATE INDEX ON SparseArticle (tokens, weights) |
| 407 | + LSM_SPARSE_VECTOR |
| 408 | + METADATA { |
| 409 | + "dimensions": 128 |
| 410 | + } |
| 411 | + """, |
| 412 | + ) |
| 413 | + |
| 414 | + with db.transaction(): |
| 415 | + db.command( |
| 416 | + "sql", |
| 417 | + "INSERT INTO SparseArticle SET id = 'sparse_doc_1', tokens = [1, 5, 10], weights = [0.5, 0.3, 0.2]", |
| 418 | + ) |
| 419 | + db.command( |
| 420 | + "sql", |
| 421 | + "INSERT INTO SparseArticle SET id = 'sparse_doc_2', tokens = [2, 5, 11], weights = [0.4, 0.6, 0.1]", |
| 422 | + ) |
| 423 | + |
| 424 | + sparse_hits = db.query( |
| 425 | + "sql", |
| 426 | + ( |
| 427 | + "SELECT id, score FROM " |
| 428 | + "(SELECT expand(`vector.sparseNeighbors`('SparseArticle[tokens,weights]', ?, ?, 5))) " |
| 429 | + "ORDER BY score DESC" |
| 430 | + ), |
| 431 | + jtypes.JArray(jtypes.JInt)([5]), |
| 432 | + arcadedb.to_java_float_array([1.0]), |
| 433 | + ).to_list() |
| 434 | + except arcadedb.ArcadeDBError as exc: |
| 435 | + print(" ⚠️ Skipping sparse-vector demo in this runtime") |
| 436 | + print(f" 💡 Reason: {exc}") |
| 437 | + else: |
| 438 | + print(" ✅ Created sparse-vector index on token/weight arrays") |
| 439 | + print(" 💡 Use this for BM25-style or learned sparse retrieval") |
| 440 | + print(" Top matches for sparse query {(5): 1.0}:") |
| 441 | + for hit in sparse_hits: |
| 442 | + print(f" • {hit.get('id')}, score={hit.get('score'):.4f}") |
| 443 | + print(f" ⏱️ Time: {time.time() - step_start:.3f}s") |
| 444 | + print() |
| 445 | + |
315 | 446 | # ----------------------------------------------------------------------------- |
316 | 447 | # Cleanup |
317 | 448 | # ----------------------------------------------------------------------------- |
|
0 commit comments