humemai
diff --git a/‎.github/workflows/test-python-examples.yml‎
Lines changed: 23 additions & 8 deletions b/‎.github/workflows/test-python-examples.yml‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎bindings/python/README.md‎
Lines changed: 2 additions & 2 deletions b/‎bindings/python/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bindings/python/docs/api/database.md‎
Lines changed: 5 additions & 4 deletions b/‎bindings/python/docs/api/database.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎bindings/python/docs/api/vector.md‎
Lines changed: 33 additions & 95 deletions b/‎bindings/python/docs/api/vector.md‎
Lines changed: 33 additions & 95 deletions
diff --git a/‎bindings/python/docs/development/architecture.md‎
Lines changed: 1 addition & 1 deletion b/‎bindings/python/docs/development/architecture.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bindings/python/docs/development/testing/overview.md‎
Lines changed: 1 addition & 1 deletion b/‎bindings/python/docs/development/testing/overview.md‎
Lines changed: 1 addition & 1 deletion
@@ -179,6 +179,16 @@ jobs:
             pip install numpy requests sentence-transformers
           fi
 
+      - name: Download datasets
+        shell: bash
+        run: |
+          pip install tqdm py7zr lxml
+          cd bindings/python/examples
+          echo "📥 Downloading MovieLens Small dataset..."
+          python3 download_data.py movielens-small
+          echo "📥 Downloading Stack Overflow Small dataset..."
+          python3 download_data.py stackoverflow-small
+
       - name: Install timeout command (macOS only)
         if: matrix.platform == 'darwin/amd64' || matrix.platform == 'darwin/arm64'
         shell: bash
@@ -215,8 +225,8 @@ jobs:
           results_file="example-results.txt"
           > $results_file
 
-          # Find all Python example files (exclude download_data.py as it's a utility)
-          examples=$(ls [0-9]*.py 2>/dev/null | sort)
+          # Find all Python example files (exclude download_data.py and 08_server_mode_rest_api.py)
+          examples=$(ls [0-9]*.py 2>/dev/null | grep -v "08_server_mode_rest_api.py" | sort)
 
           if [ -z "$examples" ]; then
             echo "❌ No example files found!"
@@ -248,20 +258,25 @@ jobs:
             # Set example-specific parameters and timeout
             case "$example" in
               "04_csv_import_documents.py")
-                example_args="--dataset movielens-small --parallel 4 --batch-size 5000 --export"
+                example_args="--dataset movielens-small --export"
                 example_name="$example (movielens-small dataset with export)"
                 timeout_duration=900  # 15 minutes
                 ;;
               "05_csv_import_graph.py")
-                example_args="--dataset movielens-small --parallel 1 --no-async --export --import-jsonl ./exports/movielens_small_db.jsonl.tgz"
-                example_name="$example (movielens-small dataset, sync mode, import from JSONL)"
+                example_args="--dataset movielens-small --method java --import-jsonl ./exports/movielens_small_db.jsonl.tgz --export"
+                example_name="$example (movielens-small dataset, embedded java method, import/export)"
                 timeout_duration=900  # 15 minutes
                 ;;
               "06_vector_search_recommendations.py")
-                example_args="--source-db my_test_databases/movielens_graph_small_db --db-path my_test_databases/movielens_graph_small_db_vectors --import-jsonl exports/movielens_graph_small_db.jsonl.tgz"
+                example_args="--import-jsonl ./exports/movielens_graph_small_db.jsonl.tgz"
                 example_name="$example (vector search, import from JSONL)"
                 timeout_duration=900  # 15 minutes
                 ;;
+              "07_stackoverflow_multimodel.py")
+                example_args="--dataset stackoverflow-small"
+                example_name="$example (stackoverflow-small dataset)"
+                timeout_duration=1800  # 30 minutes
+                ;;
               *)
                 example_args=""
                 example_name="$example"
@@ -368,12 +383,12 @@ jobs:
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "- **01_simple_document_store.py** - Document CRUD operations with comprehensive data types" >> $GITHUB_STEP_SUMMARY
           echo "- **02_social_network_graph.py** - Graph modeling with vertices, edges, and traversal" >> $GITHUB_STEP_SUMMARY
-          echo "- **03_vector_search.py** - Vector embeddings and semantic similarity search (experimental)" >> $GITHUB_STEP_SUMMARY
+          echo "- **03_vector_search.py** - Vector embeddings and semantic similarity search" >> $GITHUB_STEP_SUMMARY
           echo "- **04_csv_import_documents.py** - CSV import with automatic dataset download and type inference" >> $GITHUB_STEP_SUMMARY
           echo "  - Tested with \`--dataset movielens-small --parallel 4 --batch-size 5000 --export\`" >> $GITHUB_STEP_SUMMARY
           echo "- **05_csv_import_graph.py** - Graph creation from document store with benchmarking" >> $GITHUB_STEP_SUMMARY
           echo "  - Tested with \`--dataset movielens-small --parallel 1 --no-async --export --import-jsonl\`" >> $GITHUB_STEP_SUMMARY
-          echo "- **06_vector_search_recommendations.py** - HNSW vector indexing for movie recommendations" >> $GITHUB_STEP_SUMMARY
+          echo "- **06_vector_search_recommendations.py** - JVector vector indexing for movie recommendations" >> $GITHUB_STEP_SUMMARY
           echo "  - Tested with \`--import-jsonl exports/movielens_graph_small_db.jsonl.tgz\`" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
 
 
@@ -89,7 +89,7 @@ with arcadedb.create_database("/tmp/mydb") as db:
 - 🔍 **Multiple query languages**: SQL, Cypher, Gremlin, MongoDB
 - ⚡ **High performance**: Direct JVM integration via JPype
 - 🔒 **ACID transactions**: Full transaction support
-- 🎯 **Vector storage**: Store and query vector embeddings with HNSW indexing
+- 🎯 **Vector storage**: Store and query vector embeddings with JVector indexing
 - 📥 **Data import**: Built-in CSV, JSON, Neo4j importers
 
 ---
@@ -221,7 +221,7 @@ arcadedb_embedded/
 ├── server.py            # ArcadeDBServer for HTTP mode
 ├── results.py           # ResultSet and Result wrappers
 ├── transactions.py      # TransactionContext manager
-├── vector.py            # Vector search and HNSW indexing
+├── vector.py            # Vector search and JVector indexing
 ├── importer.py          # Data import (CSV, JSON, Neo4j)
 ├── exceptions.py        # ArcadeDBError exception
 └── jvm.py              # JVM lifecycle management
 
@@ -386,13 +386,15 @@ db.create_vector_index(
     vector_property: str,
     dimensions: int,
     distance_function: str = "cosine",
-    max_connections: int = 16,
-    beam_width: int = 200
+    max_connections: int = 32,
+    beam_width: int = 256
 ) -> VectorIndex
 ```
 
 Create a vector index for similarity search (default JVector implementation).
 
+**Note:** The index is built lazily. Construction happens upon the first query, not at creation time.
+
 **Parameters:**
 
 - `vertex_type` (str): Vertex type containing vectors
@@ -427,7 +429,6 @@ with db.transaction():
         vertex.set("id", f"doc_{i}")
         vertex.set("embedding", arcadedb.to_java_float_array(embedding))
         vertex.save()
-        index.add_vertex(vertex)
 
 # Search
 query_vector = np.random.rand(384)
@@ -698,6 +699,6 @@ else:
 ## See Also
 
 - [Graph Operations](../guide/graphs.md): Working with vertices and edges
-- [Vector Search](../guide/vectors.md): Similarity search with HNSW indexes
+- [Vector Search](../guide/vectors.md): Similarity search with JVector indexes
 - [Server Mode](../guide/server.md): HTTP API and Studio UI
 - [Quick Start](../getting-started/quickstart.md): Getting started guide
@@ -1,6 +1,8 @@
 # Vector API
 
-Vector search capabilities in ArcadeDB use HNSW (Hierarchical Navigable Small World) indexing for fast approximate nearest neighbor search. Perfect for semantic search, recommendation systems, and similarity-based queries.
+Vector search capabilities in ArcadeDB use JVector (a graph-based index combining HNSW
+and DiskANN concepts) for fast approximate nearest neighbor search. Perfect for semantic
+search, recommendation systems, and similarity-based queries.
 
 ## Overview
 
@@ -13,7 +15,7 @@ ArcadeDB's vector support enables:
 
 **Key Features:**
 
-- HNSW indexing for O(log N) search performance
+- Graph-based indexing for O(log N) search performance
 - Multiple distance metrics (cosine, euclidean, inner product)
 - Native NumPy integration (optional)
 - Configurable precision/performance trade-offs
@@ -24,7 +26,8 @@ Utility functions for converting between Python and Java vector representations:
 
 ### `to_java_float_array(vector)`
 
-Convert a Python array-like object to a Java float array compatible with ArcadeDB's vector indexing.
+Convert a Python array-like object to a Java float array compatible with ArcadeDB's
+vector indexing.
 
 **Parameters:**
 
@@ -98,7 +101,7 @@ print(type(py_list))  # <class 'list'>
 
 ## VectorIndex Class
 
-Wrapper for ArcadeDB's HNSW vector index, providing similarity search capabilities.
+Wrapper for ArcadeDB's vector index, providing similarity search capabilities.
 
 ### Creation via Database
 
@@ -171,10 +174,13 @@ print(f"Created vector index: {index}")
 
 ---
 
-### `VectorIndex.find_nearest(query_vector, k=10, overquery_factor=16, use_numpy=True, allowed_rids=None)`
+### `VectorIndex.find_nearest(query_vector, k=10, overquery_factor=16, allowed_rids=None)`
 
 Find k-nearest neighbors to the query vector.
 
+**Note:** The first call to `find_nearest` triggers the index construction if it hasn't
+been built yet. This "warm up" query may take longer than subsequent queries.
+
 **Parameters:**
 
 - `query_vector`: Query vector as:
@@ -184,14 +190,13 @@ Find k-nearest neighbors to the query vector.
 - `k` (int): Number of neighbors to return (default: 10)
 - `overquery_factor` (int): Multiplier for search-time over-querying (implicit efSearch)
   (default: 16)
-- `use_numpy` (bool): Return vectors as NumPy if available (default: `True`)
 - `allowed_rids` (List[str]): Optional list of RID strings (e.g. `["#1:0", "#2:5"]`) to
   restrict search (default: `None`)
 
 **Returns:**
 
-- `List[Tuple[vertex, float]]`: List of `(vertex, distance)` tuples
-  - `vertex`: Matched vertex object (MutableVertex)
+- `List[Tuple[record, float]]`: List of `(record, distance)` tuples
+  - `record`: Matched ArcadeDB record object (Vertex, Document, or Edge)
   - `distance`: Similarity score (float)
     - Lower = more similar
     - Range depends on distance function
@@ -210,9 +215,9 @@ neighbors = index.find_nearest(query_vector, k=5)
 allowed_rids = ["#10:5", "#10:8", "#10:12"]
 filtered_neighbors = index.find_nearest(query_vector, k=5, allowed_rids=allowed_rids)
 
-for vertex, distance in neighbors:
-    doc_id = vertex.get("id")
-    text = vertex.get("text")
+for record, distance in neighbors:
+    doc_id = record.get("id")
+    text = record.get("text")
     print(f"Distance: {distance:.4f} | ID: {doc_id}")
     print(f"  Text: {text[:100]}...")
 ```
@@ -225,76 +230,12 @@ for vertex, distance in neighbors:
 | euclidean | [0, ∞) | ✓ (0 = identical) |
 | inner_product | (-∞, ∞) | ✗ (higher = more similar) |
 
----
-
-### `VectorIndex.add_vertex(vertex)`
-
-Add a single vertex to the index.
-
-**Parameters:**
-
-- `vertex`: Vertex object with vector property set
-
-**Raises:**
-
-- `ArcadeDBError`: If vertex cannot be added
-
-**Example:**
-
-```python
-# Add during vertex creation
-with db.transaction():
-    doc = db.new_vertex("Document")
-    doc.set("id", "doc_001")
-    doc.set("text", "Introduction to vector search")
-    doc.set("embedding", to_java_float_array(embedding))
-    doc.save()
-
-    # Add to index
-    index.add_vertex(doc)
-```
-
-**Important:**
-
 - Vertex must have the vector property populated
 - Vector dimensionality must match index dimensions
 - Call within a transaction for consistency
 
 ---
 
-### `VectorIndex.remove_vertex(vertex_id)`
-
-Remove a vertex from the index.
-
-**Parameters:**
-
-- `vertex_id`: ID of the vertex to remove (typically string or int)
-
-**Raises:**
-
-- `ArcadeDBError`: If removal fails
-
-**Example:**
-
-```python
-# Remove by ID
-vertex_id = "doc_001"
-index.remove_vertex(vertex_id)
-```
-
-**Note:** This removes from the vector index only, not from the database. To fully delete:
-
-```python
-with db.transaction():
-    # Remove from index
-    index.remove_vertex(doc_id)
-
-    # Delete from database
-    db.command("sql", f"DELETE FROM Document WHERE id = '{doc_id}'")
-```
-
----
-
 ## Complete Examples
 
 ### Semantic Search with Sentence Transformers
@@ -355,9 +296,6 @@ with db.transaction():
         vertex.set("embedding", to_java_float_array(embedding))
         vertex.save()
 
-        # Add to vector index
-        index.add_vertex(vertex)
-
 print(f"Indexed {len(documents)} documents")
 
 # Search
@@ -424,7 +362,7 @@ with db.transaction():
         v.set("price", prod["price"])
         v.set("features", to_java_float_array(prod["features"]))
         v.save()
-        index.add_vertex(v)
+        # Note: LSM vector index automatically indexes new records
 
 # Hybrid search: vector similarity + filters
 query_features = np.random.rand(128)
@@ -502,7 +440,7 @@ with db.transaction():
         v.set("embedding", to_java_float_array(embedding))
         v.save()
 
-        index.add_vertex(v)
+        # Note: LSM vector index automatically indexes new records
 
 # Search for similar images
 query_image = "query.jpg"
@@ -521,25 +459,25 @@ db.close()
 
 ## Performance Tuning
 
-### HNSW Parameters
+### Vector Index Parameters
 
-**M (connections per node):**
+**max_connections (connections per node):**
 
-- **Lower (8-12)**: Faster build, less memory, lower recall
-- **Medium (16-24)**: Balanced (recommended)
-- **Higher (32-48)**: Better recall, more memory, slower build
+- **Lower (<32)**: Faster build, less memory, lower recall
+- **Medium (32)**: Balanced (recommended)
+- **Higher (>32)**: Better recall, more memory, slower build
 
-**ef (search size):**
+**overquery_factor (search size):**
 
-- **Lower (50-100)**: Faster search, lower recall
-- **Medium (128-200)**: Balanced (recommended)
-- **Higher (200-400)**: Better recall, slower search
+- **Lower (<16)**: Faster search, lower recall
+- **Medium (16)**: Balanced (recommended)
+- **Higher (>16)**: Better recall, slower search
 
-**ef_construction:**
+**beam_width:**
 
-- **Lower (100-150)**: Faster build, lower quality
-- **Medium (128-256)**: Balanced
-- **Higher (300-500)**: Better quality, slower build
+- **Lower (<256)**: Faster build, lower quality
+- **Medium (256)**: Balanced
+- **Higher (>256)**: Better quality, slower build
 
 ### Distance Functions
 
@@ -592,7 +530,7 @@ try:
     v = db.new_vertex("Doc")
     v.set("emb", to_java_float_array(np.random.rand(512)))  # Wrong size!
     v.save()
-    index.add_vertex(v)  # Will fail
+    # Indexing happens automatically and may fail asynchronously or on next access
 
 except ArcadeDBError as e:
     print(f"Error: {e}")
@@ -604,7 +542,7 @@ try:
     v.set("id", "doc1")
     # Forgot to set embedding!
     v.save()
-    index.add_vertex(v)  # Will fail
+    # Indexing happens automatically
 
 except ArcadeDBError as e:
     print(f"Error: {e}")
 
@@ -51,7 +51,7 @@ arcadedb_embedded/
 - Type inference
 
 **`vector.py`**
-- `VectorIndex`: HNSW vector indexing
+- `VectorIndex`: JVector-based Vector indexing
 - NumPy ↔ Java array conversion
 - Nearest neighbor search
 - Distance metrics
 
@@ -19,7 +19,7 @@ The test suite covers:
 - ✅ **Concurrency patterns** - File locking, thread safety, multi-process
 - ✅ **Graph operations** - Vertices, edges, traversals
 - ✅ **Query languages** - SQL, Cypher, Gremlin
-- ✅ **Vector search** - HNSW indexes, similarity search
+- ✅ **Vector search** - JVector-based Vector indexes, similarity search
 - ✅ **Data import** - CSV, JSON, Neo4j exports
 - ✅ **Unicode support** - International characters, emoji
 - ✅ **Schema introspection** - Querying database metadata