Enhance index handling in Python bindings: introduce HASH index type and update examples and documentation

tae898 · tae898 · commit 671c8f55049c · 2026-03-18T18:22:05.000+01:00
diff --git a/bindings/python/docs/api/schema.md b/bindings/python/docs/api/schema.md
@@ -233,7 +233,8 @@ Create an index on a type.
 - `type_name` (str): Name of the type
 - `property_names` (List[str]): List of property names to index
 - `unique` (bool): Whether the index should enforce uniqueness (default: `False`)
-- `index_type` (str or IndexType): Type of index (`"LSM_TREE"`, `"FULL_TEXT"`)
+- `index_type` (str or IndexType): Type of index (`"LSM_TREE"`, `"HASH"`, `"FULL_TEXT"`,
+  `"LSM_VECTOR"`, `"GEOSPATIAL"`)
 
 **Returns:**
 
@@ -250,13 +251,35 @@ Create an index on a type.
 # Unique index on username
 schema.create_index("User", ["username"], unique=True)
 
+# Exact-match lookup index
+schema.create_index("Order", ["customerId"], index_type="HASH")
+
 # Composite index
 schema.create_index("Event", ["userId", "timestamp"])
 
 # Full-text index
 schema.create_index("Article", ["content"], index_type="FULL_TEXT")
 ```
 
+**Index choice rules of thumb:**
+
+- Use `HASH` for exact-match lookups when you do not need ranges or ordered scans.
+- Use `LSM_TREE` when you need ranges, sorting, or a safe general-purpose default.
+- Use `FULL_TEXT`, `LSM_VECTOR`, and `GEOSPATIAL` only for their specialized query
+  types.
+- `HASH` can be unique or non-unique. The index structure and uniqueness constraint are
+  separate choices.
+
+**SQL DSL equivalents:**
+
+- `CREATE INDEX ON User (email) UNIQUE` -> unique `LSM_TREE`
+- `CREATE INDEX ON User (email) NOTUNIQUE` -> non-unique `LSM_TREE`
+- `CREATE INDEX ON User (email) UNIQUE_HASH` -> unique `HASH`
+- `CREATE INDEX ON Order (customerId) NOTUNIQUE_HASH` -> non-unique `HASH`
+- `CREATE INDEX ON Article (content) FULL_TEXT` -> `FULL_TEXT`
+- `CREATE INDEX ON Doc (embedding) LSM_VECTOR ...` -> `LSM_VECTOR`
+- `CREATE INDEX ON Place (location) GEOSPATIAL` -> `GEOSPATIAL`
+
 **Vector (JVector) Parameters:**
 
 - **max_connections**: Max connections per node (default: 16; typical 8-32). Maps to JVector `maxConnections`.
diff --git a/bindings/python/docs/examples/02_social_network_graph.md b/bindings/python/docs/examples/02_social_network_graph.md
@@ -278,10 +278,10 @@ Create indexes on frequently queried properties:
 
 ```python
 # Index on person names for fast lookups
-db.command("sql", "CREATE INDEX ON Person (name) NOTUNIQUE")
+db.command("sql", "CREATE INDEX ON Person (name) NOTUNIQUE_HASH")
 
 # For unique identifiers
-db.command("sql", "CREATE INDEX ON Person (person_id) UNIQUE")
+db.command("sql", "CREATE INDEX ON Person (person_id) UNIQUE_HASH")
 ```
 
 ### Batch Operations
diff --git a/bindings/python/docs/examples/04_csv_import_documents.md b/bindings/python/docs/examples/04_csv_import_documents.md
@@ -533,7 +533,7 @@ The database is preserved for inspection after the example completes.
 
 ⚠️ **Note**: Database files are larger than source CSVs due to:
 
-- Index structures (LSM-Tree buffers and sorted data)
+- Index structures (LSM-tree, hash, or other index data depending on what you create)
 - Transaction logs and metadata
 - Internal data structures for document storage
 - WAL (Write-Ahead Log) files for durability
@@ -545,7 +545,9 @@ The database is preserved for inspection after the example completes.
 3. ✅ **NULL value handling** works seamlessly across all data types (STRING, INTEGER, etc.)
 4. ✅ **Batch processing** (`commit_every`) dramatically improves import performance
 5. ✅ **Create indexes AFTER import** - 2-3x faster than indexing during import
-6. ✅ **LSM_TREE indexes** provide massive performance gains (up to 14,836x speedup!)
+6. ✅ **Indexes** provide massive performance gains (up to 14,836x speedup!). For
+   exact-match lookups, prefer `UNIQUE_HASH` / `NOTUNIQUE_HASH`; for ranges and ordered
+   scans, prefer `UNIQUE` / `NOTUNIQUE` (`LSM_TREE`).
 7. ✅ **Statistical validation** (10 runs) ensures reliable performance measurements
 8. ✅ **Result validation** compares actual data values, not just row counts
 9. ✅ **Multi-bucket architecture** creates 15 buckets per type, 1 index file per bucket per property
diff --git a/bindings/python/docs/examples/05_csv_import_graph.md b/bindings/python/docs/examples/05_csv_import_graph.md
@@ -306,12 +306,12 @@ LIMIT 10
 db.command("sql", "CREATE VERTEX TYPE User IF NOT EXISTS")
 db.command("sql", "CREATE PROPERTY User.userId LONG")
 db.command("sql", "CREATE PROPERTY User.name STRING")
-db.command("sql", "CREATE INDEX ON User (userId) UNIQUE")
+db.command("sql", "CREATE INDEX ON User (userId) UNIQUE_HASH")
 
 db.command("sql", "CREATE VERTEX TYPE Movie IF NOT EXISTS")
 db.command("sql", "CREATE PROPERTY Movie.movieId LONG")
 db.command("sql", "CREATE PROPERTY Movie.title STRING")
-db.command("sql", "CREATE INDEX ON Movie (movieId) UNIQUE")
+db.command("sql", "CREATE INDEX ON Movie (movieId) UNIQUE_HASH")
 
 # Create edge types
 db.command("sql", "CREATE EDGE TYPE RATED UNIDIRECTIONAL IF NOT EXISTS")
diff --git a/bindings/python/docs/guide/core/queries.md b/bindings/python/docs/guide/core/queries.md
@@ -331,6 +331,42 @@ for row in result:
     print(row.get("content"), row.get("$score"))
 ```
 
+### Choosing index types in SQL DSL
+
+When you create indexes through SQL, the index keyword controls both the index
+structure and uniqueness.
+
+```python
+# General-purpose ordered index (LSM_TREE)
+db.command("sql", "CREATE INDEX ON User (email) UNIQUE")
+db.command("sql", "CREATE INDEX ON Event (createdAt) NOTUNIQUE")
+
+# Exact-match hash index
+db.command("sql", "CREATE INDEX ON User (email) UNIQUE_HASH")
+db.command("sql", "CREATE INDEX ON Order (customerId) NOTUNIQUE_HASH")
+
+# Specialized indexes
+db.command("sql", "CREATE INDEX ON Article (content) FULL_TEXT")
+db.command("sql", "CREATE INDEX ON Doc (embedding) LSM_VECTOR METADATA {\"dimensions\": 128}")
+db.command("sql", "CREATE INDEX ON Place (location) GEOSPATIAL")
+```
+
+Rules of thumb:
+
+- Use `UNIQUE_HASH` or `NOTUNIQUE_HASH` for exact-match lookups only.
+- Use `UNIQUE` or `NOTUNIQUE` for `LSM_TREE` indexes when you need ranges, ordering, or a safe general-purpose default.
+- Use `FULL_TEXT` for tokenized text search, not normal equality lookups.
+- Use `LSM_VECTOR` for embeddings and nearest-neighbor search.
+- Use `GEOSPATIAL` for spatial predicates.
+
+Examples:
+
+- `email = ?`, `userId = ?`, `movieId = ?`: usually `UNIQUE_HASH` or `NOTUNIQUE_HASH`
+- `createdAt BETWEEN ? AND ?`, `price > ?`, ordered scans: usually `UNIQUE` or `NOTUNIQUE`
+
+`HASH` does not imply uniqueness. A non-unique hash index still makes sense when many
+records share the same exact-match value, such as `customerId`, `status`, or `country`.
+
 ### ResultSet Methods
 
 ```python
diff --git a/bindings/python/docs/java-api-coverage.md b/bindings/python/docs/java-api-coverage.md
@@ -1,23 +1,23 @@
-## Java API Coverage Analysis
+# Java API Coverage Analysis
 
 This section provides a practical mapping between the ArcadeDB Java API and the
 Python bindings surface in this repository. It reflects the current code in
 `arcadedb_embedded` rather than a theoretical, full Java surface comparison.
 
-### Executive Summary
+## Executive Summary
 
 The Python bindings expose the **core database, schema, graph, vector, async,
 import/export, and server workflows** needed for typical application usage. Most
 omissions are **low-level JVM internals** (WAL details, bucket scanning, binary
 protocol, server plugins, clustering) that are not typically used from Python.
 
-#### Coverage by Area (Qualitative)
+### Coverage by Area (Qualitative)
 
 | Area | Status | Notes |
 | --- | --- | --- |
 | Core Database | ✅ Supported | `DatabaseFactory`, `Database`, transactions, lookups, async helpers |
 | Query Execution | ✅ Supported | SQL, OpenCypher, MongoDB, GraphQL passthrough |
-| Schema & Indexes | ✅ Supported | Types, properties, LSM/FULL_TEXT/Vector indexes |
+| Schema & Indexes | ✅ Supported | Types, properties, LSM_TREE/HASH/FULL_TEXT/LSM_VECTOR/GEOSPATIAL indexes |
 | Graph API | ✅ Supported | SQL/OpenCypher graph workflows plus `Document`/`Vertex`/`Edge` wrapper compatibility |
 | Vector Search | ✅ Supported | JVector indexes + NumPy conversion helpers |
 | Async Execution | ✅ Supported | `AsyncExecutor` plus record-level and SQL/Cypher async flows |
diff --git a/bindings/python/examples/02_social_network_graph.py b/bindings/python/examples/02_social_network_graph.py
@@ -139,8 +139,7 @@ def create_schema(db):
         db.command("sql", "CREATE PROPERTY FRIEND_OF.closeness STRING")
         print("  ✓ Created FRIEND_OF properties")
 
-        # Create indexes for better performance using Schema API
-        db.command("sql", "CREATE INDEX ON Person (name) NOTUNIQUE")
+        db.command("sql", "CREATE INDEX ON Person (name) NOTUNIQUE_HASH")
         print("  ✓ Created index on Person.name")
 
         print(f"  ⏱️  Time: {time.time() - step_start:.3f}s")
diff --git a/bindings/python/examples/03_vector_search.py b/bindings/python/examples/03_vector_search.py
@@ -99,7 +99,7 @@
     db.command("sql", "CREATE PROPERTY Article.id STRING")
 
     # Create standard index on ID for fast lookups
-    db.command("sql", "CREATE INDEX ON Article (id) UNIQUE")
+    db.command("sql", "CREATE INDEX ON Article (id) UNIQUE_HASH")
 
     print("   ✅ Schema created: Article vertex type")
     print("   💡 Vector property type: ARRAY_OF_FLOATS")
diff --git a/bindings/python/examples/04_csv_import_documents.py b/bindings/python/examples/04_csv_import_documents.py
@@ -618,8 +618,14 @@ def create_indexes(db, indexes, verbose=True):
                 # Convert uniqueness string to SQL index creation
                 if uniqueness == "UNIQUE":
                     db.command("sql", f"CREATE INDEX ON {table} ({column}) UNIQUE")
+                elif uniqueness == "UNIQUE_HASH":
+                    db.command("sql", f"CREATE INDEX ON {table} ({column}) UNIQUE_HASH")
                 elif uniqueness == "FULL_TEXT":
                     db.command("sql", f"CREATE INDEX ON {table} ({column}) FULL_TEXT")
+                elif uniqueness == "NOTUNIQUE_HASH":
+                    db.command(
+                        "sql", f"CREATE INDEX ON {table} ({column}) NOTUNIQUE_HASH"
+                    )
                 else:  # NOTUNIQUE
                     db.command("sql", f"CREATE INDEX ON {table} ({column}) NOTUNIQUE")
 
@@ -1469,15 +1475,16 @@ def _flush_batch(batch_rows):
 
 # Check all existing indexes
 #
-# Note: ArcadeDB has 3 index engine types: LSM_TREE, FULL_TEXT, VECTOR
+# Note: ArcadeDB exposes multiple index engines, including LSM_TREE, HASH,
+# FULL_TEXT, and VECTOR.
 # The schema metadata query only exposes a boolean 'unique' field, not the engine type.
 # Therefore:
-#   - UNIQUE indexes → unique=true, engine=LSM_TREE
-#   - NOTUNIQUE indexes → unique=false, engine=LSM_TREE
+#   - UNIQUE / UNIQUE_HASH indexes → unique=true
+#   - NOTUNIQUE / NOTUNIQUE_HASH indexes → unique=false
 #   - FULL_TEXT indexes → unique=false, engine=FULL_TEXT (appears as NOTUNIQUE!)
 #
-# This means FULL_TEXT indexes show as NOTUNIQUE in the metadata, so we need to
-# check for both when validating expected FULL_TEXT indexes.
+# This means HASH and FULL_TEXT indexes must be validated by semantics rather than
+# engine type alone.
 for idx in existing_indexes:
     idx_dict = json.loads(idx.to_json())
     index_type = "UNIQUE" if idx_dict.get("unique") else "NOTUNIQUE"
@@ -1505,7 +1512,7 @@ def _flush_batch(batch_rows):
                 candidate_columns.append(prop)
 
     if isinstance(name, str) and "[" in name and name.endswith("]"):
-        raw_props = name[name.find("[") + 1 : -1]
+        raw_props = name[name.find("[") + 1 : -1].strip()
         for col in raw_props.split(","):
             col = col.strip()
             if col:
@@ -1514,15 +1521,23 @@ def _flush_batch(batch_rows):
     candidate_columns = [c for c in candidate_columns if c]
 
     for column_name in candidate_columns:
-        # Try matching as the reported type (UNIQUE/NOTUNIQUE)
+        # Try matching as the reported uniqueness semantics.
         key = (type_name, column_name, index_type)
         if key in expected_indexes:
             expected_indexes[key] = True
 
+        if index_type == "UNIQUE":
+            unique_hash_key = (type_name, column_name, "UNIQUE_HASH")
+            if unique_hash_key in expected_indexes:
+                expected_indexes[unique_hash_key] = True
+
         # FULL_TEXT indexes appear as NOTUNIQUE in metadata, so also check for FULL_TEXT
         # This is expected behavior since FULL_TEXT is a different index engine type,
         # not a variant of LSM_TREE, but metadata only exposes the 'unique' boolean.
         if index_type == "NOTUNIQUE":
+            notunique_hash_key = (type_name, column_name, "NOTUNIQUE_HASH")
+            if notunique_hash_key in expected_indexes:
+                expected_indexes[notunique_hash_key] = True
             fulltext_key = (type_name, column_name, "FULL_TEXT")
             if fulltext_key in expected_indexes:
                 expected_indexes[fulltext_key] = True
@@ -1534,6 +1549,9 @@ def _flush_batch(batch_rows):
             unique_key = (type_name, column_name, "UNIQUE")
             if unique_key in expected_indexes:
                 expected_indexes[unique_key] = True
+            unique_hash_key = (type_name, column_name, "UNIQUE_HASH")
+            if unique_hash_key in expected_indexes:
+                expected_indexes[unique_hash_key] = True
 
 # Validate all expected indexes were created
 print("\n   ✅ Validating expected indexes:")
diff --git a/bindings/python/examples/05_csv_import_graph.py b/bindings/python/examples/05_csv_import_graph.py
@@ -1429,8 +1429,8 @@ def create_schema(db: Any, create_indexes: bool = True):
         print("Creating indexes...")
         # Idempotent index creation
         for command in (
-            "CREATE INDEX ON User (userId) UNIQUE",
-            "CREATE INDEX ON Movie (movieId) UNIQUE",
+            "CREATE INDEX ON User (userId) UNIQUE_HASH",
+            "CREATE INDEX ON Movie (movieId) UNIQUE_HASH",
         ):
             try:
                 db.command("sql", command)
diff --git a/bindings/python/examples/07_stackoverflow_tables_oltp.py b/bindings/python/examples/07_stackoverflow_tables_oltp.py
@@ -563,7 +563,7 @@ def create_schema_arcadedb(db):
 
 def create_arcadedb_id_indexes(db):
     for table in TABLE_DEFS:
-        db.command("sql", f"CREATE INDEX ON {table['name']} (Id) UNIQUE")
+        db.command("sql", f"CREATE INDEX ON {table['name']} (Id) UNIQUE_HASH")
 
 
 def create_schema_sqlite(conn: sqlite3.Connection):
diff --git a/bindings/python/examples/08_stackoverflow_tables_olap.py b/bindings/python/examples/08_stackoverflow_tables_olap.py
@@ -482,6 +482,8 @@ def count_table_rows_postgres(conn) -> Dict[str, int]:
     ("PostHistory", ["PostHistoryTypeId"], False),
 ]
 
+HASH_INDEX_FIELDS = {"Id"}
+
 
 QUERY_DEFS: List[Dict[str, str]] = [
     {
@@ -957,7 +959,11 @@ def create_indexes(db, retry_delay: int = 10, max_retries: int = 60) -> float:
         created = False
         for attempt in range(1, max_retries + 1):
             try:
-                unique_clause = "UNIQUE" if unique else "NOTUNIQUE"
+                use_hash = unique and len(props) == 1 and props[0] in HASH_INDEX_FIELDS
+                if use_hash:
+                    unique_clause = "UNIQUE_HASH" if unique else "NOTUNIQUE_HASH"
+                else:
+                    unique_clause = "UNIQUE" if unique else "NOTUNIQUE"
                 props_clause = ", ".join(props)
                 db.command(
                     "sql",
diff --git a/bindings/python/examples/09_stackoverflow_graph_oltp.py b/bindings/python/examples/09_stackoverflow_graph_oltp.py
@@ -456,12 +456,12 @@ def create_arcadedb_schema(db):
 
 
 def create_arcadedb_indexes(db):
-    db.command("sql", "CREATE INDEX ON User (Id) UNIQUE")
-    db.command("sql", "CREATE INDEX ON Question (Id) UNIQUE")
-    db.command("sql", "CREATE INDEX ON Answer (Id) UNIQUE")
-    db.command("sql", "CREATE INDEX ON Tag (Id) UNIQUE")
-    db.command("sql", "CREATE INDEX ON Badge (Id) UNIQUE")
-    db.command("sql", "CREATE INDEX ON Comment (Id) UNIQUE")
+    db.command("sql", "CREATE INDEX ON User (Id) UNIQUE_HASH")
+    db.command("sql", "CREATE INDEX ON Question (Id) UNIQUE_HASH")
+    db.command("sql", "CREATE INDEX ON Answer (Id) UNIQUE_HASH")
+    db.command("sql", "CREATE INDEX ON Tag (Id) UNIQUE_HASH")
+    db.command("sql", "CREATE INDEX ON Badge (Id) UNIQUE_HASH")
+    db.command("sql", "CREATE INDEX ON Comment (Id) UNIQUE_HASH")
 
     db.async_executor().wait_completion()
 
diff --git a/bindings/python/examples/10_stackoverflow_graph_olap.py b/bindings/python/examples/10_stackoverflow_graph_olap.py
@@ -547,7 +547,7 @@ def create_arcadedb_indexes(db, retry_delay: int = 10, max_retries: int = 60) ->
         created = False
         for attempt in range(1, max_retries + 1):
             try:
-                unique_clause = " UNIQUE" if unique else ""
+                unique_clause = " UNIQUE_HASH" if unique else ""
                 props_csv = ", ".join(props)
                 db.command(
                     "sql",
diff --git a/bindings/python/examples/13_stackoverflow_hybrid_queries.py b/bindings/python/examples/13_stackoverflow_hybrid_queries.py
@@ -395,6 +395,8 @@ def infer_table_defs_from_xml(
     ("Tag", ["TagName"], False),
 ]
 
+HASH_INDEX_FIELDS = {"Id"}
+
 GRAPH_VERTEX_TYPES = ["User", "Question", "Answer", "Tag", "Badge", "Comment"]
 GRAPH_EDGE_TYPES = [
     "ASKED",
@@ -429,7 +431,11 @@ def create_indexes_with_retry(
         created = False
         for attempt in range(1, max_retries + 1):
             try:
-                unique_clause = "UNIQUE" if unique else "NOTUNIQUE"
+                use_hash = unique and len(props) == 1 and props[0] in HASH_INDEX_FIELDS
+                if use_hash:
+                    unique_clause = "UNIQUE_HASH" if unique else "NOTUNIQUE_HASH"
+                else:
+                    unique_clause = "UNIQUE" if unique else "NOTUNIQUE"
                 props_clause = ", ".join(props)
                 db.command(
                     "sql",
@@ -600,7 +606,7 @@ def create_graph_schema(db) -> None:
     db.command("sql", "CREATE PROPERTY LINKED_TO.CreationDate INTEGER")
 
     for vertex_type in GRAPH_VERTEX_TYPES:
-        db.command("sql", f"CREATE INDEX ON {vertex_type} (Id) UNIQUE")
+        db.command("sql", f"CREATE INDEX ON {vertex_type} (Id) UNIQUE_HASH")
 
     db.async_executor().wait_completion()
 
diff --git a/bindings/python/examples/14_lifecycle_timing.py b/bindings/python/examples/14_lifecycle_timing.py
@@ -47,15 +47,15 @@ def create_schema(db, vector_dimensions: int) -> None:
     db.command("sql", "CREATE VERTEX TYPE Node")
     db.command("sql", "CREATE PROPERTY Node.node_id INTEGER")
     db.command("sql", "CREATE PROPERTY Node.group_name STRING")
-    db.command("sql", "CREATE INDEX ON Node (node_id) UNIQUE")
+    db.command("sql", "CREATE INDEX ON Node (node_id) UNIQUE_HASH")
 
     db.command("sql", "CREATE EDGE TYPE CONNECTED_TO UNIDIRECTIONAL")
     db.command("sql", "CREATE PROPERTY CONNECTED_TO.weight INTEGER")
 
     db.command("sql", "CREATE VERTEX TYPE VectorDoc")
     db.command("sql", "CREATE PROPERTY VectorDoc.doc_id STRING")
     db.command("sql", "CREATE PROPERTY VectorDoc.embedding ARRAY_OF_FLOATS")
-    db.command("sql", "CREATE INDEX ON VectorDoc (doc_id) UNIQUE")
+    db.command("sql", "CREATE INDEX ON VectorDoc (doc_id) UNIQUE_HASH")
 
     db.create_vector_index(
         vertex_type="VectorDoc",
diff --git a/bindings/python/examples/16_import_database_vs_transactional_graph_ingest.py b/bindings/python/examples/16_import_database_vs_transactional_graph_ingest.py
diff --git a/bindings/python/tests/test_core.py b/bindings/python/tests/test_core.py
diff --git a/bindings/python/tests/test_cypher.py b/bindings/python/tests/test_cypher.py
diff --git a/bindings/python/tests/test_docs_examples.py b/bindings/python/tests/test_docs_examples.py
diff --git a/bindings/python/tests/test_exporter.py b/bindings/python/tests/test_exporter.py
diff --git a/bindings/python/tests/test_graph_api.py b/bindings/python/tests/test_graph_api.py
diff --git a/things-to-do.md b/things-to-do.md