update v1.2.0-alpha.5

euphoria0-0 · euphoria0-0 · commit f5857b50f409 · 2025-12-05T08:47:12.000Z
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
-# enVector with ANN (GAS) in VectorDBBench
+# enVector in VectorDBBench
 
-The guide on how to use enVector with ANN index in VectorDBBench is available in [README_ENVECTOR.md](README_ENVECTOR.md).
+**Quick start:** The guide on how to use **enVector** in VectorDBBench is available in :
+
+👉 [README_ENVECTOR.md](README_ENVECTOR.md).
 
 The followings are the original contents of README in VectorDBBench:
 
diff --git a/README_ENVECTOR.md b/README_ENVECTOR.md
@@ -1,6 +1,6 @@
-# enVector with ANN (GAS) in VectorDBBench
+# enVector in VectorDBBench
 
-This guide demonstrates how to use enVector with an ANN index in VectorDBBench.
+This guide demonstrates how to use enVector in VectorDBBench.
 
 Basic usage of enVector with VectorDBBench follows the standard procedure for [VectorDBBench](https://github.com/zilliztech/VectorDBBench).
 
@@ -18,7 +18,7 @@ Basic usage of enVector with VectorDBBench follows the standard procedure for [V
 │       ├── test.npy
 │       └── train.pkl
 ├── README_ENVECTOR.md
-├── scripts
+└── scripts
     ├── run_benchmark.sh              # benchmark script
     ├── envector_pubmed_config.yml    # benchmark config file
     └── prepare_dataset.py            # download and prepare ground truth neighbors for dataset
@@ -35,8 +35,8 @@ source .venv/bin/activate
 # 2. Install VectorDBBench
 pip install -e .
 
-# 3. Install es2
-pip install es2==1.2.0a4
+# 3. Install pyenvector
+pip install pyenvector==1.2.0a5
 ```
 
 ### Prepare dataset
@@ -48,8 +48,8 @@ Prepare the following artifacts for the ANN benchmark with `scripts/prepare_data
 - download centroids and tree metadata for the GAS index for corresponding to the embedding model
 
 For the ANN benchmark, we provide two datasets via HuggingFace:
-- PUBMED768D400K: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m)
-- BLOOMBERG768D368K: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m)
+- `PUBMED768D400K`: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m)
+- `BLOOMBERG768D368K`: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m)
 
 Also, we provide centroids and tree metadata for the corresponding embedding model used in the ANN benchmark:
 - GAS Centroids: [cryptolab-playground/gas-centroids](https://huggingface.co/datasets/cryptolab-playground/gas-centroids)
@@ -63,7 +63,7 @@ python ./scripts/prepare_dataset.py \
     -e embeddinggemma-300m
 ```
 
-Then, you can find the following generated files:
+Then, you can find the generated files as follows:
 
 ```bash
 .
@@ -91,25 +91,38 @@ cd envector-deployment/docker-compose
 ```
 
 We provide four enVector Docker Images:
-- `cryptolabinc/es2e:v1.2.0-alpha.4`
-- `cryptolabinc/es2b:v1.2.0-alpha.4`
-- `cryptolabinc/es2o:v1.2.0-alpha.4`
-- `cryptolabinc/es2c:v1.2.0-alpha.4`
+- `cryptolabinc/es2e:v1.2.0-alpha.5`
+- `cryptolabinc/es2b:v1.2.0-alpha.5`
+- `cryptolabinc/es2o:v1.2.0-alpha.5`
+- `cryptolabinc/es2c:v1.2.0-alpha.5`
 
 ### Set Environment Variables
 
 ```bash
 # Set environment variables
-export DATASET_LOCAL_DIR="./dataset"
-export NUM_PER_BATCH=4096
+export DATASET_LOCAL_DIR="./dataset"  # dataset directory. default: /tmp/vectordb_bench/dataset
+export NUM_PER_BATCH=4096             # default batch size for enVector
 ```
 
-## Run Benchmark
+## Run Our ANN Benchmark
+
+We provide enVector-customized ANN, called "GAS", designed to perform efficient IVF-FLAT-based ANN search with the encrypted index.
+We evaluated enVector on two benchmark datasets that we provided:
+- `PUBMED768D400K`
+- `BLOOMBERG768D368K`
+
+Run the provided shell scripts (`./scripts/run_benchmark.sh`) as the following:
+
+```bash
+./scripts/run_benchmark.sh --type flat  # FLAT
+./scripts/run_benchmark.sh --type ivf   # IVF-FLAT with enVector-customized ANN (GAS)
+```
+
+For more details, please refer to `run_benchmark.sh` or `envector_{benchmark}_config.yml` in scripts directory for benchmarks with enVector with ANN (GAS), or you can use the following command:
 
-Refer to `./scripts/run_benchmark.sh` or `./scripts/envector_benchmark_config.yml` for benchmarks with enVector with ANN (VCT), or use the following command:
 
 ```bash
-export NUM_PER_BATCH=500000 # set to the database size for efficiency with IVF_FLAT
+export NUM_PER_BATCH=500000 # set to the database size when IVF_FLAT
 python -m vectordb_bench.cli.vectordbbench envectorivfflat \
     --uri "localhost:50050" \
     --eval-mode mm \
@@ -123,10 +136,69 @@ python -m vectordb_bench.cli.vectordbbench envectorivfflat \
     --custom-dataset-file-count 1 \
     --custom-dataset-with-gt \
     --skip-custom-dataset-use-shuffled \
+    --k 10 \
     --train-centroids True \
     --is-vct True \
     --centroids-path "./centroids/embeddinggemma-300m/centroids.npy" \
     --vct-path "./centroids/embeddinggemma-300m/tree_info.pkl" \
     --nlist 32768 \
     --nprobe 6
-```
+```
+
+Note that, `NUM_PER_BATCH` is set to the database size when using IVF-based index for enVector.
+
+## Run VectorDBBench Benchmark
+
+Run the following commands to run enVector with VectorDBBench's built-in benchmark.
+
+```bash
+# flat
+python -m vectordb_bench.cli.vectordbbench envectorflat \
+    --uri "localhost:50050" \
+    --case-type "Performance1536D500K" \
+    --db-label "Performance1536D500K-FLAT"
+
+# ivf: IVF-FLAT with random centroids
+export NUM_PER_BATCH=500000  # set database size when IVF-FLAT
+python -m vectordb_bench.cli.vectordbbench envectorivfflat \
+    --uri "localhost:50050" \
+    --case-type "Performance1536D500K" \
+    --db-label "Performance1536D500K-IVF-FLAT" \
+    --nlist 250 \
+    --nprobe 6
+
+# ivf-trained: IVF-FLAT with trained centroids via k-means
+export NUM_PER_BATCH=500000 # set to the database size when IVF-FLAT
+python -m vectordb_bench.cli.vectordbbench envectorivfflat \
+    --uri "localhost:50050" \
+    --case-type "Performance1536D500K" \
+    --db-label "Performance1536D500K-IVF-FLAT" \
+    --train-centroids True \
+    --centroids-path "./centroids/kmeans_centroids.npy" \  # centroids built by sklearn, etc.
+    --nlist 250 \
+    --nprobe 6
+```
+
+Note that, the benchmark provided by VectorDBBench, including Performance1536D500K, uses **unknown** embedding model (just notified as openai's one), we cannot use our GAS approach for ANN.
+
+### CLI Options
+
+enVector Types for VectorDBBench
+- `envectorflat`: FLAT as index type for enVector
+- `envectorivfflat`: IVF_FLAT as index type for enVector
+
+Common Options for enVector
+- `--uri`: enVector server URI
+- `--eval-mode`: FHE evaluation mode on server. Use `mm` for enhanced performance.
+
+ANN Options for enVector
+- `--nlist`: Number of coarse clusters for IVF_FLAT
+- `--nprobe`: Number of clusters to scan during search for IVF_FLAT
+- `--train-centroids`: whether to use trained centroids for IVF_FLAT
+- `--centroids-path`: path to the trained centroids
+- `--is-vct`: whether to use VCT approach for IVF_GAS
+- `--vct-path`: path to the trained VCT metadata for IVF_GAS
+
+Benchmark Options:
+    follows conventions of VectorDBBench, 
+    see details in [VectorDBBench Options](https://github.com/zilliztech/VectorDBBench?tab=readme-ov-file#custom-dataset-for-performance-case)
diff --git a/vectordb_bench/backend/clients/envector/envector.py b/vectordb_bench/backend/clients/envector/envector.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 from typing import Any
 
-import es2
 import numpy as np
+import pyenvector as ev
 
 from vectordb_bench.backend.filter import Filter, FilterOp
 
@@ -51,32 +51,32 @@ def __init__(
         self._vector_index_name = "vector_idx"
         self._scalar_id_index_name = "id_sort_idx"
         self._scalar_labels_index_name = "labels_idx"
-        self.col: es2.Index | None = None
+        self.col: ev.Index | None = None
 
         self.is_vct: bool = False
         self.vct_params: dict[str, Any] = {}
 
-        es2.init(
+        ev.init(
             address=self.db_config.get("uri"),
             key_path=self.db_config.get("key_path"),
             key_id=self.db_config.get("key_id"),
             eval_mode=self.case_config.eval_mode,
         )
         if drop_old:
             log.info(f"{self.name} client drop_old index: {self.collection_name}")
-            if self.collection_name in es2.get_index_list():
-                es2.drop_index(self.collection_name)
+            if self.collection_name in ev.get_index_list():
+                ev.drop_index(self.collection_name)
 
         # Create the collection
         log.info(f"{self.name} create index: {self.collection_name}")
 
         index_kwargs = dict(kwargs)
         self._ensure_index(dim, index_kwargs)
 
-        es2.disconnect()
+        ev.disconnect()
 
     def _ensure_index(self, dim: int, index_kwargs: dict[str, Any]):
-        if self.collection_name in es2.get_index_list():
+        if self.collection_name in ev.get_index_list():
             log.info(f"{self.name} index {self.collection_name} already exists, skip creating")
             self.is_vct = self.case_config.index_param().get("is_vct", False)
             log.debug(f"IS_VCT: {self.is_vct}")
@@ -94,7 +94,7 @@ def _create_index(self, dim: int, index_kwargs: dict[str, Any]):
         if index_type == "IVF_FLAT":
             self._adjust_batch_size()
 
-        es2.create_index(
+        ev.create_index(
             index_name=self.collection_name,
             dim=dim,
             key_path=self.db_config.get("key_path"),
@@ -146,24 +146,24 @@ def init(self):
             >>>     self.insert_embeddings()
             >>>     self.search_embedding()
         """
-        es2.init(
+        ev.init(
             address=self.db_config.get("uri"),
             key_path=self.db_config.get("key_path"),
             key_id=self.db_config.get("key_id"),
             eval_mode=self.case_config.eval_mode,
         )
         try:
-            self.col = es2.Index(self.collection_name)
+            self.col = ev.Index(self.collection_name)
             if self.is_vct:
-                log.debug(f"VCT: {self.col.index_config.index_param.index_params['virtual_cluster']}")
+                log.debug(f"VCT: {self.col.index_config.index_param.index_params.get('virtual_cluster')}")
                 is_vct = self.case_config.index_param().get("is_vct", False)
                 assert self.is_vct == is_vct, "is_vct mismatch"
                 vct_path = self.case_config.index_param().get("vct_path", None)
                 self.col._load_virtual_cluster_from_pkl(vct_path)
             yield
         finally:
             self.col = None
-            es2.disconnect()
+            ev.disconnect()
 
     def create_index(self):
         pass
@@ -194,8 +194,6 @@ def insert_embeddings(
         assert self.col is not None
         assert len(embeddings) == len(metadata)
 
-        log.debug(f"IS_VCT: {self.is_vct}")
-
         insert_count = 0
         try:
             for batch_start_offset in range(0, len(embeddings), self.batch_size):