From d4333046f68739044a818d93a4043a24ee14b5a0 Mon Sep 17 00:00:00 2001 From: suyeong Date: Mon, 2 Feb 2026 23:40:48 +0000 Subject: [PATCH 1/4] update v1.3.0-alpha.1 --- README.md | 906 ++++-------------- README_ENVECTOR.md | 203 ---- scripts/envector_bloomberg_config.yml | 42 - scripts/envector_pubmed_config.yml | 42 - scripts/prepare_dataset.py | 54 +- scripts/requirements.txt | 4 + scripts/run_benchmark.sh | 131 +-- vectordb_bench/__init__.py | 2 +- vectordb_bench/backend/clients/api.py | 2 +- .../backend/clients/envector/cli.py | 47 +- .../backend/clients/envector/config.py | 32 +- .../backend/clients/envector/envector.py | 103 +- .../backend/runner/serial_runner.py | 29 +- vectordb_bench/cli/vectordbbench.py | 5 +- .../envector_bloomberg_config.yml | 46 + .../config-files/envector_fashion_config.yml | 46 + .../config-files/envector_openai_config.yml | 26 + .../config-files/envector_products_config.yml | 46 + .../config-files/envector_pubmed_config.yml | 46 + 19 files changed, 625 insertions(+), 1187 deletions(-) delete mode 100644 README_ENVECTOR.md delete mode 100644 scripts/envector_bloomberg_config.yml delete mode 100644 scripts/envector_pubmed_config.yml create mode 100644 scripts/requirements.txt create mode 100644 vectordb_bench/config-files/envector_bloomberg_config.yml create mode 100644 vectordb_bench/config-files/envector_fashion_config.yml create mode 100644 vectordb_bench/config-files/envector_openai_config.yml create mode 100644 vectordb_bench/config-files/envector_products_config.yml create mode 100644 vectordb_bench/config-files/envector_pubmed_config.yml diff --git a/README.md b/README.md index 3298b2536..0ec3d2a21 100644 --- a/README.md +++ b/README.md @@ -1,779 +1,261 @@ # enVector in VectorDBBench -**Quick start:** The guide on how to use **enVector** in VectorDBBench is available in : +This guide demonstrates how to use [enVector](https://docs.envector.io/) in VectorDBBench. +The **enVector** is a vector search engine that lets you search directly on encrypted data. +VectorDBBench with enVector provides toolkits to measure and compare performance across different index types. -πŸ‘‰ [README_ENVECTOR.md](README_ENVECTOR.md). +Basic usage of enVector with VectorDBBench follows the standard procedure for [VectorDBBench](https://github.com/zilliztech/VectorDBBench). -The followings are the original contents of README in VectorDBBench: +## πŸš€ Quick Start ---- +1. Run enVector server -# VectorDBBench(VDBBench): A Benchmark Tool for VectorDB +```bash +# Start enVector server +git clone https://github.com/CryptoLabInc/envector-deployment +cd envector-deployment/docker-compose +./start_envector.sh +``` -[![version](https://img.shields.io/pypi/v/vectordb-bench.svg?color=blue)](https://pypi.org/project/vectordb-bench/) -[![Downloads](https://pepy.tech/badge/vectordb-bench)](https://pepy.tech/project/vectordb-bench) +2. Install Python Dependencies -## What is VDBBench -VDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze. +```bash +# Install Python Dependencies +pip install -e . +pip install pyenvector==1.3.0a1 +``` -Understanding the importance of user experience, we provide an intuitive visual interface. This not only empowers users to initiate benchmarks at ease, but also to view comparative result reports, thereby reproducing benchmark results effortlessly. -To add more relevance and practicality, we provide cost-effectiveness reports particularly for cloud services. This allows for a more realistic and applicable benchmarking process. +3. Run Benchmark -Closely mimicking real-world production environments, we've set up diverse testing scenarios including insertion, searching, and filtered searching. To provide you with credible and reliable data, we've included public datasets from actual production scenarios, such as [SIFT](http://corpus-texmex.irisa.fr/), [GIST](http://corpus-texmex.irisa.fr/), [Cohere](https://huggingface.co/datasets/Cohere/wikipedia-22-12/tree/main/en), and a dataset generated by OpenAI from an opensource [raw dataset](https://huggingface.co/datasets/allenai/c4). It's fascinating to discover how a relatively unknown open-source database might excel in certain circumstances! +```bash +# Run Benchmark +./scripts/run_benchmark.sh --index-type FLAT --config-file envector_openai_config.yml +``` -Prepare to delve into the world of VDBBench, and let it guide you in uncovering your perfect vector database match. +## πŸ“š Core Concepts -VDBBench is sponsored by Zilliz,the leading opensource vectorDB company behind Milvus. Choose smarter with VDBBench - start your free test on [zilliz cloud](https://zilliz.com/) today! +### Index Types -**Leaderboard:** https://zilliz.com/benchmark -## Quick Start -### Prerequirement -``` shell -python >= 3.11 -``` -### Install -**Install vectordb-bench with only PyMilvus** -```shell -pip install vectordb-bench -``` +- FLAT: brute-force search. +- IVF-FLAT: Inverted file (IVF) index with flat vectors for faster approximate search. Searchs only the nearest clusters with trained centroids instead of the entire database. +- IVF-GAS: enVector-customized ANN algorithm for the fastest approximate search. To test this, we provided benchmark datasets. -**Install all database clients** +| Index Type | Speed | Accuracy | Centroids Setup | Dataset Requirements | +|------------|-------|----------|------------------|---------------------| +| FLAT | Slow | 100% | None | Any dataset | +| IVF-FLAT | Fast | 95-99% | Requires to train k-means centroids with the client's dataset | Any dataset | +| IVF-GAS | Fastest* | 95-99% | Requires the provided centroids (see [Prepare Dataset](#prepare-dataset)) | enVector custom datasets only | -``` shell -pip install 'vectordb-bench[all]' -``` -**Install the specific database client** +### Benchmark Cases -```shell -pip install 'vectordb-bench[pinecone]' -``` -All the database client supported - -| Optional database client | install command | -|--------------------------|---------------------------------------------| -| pymilvus, zilliz_cloud (*default*) | `pip install vectordb-bench` | -| all (*clients requirements might be conflict with each other*) | `pip install 'vectordb-bench[all]'` | -| qdrant | `pip install 'vectordb-bench[qdrant]'` | -| pinecone | `pip install 'vectordb-bench[pinecone]'` | -| weaviate | `pip install 'vectordb-bench[weaviate]'` | -| elastic, aliyun_elasticsearch| `pip install 'vectordb-bench[elastic]'` | -| pgvector, pgvectorscale, pgdiskann, alloydb | `pip install 'vectordb-bench[pgvector]'` | -| pgvecto.rs | `pip install 'vectordb-bench[pgvecto_rs]'` | -| redis | `pip install 'vectordb-bench[redis]'` | -| memorydb | `pip install 'vectordb-bench[memorydb]'` | -| chromadb | `pip install 'vectordb-bench[chromadb]'` | -| awsopensearch | `pip install 'vectordb-bench[opensearch]'` | -| aliyun_opensearch | `pip install 'vectordb-bench[aliyun_opensearch]'` | -| mongodb | `pip install 'vectordb-bench[mongodb]'` | -| tidb | `pip install 'vectordb-bench[tidb]'` | -| vespa | `pip install 'vectordb-bench[vespa]'` | -| oceanbase | `pip install 'vectordb-bench[oceanbase]'` | -| hologres | `pip install 'vectordb-bench[hologres]'` | - -### Run - -``` shell -init_bench -``` +enVector supports two types of benchmark cases: -OR: +| Benchmark Type | Description | Dataset Preparation | Available Index Types | +|----------------|-------------|---------------------|----------------------| +| **VectorDBBench Built-in** | Standard benchmarks from VectorDBBench | Not required (auto-downloaded) | FLAT, IVF-FLAT | +| **enVector Custom Cases** | Optimized for encrypted search with GAS | Required (see [Prepare Dataset](#prepare-dataset)) | FLAT, IVF-FLAT, IVF-GAS | -### Run from the command line. -``` shell -vectordbbench [OPTIONS] COMMAND [ARGS]... -``` -To list the clients that are runnable via the commandline option, execute: `vectordbbench --help` -``` text -$ vectordbbench --help -Usage: vectordbbench [OPTIONS] COMMAND [ARGS]... - -Options: - --help Show this message and exit. - -Commands: - pgvectorhnsw - pgvectorivfflat - test - weaviate -``` -To list the options for each command, execute `vectordbbench [command] --help` - -```text -$ vectordbbench pgvectorhnsw --help -Usage: vectordbbench pgvectorhnsw [OPTIONS] - -Options: - --config-file PATH Read configuration from yaml file - --drop-old / --skip-drop-old Drop old or skip [default: drop-old] - --load / --skip-load Load or skip [default: load] - --search-serial / --skip-search-serial - Search serial or skip [default: search- - serial] - --search-concurrent / --skip-search-concurrent - Search concurrent or skip [default: search- - concurrent] - --case-type [CapacityDim128|CapacityDim960|Performance768D100M|Performance768D10M|Performance768D1M|Performance768D10M1P|Performance768D1M1P|Performance768D10M99P|Performance768D1M99P|Performance1536D500K|Performance1536D5M|Performance1536D500K1P|Performance1536D5M1P|Performance1536D500K99P|Performance1536D5M99P|Performance1536D50K] - Case type - --db-label TEXT Db label, default: date in ISO format - [default: 2024-05-20T20:26:31.113290] - --dry-run Print just the configuration and exit - without running the tasks - --k INTEGER K value for number of nearest neighbors to - search [default: 100] - --concurrency-duration INTEGER Adjusts the duration in seconds of each - concurrency search [default: 30] - --num-concurrency TEXT Comma-separated list of concurrency values - to test during concurrent search [default: - 1,10,20] - --concurrency-timeout INTEGER Timeout (in seconds) to wait for a - concurrency slot before failing. Set to a - negative value to wait indefinitely. - [default: 3600] - --user-name TEXT Db username [required] - --password TEXT Db password [required] - --host TEXT Db host [required] - --db-name TEXT Db name [required] - --maintenance-work-mem TEXT Sets the maximum memory to be used for - maintenance operations (index creation). Can - be entered as string with unit like '64GB' - or as an integer number of KB.This will set - the parameters: - max_parallel_maintenance_workers, - max_parallel_workers & - table(parallel_workers) - --max-parallel-workers INTEGER Sets the maximum number of parallel - processes per maintenance operation (index - creation) - --m INTEGER hnsw m - --ef-construction INTEGER hnsw ef-construction - --ef-search INTEGER hnsw ef-search - --quantization-type [none|bit|halfvec] - quantization type for vectors (in index) - --table-quantization-type [none|bit|halfvec] - quantization type for vectors (in table). If - equal to bit, the parameter - quantization_type will be set to bit too. - --custom-case-name TEXT Custom case name i.e. PerformanceCase1536D50K - --custom-case-description TEXT Custom name description - --custom-case-load-timeout INTEGER - Custom case load timeout [default: 36000] - --custom-case-optimize-timeout INTEGER - Custom case optimize timeout [default: 36000] - --custom-dataset-name TEXT - Dataset name i.e OpenAI - --custom-dataset-dir TEXT Dataset directory i.e. openai_medium_500k - --custom-dataset-size INTEGER Dataset size i.e. 500000 - --custom-dataset-dim INTEGER Dataset dimension - --custom-dataset-metric-type TEXT - Dataset distance metric [default: COSINE] - --custom-dataset-file-count INTEGER - Dataset file count - --custom-dataset-use-shuffled / --skip-custom-dataset-use-shuffled - Use shuffled custom dataset or skip [default: custom-dataset- - use-shuffled] - --custom-dataset-with-gt / --skip-custom-dataset-with-gt - Custom dataset with ground truth or skip [default: custom-dataset- - with-gt] - --help Show this message and exit. +## πŸ“ Project Structure + +```bash +. +β”œβ”€β”€ README.md +β”œβ”€β”€ scripts +β”‚ β”œβ”€β”€ get_kmeans_centroids.py # create kmeans centroids +β”‚ β”œβ”€β”€ requirements.txt # python requirements +β”‚ β”œβ”€β”€ prepare_dataset.py # download and prepare ground truth neighbors for dataset +β”‚ └── run_benchmark.sh # benchmark script +└── vectordb_bench/config-files # benchmark config file + └── envector_{benchmark_case}_config.yml ``` -### Run awsopensearch from command line +## πŸ”§ Prerequisites -```shell -vectordbbench awsopensearch --db-label awsopensearch \ ---m 16 --ef-construction 256 \ ---host search-vector-db-prod-h4f6m4of6x7yp2rz7gdmots7w4.us-west-2.es.amazonaws.com --port 443 \ ---user vector --password '' \ ---case-type Performance1536D5M --number-of-indexing-clients 10 \ ---skip-load --num-concurrency 75 -``` +### 1. Install Python Dependencies +```bash +# 1. Create your environment +python -m venv .venv +source .venv/bin/activate -To list the options for awsopensearch, execute `vectordbbench awsopensearch --help` - -```text -$ vectordbbench awsopensearch --help -Usage: vectordbbench awsopensearch [OPTIONS] - -Options: - # Sharding and Replication - --number-of-shards INTEGER Number of primary shards for the index - --number-of-replicas INTEGER Number of replica copies for each primary - shard - # Indexing Performance - --index-thread-qty INTEGER Thread count for native engine indexing - --index-thread-qty-during-force-merge INTEGER - Thread count during force merge operations - --number-of-indexing-clients INTEGER - Number of concurrent indexing clients - # Index Management - --number-of-segments INTEGER Target number of segments after merging - --refresh-interval TEXT How often to make new data available for - search - --force-merge-enabled BOOLEAN Whether to perform force merge operation - --flush-threshold-size TEXT Size threshold for flushing the transaction - log - --engine TEXT type of engine to use valid values [faiss, lucene, s3vector] - # Memory Management - --cb-threshold TEXT k-NN Memory circuit breaker threshold - - --ondisk Ondisk mode with binary quantization(32x compression) - --oversample-factor Controls the degree of oversampling applied to minority classes in imbalanced datasets to improve model performance by balancing class distributions.(default 1.0) - - - # Quantization Type - --quantization-type TEXT which type of quantization to use valid values [fp32, fp16, bq] - --help Show this message and exit. - ``` -### Run OceanBase from command line - -Execute tests for the index types: HNSW, HNSW_SQ, or HNSW_BQ. - -```shell -vectordbbench oceanbasehnsw --host xxx --port xxx --user root@mysql_tenant --database test \ ---m 16 --ef-construction 200 --case-type Performance1536D50K \ ---index-type HNSW --ef-search 100 -``` +# 2. Install VectorDBBench +pip install -e . -To list the options for oceanbase, execute `vectordbbench oceanbasehnsw --help`, The following are some OceanBase-specific command-line options. - -```text -$ vectordbbench oceanbasehnsw --help -Usage: vectordbbench oceanbasehnsw [OPTIONS] - -Options: - [...] - --host TEXT OceanBase host - --user TEXT OceanBase username [required] - --password TEXT OceanBase database password - --database TEXT DataBase name [required] - --port INTEGER OceanBase port [required] - --m INTEGER hnsw m [required] - --ef-construction INTEGER hnsw ef-construction [required] - --ef-search INTEGER hnsw ef-search [required] - --index-type [HNSW|HNSW_SQ|HNSW_BQ] - Type of index to use. Supported values: - HNSW, HNSW_SQ, HNSW_BQ [required] - --help Show this message and exit. - ``` - -Execute tests for the index types: IVF_FLAT, IVF_SQ8, or IVF_PQ. - -```shell -vectordbbench oceanbaseivf --host xxx --port xxx --user root@mysql_tenant --database test \ ---nlist 1000 --sample_per_nlist 256 --case-type Performance768D1M \ ---index-type IVF_FLAT --ivf_nprobes 100 +# 3. Install pyenvector +# pip uninstall pyenvector # if installed +pip install pyenvector==1.3.0a1 ``` -To list the options for oceanbase, execute `vectordbbench oceanbaseivf --help`, The following are some OceanBase-specific command-line options. - -```text -$ vectordbbench oceanbaseivf --help -Usage: vectordbbench oceanbaseivf [OPTIONS] - -Options: - [...] - --host TEXT OceanBase host - --user TEXT OceanBase username [required] - --password TEXT OceanBase database password - --database TEXT DataBase name [required] - --port INTEGER OceanBase port [required] - --index-type [IVF_FLAT|IVF_SQ8|IVF_PQ] - Type of index to use. Supported values: - IVF_FLAT, IVF_SQ8, IVF_PQ [required] - --nlist INTEGER Number of cluster centers [required] - --sample_per_nlist INTEGER The cluster centers are calculated by total - sampling sample_per_nlist * nlist vectors - [required] - --ivf_nprobes TEXT How many clustering centers to search during - the query [required] - --m INTEGER The number of sub-vectors that each data - vector is divided into during IVF-PQ - --help Show this message and exit. Show this message and exit. - ``` - -### Run Hologres from command line - -It is recommended to use the following code for installation. -```shell -pip install 'vectordb-bench[hologres]' 'psycopg[binary]' pgvector -``` +### 2. Prepare enVector Server -Execute tests for the index types: HGraph. +To run enVector server with ANN, please refer to the [enVector Deployment repository](https://github.com/CryptoLabInc/envector-deployment). +For example, you can start the server with the following command: -```shell -NUM_PER_BATCH=10000 vectordbbench hologreshgraph --host Hologres_Endpoint --port 80 \ ---user ACCESS_ID --password ACCESS_KEY --database DATABASE_NAME \ ---m 64 --ef-construction 400 --case-type Performance768D10M \ ---index-type HGraph --ef-search 400 --k 10 --num-concurrency 1,60,70,75,80,90,95,100,110,120 +```bash +# Start enVector server +git clone https://github.com/CryptoLabInc/envector-deployment +cd envector-deployment/docker-compose +./start_envector.sh ``` -To list the options for Hologres, execute `vectordbbench hologreshgraph --help`, The following are some Hologres-specific command-line options. - -```text -$ vectordbbench hologreshgraph --help -Usage: vectordbbench hologreshgraph [OPTIONS] - -Options: - [...] - --host TEXT Hologres host - --user TEXT Hologres username [required] - --password TEXT Hologres database password - --database TEXT Hologres database name [required] - --port INTEGER Hologres port [required] - --m INTEGER hnsw m [required] - --ef-construction INTEGER hnsw ef-construction [required] - --ef-search INTEGER hnsw ef-search [required] - --index-type [HGraph] Type of index to use. Supported values: - HGraph [required] - --help Show this message and exit. - ``` - -#### Using a configuration file. - -The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file. - -By default, configuration files are expected to be in vectordb_bench/config-files/, this can be overridden by setting -the environment variable CONFIG_LOCAL_DIR or by passing the full path to the file. - -The required format is: -```yaml -commandname: - parameter_name: parameter_value - parameter_name: parameter_value -``` -Example: -```yaml -pgvectorhnsw: - db_label: pgConfigTest - user_name: vectordbbench - password: vectordbbench - db_name: vectordbbench - host: localhost - m: 16 - ef_construction: 128 - ef_search: 128 -milvushnsw: - skip_search_serial: True - case_type: Performance1536D50K - uri: http://localhost:19530 - m: 16 - ef_construction: 128 - ef_search: 128 - drop_old: False - load: False -``` -> Notes: -> - Options passed on the command line will override the configuration file* -> - Parameter names use an _ not - +We provide 5 enVector Docker Images: +- `cryptolabinc/envector-endpoint:v1.3.0-alpha.1` +- `cryptolabinc/envector-backend:v1.3.0-alpha.1` +- `cryptolabinc/envector-shaper:v1.3.0-alpha.1` +- `cryptolabinc/envector-orchestrator:v1.3.0-alpha.1` +- `cryptolabinc/envector-compute:v1.3.0-alpha.1` -#### Using a batch configuration file. +## πŸ“Š Run Benchmark -The vectordbbench command can read a batch configuration file to run all the test cases in the yaml formatted configuration file. +### 1. VectorDBBench Built-in Cases -By default, configuration files are expected to be in vectordb_bench/config-files/, this can be overridden by setting -the environment variable CONFIG_LOCAL_DIR or by passing the full path to the file. +Run the following commands to run enVector with VectorDBBench's built-in benchmark. -The required format is: -```yaml -commandname: - - parameter_name: parameter_value - another_parameter_name: parameter_value -``` -Example: -```yaml -pgvectorhnsw: - - db_label: pgConfigTest - user_name: vectordbbench - password: vectordbbench - db_name: vectordbbench - host: localhost - m: 16 - ef_construction: 128 - ef_search: 128 -milvushnsw: - - skip_search_serial: True - case_type: Performance1536D50K - uri: http://localhost:19530 - m: 16 - ef_construction: 128 - ef_search: 128 - drop_old: False - load: False +```bash +./scripts/run_benchmark.sh --index-type FLAT --config-file envector_{benchmark_case}_config.yml # FLAT +./scripts/run_benchmark.sh --index-type IVF_FLAT --config-file envector_{benchmark_case}_config.yml # IVF-FLAT ``` -> Notes: -> - Options can only be passed through configuration files -> - Parameter names use an _ not - -How to use? -```shell -vectordbbench batchcli --batch-config-file +For more details, please refer to `envector_{benchmark_case}_config.yml` in scripts directory for benchmarks with enVector, or you can use the following command: + +```bash +python -m vectordb_bench.cli.vectordbbench envectorflat \ + --config-file envector_openai_config.yml + +# or + +python -m vectordb_bench.cli.vectordbbench envectorflat \ + --uri "localhost:50050" \ + --case-type "Performance1536D500K" ``` -## Leaderboard -### Introduction -To facilitate the presentation of test results and provide a comprehensive performance analysis report, we offer a [leaderboard page](https://zilliz.com/benchmark). It allows us to choose from QPS, QP$, and latency metrics, and provides a comprehensive assessment of a system's performance based on the test results of various cases and a set of scoring mechanisms (to be introduced later). On this leaderboard, we can select the systems and models to be compared, and filter out cases we do not want to consider. Comprehensive scores are always ranked from best to worst, and the specific test results of each query will be presented in the list below. +If you need the trained k-means centroids, run `./scripts/get_kmeans_centroids.py` with your dataset. -### Scoring Rules +Note that, the benchmark provided by VectorDBBench, including Performance1536D500K, uses **unknown** embedding model (just notified as openai's one), we cannot use our GAS approach for ANN. -1. For each case, select a base value and score each system based on relative values. - - For QPS and QP$, we use the highest value as the reference, denoted as `base_QPS` or `base_QP$`, and the score of each system is `(QPS/base_QPS) * 100` or `(QP$/base_QP$) * 100`. - - For Latency, we use the lowest value as the reference, that is, `base_Latency`, and the score of each system is `(base_Latency + 10ms)/(Latency + 10ms) * 100`. - We want to give equal weight to different cases, and not let a case with high absolute result values become the sole reason for the overall scoring. Therefore, when scoring different systems in each case, we need to use relative values. +### 2. enVector Custom Cases (with GAS Support) - Also, for Latency, we add 10ms to the numerator and denominator to ensure that if every system performs particularly well in a case, its advantage will not be infinitely magnified when latency tends to 0. +We provide enVector-customized ANN, called "GAS", designed to perform efficient IVF-FLAT-based ANN search with the encrypted index. +We evaluated enVector on benchmark datasets that we provided. -2. For systems that fail or timeout in a particular case, we will give them a score based on a value worse than the worst result by a factor of two. For example, in QPS or QP$, it would be half the lowest value. For Latency, it would be twice the maximum value. +#### 2-1. Prepare dataset -3. For each system, we will take the geometric mean of its scores in all cases as its comprehensive score for a particular metric. +Prepare the following artifacts for the ANN benchmark with `scripts/prepare_dataset.py`: -## Build on your own -### Install requirements -``` shell -pip install -e '.[test]' +- download datasets from HuggingFace +- prepare ground-truth neighbors +- download centroids for the GAS index for corresponding to the embedding model -pip install -e '.[pinecone]' -``` -### Run test server +For the ANN benchmark, we provide two datasets via HuggingFace: +- `PUBMED768D400K`: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m) +- `BLOOMBERG768D368K`: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m) +- `PRODUCTS512D400K` +- `FASHION512D200K` +- `FOOD512D75K` + +Also, we provide centroids and tree metadata for the corresponding embedding model used in the ANN benchmark: +- GAS Centroids: [cryptolab-playground/gas-centroids](https://huggingface.co/datasets/cryptolab-playground/gas-centroids) + +To prepare dataset, run the following command as example: + +```bash +# Install dependencies for preparing dataset +pip install -r ./scripts/requirements.txt + +# Prepare GAS dataset +python ./scripts/prepare_dataset.py \ + -d cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m \ + -e embeddinggemma-300m ``` -python -m vectordb_bench + +Then, you can find the generated files as follows: + +```bash +. +β”œβ”€β”€ centroids # centroids for IVF index types +β”‚ └── embeddinggemma-300m # centroids for IVF-GAS +β”‚ └── centroids.npy +└── dataset # custom benchmark dataset + └── pubmed768d400k + β”œβ”€β”€ neighbors.parquet + β”œβ”€β”€ test.parquet + └── train.parquet ``` -OR: +#### 2-2. Run enVector Custom Cases -If you are using [dev container](https://code.visualstudio.com/docs/devcontainers/containers), create -the following dataset directory first: +Run the provided shell scripts (`./scripts/run_benchmark.sh`) as the following: -```shell -init_bench +```bash +./scripts/run_benchmark.sh --index-type FLAT --config-file envector_pubmed_config.yml # FLAT +./scripts/run_benchmark.sh --index-type IVF_FLAT --config-file envector_pubmed_config.yml # IVF-FLAT with trained k-means centroids +./scripts/run_benchmark.sh --index-type IVF_GAS --config-file envector_pubmed_config.yml # GAS: enVector-customized ANN ``` -OR: +For more details, please refer to `run_benchmark.sh` or `envector_{benchmark_case}_config.yml` in scripts directory for benchmarks with enVector with ANN (GAS), or you can use the following command: -If you are using [dev container](https://code.visualstudio.com/docs/devcontainers/containers), create -the following dataset directory first: +```bash +python -m vectordb_bench.cli.vectordbbench envectorivfflat \ + --config-file envector_pubmed_config.yml -```shell -# Mount local ~/vectordb_bench/dataset to contain's /tmp/vectordb_bench/dataset. -# If you are not comfortable with the path name, feel free to change it in devcontainer.json -mkdir -p ~/vectordb_bench/dataset -``` -After reopen the repository in container, run `python -m vectordb_bench` in the container's bash. +# or -### Check coding styles -```shell -make lint +python -m vectordb_bench.cli.vectordbbench envectorivfflat \ + --uri "localhost:50050" \ + --eval-mode mm \ + ... \ + --train-centroids True \ + --centroids-path "./centroids/embeddinggemma-300m/centroids.npy" \ + --nlist 32768 \ + --nprobe 6 ``` -To fix the coding styles automatically +Note that, **`NUM_PER_BATCH` should be set to the database size** when using IVF-based ANN index for enVector currently. +We will support adjustable `NUM_PER_BATCH` for ANN soon. -```shell -make format -``` +## 🎯 Advanced Usage + +### Prepare Other Datasets + +If you want to test on other benchmark datasets regardless ANN benchmark, please run the following scripts: -## How does it work? -### Result Page -![image](https://github.com/zilliztech/VectorDBBench/assets/105927039/8a981327-c1c6-4796-8a85-c86154cb5472) -This is the main page of VDBBench, which displays the standard benchmark results we provide. Additionally, results of all tests performed by users themselves will also be shown here. We also offer the ability to select and compare results from multiple tests simultaneously. - -The standard benchmark results displayed here include all 15 cases that we currently support for 6 of our clients (Milvus, Zilliz Cloud, Elastic Search, Qdrant Cloud, Weaviate Cloud and PgVector). However, as some systems may not be able to complete all the tests successfully due to issues like Out of Memory (OOM) or timeouts, not all clients are included in every case. - -All standard benchmark results are generated by a client running on an 8 core, 32 GB host, which is located in the same region as the server being tested. The client host is equipped with an `Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz` processor. Also all the servers for the open-source systems tested in our benchmarks run on hosts with the same type of processor. -### Run Test Page -1. Initially, you select the systems to be tested - multiple selections are allowed. Once selected, corresponding forms will pop up to gather necessary information for using the chosen databases. The db_label is used to differentiate different instances of the same system. We recommend filling in the host size or instance type here (as we do in our standard results). -2. The next step is to select the test cases you want to perform. You can select multiple cases at once, and a form to collect corresponding parameters will appear. -3. Finally, you'll need to provide a task label to distinguish different test results. Using the same label for different tests will result in the previous results being overwritten. -Now we can only run one task at the same time. -![image](vectordb_bench/fig/run_test_select_db.png) -![image](vectordb_bench/fig/run_test_select_case.png) -![image](vectordb_bench/fig/run_test_submit.png) - - -## Module -### Code Structure -![image](https://github.com/zilliztech/VectorDBBench/assets/105927039/8c06512e-5419-4381-b084-9c93aed59639) -### Client -Our client module is designed with flexibility and extensibility in mind, aiming to integrate APIs from different systems seamlessly. As of now, it supports Milvus, Zilliz Cloud, Elastic Search, Pinecone, Qdrant Cloud, Weaviate Cloud, PgVector, Redis, Chroma, etc. Stay tuned for more options, as we are consistently working on extending our reach to other systems. -### Benchmark Cases -We've developed lots of comprehensive benchmark cases to test vector databases' various capabilities, each designed to give you a different piece of the puzzle. These cases are categorized into four main types: -#### Capacity Case -- **Large Dim:** Tests the database's loading capacity by inserting large-dimension vectors (GIST 100K vectors, 960 dimensions) until fully loaded. The final number of inserted vectors is reported. -- **Small Dim:** Similar to the Large Dim case but uses small-dimension vectors (SIFT 500K vectors, 128 dimensions). -#### Search Performance Case -- **XLarge Dataset:** Measures search performance with a massive dataset (LAION 100M vectors, 768 dimensions) at varying parallel levels. The results include index building time, recall, latency, and maximum QPS. -- **Large Dataset:** Similar to the XLarge Dataset case, but uses a slightly smaller dataset (10M-1024dim, 10M-768dim, 5M-1536dim). -- **Medium Dataset:** A case using a medium dataset (1M-1024dim, 1M-768dim, 500K-1536dim). -- **Small Dataset:** For development (100K-768dim, 50K-1536dim). -#### Filtering Search Performance Case -- **Int-Filter Cases:** Evaluates search performance with int-based filter expression (e.g. "id >= 2,000"). -- **Label-Filter Cases:** Evaluates search performance with label-based filter expressions (e.g., "color == 'red'"). The test includes randomly generated labels to simulate real-world filtering scenarios. -#### Streaming Cases -- **Insertion-Under-Load Case:** Evaluates search performance while maintaining a constant insertion workload. VDBBench applies a steady stream of insert requests at a fixed rate to simulate real-world scenarios where search operations must perform reliably under continuous data ingestion. - -Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance. - -#### Custom Dataset for Performance case - -Through the `/custom` page, users can customize their own performance case using local datasets. After saving, the corresponding case can be selected from the `/run_test` page to perform the test. - -![image](vectordb_bench/fig/custom_dataset.png) -![image](vectordb_bench/fig/custom_case_run_test.png) - -We have strict requirements for the data set format, please follow them. -- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format. - - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`. - - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`. - - We recommend limiting the number of test query vectors, like 1,000. - When conducting concurrent query tests, Vdbbench creates a large number of processes. - To minimize additional communication overhead during testing, - we prepare a complete set of test queries for each process, allowing them to run independently. - However, this means that as the number of concurrent processes increases, - the number of copied query vectors also increases significantly, - which can place substantial pressure on memory resources. - - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`. - -- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files. - -- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order. - - -## Goals -Our goals of this benchmark are: -### Reproducibility & Usability -One of the primary goals of VDBBench is to enable users to reproduce benchmark results swiftly and easily, or to test their customized scenarios. We believe that lowering the barriers to entry for conducting these tests will enhance the community's understanding and improvement of vector databases. We aim to create an environment where any user, regardless of their technical expertise, can quickly set up and run benchmarks, and view and analyze results in an intuitive manner. -### Representability & Realism -VDBBench aims to provide a more comprehensive, multi-faceted testing environment that accurately represents the complexity of vector databases. By moving beyond a simple speed test for algorithms, we hope to contribute to a better understanding of vector databases in real-world scenarios. By incorporating as many complex scenarios as possible, including a variety of test cases and datasets, we aim to reflect realistic conditions and offer tangible significance to our community. Our goal is to deliver benchmarking results that can drive tangible improvements in the development and usage of vector databases. - -## Contribution -### General Guidelines -1. Fork the repository and create a new branch for your changes. -2. Adhere to coding conventions and formatting guidelines. -3. Use clear commit messages to document the purpose of your changes. -### Adding New Clients -**Step 1: Creating New Client Files** - -1. Navigate to the vectordb_bench/backend/clients directory. -2. Create a new folder for your client, for example, "new_client". -3. Inside the "new_client" folder, create two files: new_client.py and config.py. - -**Step 2: Implement new_client.py and config.py** - -1. Open new_client.py and define the NewClient class, which should inherit from the clients/api.py file's VectorDB abstract class. The VectorDB class serves as the API for benchmarking, and all DB clients must implement this abstract class. -Example implementation in new_client.py: -new_client.py -```python -from ..api import VectorDB -class NewClient(VectorDB): - # Implement the abstract methods defined in the VectorDB class - # ... -``` -2. Open config.py and implement the DBConfig and optional DBCaseConfig classes. - 1. The DBConfig class should be an abstract class that provides information necessary to establish connections with the database. It is recommended to use the pydantic.SecretStr data type to handle sensitive data such as tokens, URIs, or passwords. - 2. The DBCaseConfig class is optional and allows for providing case-specific configurations for the database. If not provided, it defaults to EmptyDBCaseConfig. -Example implementation in config.py: ```python -from pydantic import SecretStr -from clients.api import DBConfig, DBCaseConfig - -class NewDBConfig(DBConfig): - # Implement the required configuration fields for the database connection - # ... - token: SecretStr - uri: str - -class NewDBCaseConfig(DBCaseConfig): - # Implement optional case-specific configuration fields - # ... +# (Optional) Prepare laion dataset +python ./scripts/prepare_laion_dataset.py \ + --dataset-dir ./dataset/laion512d500k \ + --dataset-size 500_000 + +# (Optional) Prepare random dataset +python ./scripts/prepare_random_dataset.py \ + --dataset-dir ./dataset/random512d1m \ + --dataset-size 1_000_000 ``` -**Step 3: Importing the DB Client and Updating Initialization** +### enVector VectorDBBench CLI Options -In this final step, you will import your DB client into clients/__init__.py and update the initialization process. -1. Open clients/__init__.py and import your NewClient from new_client.py. -2. Add your NewClient to the DB enum. -3. Update the db2client dictionary by adding an entry for your NewClient. -Example implementation in clients/__init__.py: +enVector Types for VectorDBBench +- `envectorflat`: FLAT index type for enVector +- `envectorivfflat`: IVF_FLAT index type for enVector +- `envectorivfgas`: GAS index type for enVector -```python -#clients/__init__.py - -# Add NewClient to the DB enum -class DB(Enum): - ... - DB.NewClient = "NewClient" - - @property - def init_cls(self) -> Type[VectorDB]: - ... - if self == DB.NewClient: - from .new_client.new_client import NewClient - return NewClient - ... - - @property - def config_cls(self) -> Type[DBConfig]: - ... - if self == DB.NewClient: - from .new_client.config import NewClientConfig - return NewClientConfig - ... - - def case_config_cls(self, ...) - if self == DB.NewClient: - from .new_client.config import NewClientCaseConfig - return NewClientCaseConfig +Common Options for enVector +- `--uri`: enVector server URI +- `--eval-mode`: FHE evaluation mode on server. Use `mm` for enhanced performance. -``` -**Step 4: Implement new_client/cli.py and vectordb_bench/cli/vectordbbench.py** +ANN Options for enVector +- `--nlist`: Number of coarse clusters for IVF index types. +- `--nprobe`: Number of clusters to scan during search for IVF index types. +- `--train-centroids`: Whether to use trained centroids for IVF index types. Default is False, which means to use randomly generated centroids. +- `--centroids-path`: Path to the trained centroids for IVF index types. -In this (optional, but encouraged) step you will enable the test to be run from the command line. -1. Navigate to the vectordb_bench/backend/clients/"client" directory. -2. Inside the "client" folder, create a cli.py file. -Using zilliz as an example cli.py: -```python -from typing import Annotated, Unpack - -import click -import os -from pydantic import SecretStr - -from vectordb_bench.cli.cli import ( - CommonTypedDict, - cli, - click_parameter_decorators_from_typed_dict, - run, -) -from vectordb_bench.backend.clients import DB - - -class ZillizTypedDict(CommonTypedDict): - uri: Annotated[ - str, click.option("--uri", type=str, help="uri connection string", required=True) - ] - user_name: Annotated[ - str, click.option("--user-name", type=str, help="Db username", required=True) - ] - password: Annotated[ - str, - click.option("--password", - type=str, - help="Zilliz password", - default=lambda: os.environ.get("ZILLIZ_PASSWORD", ""), - show_default="$ZILLIZ_PASSWORD", - ), - ] - level: Annotated[ - str, - click.option("--level", type=str, help="Zilliz index level", required=False), - ] - - -@cli.command() -@click_parameter_decorators_from_typed_dict(ZillizTypedDict) -def ZillizAutoIndex(**parameters: Unpack[ZillizTypedDict]): - from .config import ZillizCloudConfig, AutoIndexConfig - - run( - db=DB.ZillizCloud, - db_config=ZillizCloudConfig( - db_label=parameters["db_label"], - uri=SecretStr(parameters["uri"]), - user=parameters["user_name"], - password=SecretStr(parameters["password"]), - ), - db_case_config=AutoIndexConfig( - params={parameters["level"]}, - ), - **parameters, - ) -``` -3. Update cli by adding: - 1. Add database specific options as an Annotated TypedDict, see ZillizTypedDict above. - 2. Add index configuration specific options as an Annotated TypedDict. (example: vectordb_bench/backend/clients/pgvector/cli.py) - 1. May not be needed if there is only one index config. - 2. Repeat for each index configuration, nesting them if possible. - 2. Add a index config specific function for each index type, see Zilliz above. The function name, in lowercase, will be the command name passed to the vectordbbench command. - 3. Update db_config and db_case_config to match client requirements - 4. Continue to add new functions for each index config. - 5. Import the client cli module and command to vectordb_bench/cli/vectordbbench.py (for databases with multiple commands (index configs), this only needs to be done for one command) - 6. Import the `get_custom_case_config` function from `vectordb_bench/cli/cli.py` and use it to add a new key `custom_case` to the `parameters` variable within the command. - - -> cli modules with multiple index configs: -> - pgvector: vectordb_bench/backend/clients/pgvector/cli.py -> - milvus: vectordb_bench/backend/clients/milvus/cli.py - -That's it! You have successfully added a new DB client to the vectordb_bench project. - -## Rules -### Installation -The system under test can be installed in any form to achieve optimal performance. This includes but is not limited to binary deployment, Docker, and cloud services. -### Fine-Tuning -For the system under test, we use the default server-side configuration to maintain the authenticity and representativeness of our results. -For the Client, we welcome any parameter tuning to obtain better results. -### Incomplete Results -Many databases may not be able to complete all test cases due to issues such as Out of Memory (OOM), crashes, or timeouts. In these scenarios, we will clearly state these occurrences in the test results. -### Mistake Or Misrepresentation -We strive for accuracy in learning and supporting various vector databases, yet there might be oversights or misapplications. For any such occurrences, feel free to [raise an issue](https://github.com/zilliztech/VectorDBBench/issues/new) or make amendments on our GitHub page. -## Timeout -In our pursuit to ensure that our benchmark reflects the reality of a production environment while guaranteeing the practicality of the system, we have implemented a timeout plan based on our experiences for various tests. - -**1. Capacity Case:** -- For Capacity Case, we have assigned an overall timeout. - -**2. Other Cases:** - -For other cases, we have set two timeouts: - -- **Data Loading Timeout:** This timeout is designed to filter out systems that are too slow in inserting data, thus ensuring that we are only considering systems that is able to cope with the demands of a real-world production environment within a reasonable time frame. - -- **Optimization Preparation Timeout**: This timeout is established to avoid excessive optimization strategies that might work for benchmarks but fail to deliver in real production environments. By doing this, we ensure that the systems we consider are not only suitable for testing environments but also applicable and efficient in production scenarios. - -This multi-tiered timeout approach allows our benchmark to be more representative of actual production environments and assists us in identifying systems that can truly perform in real-world scenarios. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
CaseData SizeTimeout TypeValue
Capacity CaseN/ALoading timeout24 hours
Other Cases1M vectors, 768 dimensions
500K vectors, 1536 dimensions
Loading timeout2.5 hours
Optimization timeout15 mins
Other Cases10M vectors, 768 dimensions
5M vectors, 1536 dimensions
Loading timeout25 hours
Optimization timeout2.5 hours
Other Cases100M vectors, 768 dimensionsLoading timeout250 hours
Optimization timeout25 hours
- -**Note:** Some datapoints in the standard benchmark results that violate this timeout will be kept for now for reference. We will remove them in the future. +Benchmark Options: +follows conventions of VectorDBBench, +see details in [VectorDBBench Options](https://github.com/zilliztech/VectorDBBench?tab=readme-ov-file#custom-dataset-for-performance-case). +For example, if you have a custom directory for dataset, set `DATASET_LOCAL_DIR`. + + +## ❓ Troubleshooting + +- `RuntimeError: Failed to connect to localhost:50050`: Required to run enVector server. +- `polars.exceptions.ColumnNotFoundError: "emb" not found`: Required to provide the correct dataset path to env var `DATASET_LOCAL_DIR`. \ No newline at end of file diff --git a/README_ENVECTOR.md b/README_ENVECTOR.md deleted file mode 100644 index 1ea5c2a7c..000000000 --- a/README_ENVECTOR.md +++ /dev/null @@ -1,203 +0,0 @@ -# enVector in VectorDBBench - -This guide demonstrates how to use enVector in VectorDBBench. - -Basic usage of enVector with VectorDBBench follows the standard procedure for [VectorDBBench](https://github.com/zilliztech/VectorDBBench). - -## Structure - -```bash -. -β”œβ”€β”€ centroids -β”‚ └── embeddinggemma-300m -β”‚ β”œβ”€β”€ centroids.npy # centroids file for ANN -β”‚ └── tree_info.pkl # tree metadata for ANN -β”œβ”€β”€ dataset -β”‚ └── pubmed768d400k # VectorDB ANN benchmark dataset -β”‚ β”œβ”€β”€ neighbors.parquet -β”‚ β”œβ”€β”€ test.npy -β”‚ └── train.pkl -β”œβ”€β”€ README_ENVECTOR.md -└── scripts - β”œβ”€β”€ run_benchmark.sh # benchmark script - β”œβ”€β”€ envector_pubmed_config.yml # benchmark config file - └── prepare_dataset.py # download and prepare ground truth neighbors for dataset -``` - -## Prerequisites - -### Install Python Dependencies -```bash -# 1. Create your environment -python -m venv .venv -source .venv/bin/activate - -# 2. Install VectorDBBench -pip install -e . - -# 3. Install pyenvector -pip install pyenvector==1.2.0a5 -``` - -### Prepare dataset - -Prepare the following artifacts for the ANN benchmark with `scripts/prepare_dataset.py`: - -- download datasets from HuggingFace -- prepare ground-truth neighbors -- download centroids and tree metadata for the GAS index for corresponding to the embedding model - -For the ANN benchmark, we provide two datasets via HuggingFace: -- `PUBMED768D400K`: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m) -- `BLOOMBERG768D368K`: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m) - -Also, we provide centroids and tree metadata for the corresponding embedding model used in the ANN benchmark: -- GAS Centroids: [cryptolab-playground/gas-centroids](https://huggingface.co/datasets/cryptolab-playground/gas-centroids) - -To prepare dataset, run the following command as example: - -```bash -# Prepare dataset -python ./scripts/prepare_dataset.py \ - -d cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m \ - -e embeddinggemma-300m -``` - -Then, you can find the generated files as follows: - -```bash -. -β”œβ”€β”€ centroids -β”‚ └── embeddinggemma-300m -β”‚ β”œβ”€β”€ centroids.npy -β”‚ └── tree_info.pkl -└── dataset - └── pubmed768d400k - β”œβ”€β”€ neighbors.parquet - β”œβ”€β”€ test.npy - └── train.pkl -``` - -### Prepare enVector Server - -To run enVector server with ANN, please refer to the [enVector Deployment repository](https://github.com/CryptoLabInc/envector-deployment). -For example, you can start the server with the following command: - -```bash -# Start enVector server -git clone https://github.com/CryptoLabInc/envector-deployment -cd envector-deployment/docker-compose -./start_envector.sh -``` - -We provide four enVector Docker Images: -- `cryptolabinc/es2e:v1.2.0-alpha.5` -- `cryptolabinc/es2b:v1.2.0-alpha.5` -- `cryptolabinc/es2o:v1.2.0-alpha.5` -- `cryptolabinc/es2c:v1.2.0-alpha.5` - -### Set Environment Variables - -```bash -# Set environment variables -export DATASET_LOCAL_DIR="./dataset" # dataset directory. default: /tmp/vectordb_bench/dataset -export NUM_PER_BATCH=4096 # default batch size for enVector -``` - -## Run Our ANN Benchmark - -We provide enVector-customized ANN, called "GAS", designed to perform efficient IVF-FLAT-based ANN search with the encrypted index. -We evaluated enVector on two benchmark datasets that we provided: -- `PUBMED768D400K` -- `BLOOMBERG768D368K` - -Run the provided shell scripts (`./scripts/run_benchmark.sh`) as the following: - -```bash -./scripts/run_benchmark.sh --type flat # FLAT -./scripts/run_benchmark.sh --type ivf # IVF-FLAT with enVector-customized ANN (GAS) -``` - -For more details, please refer to `run_benchmark.sh` or `envector_{benchmark}_config.yml` in scripts directory for benchmarks with enVector with ANN (GAS), or you can use the following command: - - -```bash -export NUM_PER_BATCH=500000 # set to the database size when IVF_FLAT -python -m vectordb_bench.cli.vectordbbench envectorivfflat \ - --uri "localhost:50050" \ - --eval-mode mm \ - --case-type PerformanceCustomDataset \ - --db-label "PUBMED768D400K-IVF" \ - --custom-case-name PUBMED768D400K \ - --custom-dataset-name PUBMED768D400K \ - --custom-dataset-dir "" \ - --custom-dataset-size 400335 \ - --custom-dataset-dim 768 \ - --custom-dataset-file-count 1 \ - --custom-dataset-with-gt \ - --skip-custom-dataset-use-shuffled \ - --k 10 \ - --train-centroids True \ - --is-vct True \ - --centroids-path "./centroids/embeddinggemma-300m/centroids.npy" \ - --vct-path "./centroids/embeddinggemma-300m/tree_info.pkl" \ - --nlist 32768 \ - --nprobe 6 -``` - -Note that, **`NUM_PER_BATCH` should be set to the database** size when using IVF-based ANN index for enVector currently. -We will support adjustable `NUM_PER_BATCH` for ANN soon. - -## Run VectorDBBench Benchmark - -Run the following commands to run enVector with VectorDBBench's built-in benchmark. - -```bash -# flat -python -m vectordb_bench.cli.vectordbbench envectorflat \ - --uri "localhost:50050" \ - --case-type "Performance1536D500K" \ - --db-label "Performance1536D500K-FLAT" - -# ivf: IVF-FLAT with random centroids -export NUM_PER_BATCH=500000 # set database size when IVF-FLAT -python -m vectordb_bench.cli.vectordbbench envectorivfflat \ - --uri "localhost:50050" \ - --case-type "Performance1536D500K" \ - --db-label "Performance1536D500K-IVF-FLAT" \ - --nlist 250 \ - --nprobe 6 - -# ivf-trained: IVF-FLAT with trained centroids via k-means -export NUM_PER_BATCH=500000 # set to the database size when IVF-FLAT -python -m vectordb_bench.cli.vectordbbench envectorivfflat \ - --uri "localhost:50050" \ - --case-type "Performance1536D500K" \ - --db-label "Performance1536D500K-IVF-FLAT" \ - --train-centroids True \ - --centroids-path "./centroids/kmeans_centroids.npy" \ # centroids built by sklearn, etc. - --nlist 250 \ - --nprobe 6 -``` - -### CLI Options - -enVector Types for VectorDBBench -- `envectorflat`: FLAT as index type for enVector -- `envectorivfflat`: IVF_FLAT as index type for enVector - -Common Options for enVector -- `--uri`: enVector server URI -- `--eval-mode`: FHE evaluation mode on server. Use `mm` for enhanced performance. - -ANN Options for enVector -- `--nlist`: Number of coarse clusters for IVF_FLAT -- `--nprobe`: Number of clusters to scan during search for IVF_FLAT -- `--train-centroids`: whether to use trained centroids for IVF_FLAT -- `--centroids-path`: path to the trained centroids -- `--is-vct`: whether to use VCT approach for IVF_GAS -- `--vct-path`: path to the trained VCT metadata for IVF_GAS - -Benchmark Options: - follows conventions of VectorDBBench, - see details in [VectorDBBench Options](https://github.com/zilliztech/VectorDBBench?tab=readme-ov-file#custom-dataset-for-performance-case) \ No newline at end of file diff --git a/scripts/envector_bloomberg_config.yml b/scripts/envector_bloomberg_config.yml deleted file mode 100644 index 7a0b6f5c4..000000000 --- a/scripts/envector_bloomberg_config.yml +++ /dev/null @@ -1,42 +0,0 @@ -envectorflat: - uri: localhost:50050 - eval_mode: mm - case_type: PerformanceCustomDataset - db_label: BLOOMBERG768D368K-FLAT - custom_case_name: BLOOMBERG768D368K - custom_case_description: BLOOMBERG768D368K benchmark (768D, 368K vectors) - custom_dataset_name: BLOOMBERG768D368K - custom_dataset_dir: - custom_dataset_size: 368816 - custom_dataset_dim: 768 - custom_dataset_file_count: 1 - custom_dataset_use_shuffled: false - custom_dataset_with_gt: true - k: 10 - drop_old: true - load: true - -envectorivfflat: - uri: localhost:50050 - eval_mode: mm - case_type: PerformanceCustomDataset - db_label: BLOOMBERG768D368K-IVF - custom_case_name: BLOOMBERG768D368K - custom_case_description: BLOOMBERG768D368K benchmark (768D, 368K vectors) - custom_dataset_name: BLOOMBERG768D368K - custom_dataset_dir: - custom_dataset_size: 368816 - custom_dataset_dim: 768 - custom_dataset_file_count: 1 - custom_dataset_use_shuffled: false - custom_dataset_with_gt: true - k: 10 - nlist: 32768 - nprobe: 6 - train_centroids: true - is_vct: true - centroids_path: centroids/embeddinggemma-300m/centroids.npy - vct_path: centroids/embeddinggemma-300m/tree_info.pkl - drop_old: true - load: true - \ No newline at end of file diff --git a/scripts/envector_pubmed_config.yml b/scripts/envector_pubmed_config.yml deleted file mode 100644 index f1839a567..000000000 --- a/scripts/envector_pubmed_config.yml +++ /dev/null @@ -1,42 +0,0 @@ -envectorflat: - uri: localhost:50050 - eval_mode: mm - case_type: PerformanceCustomDataset - db_label: PUBMED768D400K-FLAT - custom_case_name: PUBMED768D400K - custom_case_description: PUBMED768D400K benchmark (768D, 400K vectors) - custom_dataset_name: PUBMED768D400K - custom_dataset_dir: - custom_dataset_size: 400335 - custom_dataset_dim: 768 - custom_dataset_file_count: 1 - custom_dataset_use_shuffled: false - custom_dataset_with_gt: true - k: 10 - drop_old: true - load: true - -envectorivfflat: - uri: localhost:50050 - eval_mode: mm - case_type: PerformanceCustomDataset - db_label: PUBMED768D400K-IVF - custom_case_name: PUBMED768D400K - custom_case_description: PUBMED768D400K benchmark (768D, 400K vectors) - custom_dataset_name: PUBMED768D400K - custom_dataset_dir: - custom_dataset_size: 400335 - custom_dataset_dim: 768 - custom_dataset_file_count: 1 - custom_dataset_use_shuffled: false - custom_dataset_with_gt: true - k: 10 - nlist: 32768 - nprobe: 6 - train_centroids: true - is_vct: true - centroids_path: centroids/embeddinggemma-300m/centroids.npy - vct_path: centroids/embeddinggemma-300m/tree_info.pkl - drop_old: true - load: true - \ No newline at end of file diff --git a/scripts/prepare_dataset.py b/scripts/prepare_dataset.py index 8181e7d07..ffaa0bb72 100644 --- a/scripts/prepare_dataset.py +++ b/scripts/prepare_dataset.py @@ -1,21 +1,24 @@ -import os -import wget +""" +Prepare dataset and ground truth neighbors for benchmarking. +""" + import argparse +import os + +import faiss import numpy as np import pandas as pd import pyarrow as pa import pyarrow.parquet as pq - +import wget from datasets import load_dataset -import faiss def get_args(): - parser = argparse.ArgumentParser( - description="Prepare dataset and ground truth neighbors for benchmarking." - ) + parser = argparse.ArgumentParser(description="Prepare dataset and ground truth neighbors for benchmarking.") parser.add_argument( - "-d", "--dataset-name", + "-d", + "--dataset-name", type=str, default="cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m", help="Huggingface dataset name to download.", @@ -27,11 +30,12 @@ def get_args(): parser.add_argument( "--dataset-dir", type=str, - default="./dataset/pubmed768d400k", - help="Dataset directory to save the dataset and neighbors.", + default=os.path.join(os.environ.get("DATASET_LOCAL_DIR", "/tmp/vectordb_bench/dataset"), "pubmed768d400k"), + help="Dataset directory to save the dataset and neighbors. Default: 'pubmed768d400k' in DATASET_LOCAL_DIR.", ) parser.add_argument( - "-e", "--embedding-model", + "-e", + "--embedding-model", type=str, default="embeddinggemma-300m", help="Embedding model name to download centroids for.", @@ -44,16 +48,14 @@ def get_args(): ) return parser.parse_args() -def download_dataset( - dataset_name: str, - output_dir: str = "./dataset/pubmed768d400k" -) -> None: + +def download_dataset(dataset_name: str, output_dir: str = "./dataset/pubmed768d400k") -> None: """Download dataset from Huggingface and save as Parquet files.""" # load dataset ds = load_dataset(dataset_name) train = ds["train"].to_pandas() test = ds["test"].to_pandas() - + # write to parquet train_table = pa.Table.from_pandas(train) pq.write_table(train_table, f"{output_dir}/train.parquet") @@ -61,6 +63,7 @@ def download_dataset( test_table = pa.Table.from_pandas(test) pq.write_table(test_table, f"{output_dir}/test.parquet") + def prepare_neighbors( data_dir: str = "./dataset/pubmed768d400k", ) -> None: @@ -82,29 +85,28 @@ def prepare_neighbors( print(distances.shape, indices.shape) # save flat search result as neighbors - df = pd.DataFrame({ - "id": np.arange(len(indices)), - "neighbors_id": indices.tolist() - }) - + df = pd.DataFrame({"id": np.arange(len(indices)), "neighbors_id": indices.tolist()}) + table = pa.Table.from_pandas(df) pq.write_table(table, f"{data_dir}/neighbors.parquet") + def download_centroids(embedding_model: str, dataset_dir: str) -> None: """Download pre-computed centroids and tree info for GAS VCT index.""" - + if embedding_model != "embeddinggemma-300m": raise ValueError(f"Centroids for {embedding_model} currently not available.") - # https://huggingface.co/datasets/cryptolab-playground/gas-centroids + # BASE URL: https://huggingface.co/datasets/cryptolab-playground/gas-centroids dataset_link = f"https://huggingface.co/datasets/cryptolab-playground/gas-centroids/resolve/main/{embedding_model}" - + # download os.makedirs(os.path.join(dataset_dir, embedding_model), exist_ok=True) wget.download(f"{dataset_link}/centroids.npy", out=os.path.join(dataset_dir, embedding_model, "centroids.npy")) wget.download(f"{dataset_link}/tree_info.pkl", out=os.path.join(dataset_dir, embedding_model, "tree_info.pkl")) - - + print(f"\nDownloaded centroids and tree info to {os.path.join(dataset_dir, embedding_model)}") + + if __name__ == "__main__": args = get_args() os.makedirs(args.dataset_dir, exist_ok=True) diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 000000000..00dd6eb66 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,4 @@ +wget==3.2 +pandas==2.3.3 +pyarrow==22.0.0 +faiss-cpu==1.13.2 \ No newline at end of file diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 37037df90..f34e47566 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -2,74 +2,81 @@ set -euo pipefail -export DATASET_LOCAL_DIR="./dataset" -export NUM_PER_BATCH=4096 +DEFAULT_INDEX_TYPE="FLAT" +DEFAULT_CONFIG_FILE="envector_random_config.yml" -CENTROID_PATH=centroids/embeddinggemma-300m/centroids.npy -VCT_PATH=centroids/embeddinggemma-300m/tree_info.pkl -ENVECTOR_URI="localhost:50050" -REQUESTED_TYPE="" +# requested parameters +REQUESTED_INDEX_TYPE="${INDEX_TYPE:-$DEFAULT_INDEX_TYPE}" # default mode FLAT +REQUESTED_CONFIG_FILE="${CONFIG_FILE:-$DEFAULT_CONFIG_FILE}" # default config file -while [[ $# -gt 0 ]]; do - case "$1" in - --type) - REQUESTED_TYPE="${2:-}" - shift 2 - ;; - --type=*) - REQUESTED_TYPE="${1#--type=}" - shift - ;; - *) - echo "Unknown option: $1" >&2 - exit 1 - ;; - esac -done +usage() { + cat >&2 <] [--index-type ] -case "$REQUESTED_TYPE" in - ""|flat|ivf) ;; - *) - echo "Invalid --type: $REQUESTED_TYPE (expected: flat or ivf)" >&2 - exit 1 - ;; -esac -COMMON_ARGS=( - --uri "$ENVECTOR_URI" - --eval-mode mm - --case-type PerformanceCustomDataset - --custom-case-name PUBMED768D400K - --custom-dataset-name PUBMED768D400K - --custom-dataset-dir "" - --custom-dataset-size 400335 - --custom-dataset-dim 768 - --custom-dataset-file-count 1 - --custom-dataset-with-gt - --skip-custom-dataset-use-shuffled - --k 10 -) +Benchmark enVector with specified index type and configuration file. + +Options: + --config-file Path to the benchmark configuration file. + --index-type Index type to benchmark: + FLAT : brute-force search (baseline). + IVF_FLAT : Inverted file (IVF) index with FLAT vectors for faster approximate search. + IVF_GAS : enVector-customized ANN algorithm for the fastest approximate search. -run_case() { - local engine=$1 - local label=$2 - shift 2 - python -m vectordb_bench.cli.vectordbbench "$engine" \ - "${COMMON_ARGS[@]}" \ - --db-label "$label" \ - "$@" +Example: + $0 --config-file envector_random_config.yml --index-type FLAT + $0 --config-file envector_openai_config.yml --index-type IVF_FLAT + $0 --config-file envector_pubmed_config.yml --index-type IVF_GAS +EOF } -if [[ -z "$REQUESTED_TYPE" || "$REQUESTED_TYPE" == "flat" ]]; then - run_case envectorflat "PUBMED768D400K-FLAT" +# parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --config-file) + REQUESTED_CONFIG_FILE="${2:-}" + shift 2 + ;; + --index-type) + REQUESTED_INDEX_TYPE="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +if [ -z "${REQUESTED_CONFIG_FILE}" ]; then + echo "--config-file requires a path" >&2 + exit 1 fi -if [[ -z "$REQUESTED_TYPE" || "$REQUESTED_TYPE" == "ivf" ]]; then - export NUM_PER_BATCH=500000 # set database size for efficiency - run_case envectorivfflat "PUBMED768D400K-IVF" \ - --is-vct True \ - --train-centroids True \ - --centroids-path "$CENTROID_PATH" \ - --vct-path "$VCT_PATH" \ - --nlist 32768 \ - --nprobe 6 +if [ -z "${REQUESTED_INDEX_TYPE}" ]; then + echo "--index-type requires a value (FLAT / IVF_FLAT / IVF_GAS)" >&2 + exit 1 fi + +# run benchmark based on requested type +case "${REQUESTED_INDEX_TYPE}" in + FLAT) + python -m vectordb_bench.cli.vectordbbench envectorflat \ + --config-file "${REQUESTED_CONFIG_FILE}" + ;; + IVF_FLAT) + python -m vectordb_bench.cli.vectordbbench envectorivfflat \ + --config-file "${REQUESTED_CONFIG_FILE}" + ;; + IVF_GAS) + python -m vectordb_bench.cli.vectordbbench envectorivfgas \ + --config-file "${REQUESTED_CONFIG_FILE}" + ;; + *) + echo "Unsupported INDEX_TYPE: ${REQUESTED_INDEX_TYPE} (use FLAT or IVF_FLAT or IVF_GAS)" >&2 + exit 1 + ;; +esac diff --git a/vectordb_bench/__init__.py b/vectordb_bench/__init__.py index cec641fbd..9b4844a92 100644 --- a/vectordb_bench/__init__.py +++ b/vectordb_bench/__init__.py @@ -18,7 +18,7 @@ class config: DEFAULT_DATASET_URL = env.str("DEFAULT_DATASET_URL", AWS_S3_URL) DATASET_SOURCE = env.str("DATASET_SOURCE", "S3") # Options "S3" or "AliyunOSS" DATASET_LOCAL_DIR = env.path("DATASET_LOCAL_DIR", "/tmp/vectordb_bench/dataset") - NUM_PER_BATCH = env.int("NUM_PER_BATCH", 100) + NUM_PER_BATCH = env.int("NUM_PER_BATCH", 4096) TIME_PER_BATCH = 1 # 1s. for streaming insertion. MAX_INSERT_RETRY = 5 MAX_SEARCH_RETRY = 5 diff --git a/vectordb_bench/backend/clients/api.py b/vectordb_bench/backend/clients/api.py index fdc445618..a2232df17 100644 --- a/vectordb_bench/backend/clients/api.py +++ b/vectordb_bench/backend/clients/api.py @@ -43,6 +43,7 @@ class IndexType(str, Enum): Hologres_HGraph = "HGraph" Hologres_Graph = "Graph" NONE = "NONE" + IVFGAS = "IVF_VCT" # enVector custom index type class SQType(str, Enum): @@ -135,7 +136,6 @@ class VectorDB(ABC): "The filtering types supported by the VectorDB Client, default only non-filter" supported_filter_types: list[FilterOp] = [FilterOp.NonFilter] - name: str = "" @classmethod def filter_supported(cls, filters: Filter) -> bool: diff --git a/vectordb_bench/backend/clients/envector/cli.py b/vectordb_bench/backend/clients/envector/cli.py index 3197b8e3b..3b88ae76c 100644 --- a/vectordb_bench/backend/clients/envector/cli.py +++ b/vectordb_bench/backend/clients/envector/cli.py @@ -15,6 +15,8 @@ class EnVectorTypedDict(TypedDict): + """enVector common parameters""" + uri: Annotated[ str, click.option("--uri", type=str, help="uri connection string", required=True), @@ -23,6 +25,10 @@ class EnVectorTypedDict(TypedDict): str, click.option("--eval-mode", help="Evaluation mode", type=click.Choice(["mm", "rmp"]), default="mm"), ] + index_name: Annotated[ + str, + click.option("--index-name", help="Index name", type=str, default="vdbbench"), + ] class EnVectorFlatIndexTypedDict(CommonTypedDict, EnVectorTypedDict): ... @@ -39,6 +45,7 @@ def EnVectorFlat(**parameters: Unpack[EnVectorFlatIndexTypedDict]): db_label=parameters["db_label"], uri=SecretStr(parameters["uri"]), eval_mode=parameters["eval_mode"], + collection_name=parameters["index_name"], index_params={}, ), db_case_config=FlatIndexConfig(), @@ -47,6 +54,8 @@ def EnVectorFlat(**parameters: Unpack[EnVectorFlatIndexTypedDict]): class EnVectorIVFFlatIndexTypedDict(CommonTypedDict, EnVectorTypedDict): + """IVF-FLAT index specific parameters""" + nlist: Annotated[ int, click.option("--nlist", type=int, help="nlist for IVF index", default=250), @@ -63,14 +72,6 @@ class EnVectorIVFFlatIndexTypedDict(CommonTypedDict, EnVectorTypedDict): str, click.option("--centroids-path", type=str, help="path to centroids for IVF index", default=None), ] - is_vct: Annotated[ - bool, - click.option("--is-vct", type=bool, help="whether use VCT index", default=False), - ] - vct_path: Annotated[ - str, - click.option("--vct-path", type=str, help="path to VCT index file", default=None), - ] @cli.command(name="envectorivfflat") @@ -84,6 +85,7 @@ def EnVectorIVFFlat(**parameters: Unpack[EnVectorIVFFlatIndexTypedDict]): db_label=parameters["db_label"], uri=SecretStr(parameters["uri"]), eval_mode=parameters["eval_mode"], + collection_name=parameters["index_name"], index_params={"nlist": parameters["nlist"], "nprobe": parameters["nprobe"]}, ), db_case_config=IVFFlatIndexConfig( @@ -91,8 +93,33 @@ def EnVectorIVFFlat(**parameters: Unpack[EnVectorIVFFlatIndexTypedDict]): nprobe=parameters["nprobe"], train_centroids=parameters["train_centroids"], centroids_path=parameters["centroids_path"], - is_vct=parameters["is_vct"], - vct_path=parameters["vct_path"], + ), + **parameters, + ) + + +class EnVectorIVFGASIndexTypedDict(CommonTypedDict, EnVectorIVFFlatIndexTypedDict): ... + + +@cli.command(name="envectorivfgas") +@click_parameter_decorators_from_typed_dict(EnVectorIVFGASIndexTypedDict) +def EnVectorIVFGAS(**parameters: Unpack[EnVectorIVFGASIndexTypedDict]): + from .config import EnVectorConfig, IVFGASIndexConfig + + run( + db=DBTYPE, + db_config=EnVectorConfig( + db_label=parameters["db_label"], + uri=SecretStr(parameters["uri"]), + eval_mode=parameters["eval_mode"], + collection_name=parameters["index_name"], + index_params={"nlist": parameters["nlist"], "nprobe": parameters["nprobe"]}, + ), + db_case_config=IVFGASIndexConfig( + nlist=parameters["nlist"], + nprobe=parameters["nprobe"], + train_centroids=parameters["train_centroids"], + centroids_path=parameters["centroids_path"], ), **parameters, ) diff --git a/vectordb_bench/backend/clients/envector/config.py b/vectordb_bench/backend/clients/envector/config.py index 3bc58f862..62e2c4f9e 100644 --- a/vectordb_bench/backend/clients/envector/config.py +++ b/vectordb_bench/backend/clients/envector/config.py @@ -4,9 +4,12 @@ class EnVectorConfig(DBConfig): + """enVector common configuration""" + uri: SecretStr = SecretStr("http://localhost:50050") key_path: str = "keys" key_id: str = "default_key" + collection_name: str = "vdbbench" def to_dict(self) -> dict: return { @@ -17,11 +20,12 @@ def to_dict(self) -> dict: class EnVectorIndexConfig(BaseModel): - """Base config for envector""" + """Base index config for envector""" index: IndexType - metric_type: MetricType | None = None + metric_type: MetricType = MetricType.COSINE # envector supports cosine similarity only use_partition_key: bool = True # for label-filter + eval_mode: str = "mm" # default eval_mode @property def is_gpu_index(self) -> bool: @@ -42,9 +46,9 @@ def parse_metric(self) -> str: class FlatIndexConfig(EnVectorIndexConfig, DBCaseConfig): + """enVector FLAT index configuration""" + index: IndexType = IndexType.Flat - metric_type: MetricType = MetricType.COSINE # envector supports cosine similarity only - eval_mode: str = "mm" # default eval_mode def index_param(self) -> dict: return { @@ -62,15 +66,13 @@ def search_param(self) -> dict: class IVFFlatIndexConfig(EnVectorIndexConfig, DBCaseConfig): + """enVector IVF-FLAT index configuration""" + index: IndexType = IndexType.IVFFlat - metric_type: MetricType = MetricType.COSINE # envector supports cosine similarity only nlist: int = 0 nprobe: int = 0 - eval_mode: str = "mm" train_centroids: bool = False # whether to train centroids before inserting data centroids_path: str | None = None # path to centroids file - is_vct: bool = False # whether use VCT index - vct_path: str | None = None # path to VCT index file def index_param(self) -> dict: return { @@ -80,8 +82,6 @@ def index_param(self) -> dict: "params": {"index_type": "IVF_FLAT", "nlist": self.nlist, "default_nprobe": self.nprobe}, "train_centroids": self.train_centroids, "centroids_path": self.centroids_path, - "is_vct": self.is_vct, - "vct_path": self.vct_path, } def search_param(self) -> dict: @@ -91,7 +91,19 @@ def search_param(self) -> dict: } +class IVFGASIndexConfig(IVFFlatIndexConfig): + """enVector IVF-GAS index configuration""" + + index: IndexType = IndexType.IVFGAS + + def index_param(self) -> dict: + index_param = super().index_param() + index_param["params"].update({"index_type": "IVF_VCT"}) + return index_param + + _envector_case_config = { IndexType.Flat: FlatIndexConfig, IndexType.IVFFlat: IVFFlatIndexConfig, + IndexType.IVFGAS: IVFGASIndexConfig, } diff --git a/vectordb_bench/backend/clients/envector/envector.py b/vectordb_bench/backend/clients/envector/envector.py index e546f6390..fafe89347 100644 --- a/vectordb_bench/backend/clients/envector/envector.py +++ b/vectordb_bench/backend/clients/envector/envector.py @@ -1,7 +1,6 @@ """Wrapper around the EnVector vector database over VectorDB""" import logging -import os from collections.abc import Iterable from contextlib import contextmanager from pathlib import Path @@ -42,8 +41,6 @@ def __init__( self.case_config = db_case_config self.collection_name = collection_name - self.batch_size = 128 * 32 # default batch size for insertions, can be modified for IVF_FLAT - self._primary_field = "pk" self._scalar_id_field = "id" self._scalar_label_field = "label" @@ -53,47 +50,51 @@ def __init__( self._scalar_labels_index_name = "labels_idx" self.col: ev.Index | None = None - self.is_vct: bool = False - self.vct_params: dict[str, Any] = {} - + # Initialize the EnVector client ev.init( address=self.db_config.get("uri"), key_path=self.db_config.get("key_path"), key_id=self.db_config.get("key_id"), eval_mode=self.case_config.eval_mode, + preset="ip1" if self.case_config.eval_mode == "mm" else "ip", ) + + # Drop old index if specified if drop_old: log.info(f"{self.name} client drop_old index: {self.collection_name}") if self.collection_name in ev.get_index_list(): ev.drop_index(self.collection_name) - # Create the collection - log.info(f"{self.name} create index: {self.collection_name}") + # Check index type + index_param = self.case_config.index_param().get("params", {}) + index_type = index_param.get("index_type", "FLAT") + log.debug(f"Index Type: {index_type}") + # Ensure the index exists or create it index_kwargs = dict(kwargs) self._ensure_index(dim, index_kwargs) ev.disconnect() def _ensure_index(self, dim: int, index_kwargs: dict[str, Any]): + # Check if the collection already exists if self.collection_name in ev.get_index_list(): log.info(f"{self.name} index {self.collection_name} already exists, skip creating") - self.is_vct = self.case_config.index_param().get("is_vct", False) - log.debug(f"IS_VCT: {self.is_vct}") return + # Create the index if it does not exist self._create_index(dim, index_kwargs) def _create_index(self, dim: int, index_kwargs: dict[str, Any]): + # Create the collection + log.info(f"{self.name} create index: {self.collection_name}") + index_param = self.case_config.index_param().get("params", {}) index_type = index_param.get("index_type", "FLAT") train_centroids = self.case_config.index_param().get("train_centroids", False) - if index_type == "IVF_FLAT" and train_centroids: + if index_type in ["IVF_FLAT", "IVF_VCT"] and train_centroids: self._configure_centroids(index_param, index_kwargs) - if index_type == "IVF_FLAT": - self._adjust_batch_size() - ev.create_index( index_name=self.collection_name, dim=dim, @@ -105,39 +106,21 @@ def _create_index(self, dim: int, index_kwargs: dict[str, Any]): ) def _configure_centroids(self, index_param: dict[str, Any], index_kwargs: dict[str, Any]): + # Load centroids centroid_path = self.case_config.index_param().get("centroids_path", None) - self.is_vct = self.case_config.index_param().get("is_vct", False) - log.debug(f"IS_VCT: {self.is_vct}") - if centroid_path is None: - raise ValueError("Centroids path must be provided for IVF_FLAT index training.") + raise ValueError("Centroids path must be provided for IVF index training.") centroid_file = Path(centroid_path) if not centroid_file.exists(): - msg = f"Centroid file {centroid_path} not found for IVF_FLAT index training." + msg = f"Centroid file {centroid_path} not found for IVF index training." raise FileNotFoundError(msg) - log.debug(f"Centroids: {centroid_path}") centroids = np.load(centroid_file) - log.info(f"{self.name} loaded centroids from {centroid_path} for IVF_FLAT index training.") + log.info(f"{self.name} loaded centroids from {centroid_path} for IVF index training.") index_param["centroids"] = centroids.tolist() - if self.is_vct: - vct_path = self.case_config.index_param().get("vct_path", None) - log.debug(f"VCT: {vct_path}") - index_param["virtual_cluster"] = True - index_kwargs["tree_description"] = vct_path - self.is_vct = True - log.info(f"{self.name} VCT parameters set for IVF_FLAT index creation.") - - def _adjust_batch_size(self): - self.batch_size = int(os.environ.get("NUM_PER_BATCH", "500000")) - log.debug( - f"Set EnVector IVF_FLAT insert batch size to {self.batch_size}. " - f"This should be the size of dataset for better performance when IVF_FLAT." - ) - @contextmanager def init(self): """ @@ -151,15 +134,10 @@ def init(self): key_path=self.db_config.get("key_path"), key_id=self.db_config.get("key_id"), eval_mode=self.case_config.eval_mode, + preset="ip1" if self.case_config.eval_mode == "mm" else "ip", ) try: self.col = ev.Index(self.collection_name) - if self.is_vct: - log.debug(f"VCT: {self.col.index_config.index_param.index_params.get('virtual_cluster')}") - is_vct = self.case_config.index_param().get("is_vct", False) - assert self.is_vct == is_vct, "is_vct mismatch" - vct_path = self.case_config.index_param().get("vct_path", None) - self.col._load_virtual_cluster_from_pkl(vct_path) yield finally: self.col = None @@ -194,19 +172,17 @@ def insert_embeddings( assert self.col is not None assert len(embeddings) == len(metadata) + request_ids = kwargs.pop("request_ids", []) # extract request_ids from kwargs for tracking insert operations + insert_count = 0 try: - for batch_start_offset in range(0, len(embeddings), self.batch_size): - batch_end_offset = min(batch_start_offset + self.batch_size, len(embeddings)) - meta = [str(m) for m in metadata[batch_start_offset:batch_end_offset]] - vectors = embeddings[batch_start_offset:batch_end_offset] - if self.is_vct: - self.col.insert_vct(vectors, meta) - else: - self.col.insert(vectors, meta) - insert_count += len(vectors) + metadata = list(map(str, metadata)) + log.debug(f"Inserting {len(embeddings)} embeddings...") + self.col.insert(embeddings, metadata, request_ids=request_ids, await_completion=False) + insert_count += len(embeddings) + log.debug(f"Insert successful, count={insert_count}") except Exception as e: - log.info(f"Failed to insert data: {e}") + log.exception("Failed to insert data") return insert_count, e return insert_count, None @@ -223,22 +199,13 @@ def search_embedding( assert self.col is not None try: - if self.is_vct: - res = self.col.search_vct( - query=query, - top_k=k, - output_fields=["metadata"], - search_params=self.case_config.search_param().get("search_params", {}), - ) - - else: - # Perform the search. - res = self.col.search( - query=query, - top_k=k, - output_fields=["metadata"], - search_params=self.case_config.search_param().get("search_params", {}), - ) + # Perform the search. + res = self.col.search( + query=query, + top_k=k, + output_fields=["metadata"], + search_params=self.case_config.search_param().get("search_params", {}), + ) # Handle empty results if not res or len(res) == 0: diff --git a/vectordb_bench/backend/runner/serial_runner.py b/vectordb_bench/backend/runner/serial_runner.py index 300553a4e..235855c48 100644 --- a/vectordb_bench/backend/runner/serial_runner.py +++ b/vectordb_bench/backend/runner/serial_runner.py @@ -2,12 +2,14 @@ import logging import math import multiprocessing as mp +import os import time import traceback import numpy as np import psutil +from vectordb_bench.backend.clients.envector.envector import EnVector from vectordb_bench.backend.dataset import DatasetManager from vectordb_bench.backend.filter import Filter, FilterOp, non_filter @@ -19,6 +21,8 @@ NUM_PER_BATCH = config.NUM_PER_BATCH LOAD_MAX_TRY_COUNT = config.LOAD_MAX_TRY_COUNT +INSERT_TIMEOUT = max(int(os.environ.get("INSERT_TIMEOUT", "300")), config.LOAD_TIMEOUT_DEFAULT) +INSERT_POLL_INTERVAL = int(os.environ.get("INSERT_POLL_INTERVAL", "60")) log = logging.getLogger(__name__) @@ -55,16 +59,16 @@ def task(self) -> int: with self.db.init(): log.info(f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}") start = time.perf_counter() + request_ids = [] for data_df in self.dataset: all_metadata = data_df[self.dataset.data.train_id_field].tolist() emb_np = np.stack(data_df[self.dataset.data.train_vector_field]) if self.normalize: log.debug("normalize the 100k train data") - all_embeddings = (emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis]).tolist() + all_embeddings = emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis] else: - all_embeddings = emb_np.tolist() - del emb_np + all_embeddings = emb_np log.debug(f"batch dataset size: {len(all_embeddings)}, {len(all_metadata)}") labels_data = None @@ -73,11 +77,8 @@ def task(self) -> int: labels_data = self.dataset.scalar_labels[self.filters.label_field][all_metadata].to_list() else: labels_data = data_df[self.filters.label_field].tolist() - insert_count, error = self.db.insert_embeddings( - embeddings=all_embeddings, - metadata=all_metadata, - labels_data=labels_data, + embeddings=all_embeddings, metadata=all_metadata, labels_data=labels_data, request_ids=request_ids ) if error is not None: self.retry_insert( @@ -85,6 +86,7 @@ def task(self) -> int: embeddings=all_embeddings, metadata=all_metadata, labels_data=labels_data, + request_ids=request_ids, ) assert insert_count == len(all_metadata) @@ -92,10 +94,23 @@ def task(self) -> int: if count % 100_000 == 0: log.info(f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB") + if isinstance(self.db, EnVector): + log.info( + "Waiting for inserted rows to become searchable (Index Operation Status v0)... " + f"(requests={len(request_ids)}, timeout={INSERT_TIMEOUT}s)" + ) + self.db.col.indexer.wait_for_inserts_searchable( + index_name=self.db.collection_name, + request_ids=request_ids, + timeout_s=INSERT_TIMEOUT, + poll_interval_s=INSERT_POLL_INTERVAL, + ) + log.info( f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, " f"dur={time.perf_counter() - start}" ) + return count def endless_insert_data(self, all_embeddings: list, all_metadata: list, left_id: int = 0) -> int: diff --git a/vectordb_bench/cli/vectordbbench.py b/vectordb_bench/cli/vectordbbench.py index eab245f1b..1f4e4a573 100644 --- a/vectordb_bench/cli/vectordbbench.py +++ b/vectordb_bench/cli/vectordbbench.py @@ -1,7 +1,7 @@ from ..backend.clients.alloydb.cli import AlloyDBScaNN from ..backend.clients.aws_opensearch.cli import AWSOpenSearch from ..backend.clients.clickhouse.cli import Clickhouse -from ..backend.clients.envector.cli import EnVectorFlat, EnVectorIVFFlat +from ..backend.clients.envector.cli import EnVectorFlat, EnVectorIVFFlat, EnVectorIVFGAS from ..backend.clients.hologres.cli import HologresHGraph from ..backend.clients.lancedb.cli import LanceDB from ..backend.clients.mariadb.cli import MariaDBHNSW @@ -16,7 +16,6 @@ from ..backend.clients.qdrant_cloud.cli import QdrantCloud from ..backend.clients.qdrant_local.cli import QdrantLocal from ..backend.clients.redis.cli import Redis -from ..backend.clients.s3_vectors.cli import S3Vectors from ..backend.clients.test.cli import Test from ..backend.clients.tidb.cli import TiDB from ..backend.clients.vespa.cli import Vespa @@ -51,8 +50,8 @@ cli.add_command(QdrantLocal) cli.add_command(EnVectorFlat) cli.add_command(EnVectorIVFFlat) +cli.add_command(EnVectorIVFGAS) cli.add_command(BatchCli) -cli.add_command(S3Vectors) if __name__ == "__main__": diff --git a/vectordb_bench/config-files/envector_bloomberg_config.yml b/vectordb_bench/config-files/envector_bloomberg_config.yml new file mode 100644 index 000000000..0297e5331 --- /dev/null +++ b/vectordb_bench/config-files/envector_bloomberg_config.yml @@ -0,0 +1,46 @@ +# Custom Case +_base_dataset: &base_dataset + case_type: PerformanceCustomDataset + custom_case_name: BLOOMBERG768D368K + custom_case_description: BLOOMBERG768D368K benchmark (768D, 368K vectors) + custom_dataset_name: BLOOMBERG768D368K + custom_dataset_dir: "" + custom_dataset_size: 368816 + custom_dataset_dim: 768 + custom_dataset_file_count: 1 + custom_dataset_use_shuffled: false + custom_dataset_with_gt: true + k: 10 + +# envector server settings +_base_envector: &base_envector + uri: localhost:50050 + eval_mode: mm + drop_old: true + load: true + +# FLAT +envectorflat: + <<: [*base_dataset, *base_envector] + index_name: bloomberg_flat + db_label: BLOOMBERG768D368K-FLAT + +# IVF-FLAT with trained k-means centroids +envectorivfflat: + <<: [*base_dataset, *base_envector] + index_name: bloomberg_ivfflat + db_label: BLOOMBERG768D368K-IVFFLAT + nlist: 128 + nprobe: 6 + train_centroids: true + centroids_path: bloomberg/centroids/centroids_128.npy + +# GAS: enVector-customized ANN +envectorivfgas: + <<: [*base_dataset, *base_envector] + index_name: bloomberg_ivfgas + db_label: BLOOMBERG768D368K-IVFGAS + nlist: 32768 + nprobe: 6 + train_centroids: true + centroids_path: centroids/embeddinggemma-300m/centroids.npy \ No newline at end of file diff --git a/vectordb_bench/config-files/envector_fashion_config.yml b/vectordb_bench/config-files/envector_fashion_config.yml new file mode 100644 index 000000000..46b93ada4 --- /dev/null +++ b/vectordb_bench/config-files/envector_fashion_config.yml @@ -0,0 +1,46 @@ +# Custom Case +_base_dataset: &base_dataset + case_type: PerformanceCustomDataset + custom_case_name: FASHION512D200K + custom_case_description: FASHION512D200K benchmark (512D, 200K vectors) + custom_dataset_name: FASHION512D200K + custom_dataset_dir: "" + custom_dataset_size: 200000 + custom_dataset_dim: 512 + custom_dataset_file_count: 1 + custom_dataset_use_shuffled: false + custom_dataset_with_gt: true + k: 10 + +# envector server settings +_base_envector: &base_envector + uri: localhost:50050 + eval_mode: mm + drop_old: true + load: true + +# FLAT +envectorflat: + <<: [*base_dataset, *base_envector] + index_name: fashion_flat + db_label: FASHION512D200K-FLAT + +# IVF-FLAT with trained k-means centroids +envectorivfflat: + <<: [*base_dataset, *base_envector] + index_name: fashion_ivfflat + db_label: FASHION512D200K-IVFFLAT + nlist: 128 + nprobe: 6 + train_centroids: true + centroids_path: fashion/centroids/centroids_128.npy + +# GAS: enVector-customized ANN +envectorivfgas: + <<: [*base_dataset, *base_envector] + index_name: fashion_ivfgas + db_label: FASHION512D200K-IVFGAS + nlist: 32768 + nprobe: 6 + train_centroids: true + centroids_path: centroids/clip-vit-b-32/centroids.npy \ No newline at end of file diff --git a/vectordb_bench/config-files/envector_openai_config.yml b/vectordb_bench/config-files/envector_openai_config.yml new file mode 100644 index 000000000..f146b3871 --- /dev/null +++ b/vectordb_bench/config-files/envector_openai_config.yml @@ -0,0 +1,26 @@ +# FLAT +envectorflat: + index_name: perf1536d500k_flat + uri: localhost:50050 + eval_mode: mm + case_type: Performance1536D500K + db_label: Performance1536D500K-FLAT + k: 10 + drop_old: true + load: true + +# IVF-FLAT with trained k-means centroids +envectorivfflat: + index_name: perf1536d500k_ivfflat + uri: localhost:50050 + eval_mode: mm + case_type: Performance1536D500K + db_label: Performance1536D500K-IVF + k: 10 + nlist: 256 + nprobe: 6 + train_centroids: true + centroids_path: centroids/performance1536d500k/centroids_256.npy + drop_old: true + load: true + \ No newline at end of file diff --git a/vectordb_bench/config-files/envector_products_config.yml b/vectordb_bench/config-files/envector_products_config.yml new file mode 100644 index 000000000..550404208 --- /dev/null +++ b/vectordb_bench/config-files/envector_products_config.yml @@ -0,0 +1,46 @@ +# Custom Case +_base_dataset: &base_dataset + case_type: PerformanceCustomDataset + custom_case_name: PRODUCTS512D400K + custom_case_description: PRODUCTS512D400K benchmark (512D, 400K vectors) + custom_dataset_name: PRODUCTS512D400K + custom_dataset_dir: "" + custom_dataset_size: 400000 + custom_dataset_dim: 512 + custom_dataset_file_count: 1 + custom_dataset_use_shuffled: false + custom_dataset_with_gt: true + k: 10 + +# envector server settings +_base_envector: &base_envector + uri: localhost:50050 + eval_mode: mm + drop_old: true + load: true + +# FLAT +envectorflat: + <<: [*base_dataset, *base_envector] + index_name: products_flat + db_label: PRODUCTS512D400K-FLAT + +# IVF-FLAT with trained k-means centroids +envectorivfflat: + <<: [*base_dataset, *base_envector] + index_name: products_ivfflat + db_label: PRODUCTS512D400K-IVFFLAT + nlist: 128 + nprobe: 6 + train_centroids: true + centroids_path: products/centroids/centroids_128.npy + +# GAS: enVector-customized ANN +envectorivfgas: + <<: [*base_dataset, *base_envector] + index_name: products_ivfgas + db_label: PRODUCTS512D400K-IVFGAS + nlist: 32768 + nprobe: 6 + train_centroids: true + centroids_path: centroids/clip-vit-b-32/centroids.npy \ No newline at end of file diff --git a/vectordb_bench/config-files/envector_pubmed_config.yml b/vectordb_bench/config-files/envector_pubmed_config.yml new file mode 100644 index 000000000..3f301ea94 --- /dev/null +++ b/vectordb_bench/config-files/envector_pubmed_config.yml @@ -0,0 +1,46 @@ +# Custom Case +_base_dataset: &base_dataset + case_type: PerformanceCustomDataset + custom_case_name: PUBMED768D400K + custom_case_description: PUBMED768D400K benchmark (768D, 400K vectors) + custom_dataset_name: PUBMED768D400K + custom_dataset_dir: "" + custom_dataset_size: 400335 + custom_dataset_dim: 768 + custom_dataset_file_count: 1 + custom_dataset_use_shuffled: false + custom_dataset_with_gt: true + k: 10 + +# envector server settings +_base_envector: &base_envector + uri: localhost:50050 + eval_mode: mm + drop_old: true + load: true + +# FLAT +envectorflat: + <<: [*base_dataset, *base_envector] + index_name: pubmed_flat + db_label: PUBMED768D400K-FLAT + +# IVF-FLAT with trained k-means centroids +envectorivfflat: + <<: [*base_dataset, *base_envector] + index_name: pubmed_ivfflat + db_label: PUBMED768D400K-IVFFLAT + nlist: 128 + nprobe: 6 + train_centroids: true + centroids_path: pubmed/centroids/centroids_128.npy + +# GAS: enVector-customized ANN +envectorivfgas: + <<: [*base_dataset, *base_envector] + index_name: pubmed_ivfgas + db_label: PUBMED768D400K-IVFGAS + nlist: 32768 + nprobe: 6 + train_centroids: true + centroids_path: centroids/embeddinggemma-300m/centroids.npy \ No newline at end of file From 3978e182385cdcc7668a0b9bf48cc64229ad025a Mon Sep 17 00:00:00 2001 From: suyeong Date: Tue, 3 Feb 2026 00:00:55 +0000 Subject: [PATCH 2/4] fix lint --- install.py | 60 ++---- tests/conftest.py | 1 + tests/test_bench_runner.py | 16 +- tests/test_chroma.py | 47 ++--- tests/test_data_source.py | 6 +- tests/test_dataset.py | 6 +- tests/test_elasticsearch_cloud.py | 17 +- tests/test_models.py | 14 +- tests/test_rate_runner.py | 32 +++- tests/test_redis.py | 61 ++---- tests/test_utils.py | 60 +++--- tests/ut_cases.py | 1 + .../clients/aws_opensearch/aws_opensearch.py | 108 +++++------ .../backend/clients/aws_opensearch/cli.py | 24 +-- .../backend/clients/aws_opensearch/config.py | 28 ++- .../backend/clients/hologres/config.py | 3 +- .../backend/clients/hologres/hologres.py | 56 ++---- .../backend/clients/mariadb/mariadb.py | 12 +- .../backend/clients/milvus/config.py | 1 + .../backend/clients/oss_opensearch/cli.py | 25 +-- .../backend/clients/oss_opensearch/config.py | 3 - .../clients/oss_opensearch/oss_opensearch.py | 2 - .../backend/clients/pgdiskann/pgdiskann.py | 12 +- .../backend/clients/pgvector/config.py | 28 +-- .../backend/clients/pgvector/pgvector.py | 181 ++++++++---------- .../backend/clients/s3_vectors/cli.py | 67 ------- vectordb_bench/backend/clients/tidb/tidb.py | 24 +-- vectordb_bench/backend/runner/rate_runner.py | 3 +- vectordb_bench/cli/cli.py | 39 ---- .../components/check_results/filters.py | 3 +- .../frontend/components/check_results/nav.py | 1 - .../components/custom/displaypPrams.py | 6 +- .../frontend/components/qps_recall/charts.py | 118 ------------ .../frontend/components/qps_recall/data.py | 58 ------ .../components/run_test/submitTask.py | 12 +- .../components/welcome/explainPrams.py | 24 +-- .../frontend/config/dbCaseConfigs.py | 26 --- vectordb_bench/frontend/pages/qps_recall.py | 73 ------- vectordb_bench/frontend/utils.py | 1 - vectordb_bench/interface.py | 2 +- vectordb_bench/models.py | 6 +- 41 files changed, 340 insertions(+), 927 deletions(-) delete mode 100644 vectordb_bench/backend/clients/s3_vectors/cli.py delete mode 100644 vectordb_bench/frontend/components/qps_recall/charts.py delete mode 100644 vectordb_bench/frontend/components/qps_recall/data.py delete mode 100644 vectordb_bench/frontend/pages/qps_recall.py diff --git a/install.py b/install.py index 5807485fd..51202445a 100644 --- a/install.py +++ b/install.py @@ -4,70 +4,52 @@ def docker_tag_base(): - return 'vdbbench' + return "vdbbench" + def dockerfile_path_base(): - return os.path.join('vectordb_bench/', '../Dockerfile') + return os.path.join("vectordb_bench/", "../Dockerfile") + def docker_tag(track, algo): - return docker_tag_base() + '-' + track + '-' + algo + return docker_tag_base() + "-" + track + "-" + algo def build(tag, args, dockerfile): - print('Building %s...' % tag) + print("Building %s..." % tag) if args is not None and len(args) != 0: q = " ".join(["--build-arg " + x.replace(" ", "\\ ") for x in args]) else: q = "" try: - command = 'docker build %s --rm -t %s -f' \ - % (q, tag) - command += ' %s .' % dockerfile + command = "docker build %s --rm -t %s -f" % (q, tag) + command += " %s ." % dockerfile print(command) subprocess.check_call(command, shell=True) - return {tag: 'success'} + return {tag: "success"} except subprocess.CalledProcessError: - return {tag: 'fail'} + return {tag: "fail"} + def build_multiprocess(args): return build(*args) if __name__ == "__main__": - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument( - "--proc", - default=1, - type=int, - help="the number of process to build docker images") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--proc", default=1, type=int, help="the number of process to build docker images") + parser.add_argument("--track", choices=["none"], default="none") + parser.add_argument("--algorithm", metavar="NAME", help="build only the named algorithm image", default=None) parser.add_argument( - '--track', - choices=['none'], - default='none' + "--dockerfile", metavar="PATH", help="build only the image from a Dockerfile path", default=None ) - parser.add_argument( - '--algorithm', - metavar='NAME', - help='build only the named algorithm image', - default=None) - parser.add_argument( - '--dockerfile', - metavar='PATH', - help='build only the image from a Dockerfile path', - default=None) - parser.add_argument( - '--build-arg', - help='pass given args to all docker builds', - nargs="+") + parser.add_argument("--build-arg", help="pass given args to all docker builds", nargs="+") args = parser.parse_args() - print('Building base image...') - - subprocess.check_call( - 'docker build \ - --rm -t %s -f %s .' % (docker_tag_base(), dockerfile_path_base()), shell=True) + print("Building base image...") - print('Building end.') + subprocess.check_call("docker build \ + --rm -t %s -f %s ." % (docker_tag_base(), dockerfile_path_base()), shell=True) + print("Building end.") diff --git a/tests/conftest.py b/tests/conftest.py index 9e053200c..add768a45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import sys from os.path import dirname, abspath + sys.path.append(dirname(dirname(abspath(__file__)))) diff --git a/tests/test_bench_runner.py b/tests/test_bench_runner.py index 5fab91067..0ab6dc27b 100644 --- a/tests/test_bench_runner.py +++ b/tests/test_bench_runner.py @@ -2,11 +2,16 @@ import logging from vectordb_bench.interface import BenchMarkRunner from vectordb_bench.models import ( - DB, IndexType, CaseType, TaskConfig, CaseConfig, + DB, + IndexType, + CaseType, + TaskConfig, + CaseConfig, ) log = logging.getLogger(__name__) + class TestBenchRunner: def test_get_results(self): runner = BenchMarkRunner() @@ -17,7 +22,7 @@ def test_get_results(self): def test_performance_case_whole(self): runner = BenchMarkRunner() - task_config=TaskConfig( + task_config = TaskConfig( db=DB.Milvus, db_config=DB.Milvus.config(), db_case_config=DB.Milvus.case_config_cls(index=IndexType.Flat)(), @@ -32,7 +37,7 @@ def test_performance_case_whole(self): def test_performance_case_clean(self): runner = BenchMarkRunner() - task_config=TaskConfig( + task_config = TaskConfig( db=DB.Milvus, db_config=DB.Milvus.config(), db_case_config=DB.Milvus.case_config_cls(index=IndexType.Flat)(), @@ -44,7 +49,7 @@ def test_performance_case_clean(self): runner.stop_running() def test_performance_case_no_error(self): - task_config=TaskConfig( + task_config = TaskConfig( db=DB.ZillizCloud, db_config=DB.ZillizCloud.config(uri="xxx", user="abc", password="1234"), db_case_config=DB.ZillizCloud.case_config_cls()(), @@ -52,9 +57,10 @@ def test_performance_case_no_error(self): ) t = task_config.copy() - d = t.json(exclude={'db_config': {'password', 'api_key'}}) + d = t.json(exclude={"db_config": {"password", "api_key"}}) log.info(f"{d}") import ujson + loads = ujson.loads(d) log.info(f"{loads}") diff --git a/tests/test_chroma.py b/tests/test_chroma.py index 2b1b0596e..f337abfb9 100644 --- a/tests/test_chroma.py +++ b/tests/test_chroma.py @@ -6,7 +6,6 @@ import numpy as np import chromadb - log = logging.getLogger(__name__) """ Tests for Chroma, assumes Chroma is running on localhost:8000, @@ -19,13 +18,11 @@ 3. default port is 8000, default host is localhost""" - -dict = {} #Assumes chroma is acception connections on localhost:8000 -dict['name'] = "chroma" -dict['host'] = "localhost" -dict['port'] = 8000 -dict['password'] = "chroma" - +dict = {} # Assumes chroma is acception connections on localhost:8000 +dict["name"] = "chroma" +dict["host"] = "localhost" +dict["port"] = 8000 +dict["password"] = "chroma" class TestChroma: @@ -34,7 +31,6 @@ def test_insert_and_search(self): dbcls = DB.Chroma.init_cls dbConfig = DB.Chroma.config_cls - dim = 16 chrma = dbcls( @@ -49,36 +45,28 @@ def test_insert_and_search(self): filter_value = 0.9 embeddings = [[np.random.random() for _ in range(dim)] for _ in range(count)] - # insert with chrma.init(): - #chrma.client.delete_collection("example2") - assert (chrma.client.heartbeat() is not None), "chroma client is not connected" + # chrma.client.delete_collection("example2") + assert chrma.client.heartbeat() is not None, "chroma client is not connected" res = chrma.insert_embeddings(embeddings=embeddings, metadata=range(count)) # bulk_insert return - assert ( - res[0] == count - ), f"the return count of bulk insert ({res}) is not equal to count ({count})" + assert res[0] == count, f"the return count of bulk insert ({res}) is not equal to count ({count})" # count entries in chroma database countRes = chrma.collection.count() - - assert ( - countRes == count - ), f"the return count of redis client ({countRes}) is not equal to count ({count})" + + assert countRes == count, f"the return count of redis client ({countRes}) is not equal to count ({count})" # search with chrma.init(): test_id = np.random.randint(count) - #log.info(f"test_id: {test_id}") + # log.info(f"test_id: {test_id}") q = embeddings[test_id] res = chrma.search_embedding(query=q, k=100) print(res) - assert ( - res[0] == int(test_id) - ), f"the most nearest neighbor ({res[0]}) id is not test_id ({int(test_id)}" - + assert res[0] == int(test_id), f"the most nearest neighbor ({res[0]}) id is not test_id ({int(test_id)}" # search with filters, assumes filter format {id: int, metadata: >=int} with chrma.init(): @@ -86,13 +74,8 @@ def test_insert_and_search(self): test_id = np.random.randint(filter_value, count) q = embeddings[test_id] - - res = chrma.search_embedding( - query=q, k=100, filters={"id": filter_value} - ) - assert ( - res[0] == int(test_id) - ), f"the most nearest neighbor ({res[0]}) id is not test_id ({test_id})" + res = chrma.search_embedding(query=q, k=100, filters={"id": filter_value}) + assert res[0] == int(test_id), f"the most nearest neighbor ({res[0]}) id is not test_id ({test_id})" isFilter = True id_list = [] for id in res: @@ -101,5 +84,3 @@ def test_insert_and_search(self): isFilter = False break assert isFilter, f"Filter not working, id_list: {id_list}" - - \ No newline at end of file diff --git a/tests/test_data_source.py b/tests/test_data_source.py index 302ee3a1c..b52b6f4c8 100644 --- a/tests/test_data_source.py +++ b/tests/test_data_source.py @@ -5,14 +5,12 @@ log = logging.getLogger("vectordb_bench") + class TestReader: - @pytest.mark.parametrize("type_case", [ - (k, v) for k, v in type2case.items() - ]) + @pytest.mark.parametrize("type_case", [(k, v) for k, v in type2case.items()]) def test_type_cases(self, type_case): self.per_case_test(type_case) - def per_case_test(self, type_case): t, ca_cls = type_case ca = ca_cls() diff --git a/tests/test_dataset.py b/tests/test_dataset.py index d4ccb283d..6ad15c34e 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -4,9 +4,9 @@ from pydantic import ValidationError from vectordb_bench.backend.data_source import DatasetSource - log = logging.getLogger("vectordb_bench") + class TestDataSet: def test_iter_dataset(self): for ds in Dataset: @@ -29,6 +29,7 @@ def test_iter_cohere(self): cohere_10m.prepare() import time + before = time.time() for i in cohere_10m: log.debug(i.head(1)) @@ -40,9 +41,11 @@ def test_iter_cohere(self): def test_iter_laion(self): laion_100m = Dataset.LAION.manager(100_000_000) from vectordb_bench.backend.data_source import DatasetSource + laion_100m.prepare(source=DatasetSource.AliyunOSS) import time + before = time.time() for i in laion_100m: log.debug(i.head(1)) @@ -74,4 +77,3 @@ def test_download_small(self): files=files, local_ds_root=openai_50k.data_dir, ) - diff --git a/tests/test_elasticsearch_cloud.py b/tests/test_elasticsearch_cloud.py index f161ab6c2..b92df0eaf 100644 --- a/tests/test_elasticsearch_cloud.py +++ b/tests/test_elasticsearch_cloud.py @@ -6,7 +6,6 @@ ) import numpy as np - log = logging.getLogger(__name__) cloud_id = "" @@ -41,9 +40,7 @@ def test_insert_and_search(self): with es.init(): res = es.insert_embeddings(embeddings=embeddings, metadata=range(count)) # bulk_insert return - assert ( - res == count - ), f"the return count of bulk insert ({res}) is not equal to count ({count})" + assert res == count, f"the return count of bulk insert ({res}) is not equal to count ({count})" # indice_count return es.client.indices.refresh() @@ -61,9 +58,7 @@ def test_insert_and_search(self): res = es.search_embedding(query=q, k=100) log.info(f"search_results_id: {res}") - assert ( - res[0] == test_id - ), f"the most nearest neighbor ({res[0]}) id is not test_id ({test_id})" + assert res[0] == test_id, f"the most nearest neighbor ({res[0]}) id is not test_id ({test_id})" # search with filters with es.init(): @@ -71,13 +66,9 @@ def test_insert_and_search(self): log.info(f"test_id: {test_id}") q = embeddings[test_id] - res = es.search_embedding( - query=q, k=100, filters={"id": count * filter_rate} - ) + res = es.search_embedding(query=q, k=100, filters={"id": count * filter_rate}) log.info(f"search_results_id: {res}") - assert ( - res[0] == test_id - ), f"the most nearest neighbor ({res[0]}) id is not test_id ({test_id})" + assert res[0] == test_id, f"the most nearest neighbor ({res[0]}) id is not test_id ({test_id})" isFilter = True for id in res: if id < count * filter_rate: diff --git a/tests/test_models.py b/tests/test_models.py index d68dd6afb..7c633ee90 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,18 +1,10 @@ import pytest import logging -from vectordb_bench.models import ( - TaskConfig, CaseConfig, - CaseResult, TestResult, - Metric, CaseType -) -from vectordb_bench.backend.clients import ( - DB, - IndexType -) +from vectordb_bench.models import TaskConfig, CaseConfig, CaseResult, TestResult, Metric, CaseType +from vectordb_bench.backend.clients import DB, IndexType from vectordb_bench import config - log = logging.getLogger("vectordb_bench") @@ -33,7 +25,7 @@ def test_test_result(self): test_result.flush() with pytest.raises(ValueError): - result = TestResult.read_file('nosuchfile.json') + result = TestResult.read_file("nosuchfile.json") def test_test_result_read_write(self): result_dir = config.RESULTS_LOCAL_DIR diff --git a/tests/test_rate_runner.py b/tests/test_rate_runner.py index df92b0dd7..15dd5af63 100644 --- a/tests/test_rate_runner.py +++ b/tests/test_rate_runner.py @@ -12,25 +12,30 @@ log = logging.getLogger("vectordb_bench") log.setLevel(logging.DEBUG) + def get_rate_runner(db): cohere = Dataset.COHERE.manager(100_000) prepared = cohere.prepare(DatasetSource.AliyunOSS) assert prepared runner = RatedMultiThreadingInsertRunner( - rate = 10, - db = db, - dataset = cohere, + rate=10, + db=db, + dataset=cohere, ) return runner + def test_rate_runner(db, insert_rate): runner = get_rate_runner(db) _, t = runner.run_with_rate() log.info(f"insert run done, time={t}") -def test_read_write_runner(db, insert_rate, conc: list, search_stage: Iterable[float], read_dur_after_write: int, local: bool=False): + +def test_read_write_runner( + db, insert_rate, conc: list, search_stage: Iterable[float], read_dur_after_write: int, local: bool = False +): cohere = Dataset.COHERE.manager(1_000_000) if local is True: source = DatasetSource.AliyunOSS @@ -45,26 +50,32 @@ def test_read_write_runner(db, insert_rate, conc: list, search_stage: Iterable[f insert_rate=insert_rate, search_stage=search_stage, read_dur_after_write=read_dur_after_write, - concurrencies=conc + concurrencies=conc, ) rw_runner.run_read_write() def get_db(db: str, config: dict) -> VectorDB: if db == DB.Milvus.name: - return DB.Milvus.init_cls(dim=768, db_config=config, db_case_config=FLATConfig(metric_type="COSINE"), drop_old=True) + return DB.Milvus.init_cls( + dim=768, db_config=config, db_case_config=FLATConfig(metric_type="COSINE"), drop_old=True + ) elif db == DB.ZillizCloud.name: - return DB.ZillizCloud.init_cls(dim=768, db_config=config, db_case_config=AutoIndexConfig(metric_type="COSINE"), drop_old=True) + return DB.ZillizCloud.init_cls( + dim=768, db_config=config, db_case_config=AutoIndexConfig(metric_type="COSINE"), drop_old=True + ) else: raise ValueError(f"unknown db: {db}") if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-r", "--insert_rate", type=int, default="1000", help="insert entity row count per seconds, cps") + parser.add_argument( + "-r", "--insert_rate", type=int, default="1000", help="insert entity row count per seconds, cps" + ) parser.add_argument("-d", "--db", type=str, default=DB.Milvus.name, help="db name") parser.add_argument("-t", "--duration", type=int, default=300, help="stage search duration in seconds") - parser.add_argument("--use_s3", action='store_true', help="whether to use S3 dataset") + parser.add_argument("--use_s3", action="store_true", help="whether to use S3 dataset") flags = parser.parse_args() @@ -85,4 +96,5 @@ def get_db(db: str, config: dict) -> VectorDB: conc=conc, search_stage=search_stage, read_dur_after_write=flags.duration, - local=flags.use_s3) + local=flags.use_s3, + ) diff --git a/tests/test_redis.py b/tests/test_redis.py index 6783ee5a9..9538834b9 100644 --- a/tests/test_redis.py +++ b/tests/test_redis.py @@ -5,16 +5,14 @@ from vectordb_bench.backend.clients.redis.config import RedisConfig import numpy as np - log = logging.getLogger(__name__) # Tests for Redis, assumes Redis is running on localhost:6379, can be modified by changing the dict below dict = {} -dict['name'] = "redis" -dict['host'] = "localhost" -dict['port'] = 6379 -dict['password'] = "redis" - +dict["name"] = "redis" +dict["host"] = "localhost" +dict["port"] = 6379 +dict["password"] = "redis" class TestRedis: @@ -22,7 +20,6 @@ def test_insert_and_search(self): assert DB.Redis.value == "Redis" dbcls = DB.Redis.init_cls dbConfig = dbcls.config_cls() - dim = 16 rdb = dbcls( @@ -37,36 +34,29 @@ def test_insert_and_search(self): filter_value = 0.9 embeddings = [[np.random.random() for _ in range(dim)] for _ in range(count)] - # insert with rdb.init(): - assert (rdb.conn.ping() == True), "redis client is not connected" + assert rdb.conn.ping() == True, "redis client is not connected" res = rdb.insert_embeddings(embeddings=embeddings, metadata=range(count)) # bulk_insert return - assert ( - res[0] == count - ), f"the return count of bulk insert ({res}) is not equal to count ({count})" + assert res[0] == count, f"the return count of bulk insert ({res}) is not equal to count ({count})" # count entries in redis database countRes = rdb.conn.dbsize() - - assert ( - countRes == count - ), f"the return count of redis client ({countRes}) is not equal to count ({count})" + + assert countRes == count, f"the return count of redis client ({countRes}) is not equal to count ({count})" # search with rdb.init(): test_id = np.random.randint(count) - #log.info(f"test_id: {test_id}") + # log.info(f"test_id: {test_id}") q = embeddings[test_id] res = rdb.search_embedding(query=q, k=100) - #log.info(f"search_results_id: {res}") + # log.info(f"search_results_id: {res}") print(res) # res of format [2757, 2944, 8893, 6695, 5571, 608, 455, 3464, 1584, 1807, 8452, 4311...] - assert ( - res[0] == int(test_id) - ), f"the most nearest neighbor ({res[0]}) id is not test_id ({str(test_id)}" + assert res[0] == int(test_id), f"the most nearest neighbor ({res[0]}) id is not test_id ({str(test_id)}" # search with filters with rdb.init(): @@ -74,13 +64,8 @@ def test_insert_and_search(self): test_id = np.random.randint(filter_value, count) q = embeddings[test_id] - - res = rdb.search_embedding( - query=q, k=100, filters={"metadata": filter_value} - ) - assert ( - res[0] == int(test_id) - ), f"the most nearest neighbor ({res[0]}) id is not test_id ({test_id})" + res = rdb.search_embedding(query=q, k=100, filters={"metadata": filter_value}) + assert res[0] == int(test_id), f"the most nearest neighbor ({res[0]}) id is not test_id ({test_id})" isFilter = True id_list = [] for id in res: @@ -90,18 +75,10 @@ def test_insert_and_search(self): break assert isFilter, f"filters failed, got: ({id}), expected less than ({filter_value})" - #Test id filter for exact match - res = rdb.search_embedding( - query=q, k=100, filters={"id": 9999} - ) - assert ( - res[0] == 9999 - ) + # Test id filter for exact match + res = rdb.search_embedding(query=q, k=100, filters={"id": 9999}) + assert res[0] == 9999 - #Test two filters, id and metadata - res = rdb.search_embedding( - query=q, k=100, filters={"metadata": filter_value, "id": 9999} - ) - assert ( - res[0] == 9999 and len(res) == 1, f"filters failed, got: ({res[0]}), expected ({9999})" - ) \ No newline at end of file + # Test two filters, id and metadata + res = rdb.search_embedding(query=q, k=100, filters={"metadata": filter_value, "id": 9999}) + assert (res[0] == 9999 and len(res) == 1, f"filters failed, got: ({res[0]}), expected ({9999})") diff --git a/tests/test_utils.py b/tests/test_utils.py index df3fa6ffe..328f9b323 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,30 +6,37 @@ log = logging.getLogger(__name__) + class TestUtils: - @pytest.mark.parametrize("testcases", [ - (1, '1'), - (10, '10'), - (100, '100'), - (1000, '1K'), - (2000, '2K'), - (30_000, '30K'), - (400_000, '400K'), - (5_000_000, '5M'), - (60_000_000, '60M'), - (1_000_000_000, '1B'), - (1_000_000_000_000, '1000B'), - ]) + @pytest.mark.parametrize( + "testcases", + [ + (1, "1"), + (10, "10"), + (100, "100"), + (1000, "1K"), + (2000, "2K"), + (30_000, "30K"), + (400_000, "400K"), + (5_000_000, "5M"), + (60_000_000, "60M"), + (1_000_000_000, "1B"), + (1_000_000_000_000, "1000B"), + ], + ) def test_numerize(self, testcases): t_in, expected = testcases assert expected == utils.numerize(t_in) - @pytest.mark.parametrize("got_expected", [ - ([1, 3, 5, 7, 9, 10], 1.0), - ([11, 12, 13, 14, 15, 16], 0.0), - ([1, 3, 5, 11, 12, 13], 0.5), - ([1, 3, 5], 0.5), - ]) + @pytest.mark.parametrize( + "got_expected", + [ + ([1, 3, 5, 7, 9, 10], 1.0), + ([11, 12, 13, 14, 15, 16], 0.0), + ([1, 3, 5, 11, 12, 13], 0.5), + ([1, 3, 5], 0.5), + ], + ) def test_recall(self, got_expected): got, expected = got_expected ground_truth = [1, 3, 5, 7, 9, 10] @@ -39,12 +46,15 @@ def test_recall(self, got_expected): class TestGetFiles: - @pytest.mark.parametrize("train_count", [ - 1, - 10, - 50, - 100, - ]) + @pytest.mark.parametrize( + "train_count", + [ + 1, + 10, + 50, + 100, + ], + ) def test_train_count(self, train_count): files = utils.compose_train_files(train_count, True) log.info(files) diff --git a/tests/ut_cases.py b/tests/ut_cases.py index 76311363b..d089d23a4 100644 --- a/tests/ut_cases.py +++ b/tests/ut_cases.py @@ -14,6 +14,7 @@ class Performance100K99p(PerformanceCase): description: str = """This case tests the search performance of a vector database with a small dataset (Cohere 100K vectors, 768 dimensions) under a high filtering rate (99% vectors), at varying parallel levels. Results will show index building time, recall, and maximum QPS.""" + class Performance100K1p(PerformanceCase): case_id: CaseType = 100 filter_rate: float | int | None = 0.01 diff --git a/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py b/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py index 03443b255..8a059c995 100644 --- a/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +++ b/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py @@ -65,21 +65,14 @@ def __init__( self._load_graphs_to_memory(client) def _create_index(self, client: OpenSearch) -> None: - self._log_index_creation_info() - self._configure_cluster_settings(client) - settings = self._build_index_settings() - vector_field_config = self._build_vector_field_config() - mappings = self._build_mappings(vector_field_config) - self._create_opensearch_index(client, settings, mappings) - - def _log_index_creation_info(self) -> None: - log.info(f"Creating index with ef_search: {self.case_config.ef_search}") + ef_search_value = self.case_config.ef_search + log.info(f"Creating index with ef_search: {ef_search_value}") log.info(f"Creating index with number_of_replicas: {self.case_config.number_of_replicas}") + log.info(f"Creating index with engine: {self.case_config.engine}") log.info(f"Creating index with metric type: {self.case_config.metric_type_name}") log.info(f"All case_config parameters: {self.case_config.__dict__}") - def _configure_cluster_settings(self, client: OpenSearch) -> None: cluster_settings_body = { "persistent": { "knn.algo_param.index_thread_qty": self.case_config.index_thread_qty, @@ -87,82 +80,73 @@ def _configure_cluster_settings(self, client: OpenSearch) -> None: } } client.cluster.put_settings(body=cluster_settings_body) - - def _build_index_settings(self) -> dict: - return { + settings = { "index": { "knn": True, "number_of_shards": self.case_config.number_of_shards, "number_of_replicas": self.case_config.number_of_replicas, "translog.flush_threshold_size": self.case_config.flush_threshold_size, "knn.advanced.approximate_threshold": "-1", - "knn.algo_param.ef_search": self.case_config.ef_search, }, "refresh_interval": self.case_config.refresh_interval, } + settings["index"]["knn.algo_param.ef_search"] = ef_search_value - def _build_vector_field_config(self) -> dict: + # Get method configuration and log it for debugging method_config = self.case_config.index_param() log.info(f"Raw method config from index_param(): {method_config}") + # For s3vector engine, ensure method only contains engine field if self.case_config.engine == AWSOS_Engine.s3vector: method_config = {"engine": "s3vector"} + log.info(f"Cleaned method config for s3vector: {method_config}") + + # Prepare vector field configuration + vector_field_config = { + "type": "knn_vector", + "store": True, + "dimension": self.dim, + "method": method_config, + } - if self.case_config.on_disk: - space_type = self.case_config.parse_metric() - vector_field_config = { - "type": "knn_vector", - "dimension": self.dim, - "space_type": space_type, - "data_type": "float", - "mode": "on_disk", - "compression_level": "32x", - } - log.info("Using on-disk vector configuration with compression_level: 32x") - else: - vector_field_config = { - "type": "knn_vector", - "dimension": self.dim, - "method": method_config, - } - - if self.case_config.on_disk: - log.info(f"Final on-disk vector field config: {vector_field_config}") - elif self.case_config.engine == AWSOS_Engine.s3vector: + # For s3vector engine, space_type should be set at the vector field level + if self.case_config.engine == AWSOS_Engine.s3vector: space_type = self.case_config.parse_metric() vector_field_config["space_type"] = space_type + + # Ensure method config is absolutely clean for s3vector - remove any potential extra fields vector_field_config["method"] = {"engine": "s3vector"} - log.info(f"Final vector field config for s3vector: {vector_field_config}") - else: - log.info(f"Standard vector field config: {vector_field_config}") - return vector_field_config + log.info(f"Setting space_type '{space_type}' at vector field level for s3vector engine") + log.info(f"Final vector field config for s3vector: {vector_field_config}") - def _build_mappings(self, vector_field_config: dict) -> dict: + # Configure mappings based on engine type if self.case_config.engine == AWSOS_Engine.s3vector: + # For s3vector engine, use simplified mappings without _source configuration mappings = { "properties": { + # self.id_col_name: {"type": "integer", "store": True}, self.label_col_name: {"type": "keyword"}, self.vector_col_name: vector_field_config, }, } log.info("Using simplified mappings for s3vector engine (no _source configuration)") else: + # For other engines (faiss, lucene), use standard mappings with _source configuration mappings = { "_source": {"excludes": [self.vector_col_name], "recovery_source_excludes": [self.vector_col_name]}, "properties": { + # self.id_col_name: {"type": "integer", "store": True}, self.label_col_name: {"type": "keyword"}, self.vector_col_name: vector_field_config, }, } log.info("Using standard mappings with _source configuration for non-s3vector engines") - return mappings - - def _create_opensearch_index(self, client: OpenSearch, settings: dict, mappings: dict) -> None: try: log.info(f"Creating index with settings: {settings}") log.info(f"Creating index with mappings: {mappings}") + # Additional logging for s3vector to confirm method config before sending if self.case_config.engine == AWSOS_Engine.s3vector: method_in_mappings = mappings["properties"][self.vector_col_name]["method"] log.info(f"Final method config being sent to OpenSearch: {method_in_mappings}") @@ -172,21 +156,22 @@ def _create_opensearch_index(self, client: OpenSearch, settings: dict, mappings: body={"settings": settings, "mappings": mappings}, ) + # For s3vector, verify the actual index configuration after creation if self.case_config.engine == AWSOS_Engine.s3vector: - self._verify_s3vector_index_config(client) + try: + actual_mapping = client.indices.get_mapping(index=self.index_name) + actual_method = actual_mapping[self.index_name]["mappings"]["properties"][self.vector_col_name][ + "method" + ] + log.info(f"Actual method config in created index: {actual_method}") + + except Exception as e: + log.warning(f"Failed to verify index configuration: {e}") except Exception as e: log.warning(f"Failed to create index: {self.index_name} error: {e!s}") raise e from None - def _verify_s3vector_index_config(self, client: OpenSearch) -> None: - try: - actual_mapping = client.indices.get_mapping(index=self.index_name) - actual_method = actual_mapping[self.index_name]["mappings"]["properties"][self.vector_col_name]["method"] - log.info(f"Actual method config in created index: {actual_method}") - except Exception as e: - log.warning(f"Failed to verify index configuration: {e}") - @contextmanager def init(self) -> None: """connect to opensearch""" @@ -234,7 +219,7 @@ def _insert_with_single_client( insert_data.append(other_data) try: - self.client.bulk(body=insert_data) + self.client.bulk(insert_data) return len(embeddings), None except Exception as e: log.warning(f"Failed to insert data: {self.index_name} error: {e!s}") @@ -284,7 +269,7 @@ def insert_chunk(client_idx: int, chunk_idx: int): insert_data.append(other_data) try: - resp = client.bulk(body=insert_data) + resp = client.bulk(insert_data) log.info(f"Client {client_idx} added {len(resp['items'])} documents") return len(chunk_embeddings), None except Exception as e: @@ -317,7 +302,7 @@ def insert_chunk(client_idx: int, chunk_idx: int): time.sleep(10) return self._insert_with_single_client(embeddings, metadata) - resp = self.client.indices.stats(index=self.index_name) + resp = self.client.indices.stats(self.index_name) log.info( f"""Total document count in index after parallel insertion: {resp['_all']['primaries']['indexing']['index_total']}""", @@ -382,10 +367,11 @@ def search_embedding( "k": k, "method_parameters": self.case_config.search_param(), **({"filter": self.filter} if self.filter else {}), - "rescore": {"oversample_factor": self.case_config.oversample_factor} - # if self.case_config.use_quant - # else {} - , + **( + {"rescore": {"oversample_factor": self.case_config.oversample_factor}} + if self.case_config.use_quant + else {} + ), } log.debug("Using standard knn query with method_parameters for non-s3vector engines") @@ -503,7 +489,7 @@ def _do_force_merge(self): cluster_settings_body = { "persistent": {"knn.algo_param.index_thread_qty": self.case_config.index_thread_qty_during_force_merge} } - self.client.cluster.put_settings(body=cluster_settings_body) + self.client.cluster.put_settings(cluster_settings_body) log.info("Updating the graph threshold to ensure that during merge we can do graph creation.") output = self.client.indices.put_settings( diff --git a/vectordb_bench/backend/clients/aws_opensearch/cli.py b/vectordb_bench/backend/clients/aws_opensearch/cli.py index a3ddb8712..c50a455bc 100644 --- a/vectordb_bench/backend/clients/aws_opensearch/cli.py +++ b/vectordb_bench/backend/clients/aws_opensearch/cli.py @@ -131,33 +131,13 @@ class AWSOpenSearchTypedDict(TypedDict): str | None, click.option( "--quantization-type", - type=click.Choice(["fp32", "fp16", "bq"]), + type=click.Choice(["fp32", "fp16"]), help="quantization type for vectors (in index)", default="fp32", required=False, ), ] - oversample_factor: Annotated[ - float, - click.option( - "--oversample-factor", - type=float, - help="Oversample factor for vector search", - default=1.0, - ), - ] - - on_disk: Annotated[ - bool, - click.option( - "--on-disk", - is_flag=True, - help="Enable on-disk vector storage mode", - default=False, - ), - ] - class AWSOpenSearchHNSWTypedDict(CommonTypedDict, AWSOpenSearchTypedDict, HNSWFlavor1): ... @@ -207,8 +187,6 @@ def AWSOpenSearch(**parameters: Unpack[AWSOpenSearchHNSWTypedDict]): engine=engine, quantization_type=AWSOSQuantization(parameters["quantization_type"]), metric_type_name=parameters["metric_type"], - on_disk=parameters["on_disk"], - oversample_factor=parameters["oversample_factor"], ), **parameters, ) diff --git a/vectordb_bench/backend/clients/aws_opensearch/config.py b/vectordb_bench/backend/clients/aws_opensearch/config.py index 327ba83e5..b988c79f2 100644 --- a/vectordb_bench/backend/clients/aws_opensearch/config.py +++ b/vectordb_bench/backend/clients/aws_opensearch/config.py @@ -40,7 +40,6 @@ class AWSOS_Engine(Enum): class AWSOSQuantization(Enum): fp32 = "fp32" fp16 = "fp16" - bq = "bq" class AWSOpenSearchIndexConfig(BaseModel, DBCaseConfig): @@ -64,7 +63,6 @@ class AWSOpenSearchIndexConfig(BaseModel, DBCaseConfig): use_routing: bool = False # for label-filter cases oversample_factor: float = 1.0 quantization_type: AWSOSQuantization = AWSOSQuantization.fp32 - on_disk: bool = False def __eq__(self, obj: any): return ( @@ -76,7 +74,6 @@ def __eq__(self, obj: any): and self.number_of_segments == obj.number_of_segments and self.use_routing == obj.use_routing and self.quantization_type == obj.quantization_type - and self.on_disk == obj.on_disk ) def __hash__(self) -> int: @@ -90,7 +87,6 @@ def __hash__(self) -> int: self.number_of_segments, self.use_routing, self.quantization_type, - self.on_disk, ) ) @@ -120,7 +116,6 @@ def use_quant(self) -> bool: def index_param(self) -> dict: log.info(f"Using engine: {self.engine} for index creation") log.info(f"Using metric_type: {self.metric_type_name} for index creation") - log.info(f"Using on_disk mode: {self.on_disk} for index creation") space_type = self.parse_metric() log.info(f"Resulting space_type: {space_type} for index creation") @@ -129,25 +124,26 @@ def index_param(self) -> dict: if self.engine == AWSOS_Engine.s3vector: return {"engine": "s3vector"} - # For on-disk mode, return empty dict as no method config is needed - if self.on_disk: - return {} - parameters = {"ef_construction": self.efConstruction, "m": self.M} - # Add encoder configuration based on quantization type - if self.engine == AWSOS_Engine.faiss and self.use_quant: - if self.quantization_type == AWSOSQuantization.fp16: - parameters["encoder"] = {"name": "sq", "parameters": {"type": "fp16"}} - elif self.quantization_type == AWSOSQuantization.bq: - parameters["encoder"] = {"name": "binary", "parameters": {"bits": 1}} + if self.engine == AWSOS_Engine.faiss and self.quantization_type == AWSOSQuantization.fp16: + parameters["encoder"] = {"name": "sq", "parameters": {"type": "fp16"}} # For other engines (faiss, lucene), space_type is set at method level return { "name": "hnsw", "engine": self.engine.value, "space_type": space_type, - "parameters": parameters, + "parameters": { + "ef_construction": self.efConstruction, + "m": self.M, + "ef_search": self.ef_search, + **( + {"encoder": {"name": "sq", "parameters": {"type": self.quantization_type.fp16.value}}} + if self.use_quant + else {} + ), + }, } def search_param(self) -> dict: diff --git a/vectordb_bench/backend/clients/hologres/config.py b/vectordb_bench/backend/clients/hologres/config.py index ccd476556..25229e456 100644 --- a/vectordb_bench/backend/clients/hologres/config.py +++ b/vectordb_bench/backend/clients/hologres/config.py @@ -104,6 +104,8 @@ def builder_params(self) -> dict: self.base_quantization_type = "fp32" return { + "min_flush_proxima_row_count": self.min_flush_proxima_row_count, + "min_compaction_proxima_row_count": self.min_compaction_proxima_row_count, "max_total_size_to_merge_mb": self.max_total_size_to_merge_mb, "build_thread_count": self.build_thread_count, "base_quantization_type": self.base_quantization_type, @@ -111,7 +113,6 @@ def builder_params(self) -> dict: "ef_construction": self.ef_construction, "precise_quantization_type": self.precise_quantization_type, "use_reorder": self.use_reorder, - "precise_io_type": "reader_io", } def searcher_params(self) -> dict: diff --git a/vectordb_bench/backend/clients/hologres/hologres.py b/vectordb_bench/backend/clients/hologres/hologres.py index 296e1a4e5..0f300c252 100644 --- a/vectordb_bench/backend/clients/hologres/hologres.py +++ b/vectordb_bench/backend/clients/hologres/hologres.py @@ -128,18 +128,6 @@ def _drop_table(self): ) self.conn.commit() - try: - log.info(f"{self.name} client purge table recycle bin: {self.table_name}") - self.cursor.execute( - sql.SQL("purge TABLE {table_name};").format( - table_name=sql.Identifier(self.table_name), - ), - ) - except Exception as e: - log.info(f"{self.name} client purge table {self.table_name} recycle bin failed, error: {e}, ignore.") - finally: - self.conn.commit() - try: log.info(f"{self.name} client drop table group : {self._tg_name}") self.cursor.execute(sql.SQL(f"CALL HG_DROP_TABLE_GROUP('{self._tg_name}');")) @@ -148,14 +136,6 @@ def _drop_table(self): finally: self.conn.commit() - try: - log.info(f"{self.name} client free cache") - self.cursor.execute("select hg_admin_command('freecache');") - except Exception as e: - log.info(f"{self.name} client free cache failed, error: {e}, ignore.") - finally: - self.conn.commit() - def optimize(self, data_size: int | None = None): if self.case_config.create_index_after_load: self._create_index() @@ -170,11 +150,9 @@ def _vacuum(self): self.conn.autocommit = True with self.conn.cursor() as cursor: cursor.execute( - sql.SQL( - """ + sql.SQL(""" VACUUM {table_name}; - """ - ).format( + """).format( table_name=sql.Identifier(self.table_name), ) ) @@ -193,14 +171,12 @@ def _analyze(self): def _full_compact(self): log.info(f"{self.name} client full compact table : {self.table_name}") self.cursor.execute( - sql.SQL( - """ + sql.SQL(""" SELECT hologres.hg_full_compact_table( '{table_name}', 'max_file_size_mb={full_compact_max_file_size_mb}' ); - """ - ).format( + """).format( table_name=sql.SQL(self.table_name), full_compact_max_file_size_mb=sql.SQL(str(self.case_config.full_compact_max_file_size_mb)), ) @@ -211,8 +187,7 @@ def _create_index(self): assert self.conn is not None, "Connection is not initialized" assert self.cursor is not None, "Cursor is not initialized" - sql_index = sql.SQL( - """ + sql_index = sql.SQL(""" CALL set_table_property ('{table_name}', 'vectors', '{{ "embedding": {{ "algorithm": "{algorithm}", @@ -220,8 +195,7 @@ def _create_index(self): "builder_params": {builder_params} }} }}'); - """ - ).format( + """).format( table_name=sql.Identifier(self.table_name), algorithm=sql.SQL(self.case_config.algorithm()), distance_method=sql.SQL(self.case_config.distance_method()), @@ -256,15 +230,13 @@ def _set_replica_count(self, replica_count: int = 2): sql_get_warehouse_name = sql.SQL("select current_warehouse();") log.info(f"get warehouse name with sql: {sql_get_warehouse_name}") self.cursor.execute(sql_get_warehouse_name) - sql_tg_replica = sql.SQL( - """ + sql_tg_replica = sql.SQL(""" CALL hg_table_group_set_warehouse_replica_count ( '{dbname}.{tg_name}', {replica_count}, '{warehouse_name}' ); - """ - ).format( + """).format( tg_name=sql.SQL(self._tg_name), warehouse_name=sql.SQL(self.cursor.fetchone()[0]), dbname=sql.SQL(self.db_config["dbname"]), @@ -292,15 +264,13 @@ def _create_table(self, dim: int): self._set_replica_count(replica_count=2) - sql_table = sql.SQL( - """ + sql_table = sql.SQL(""" CREATE TABLE IF NOT EXISTS {table_name} ( id BIGINT PRIMARY KEY, embedding FLOAT4[] CHECK (array_ndims(embedding) = 1 AND array_length(embedding, 1) = {dim}) ) WITH (table_group = {tg_name}); - """ - ).format( + """).format( table_name=sql.Identifier(self.table_name), dim=dim, tg_name=sql.SQL(self._tg_name), @@ -351,16 +321,14 @@ def _compose_query_and_params(self, vec: list[float], topk: int, ge_id: int | No params.append(vec_float4) params.append(topk) - query = sql.SQL( - """ + query = sql.SQL(""" SELECT id FROM {table_name} {where_clause} ORDER BY {distance_function}(embedding, %b) {order_direction} LIMIT %s; - """ - ).format( + """).format( table_name=sql.Identifier(self.table_name), distance_function=sql.SQL(self.case_config.distance_function()), where_clause=where_clause, diff --git a/vectordb_bench/backend/clients/mariadb/mariadb.py b/vectordb_bench/backend/clients/mariadb/mariadb.py index 5ccddfe7a..db3863c85 100644 --- a/vectordb_bench/backend/clients/mariadb/mariadb.py +++ b/vectordb_bench/backend/clients/mariadb/mariadb.py @@ -73,14 +73,12 @@ def _create_db_table(self, dim: int): log.info(f"{self.name} client create table : {self.table_name}") self.cursor.execute(f"USE {self.db_name}") - self.cursor.execute( - f""" + self.cursor.execute(f""" CREATE TABLE {self.table_name} ( id INT PRIMARY KEY, v VECTOR({self.dim}) NOT NULL ) ENGINE={index_param["storage_engine"]} - """ - ) + """) self.cursor.execute("COMMIT") except Exception as e: @@ -142,12 +140,10 @@ def optimize(self) -> None: if index_param["index_type"] == "HNSW" and index_param["M"] is not None: index_options += f" M={index_param['M']}" - self.cursor.execute( - f""" + self.cursor.execute(f""" ALTER TABLE {self.db_name}.{self.table_name} ADD VECTOR KEY v(v) {index_options} - """ - ) + """) self.cursor.execute("COMMIT") except Exception as e: diff --git a/vectordb_bench/backend/clients/milvus/config.py b/vectordb_bench/backend/clients/milvus/config.py index 2367dda46..417b072c6 100644 --- a/vectordb_bench/backend/clients/milvus/config.py +++ b/vectordb_bench/backend/clients/milvus/config.py @@ -320,6 +320,7 @@ def search_param(self) -> dict: class GPUBruteForceConfig(MilvusIndexConfig, DBCaseConfig): limit: int = 10 # Default top-k for search + metric_type: str # Metric type (e.g., 'L2', 'IP', etc.) index: IndexType = IndexType.GPU_BRUTE_FORCE # Index type set to GPU_BRUTE_FORCE def index_param(self) -> dict: diff --git a/vectordb_bench/backend/clients/oss_opensearch/cli.py b/vectordb_bench/backend/clients/oss_opensearch/cli.py index 804a4bc82..3b83ff0df 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/cli.py +++ b/vectordb_bench/backend/clients/oss_opensearch/cli.py @@ -42,13 +42,13 @@ class OSSOpenSearchTypedDict(TypedDict): ), ] - index_thread_qty_during_force_merge: Annotated[ - int, + engine: Annotated[ + str, click.option( - "--index_thread_qty_during_force_merge", - type=int, - help="Thread count for native engine indexing during force merge", - default=4, + "--engine", + type=click.Choice(["nmslib", "faiss", "lucene"], case_sensitive=False), + help="HNSW algorithm implementation to use", + default="faiss", ), ] @@ -107,17 +107,6 @@ class OSSOpenSearchTypedDict(TypedDict): ), ] - engine: Annotated[ - str | None, - click.option( - "--engine", - type=click.Choice(["faiss", "lucene"]), - help="quantization type for vectors (in index)", - default="faiss", - required=False, - ), - ] - class OSSOpenSearchHNSWTypedDict(CommonTypedDict, OSSOpenSearchTypedDict, HNSWFlavor1): ... @@ -146,7 +135,7 @@ def OSSOpenSearch(**parameters: Unpack[OSSOpenSearchHNSWTypedDict]): index_thread_qty_during_force_merge=parameters["index_thread_qty_during_force_merge"], cb_threshold=parameters["cb_threshold"], efConstruction=parameters["ef_construction"], - efSearch=parameters["ef_search"], + efSearch=parameters["ef_runtime"], M=parameters["m"], engine=OSSOS_Engine(parameters["engine"]), quantization_type=OSSOpenSearchQuantization(parameters["quantization_type"]), diff --git a/vectordb_bench/backend/clients/oss_opensearch/config.py b/vectordb_bench/backend/clients/oss_opensearch/config.py index cc8097121..70c72e6e5 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/config.py +++ b/vectordb_bench/backend/clients/oss_opensearch/config.py @@ -76,7 +76,6 @@ class OSSOpenSearchIndexConfig(BaseModel, DBCaseConfig): use_routing: bool = False # for label-filter cases oversample_factor: float = 1.0 quantization_type: OSSOpenSearchQuantization = OSSOpenSearchQuantization.fp32 - replication_type: str | None = "DOCUMENT" @root_validator def validate_engine_name(cls, values: dict): @@ -102,7 +101,6 @@ def __eq__(self, obj: any): and self.number_of_segments == obj.number_of_segments and self.use_routing == obj.use_routing and self.quantization_type == obj.quantization_type - and self.replication_type == obj.replication_type ) def __hash__(self) -> int: @@ -116,7 +114,6 @@ def __hash__(self) -> int: self.number_of_segments, self.use_routing, self.quantization_type, - self.replication_type, ) ) diff --git a/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py b/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py index 091b8ff45..7be6100e4 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py +++ b/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py @@ -228,7 +228,6 @@ def _create_index(self, client: OpenSearch) -> None: ef_search_value = self.case_config.efSearch log.info(f"Creating index with ef_search: {ef_search_value}") log.info(f"Creating index with number_of_replicas: {self.case_config.number_of_replicas}") - log.info(f"Creating index with replication_type: {self.case_config.replication_type}") log.info(f"Creating index with engine: {self.case_config.engine}") log.info(f"Creating index with metric type: {self.case_config.metric_type_name}") log.info(f"All case_config parameters: {self.case_config.__dict__}") @@ -248,7 +247,6 @@ def _create_index(self, client: OpenSearch) -> None: "number_of_replicas": self.case_config.number_of_replicas, "translog.flush_threshold_size": self.case_config.flush_threshold_size, "knn.advanced.approximate_threshold": "-1", - "replication.type": self.case_config.replication_type, }, "refresh_interval": self.case_config.refresh_interval, } diff --git a/vectordb_bench/backend/clients/pgdiskann/pgdiskann.py b/vectordb_bench/backend/clients/pgdiskann/pgdiskann.py index 8aa4bda52..5f069ace5 100644 --- a/vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +++ b/vectordb_bench/backend/clients/pgdiskann/pgdiskann.py @@ -105,8 +105,7 @@ def init(self) -> Generator[None, None, None]: if search_params.get("reranking"): # Reranking-enabled queries - self._filtered_search = sql.SQL( - """ + self._filtered_search = sql.SQL(""" SELECT i.id FROM ( SELECT id, embedding @@ -117,16 +116,14 @@ def init(self) -> Generator[None, None, None]: ) i ORDER BY i.embedding {reranking_metric_fun_op} %s::vector LIMIT %s::int - """ - ).format( + """).format( table_name=sql.Identifier(self.table_name), metric_fun_op=sql.SQL(search_params["metric_fun_op"]), reranking_metric_fun_op=sql.SQL(search_params["reranking_metric_fun_op"]), quantized_fetch_limit=sql.Literal(search_params["quantized_fetch_limit"]), ) - self._unfiltered_search = sql.SQL( - """ + self._unfiltered_search = sql.SQL(""" SELECT i.id FROM ( SELECT id, embedding @@ -136,8 +133,7 @@ def init(self) -> Generator[None, None, None]: ) i ORDER BY i.embedding {reranking_metric_fun_op} %s::vector LIMIT %s::int - """ - ).format( + """).format( table_name=sql.Identifier(self.table_name), metric_fun_op=sql.SQL(search_params["metric_fun_op"]), reranking_metric_fun_op=sql.SQL(search_params["reranking_metric_fun_op"]), diff --git a/vectordb_bench/backend/clients/pgvector/config.py b/vectordb_bench/backend/clients/pgvector/config.py index 98e82f1c2..abfddc0cf 100644 --- a/vectordb_bench/backend/clients/pgvector/config.py +++ b/vectordb_bench/backend/clients/pgvector/config.py @@ -21,25 +21,21 @@ class PgVectorConfigDict(TypedDict): class PgVectorConfig(DBConfig): - user_name: SecretStr = "postgres" + user_name: SecretStr = SecretStr("postgres") password: SecretStr host: str = "localhost" port: int = 5432 - db_name: str = "vectordb" - table_name: str = "vdbbench_table_test" + db_name: str def to_dict(self) -> PgVectorConfigDict: - user_str = self.user_name.get_secret_value() if isinstance(self.user_name, SecretStr) else self.user_name + user_str = self.user_name.get_secret_value() pwd_str = self.password.get_secret_value() return { - "connect_config": { - "host": self.host, - "port": self.port, - "dbname": self.db_name, - "user": user_str, - "password": pwd_str, - }, - "table_name": self.table_name, + "host": self.host, + "port": self.port, + "dbname": self.db_name, + "user": user_str, + "password": pwd_str, } @@ -63,10 +59,6 @@ class PgVectorIndexConfig(BaseModel, DBCaseConfig): metric_type: MetricType | None = None create_index_before_load: bool = False create_index_after_load: bool = True - # Scan more of the index to get enough results for filter-cases. - # Options: "strict_order" (order by distance), "relaxed_order" (slightly out of order but better recall) - # See: https://github.com/pgvector/pgvector?tab=readme-ov-file#iterative-index-scans - iterative_scan: str = "relaxed_order" def parse_metric(self) -> str: d = { @@ -213,7 +205,7 @@ def search_param(self) -> PgVectorSearchParam: } def session_param(self) -> PgVectorSessionCommands: - session_parameters = {"ivfflat.probes": self.probes, "ivfflat.iterative_scan": self.iterative_scan} + session_parameters = {"ivfflat.probes": self.probes} return {"session_options": self._optionally_build_set_options(session_parameters)} @@ -263,7 +255,7 @@ def search_param(self) -> PgVectorSearchParam: } def session_param(self) -> PgVectorSessionCommands: - session_parameters = {"hnsw.ef_search": self.ef_search, "hnsw.iterative_scan": self.iterative_scan} + session_parameters = {"hnsw.ef_search": self.ef_search} return {"session_options": self._optionally_build_set_options(session_parameters)} diff --git a/vectordb_bench/backend/clients/pgvector/pgvector.py b/vectordb_bench/backend/clients/pgvector/pgvector.py index 05b38c670..877efbac2 100644 --- a/vectordb_bench/backend/clients/pgvector/pgvector.py +++ b/vectordb_bench/backend/clients/pgvector/pgvector.py @@ -10,8 +10,6 @@ from pgvector.psycopg import register_vector from psycopg import Connection, Cursor, sql -from vectordb_bench.backend.filter import Filter, FilterOp - from ..api import VectorDB from .config import PgVectorConfigDict, PgVectorIndexConfig @@ -21,46 +19,39 @@ class PgVector(VectorDB): """Use psycopg instructions""" - supported_filter_types: list[FilterOp] = [ - FilterOp.NonFilter, - FilterOp.NumGE, - FilterOp.StrEqual, - ] - conn: psycopg.Connection[Any] | None = None cursor: psycopg.Cursor[Any] | None = None - _search: sql.Composed + _filtered_search: sql.Composed + _unfiltered_search: sql.Composed def __init__( self, dim: int, db_config: PgVectorConfigDict, db_case_config: PgVectorIndexConfig, + collection_name: str = "pg_vector_collection", drop_old: bool = False, - with_scalar_labels: bool = False, **kwargs, ): self.name = "PgVector" + self.db_config = db_config self.case_config = db_case_config - self.table_name = db_config["table_name"] - self.connect_config = db_config["connect_config"] + self.table_name = collection_name self.dim = dim - self.with_scalar_labels = with_scalar_labels self._index_name = "pgvector_index" self._primary_field = "id" self._vector_field = "embedding" - self._scalar_label_field = "label" # construct basic units - self.conn, self.cursor = self._create_connection(**self.connect_config) + self.conn, self.cursor = self._create_connection(**self.db_config) # create vector extension self.cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") self.conn.commit() - log.info(f"{self.name} config values: {self.connect_config}\n{self.case_config}") + log.info(f"{self.name} config values: {self.db_config}\n{self.case_config}") if not any( ( self.case_config.create_index_before_load, @@ -69,7 +60,7 @@ def __init__( ): msg = ( f"{self.name} config must create an index using create_index_before_load or create_index_after_load" - f"{self.name} config values: {self.connect_config}\n{self.case_config}" + f"{self.name} config values: {self.db_config}\n{self.case_config}" ) log.error(msg) raise RuntimeError(msg) @@ -98,13 +89,13 @@ def _create_connection(**kwargs) -> tuple[Connection, Cursor]: return conn, cursor - def _generate_search_query(self) -> sql.Composed: + def _generate_search_query(self, filtered: bool = False) -> sql.Composed: index_param = self.case_config.index_param() reranking = self.case_config.search_param()["reranking"] column_name = ( - sql.SQL("binary_quantize({0})").format(sql.Identifier(self._vector_field)) + sql.SQL("binary_quantize({0})").format(sql.Identifier("embedding")) if index_param["quantization_type"] == "bit" and index_param["table_quantization_type"] != "bit" - else sql.SQL(self._vector_field) + else sql.SQL("embedding") ) search_vector = ( sql.SQL("binary_quantize({0})").format(sql.Placeholder()) @@ -123,14 +114,12 @@ def _generate_search_query(self) -> sql.Composed: """ SELECT i.id FROM ( - SELECT {primary_field}, {vector_field} {reranking_metric_fun_op} %s::{table_quantization_type} AS distance + SELECT id, embedding {reranking_metric_fun_op} %s::{table_quantization_type} AS distance FROM public.{table_name} {where_clause} ORDER BY {column_name}::{quantization_type}({dim}) - """, # noqa: E501 + """, ).format( table_name=sql.Identifier(self.table_name), - primary_field=sql.Identifier(self._primary_field), - vector_field=sql.Identifier(self._vector_field), column_name=column_name, reranking_metric_fun_op=sql.SQL( self.case_config.search_param()["reranking_metric_fun_op"], @@ -139,7 +128,7 @@ def _generate_search_query(self) -> sql.Composed: table_quantization_type=sql.SQL(index_param["table_quantization_type"]), quantization_type=sql.SQL(index_param["quantization_type"]), dim=sql.Literal(self.dim), - where_clause=sql.SQL(self.where_clause), + where_clause=sql.SQL("WHERE id >= %s") if filtered else sql.SQL(""), ), sql.SQL(self.case_config.search_param()["metric_fun_op"]), sql.SQL( @@ -165,16 +154,15 @@ def _generate_search_query(self) -> sql.Composed: [ sql.SQL( """ - SELECT {primary_field} FROM public.{table_name} + SELECT id FROM public.{table_name} {where_clause} ORDER BY {column_name}::{quantization_type}({dim}) """, ).format( table_name=sql.Identifier(self.table_name), - primary_field=sql.Identifier(self._primary_field), column_name=column_name, quantization_type=sql.SQL(index_param["quantization_type"]), dim=sql.Literal(self.dim), - where_clause=sql.SQL(self.where_clause), + where_clause=sql.SQL("WHERE id >= %s") if filtered else sql.SQL(""), ), sql.SQL(self.case_config.search_param()["metric_fun_op"]), sql.SQL(" {search_vector}::{quantization_type}({dim}) LIMIT %s::int").format( @@ -188,12 +176,10 @@ def _generate_search_query(self) -> sql.Composed: search_query = sql.Composed( [ sql.SQL( - "SELECT {primary_field} FROM public.{table_name} {where_clause} ORDER BY {vector_field}", + "SELECT id FROM public.{table_name} {where_clause} ORDER BY embedding ", ).format( table_name=sql.Identifier(self.table_name), - primary_field=sql.Identifier(self._primary_field), - vector_field=sql.Identifier(self._vector_field), - where_clause=sql.SQL(self.where_clause), + where_clause=sql.SQL("WHERE id >= %s") if filtered else sql.SQL(""), ), sql.SQL(self.case_config.search_param()["metric_fun_op"]), sql.SQL(" {search_vector}::{quantization_type}({dim}) LIMIT %s::int").format( @@ -215,7 +201,7 @@ def init(self) -> Generator[None, None, None]: >>> self.search_embedding() """ - self.conn, self.cursor = self._create_connection(**self.connect_config) + self.conn, self.cursor = self._create_connection(**self.db_config) # index configuration may have commands defined that we should set during each client session session_options: Sequence[dict[str, Any]] = self.case_config.session_param()["session_options"] @@ -230,6 +216,9 @@ def init(self) -> Generator[None, None, None]: self.cursor.execute(command) self.conn.commit() + self._filtered_search = self._generate_search_query(filtered=True) + self._unfiltered_search = self._generate_search_query() + try: yield finally: @@ -285,7 +274,7 @@ def _set_parallel_index_build_param(self): ) self.cursor.execute( sql.SQL("ALTER USER {} SET maintenance_work_mem TO {};").format( - sql.Identifier(self.connect_config["user"]), + sql.Identifier(self.db_config["user"]), index_param["maintenance_work_mem"], ), ) @@ -299,7 +288,7 @@ def _set_parallel_index_build_param(self): ) self.cursor.execute( sql.SQL("ALTER USER {} SET max_parallel_maintenance_workers TO '{}';").format( - sql.Identifier(self.connect_config["user"]), + sql.Identifier(self.db_config["user"]), index_param["max_parallel_workers"], ), ) @@ -310,7 +299,7 @@ def _set_parallel_index_build_param(self): ) self.cursor.execute( sql.SQL("ALTER USER {} SET max_parallel_workers TO '{}';").format( - sql.Identifier(self.connect_config["user"]), + sql.Identifier(self.db_config["user"]), index_param["max_parallel_workers"], ), ) @@ -393,36 +382,16 @@ def _create_table(self, dim: int): log.info(f"{self.name} client create table : {self.table_name}") # create table - if self.with_scalar_labels: - self.cursor.execute( - sql.SQL( - """ - CREATE TABLE IF NOT EXISTS public.{table_name} - ({primary_field} BIGINT PRIMARY KEY, embedding {table_quantization_type}({dim}), {label_field} VARCHAR(64)); - """, # noqa: E501 - ).format( - table_name=sql.Identifier(self.table_name), - table_quantization_type=sql.SQL(index_param["table_quantization_type"]), - dim=dim, - primary_field=sql.Identifier(self._primary_field), - label_field=sql.Identifier(self._scalar_label_field), - ) - ) - else: - self.cursor.execute( - sql.SQL( - """ - CREATE TABLE IF NOT EXISTS public.{table_name} - ({primary_field} BIGINT PRIMARY KEY, embedding {table_quantization_type}({dim})); - """ - ).format( - table_name=sql.Identifier(self.table_name), - table_quantization_type=sql.SQL(index_param["table_quantization_type"]), - dim=dim, - primary_field=sql.Identifier(self._primary_field), - ) + self.cursor.execute( + sql.SQL(""" + CREATE TABLE IF NOT EXISTS public.{table_name} + (id BIGINT PRIMARY KEY, embedding {table_quantization_type}({dim})); + """).format( + table_name=sql.Identifier(self.table_name), + table_quantization_type=sql.SQL(index_param["table_quantization_type"]), + dim=dim, ) - + ) self.cursor.execute( sql.SQL( "ALTER TABLE public.{table_name} ALTER COLUMN embedding SET STORAGE PLAIN;", @@ -433,17 +402,14 @@ def _create_table(self, dim: int): log.warning(f"Failed to create pgvector table: {self.table_name} error: {e}") raise e from None - def insert_embeddings( # noqa: PLR0912 + def insert_embeddings( self, embeddings: list[list[float]], metadata: list[int], - labels_data: list[str] | None = None, **kwargs: Any, ) -> tuple[int, Exception | None]: assert self.conn is not None, "Connection is not initialized" assert self.cursor is not None, "Cursor is not initialized" - if self.with_scalar_labels: - assert labels_data is not None, "labels_data should be provided if with_scalar_labels is set to True" index_param = self.case_config.index_param() @@ -465,10 +431,7 @@ def insert_embeddings( # noqa: PLR0912 embeddings_bit += "1" else: embeddings_bit += "0" - if self.with_scalar_labels: - copy.write_row((str(row), embeddings_bit, labels_data[i])) - else: - copy.write_row((str(row), embeddings_bit)) + copy.write_row((str(row), embeddings_bit)) else: with self.cursor.copy( sql.SQL("COPY public.{table_name} FROM STDIN (FORMAT BINARY)").format( @@ -476,47 +439,29 @@ def insert_embeddings( # noqa: PLR0912 ) ) as copy: if index_param["table_quantization_type"] == "halfvec": + copy.set_types(["bigint", "halfvec"]) for i, row in enumerate(metadata_arr): - if self.with_scalar_labels: - copy.set_types(["bigint", "halfvec", "varchar"]) - copy.write_row((row, np.float16(embeddings_arr[i]), labels_data[i])) - else: - copy.set_types(["bigint", "halfvec"]) - copy.write_row((row, np.float16(embeddings_arr[i]))) + copy.write_row((row, np.float16(embeddings_arr[i]))) else: + copy.set_types(["bigint", "vector"]) for i, row in enumerate(metadata_arr): - if self.with_scalar_labels: - copy.set_types(["bigint", "vector", "varchar"]) - copy.write_row((row, embeddings_arr[i], labels_data[i])) - else: - copy.set_types(["bigint", "vector"]) - copy.write_row((row, embeddings_arr[i])) + copy.write_row((row, embeddings_arr[i])) self.conn.commit() + if kwargs.get("last_batch"): + self._post_insert() + return len(metadata), None except Exception as e: log.warning(f"Failed to insert data into pgvector table ({self.table_name}), error: {e}") return 0, e - def prepare_filter(self, filters: Filter): - if filters.type == FilterOp.NonFilter: - self.where_clause = "" - elif filters.type == FilterOp.NumGE: - self.where_clause = f"WHERE {self._primary_field} >= {filters.int_value}" - elif filters.type == FilterOp.StrEqual: - self.where_clause = f"WHERE {self._scalar_label_field} = '{filters.label_value}'" - else: - msg = f"Not support Filter for PgVector - {filters}" - raise ValueError(msg) - - self._search = self._generate_search_query() - def search_embedding( self, query: list[float], k: int = 100, + filters: dict | None = None, timeout: int | None = None, - **kwargs: Any, ) -> list[int]: assert self.conn is not None, "Connection is not initialized" assert self.cursor is not None, "Cursor is not initialized" @@ -524,10 +469,36 @@ def search_embedding( index_param = self.case_config.index_param() search_param = self.case_config.search_param() q = np.asarray(query) - result = self.cursor.execute( - self._search, - (q, q, k) if index_param["quantization_type"] == "bit" and search_param["reranking"] else (q, k), - prepare=True, - binary=True, - ) + if filters: + gt = filters.get("id") + if index_param["quantization_type"] == "bit" and search_param["reranking"]: + result = self.cursor.execute( + self._filtered_search, + (q, gt, q, k), + prepare=True, + binary=True, + ) + else: + result = self.cursor.execute( + self._filtered_search, + (gt, q, k), + prepare=True, + binary=True, + ) + + elif index_param["quantization_type"] == "bit" and search_param["reranking"]: + result = self.cursor.execute( + self._unfiltered_search, + (q, q, k), + prepare=True, + binary=True, + ) + else: + result = self.cursor.execute( + self._unfiltered_search, + (q, k), + prepare=True, + binary=True, + ) + return [int(i[0]) for i in result.fetchall()] diff --git a/vectordb_bench/backend/clients/s3_vectors/cli.py b/vectordb_bench/backend/clients/s3_vectors/cli.py deleted file mode 100644 index ff362f7c1..000000000 --- a/vectordb_bench/backend/clients/s3_vectors/cli.py +++ /dev/null @@ -1,67 +0,0 @@ -from typing import Annotated, TypedDict, Unpack - -import click -from pydantic import SecretStr - -from ....cli.cli import ( - CommonTypedDict, - cli, - click_parameter_decorators_from_typed_dict, - get_custom_case_config, - run, -) -from .. import DB -from ..api import MetricType -from .config import S3VectorsIndexConfig - - -class S3VectorsTypedDict(TypedDict): - region_name: Annotated[ - str, click.option("--region", type=str, help="AWS region for S3 bucket (eg. us-east-1)", default="us-east-1") - ] - access_key_id: Annotated[str, click.option("--access_key_id", type=str, help="AWS access key ID", required=True)] - secret_access_key: Annotated[ - str, click.option("--secret_access_key", type=str, help="AWS secret access key", required=True) - ] - - bucket: Annotated[str, click.option("--bucket", type=str, help="S3 bucket name", required=True)] - index: Annotated[str, click.option("--index", type=str, help="Unique vector index name", default="vdbbench-index")] - - metric: Annotated[ - str, - click.option( - "--metric", - type=str, - help="Distance metric for vector similarity (e.g., 'cosine', 'euclidean').", - default=None, - ), - ] - - -class S3VectorsIndexTypedDict(CommonTypedDict, S3VectorsTypedDict): ... - - -@cli.command() -@click_parameter_decorators_from_typed_dict(S3VectorsIndexTypedDict) -def S3Vectors(**parameters: Unpack[S3VectorsIndexTypedDict]): - from .config import S3VectorsConfig - - parameters["custom_case"] = get_custom_case_config(parameters) - run( - db=DB.S3Vectors, - db_config=S3VectorsConfig( - region_name=parameters["region"], - access_key_id=SecretStr(parameters["access_key_id"]), - secret_access_key=SecretStr(parameters["secret_access_key"]), - bucket_name=parameters["bucket"], - index_name=parameters["index"] if parameters["index"] else "vdbbench-index", - ), - db_case_config=S3VectorsIndexConfig( - metric_type=( - MetricType.COSINE - if parameters["metric"] == "cosine" - else MetricType.L2 if parameters["metric"] == "l2" else None - ) - ), - **parameters, - ) diff --git a/vectordb_bench/backend/clients/tidb/tidb.py b/vectordb_bench/backend/clients/tidb/tidb.py index b75605eda..a5c99bbe4 100644 --- a/vectordb_bench/backend/clients/tidb/tidb.py +++ b/vectordb_bench/backend/clients/tidb/tidb.py @@ -68,15 +68,13 @@ def _create_table(self): try: index_param = self.case_config.index_param() with self._get_connection() as (conn, cursor): - cursor.execute( - f""" + cursor.execute(f""" CREATE TABLE {self.table_name} ( id BIGINT PRIMARY KEY, embedding VECTOR({self.dim}) NOT NULL, VECTOR INDEX (({index_param["metric_fn"]}(embedding))) ); - """ - ) + """) conn.commit() except Exception as e: log.warning("Failed to create table: %s error: %s", self.table_name, e) @@ -118,12 +116,10 @@ def _optimize_check_tiflash_replica_progress(self): try: database = self.db_config["database"] with self._get_connection() as (_, cursor): - cursor.execute( - f""" + cursor.execute(f""" SELECT PROGRESS FROM information_schema.tiflash_replica WHERE TABLE_SCHEMA = "{database}" AND TABLE_NAME = "{self.table_name}" - """ # noqa: S608 - ) + """) # noqa: S608 result = cursor.fetchone() return result[0] except Exception as e: @@ -155,13 +151,11 @@ def _optimize_get_tiflash_index_pending_rows(self): try: database = self.db_config["database"] with self._get_connection() as (_, cursor): - cursor.execute( - f""" + cursor.execute(f""" SELECT SUM(ROWS_STABLE_NOT_INDEXED) FROM information_schema.tiflash_indexes WHERE TIDB_DATABASE = "{database}" AND TIDB_TABLE = "{self.table_name}" - """ # noqa: S608 - ) + """) # noqa: S608 result = cursor.fetchone() return result[0] except Exception as e: @@ -223,11 +217,9 @@ def search_embedding( timeout: int | None = None, **kwargs: Any, ) -> list[int]: - self.cursor.execute( - f""" + self.cursor.execute(f""" SELECT id FROM {self.table_name} ORDER BY {self.search_fn}(embedding, "{query!s}") LIMIT {k}; - """ # noqa: S608 - ) + """) # noqa: S608 result = self.cursor.fetchall() return [int(i[0]) for i in result] diff --git a/vectordb_bench/backend/runner/rate_runner.py b/vectordb_bench/backend/runner/rate_runner.py index 163d50689..ca66c58a8 100644 --- a/vectordb_bench/backend/runner/rate_runner.py +++ b/vectordb_bench/backend/runner/rate_runner.py @@ -7,6 +7,7 @@ from vectordb_bench import config from vectordb_bench.backend.clients import api +from vectordb_bench.backend.clients.pgvector.pgvector import PgVector from vectordb_bench.backend.dataset import DataSetIterator from vectordb_bench.backend.utils import time_it @@ -47,7 +48,7 @@ def _insert_embeddings(db: api.VectorDB, emb: list[list[float]], metadata: list[ msg = f"Insert failed and retried more than {config.MAX_INSERT_RETRY} times" raise RuntimeError(msg) from None - if db.name == "PgVector": + if isinstance(db, PgVector): # pgvector is not thread-safe for concurrent insert, # so we need to copy the db object, make sure each thread has its own connection db_copy = deepcopy(db) diff --git a/vectordb_bench/cli/cli.py b/vectordb_bench/cli/cli.py index 2c63da464..eb0297a4b 100644 --- a/vectordb_bench/cli/cli.py +++ b/vectordb_bench/cli/cli.py @@ -183,16 +183,6 @@ def get_custom_case_config(parameters: dict) -> dict: "with_gt": parameters["custom_dataset_with_gt"], }, } - elif parameters["case_type"] == "NewIntFilterPerformanceCase": - custom_case_config = { - "dataset_with_size_type": parameters["dataset_with_size_type"], - "filter_rate": parameters["filter_rate"], - } - elif parameters["case_type"] == "LabelFilterPerformanceCase": - custom_case_config = { - "dataset_with_size_type": parameters["dataset_with_size_type"], - "label_percentage": parameters["label_percentage"], - } return custom_case_config @@ -426,35 +416,6 @@ class CommonTypedDict(TypedDict): ), ] task_label: Annotated[str, click.option("--task-label", help="Task label")] - dataset_with_size_type: Annotated[ - str, - click.option( - "--dataset-with-size-type", - help="Dataset with size type for NewIntFilterPerformanceCase/LabelFilterPerformanceCase, you can use " - "Medium Cohere (768dim, 1M)|Large Cohere (768dim, 10M)|Medium Bioasq (1024dim, 1M)|" - "Large Bioasq (1024dim, 10M)|Large OpenAI (1536dim, 5M)|Medium OpenAI (1536dim, 500K)", - default="Medium Cohere (768dim, 1M)", - show_default=True, - ), - ] - filter_rate: Annotated[ - float, - click.option( - "--filter-rate", - help="Filter rate for NewIntFilterPerformanceCase", - default=0.01, - show_default=True, - ), - ] - label_percentage: Annotated[ - float, - click.option( - "--label-percentage", - help="Filter rate for LabelFilterPerformanceCase", - default=0.01, - show_default=True, - ), - ] class HNSWBaseTypedDict(TypedDict): diff --git a/vectordb_bench/frontend/components/check_results/filters.py b/vectordb_bench/frontend/components/check_results/filters.py index 6016c0040..087f1fc6f 100644 --- a/vectordb_bench/frontend/components/check_results/filters.py +++ b/vectordb_bench/frontend/components/check_results/filters.py @@ -34,7 +34,6 @@ def getshownResults( st, results: list[TestResult], case_results_filter: Callable[[CaseResult], bool] = lambda x: True, - default_selected_task_labels: list[str] = [], **kwargs, ) -> list[CaseResult]: resultSelectOptions = [ @@ -48,7 +47,7 @@ def getshownResults( "Select the task results you need to analyze.", resultSelectOptions, # label_visibility="hidden", - default=default_selected_task_labels or resultSelectOptions, + default=resultSelectOptions, ) selectedResult: list[CaseResult] = [] for option in selectedResultSelectedOptions: diff --git a/vectordb_bench/frontend/components/check_results/nav.py b/vectordb_bench/frontend/components/check_results/nav.py index ba4fa99c7..2e72f2d63 100644 --- a/vectordb_bench/frontend/components/check_results/nav.py +++ b/vectordb_bench/frontend/components/check_results/nav.py @@ -26,7 +26,6 @@ def NavToPages(st): options = [ {"name": "Run Test", "link": "run_test"}, {"name": "Results", "link": "results"}, - {"name": "Qps & Recall", "link": "qps_recall"}, {"name": "Quries Per Dollar", "link": "quries_per_dollar"}, {"name": "Concurrent", "link": "concurrent"}, {"name": "Label Filter", "link": "label_filter"}, diff --git a/vectordb_bench/frontend/components/custom/displaypPrams.py b/vectordb_bench/frontend/components/custom/displaypPrams.py index cb55a3aa5..80a694308 100644 --- a/vectordb_bench/frontend/components/custom/displaypPrams.py +++ b/vectordb_bench/frontend/components/custom/displaypPrams.py @@ -1,6 +1,5 @@ def displayParams(st): - st.markdown( - """ + st.markdown(""" - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format. - Vectors data files: The file should have two kinds of columns: `id` as an incrementing `int` and `emb` as an array of `float32`. The name of two columns could be defined on your own. - Query test vectors: The file could be named on your own and should have two kinds of columns: `id` as an incrementing `int` and `emb` as an array of `float32`. The `id` column must be named as `id`, and `emb` column could be defined on your own. @@ -14,8 +13,7 @@ def displayParams(st): - `Label percentages` - If you have filter file, please input label percentage you want to real run and `split with ','` when it's `more than one`. If you `don't have` filter file, than `keep the text vacant.` -""" - ) +""") st.caption( """We recommend limiting the number of test query vectors, like 1,000.""", help=""" diff --git a/vectordb_bench/frontend/components/qps_recall/charts.py b/vectordb_bench/frontend/components/qps_recall/charts.py deleted file mode 100644 index ab57dd0ce..000000000 --- a/vectordb_bench/frontend/components/qps_recall/charts.py +++ /dev/null @@ -1,118 +0,0 @@ -from vectordb_bench.frontend.components.check_results.expanderStyle import ( - initMainExpanderStyle, -) -from vectordb_bench.metric import metric_order, isLowerIsBetterMetric, metric_unit_map -from vectordb_bench.frontend.config.styles import * -import plotly.express as px -import pandas as pd -import plotly.graph_objects as go -import matplotlib.pyplot as plt - - -def drawCharts(st, allData, caseNames: list[str]): - initMainExpanderStyle(st) - for caseName in caseNames: - chartContainer = st.expander(caseName, True) - data = [data for data in allData if data["case_name"] == caseName] - drawChart(data, chartContainer, key_prefix=caseName) - - -def drawChart(data, st, key_prefix: str): - metricsSet = set() - for d in data: - metricsSet = metricsSet.union(d["metricsSet"]) - showlineMetrics = [metric for metric in metric_order[:2] if metric in metricsSet] - - if showlineMetrics: - metric = showlineMetrics[0] - key = f"{key_prefix}-{metric}" - drawlinechart(st, data, metric, key=key) - - -def drawBestperformance(data, y, group): - all_filter_points = [] - data = pd.DataFrame(data) - grouped = data.groupby(group) - for name, group_df in grouped: - filter_points = [] - current_start = 0 - for _ in range(len(group_df)): - if current_start >= len(group_df): - break - max_index = group_df[y].iloc[current_start:].idxmax() - filter_points.append(group_df.loc[max_index]) - - current_start = group_df.index.get_loc(max_index) + 1 - all_filter_points.extend(filter_points) - - all_filter_df = pd.DataFrame(all_filter_points) - remaining_df = data[~data.isin(all_filter_df).any(axis=1)] - new_data = all_filter_df.to_dict(orient="records") - remain_data = remaining_df.to_dict(orient="records") - return new_data, remain_data - - -def drawlinechart(st, data: list[object], metric, key: str): - minV = min([d.get(metric, 0) for d in data]) - maxV = max([d.get(metric, 0) for d in data]) - padding = maxV - minV - rangeV = [ - minV - padding * 0.1, - maxV + padding * 0.1, - ] - x = "recall" - xrange = [0.8, 1.01] - y = "qps" - yrange = rangeV - data.sort(key=lambda a: a[x]) - group = "db_name" - new_data, new_remain_data = drawBestperformance(data, y, group) - unique_db_names = list(set(item["db_name"] for item in new_data + new_remain_data)) - - colors = plt.cm.get_cmap("tab10", len(unique_db_names)) - - color_map = { - db: f"rgb({int(colors(i)[0] * 255)}, {int(colors(i)[1] * 255)}, {int(colors(i)[2] * 255)})" - for i, db in enumerate(unique_db_names) - } - - fig = go.Figure() - - new_data_df = pd.DataFrame(new_data) - - for db in unique_db_names: - db_data = new_data_df[new_data_df["db_name"] == db] - fig.add_trace( - go.Scatter( - x=db_data["recall"], - y=db_data["qps"], - mode="lines+markers+text", - name=db, - line=dict(color=color_map[db]), - marker=dict(color=color_map[db]), - showlegend=True, - hovertemplate="QPS=%{y:.4g}, Recall=%{x:.2f}", - text=[f"{qps:.4g}@{recall:.2f}" for recall, qps in zip(db_data["recall"], db_data["qps"])], - textposition="top right", - ) - ) - - for item in new_remain_data: - fig.add_trace( - go.Scatter( - x=[item["recall"]], - y=[item["qps"]], - mode="markers", - name=item["db_name"], - marker=dict(color=color_map[item["db_name"]]), - showlegend=False, - ) - ) - - fig.update_xaxes(range=xrange, title_text="Recall") - fig.update_yaxes(range=yrange, title_text="QPS") - fig.update_layout( - margin=dict(l=0, r=0, t=40, b=0, pad=8), - legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="right", x=1, title=""), - ) - st.plotly_chart(fig, use_container_width=True, key=key) diff --git a/vectordb_bench/frontend/components/qps_recall/data.py b/vectordb_bench/frontend/components/qps_recall/data.py deleted file mode 100644 index b4cbcb1b5..000000000 --- a/vectordb_bench/frontend/components/qps_recall/data.py +++ /dev/null @@ -1,58 +0,0 @@ -from collections import defaultdict -from dataclasses import asdict -from vectordb_bench.backend.filter import FilterOp -from vectordb_bench.frontend.components.check_results.data import getFilterTasks -from vectordb_bench.frontend.components.check_results.filters import getShowDbsAndCases, getshownResults -from vectordb_bench.models import CaseResult, ResultLabel, TestResult - - -def getshownData(st, results: list[TestResult], filter_type: FilterOp = FilterOp.NonFilter, **kwargs): - # hide the nav - st.markdown( - "", - unsafe_allow_html=True, - ) - st.header("Filters") - shownResults = getshownResults(st, results, **kwargs) - showDBNames, showCaseNames = getShowDbsAndCases(st, shownResults, filter_type) - shownData, failedTasks = getChartData(shownResults, showDBNames, showCaseNames) - return shownData, failedTasks, showCaseNames - - -def getChartData( - tasks: list[CaseResult], - dbNames: list[str], - caseNames: list[str], -): - filterTasks = getFilterTasks(tasks, dbNames, caseNames) - failedTasks = defaultdict(lambda: defaultdict(str)) - nonemergedTasks = [] - for task in filterTasks: - db_name = task.task_config.db_name - db = task.task_config.db.value - db_label = task.task_config.db_config.db_label or "" - version = task.task_config.db_config.version or "" - case = task.task_config.case_config.case - case_name = case.name - dataset_name = case.dataset.data.full_name - filter_rate = case.filter_rate - metrics = asdict(task.metrics) - label = task.label - if label == ResultLabel.NORMAL: - nonemergedTasks.append( - { - "db_name": db_name, - "db": db, - "db_label": db_label, - "dataset_name": dataset_name, - "filter_rate": filter_rate, - "version": version, - "case_name": case_name, - "metricsSet": set(metrics.keys()), - **metrics, - } - ) - else: - failedTasks[case_name][db_name] = label - - return nonemergedTasks, failedTasks diff --git a/vectordb_bench/frontend/components/run_test/submitTask.py b/vectordb_bench/frontend/components/run_test/submitTask.py index 01d0c5876..5cb5ded43 100644 --- a/vectordb_bench/frontend/components/run_test/submitTask.py +++ b/vectordb_bench/frontend/components/run_test/submitTask.py @@ -55,17 +55,11 @@ def advancedSettings(st): "Concurrent Input", value=defaultconcurrentInput, label_visibility="collapsed" ) container[1].caption("num of concurrencies for search tests to get max-qps") - - container = st.columns([1, 2]) - concurrency_duration = container[0].number_input( - "Concurrency Duration", value=config.CONCURRENCY_DURATION, label_visibility="collapsed" - ) - container[1].caption("concurrency duration for each concurrency search test") - return index_already_exists, use_aliyun, k, concurrentInput, concurrency_duration + return index_already_exists, use_aliyun, k, concurrentInput def controlPanel(st, tasks: list[TaskConfig], taskLabel, isAllValid): - index_already_exists, use_aliyun, k, concurrentInput, concurrency_duration = advancedSettings(st) + index_already_exists, use_aliyun, k, concurrentInput = advancedSettings(st) def runHandler(): benchmark_runner.set_drop_old(not index_already_exists) @@ -79,7 +73,7 @@ def runHandler(): for task in tasks: task.case_config.k = k task.case_config.concurrency_search_config.num_concurrency = concurrentInput_list - task.case_config.concurrency_search_config.concurrency_duration = concurrency_duration + benchmark_runner.set_download_address(use_aliyun) benchmark_runner.run(tasks, taskLabel) diff --git a/vectordb_bench/frontend/components/welcome/explainPrams.py b/vectordb_bench/frontend/components/welcome/explainPrams.py index 3651c44d7..b7827d141 100644 --- a/vectordb_bench/frontend/components/welcome/explainPrams.py +++ b/vectordb_bench/frontend/components/welcome/explainPrams.py @@ -1,24 +1,20 @@ def explainPrams(st): st.markdown("## descriptions") st.markdown("### 1. Overview") - st.markdown( - """ + st.markdown(""" - **VectorDBBench(VDBBench)** is an open-source benchmarking tool designed specifically for vector databases. Its main features include: - (1) An easy-to-use **web UI** for configuration of tests and visual analysis of results. - (2) A comprehensive set of **standards for testing and metric collection**. - (3) Support for **various scenarios**, including additional support for **Filter** and **Streaming** based on standard tests. - VDBBench embraces open-source and welcome contributions of code and test result submissions. The testing process and extended scenarios of VDBBench, as well as the intention behind our design will be introduced as follows. -""" - ) +""") st.markdown("### 2. Dataset") - st.markdown( - """ + st.markdown(""" - We provide two embedding datasets: - (1)*[Cohere 768dim](https://huggingface.co/datasets/Cohere/wikipedia-22-12)*, generated using the **Cohere** model based on the Wikipedia corpus. - (2)*[Cohere 1024dim](https://huggingface.co/datasets/Cohere/beir-embed-english-v3)*, generated using the **Cohere** embed-english-v3.0 model based on the bioasq corpus. - (3)*OpenAI 1536dim*, generated using the **OpenAI** model based on the [C4 corpus](https://huggingface.co/datasets/legacy-datasets/c4). -""" - ) +""") st.markdown("### 3. Standard Test") st.markdown( """ @@ -43,15 +39,12 @@ def explainPrams(st): unsafe_allow_html=True, ) st.markdown("### 4. Filter Search Test") - st.markdown( - """ + st.markdown(""" - Compared to the Standard Test, the **Filter Search** introduces additional scalar constraints (e.g. **color == red**) during the Search Test. Different **filter_ratios** present varying levels of challenge to the VectorDB's search performance. - We provide an additional **string column** containing 10 labels with different distribution ratios (50%,20%,10%,5%,2%,1%,0.5%,0.2%,0.1%). For each label, we conduct both a **Serial Test** and a **Concurrency Test** to observe the VectorDB's performance in terms of **QPS, latency, and recall** under different filtering conditions. -""" - ) +""") st.markdown("### 5. Streaming Search Test") - st.markdown( - """ + st.markdown(""" Different from Standard's load and search separation, Streaming Search Test primarily focuses on **search performance during the insertion process**. Different **base dataset sizes** and varying **insertion rates** set distinct challenges to the VectorDB's search capabilities. VDBBench will send insert requests at a **fixed rate**, maintaining consistent insertion pressure. The search test consists of three steps as follows: @@ -62,5 +55,4 @@ def explainPrams(st): - Note: at this time, the insertion pressure drops to zero since data insertion is complete. - 3.**Optimized Search (Optional)** - Users can optionally perform an additional optimization step followed by a Serial Test and a Concurrent Test, recording qps, latency, and recall performance. This step **compares performance in Streaming section with the theoretically optimal performance**. -""" - ) +""") diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py index 07515cde1..778e19e48 100644 --- a/vectordb_bench/frontend/config/dbCaseConfigs.py +++ b/vectordb_bench/frontend/config/dbCaseConfigs.py @@ -1705,17 +1705,6 @@ class CaseConfigInput(BaseModel): inputConfig={"value": "60s", "placeholder": "e.g. 30s, 1m"}, ) -CaseConfigParamInput_REPLICATION_TYPE_AWSOpensearch = CaseConfigInput( - label=CaseConfigParamType.replication_type, - displayLabel="Replication Type", - inputHelp="Replication strategy: DOCUMENT (default) or SEGMENT", - inputType=InputType.Option, - inputConfig={ - "options": ["DOCUMENT", "SEGMENT"], - "default": "DOCUMENT", - }, -) - MilvusLoadConfig = [ CaseConfigParamInput_IndexType, CaseConfigParamInput_M, @@ -1796,13 +1785,11 @@ class CaseConfigInput(BaseModel): AWSOpensearchLoadingConfig = [ CaseConfigParamInput_EFConstruction_AWSOpensearch, CaseConfigParamInput_M_AWSOpensearch, - CaseConfigParamInput_REPLICATION_TYPE_AWSOpensearch, ] AWSOpenSearchPerformanceConfig = [ CaseConfigParamInput_EFConstruction_AWSOpensearch, CaseConfigParamInput_M_AWSOpensearch, CaseConfigParamInput_EF_SEARCH_AWSOpensearch, - CaseConfigParamInput_REPLICATION_TYPE_AWSOpensearch, ] AliyunOpensearchLoadingConfig = [] @@ -1926,24 +1913,13 @@ class CaseConfigInput(BaseModel): ] AliyunElasticsearchLoadingConfig = [ - CaseConfigParamInput_IndexType_ES, - CaseConfigParamInput_NumShards_ES, - CaseConfigParamInput_NumReplica_ES, - CaseConfigParamInput_RefreshInterval_ES, CaseConfigParamInput_EFConstruction_AliES, CaseConfigParamInput_M_AliES, ] AliyunElasticsearchPerformanceConfig = [ - CaseConfigParamInput_IndexType_ES, - CaseConfigParamInput_NumShards_ES, - CaseConfigParamInput_NumReplica_ES, - CaseConfigParamInput_RefreshInterval_ES, CaseConfigParamInput_EFConstruction_AliES, CaseConfigParamInput_M_AliES, CaseConfigParamInput_NumCandidates_AliES, - CaseConfigParamInput_UseRescore_ES, - CaseConfigParamInput_OversampleRatio_ES, - CaseConfigParamInput_UseRouting_ES, ] MongoDBLoadingConfig = [ @@ -2108,7 +2084,6 @@ class CaseConfigInput(BaseModel): CaseConfigParamInput_NUMBER_OF_REPLICAS_AWSOpensearch, CaseConfigParamInput_NUMBER_OF_INDEXING_CLIENTS_AWSOpensearch, CaseConfigParamInput_INDEX_THREAD_QTY_AWSOpensearch, - CaseConfigParamInput_REPLICATION_TYPE_AWSOpensearch, CaseConfigParamInput_INDEX_THREAD_QTY_DURING_FORCE_MERGE_AWSOpensearch, ] @@ -2123,7 +2098,6 @@ class CaseConfigInput(BaseModel): CaseConfigParamInput_NUMBER_OF_REPLICAS_AWSOpensearch, CaseConfigParamInput_NUMBER_OF_INDEXING_CLIENTS_AWSOpensearch, CaseConfigParamInput_INDEX_THREAD_QTY_AWSOpensearch, - CaseConfigParamInput_REPLICATION_TYPE_AWSOpensearch, CaseConfigParamInput_INDEX_THREAD_QTY_DURING_FORCE_MERGE_AWSOpensearch, ] diff --git a/vectordb_bench/frontend/pages/qps_recall.py b/vectordb_bench/frontend/pages/qps_recall.py deleted file mode 100644 index 27f9c4691..000000000 --- a/vectordb_bench/frontend/pages/qps_recall.py +++ /dev/null @@ -1,73 +0,0 @@ -import streamlit as st -from vectordb_bench.backend.cases import CaseLabel -from vectordb_bench.backend.filter import FilterOp -from vectordb_bench.frontend.components.check_results.footer import footer -from vectordb_bench.frontend.components.check_results.headerIcon import drawHeaderIcon -from vectordb_bench.frontend.components.check_results.nav import ( - NavToQuriesPerDollar, - NavToRunTest, - NavToPages, -) -from vectordb_bench.frontend.components.qps_recall.charts import drawCharts -from vectordb_bench.frontend.components.qps_recall.data import getshownData -from vectordb_bench.frontend.components.get_results.saveAsImage import getResults - -from vectordb_bench.frontend.config.styles import FAVICON -from vectordb_bench.interface import benchmark_runner -from vectordb_bench.models import CaseResult - - -def main(): - # set page config - st.set_page_config( - page_title="Label Filter", - page_icon=FAVICON, - layout="wide", - # initial_sidebar_state="collapsed", - ) - - # header - drawHeaderIcon(st) - - # navigate - NavToPages(st) - - allResults = benchmark_runner.get_results() - - st.title("Vector Database Benchmark (Qps & Recall)") - - # results selector and filter - resultSelectorContainer = st.sidebar.container() - - def case_results_filter(case_result: CaseResult) -> bool: - case = case_result.task_config.case_config.case - return case.label == CaseLabel.Performance and case.filters.type == FilterOp.NonFilter - - default_selected_task_labels = ["standard_2025"] - shownData, failedTasks, showCaseNames = getshownData( - resultSelectorContainer, - allResults, - case_results_filter=case_results_filter, - default_selected_task_labels=default_selected_task_labels, - ) - - resultSelectorContainer.divider() - - # nav - navContainer = st.sidebar.container() - NavToRunTest(navContainer) - NavToQuriesPerDollar(navContainer) - - # save or share - resultesContainer = st.sidebar.container() - getResults(resultesContainer, "vectordb_bench") - - # charts - drawCharts(st, shownData, showCaseNames) - - # footer - footer(st.container()) - - -if __name__ == "__main__": - main() diff --git a/vectordb_bench/frontend/utils.py b/vectordb_bench/frontend/utils.py index 407dd497d..dead61a6c 100644 --- a/vectordb_bench/frontend/utils.py +++ b/vectordb_bench/frontend/utils.py @@ -1,7 +1,6 @@ import random import string - passwordKeys = ["password", "api_key"] diff --git a/vectordb_bench/interface.py b/vectordb_bench/interface.py index 42dc876b0..b05e7c835 100644 --- a/vectordb_bench/interface.py +++ b/vectordb_bench/interface.py @@ -279,7 +279,7 @@ def kill_proc_tree( p.send_signal(sig) except psutil.NoSuchProcess: pass - _, alive = psutil.wait_procs(children, timeout=timeout, callback=on_terminate) + _gone, alive = psutil.wait_procs(children, timeout=timeout, callback=on_terminate) for p in alive: log.warning(f"force killing child process: {p}") diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index 74f96338e..682056bf3 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -29,8 +29,8 @@ def __init__(self, duration: int): class PerformanceTimeoutError(TimeoutError): - def __init__(self): - super().__init__("Performance case optimize timeout") + def __init__(self, message: str = "Performance case optimize timeout"): + super().__init__(message) class ConcurrencySlotTimeoutError(TimeoutError): @@ -125,10 +125,8 @@ class CaseConfigParamType(Enum): use_rescore = "use_rescore" oversample_ratio = "oversample_ratio" use_routing = "use_routing" - replication_type = "replication_type" dataset_with_size_type = "dataset_with_size_type" - filter_rate = "filter_rate" insert_rate = "insert_rate" search_stages = "search_stages" concurrencies = "concurrencies" From aff0f1469ec7e513bcdf577a563d17c7c50dfb1f Mon Sep 17 00:00:00 2001 From: suyeong Date: Tue, 3 Feb 2026 06:37:53 +0000 Subject: [PATCH 3/4] update description --- README.md | 33 +++++++++++++++++++++++++++++++-- scripts/prepare_dataset.py | 7 +++---- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0ec3d2a21..cb6aa086a 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ pip install pyenvector==1.3.0a1 3. Run Benchmark ```bash -# Run Benchmark +# Run Benchmark (VectorDBBench built-in dataset) ./scripts/run_benchmark.sh --index-type FLAT --config-file envector_openai_config.yml ``` @@ -154,7 +154,7 @@ For the ANN benchmark, we provide two datasets via HuggingFace: - `FASHION512D200K` - `FOOD512D75K` -Also, we provide centroids and tree metadata for the corresponding embedding model used in the ANN benchmark: +Also, we provide centroids for the corresponding embedding model used in the ANN benchmark: - GAS Centroids: [cryptolab-playground/gas-centroids](https://huggingface.co/datasets/cryptolab-playground/gas-centroids) To prepare dataset, run the following command as example: @@ -254,6 +254,35 @@ follows conventions of VectorDBBench, see details in [VectorDBBench Options](https://github.com/zilliztech/VectorDBBench?tab=readme-ov-file#custom-dataset-for-performance-case). For example, if you have a custom directory for dataset, set `DATASET_LOCAL_DIR`. +### enVector VectorDBBench Config File Options + +You can file the customized config files in `vectordb_bench/config-files` to use CLI options in more convinient way. + +```yaml +# FLAT +envectorflat: + index_name: test_index + uri: localhost:50050 + eval_mode: mm + case_type: Performance1536D500K + db_label: Performance1536D500K-FLAT + k: 10 + drop_old: true + load: true + +# IVF-FLAT with trained k-means centroids +envectorivfflat: + ... + nlist: 256 + nprobe: 6 + train_centroids: true + centroids_path: centroids/performance1536d500k/centroids_256.npy + +# GAS: enVector-customized ANN +envectorivfgas: + ... +``` + ## ❓ Troubleshooting diff --git a/scripts/prepare_dataset.py b/scripts/prepare_dataset.py index ffaa0bb72..df0e6d798 100644 --- a/scripts/prepare_dataset.py +++ b/scripts/prepare_dataset.py @@ -44,7 +44,7 @@ def get_args(): "--centroids-dir", type=str, default="./centroids", - help="Directory to save the centroids and tree info.", + help="Directory to save the centroids.", ) return parser.parse_args() @@ -92,7 +92,7 @@ def prepare_neighbors( def download_centroids(embedding_model: str, dataset_dir: str) -> None: - """Download pre-computed centroids and tree info for GAS VCT index.""" + """Download pre-computed centroids and for IVF_GAS index.""" if embedding_model != "embeddinggemma-300m": raise ValueError(f"Centroids for {embedding_model} currently not available.") @@ -103,8 +103,7 @@ def download_centroids(embedding_model: str, dataset_dir: str) -> None: # download os.makedirs(os.path.join(dataset_dir, embedding_model), exist_ok=True) wget.download(f"{dataset_link}/centroids.npy", out=os.path.join(dataset_dir, embedding_model, "centroids.npy")) - wget.download(f"{dataset_link}/tree_info.pkl", out=os.path.join(dataset_dir, embedding_model, "tree_info.pkl")) - print(f"\nDownloaded centroids and tree info to {os.path.join(dataset_dir, embedding_model)}") + print(f"\nDownloaded centroids to {os.path.join(dataset_dir, embedding_model)}") if __name__ == "__main__": From dc4c02ddbd622a4b0079080374674bbcffcb0a60 Mon Sep 17 00:00:00 2001 From: suyeong Date: Tue, 3 Feb 2026 14:07:46 +0000 Subject: [PATCH 4/4] update config --- README.md | 5 -- .../config-files/envector_fashion_config.yml | 46 ------------------- .../config-files/envector_random_config.yml | 36 +++++++++++++++ 3 files changed, 36 insertions(+), 51 deletions(-) delete mode 100644 vectordb_bench/config-files/envector_fashion_config.yml create mode 100644 vectordb_bench/config-files/envector_random_config.yml diff --git a/README.md b/README.md index cb6aa086a..419095aa4 100644 --- a/README.md +++ b/README.md @@ -221,11 +221,6 @@ We will support adjustable `NUM_PER_BATCH` for ANN soon. If you want to test on other benchmark datasets regardless ANN benchmark, please run the following scripts: ```python -# (Optional) Prepare laion dataset -python ./scripts/prepare_laion_dataset.py \ - --dataset-dir ./dataset/laion512d500k \ - --dataset-size 500_000 - # (Optional) Prepare random dataset python ./scripts/prepare_random_dataset.py \ --dataset-dir ./dataset/random512d1m \ diff --git a/vectordb_bench/config-files/envector_fashion_config.yml b/vectordb_bench/config-files/envector_fashion_config.yml deleted file mode 100644 index 46b93ada4..000000000 --- a/vectordb_bench/config-files/envector_fashion_config.yml +++ /dev/null @@ -1,46 +0,0 @@ -# Custom Case -_base_dataset: &base_dataset - case_type: PerformanceCustomDataset - custom_case_name: FASHION512D200K - custom_case_description: FASHION512D200K benchmark (512D, 200K vectors) - custom_dataset_name: FASHION512D200K - custom_dataset_dir: "" - custom_dataset_size: 200000 - custom_dataset_dim: 512 - custom_dataset_file_count: 1 - custom_dataset_use_shuffled: false - custom_dataset_with_gt: true - k: 10 - -# envector server settings -_base_envector: &base_envector - uri: localhost:50050 - eval_mode: mm - drop_old: true - load: true - -# FLAT -envectorflat: - <<: [*base_dataset, *base_envector] - index_name: fashion_flat - db_label: FASHION512D200K-FLAT - -# IVF-FLAT with trained k-means centroids -envectorivfflat: - <<: [*base_dataset, *base_envector] - index_name: fashion_ivfflat - db_label: FASHION512D200K-IVFFLAT - nlist: 128 - nprobe: 6 - train_centroids: true - centroids_path: fashion/centroids/centroids_128.npy - -# GAS: enVector-customized ANN -envectorivfgas: - <<: [*base_dataset, *base_envector] - index_name: fashion_ivfgas - db_label: FASHION512D200K-IVFGAS - nlist: 32768 - nprobe: 6 - train_centroids: true - centroids_path: centroids/clip-vit-b-32/centroids.npy \ No newline at end of file diff --git a/vectordb_bench/config-files/envector_random_config.yml b/vectordb_bench/config-files/envector_random_config.yml new file mode 100644 index 000000000..d840092ae --- /dev/null +++ b/vectordb_bench/config-files/envector_random_config.yml @@ -0,0 +1,36 @@ +# Custom Case +_base_dataset: &base_dataset + case_type: PerformanceCustomDataset + custom_case_name: RANDOM512D1M + custom_case_description: RANDOM512D1M benchmark (512D, 1M vectors) + custom_dataset_name: RANDOM512D1M + custom_dataset_dir: "" + custom_dataset_size: 1000000 + custom_dataset_dim: 512 + custom_dataset_file_count: 1 + custom_dataset_use_shuffled: false + custom_dataset_with_gt: true + k: 10 + +# envector server settings +_base_envector: &base_envector + uri: localhost:50050 + eval_mode: mm + drop_old: true + load: true + +# FLAT +envectorflat: + <<: [*base_dataset, *base_envector] + index_name: random512d1m_flat + db_label: RANDOM512D1M-FLAT + +# IVF-FLAT with trained k-means centroids +envectorivfflat: + <<: [*base_dataset, *base_envector] + index_name: random512d1m_ivfflat + db_label: RANDOM512D1M-IVFFLAT + nlist: 256 + nprobe: 6 + train_centroids: true + centroids_path: centroids/random512d1m/centroids_256.npy \ No newline at end of file