Skip to content
Open
26 changes: 11 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ enVector supports two types of benchmark cases:
├── README.md
├── scripts
│ ├── get_kmeans_centroids.py # create kmeans centroids
│ ├── prepare_dataset.py # download and prepare ground truth neighbors for GAS dataset
│ ├── prepare_random_dataset.py # download and prepare ground truth neighbors for random dataset
│ ├── requirements.txt # python requirements
│ ├── prepare_dataset.py # download and prepare ground truth neighbors for dataset
│ └── run_benchmark.sh # benchmark script
└── vectordb_bench/config-files # benchmark config file
└── envector_{benchmark_case}_config.yml
Expand Down Expand Up @@ -116,7 +117,7 @@ Run the following commands to run enVector with VectorDBBench's built-in benchma
./scripts/run_benchmark.sh --index-type IVF_FLAT --config-file envector_{benchmark_case}_config.yml # IVF-FLAT
```

For more details, please refer to `envector_{benchmark_case}_config.yml` in scripts directory for benchmarks with enVector, or you can use the following command:
For more details, please refer to `envector_{benchmark_case}_config.yml` in `vectordb_bench/config-files` directory for benchmarks with enVector, or you can use the following command:

```bash
python -m vectordb_bench.cli.vectordbbench envectorflat \
Expand Down Expand Up @@ -147,12 +148,11 @@ Prepare the following artifacts for the ANN benchmark with `scripts/prepare_data
- prepare ground-truth neighbors
- download centroids for the GAS index for corresponding to the embedding model

For the ANN benchmark, we provide two datasets via HuggingFace:
- `PUBMED768D400K`: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m)
- `BLOOMBERG768D368K`: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m)
- `PRODUCTS512D400K`
- `FASHION512D200K`
- `FOOD512D75K`
For the ANN benchmark, we provide four datasets via HuggingFace:
- `pubmed768d400k`: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m)
- `bloomberg768d368k`: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m)
- `products512d400k`: [cryptolab-playground/amazon-products-clip-vit-b-32](https://huggingface.co/datasets/cryptolab-playground/amazon-products-clip-vit-b-32)
- `food512d101k`: [cryptolab-playground/food101-clip-vit-b-32](https://huggingface.co/datasets/cryptolab-playground/food101-clip-vit-b-32)

Also, we provide centroids for the corresponding embedding model used in the ANN benchmark:
- GAS Centroids: [cryptolab-playground/gas-centroids](https://huggingface.co/datasets/cryptolab-playground/gas-centroids)
Expand All @@ -165,8 +165,7 @@ pip install -r ./scripts/requirements.txt

# Prepare GAS dataset
python ./scripts/prepare_dataset.py \
-d cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m \
-e embeddinggemma-300m
-d pubmed768d400k
```

Then, you can find the generated files as follows:
Expand Down Expand Up @@ -196,12 +195,12 @@ Run the provided shell scripts (`./scripts/run_benchmark.sh`) as the following:
For more details, please refer to `run_benchmark.sh` or `envector_{benchmark_case}_config.yml` in scripts directory for benchmarks with enVector with ANN (GAS), or you can use the following command:

```bash
python -m vectordb_bench.cli.vectordbbench envectorivfflat \
python -m vectordb_bench.cli.vectordbbench envectorivfgas \
--config-file envector_pubmed_config.yml

# or

python -m vectordb_bench.cli.vectordbbench envectorivfflat \
python -m vectordb_bench.cli.vectordbbench envectorivfgas \
--uri "localhost:50050" \
--eval-mode mm \
... \
Expand All @@ -211,9 +210,6 @@ python -m vectordb_bench.cli.vectordbbench envectorivfflat \
--nprobe 6
```

Note that, **`NUM_PER_BATCH` should be set to the database size** when using IVF-based ANN index for enVector currently.
We will support adjustable `NUM_PER_BATCH` for ANN soon.

## 🎯 Advanced Usage

### Prepare Other Datasets
Expand Down
95 changes: 95 additions & 0 deletions scripts/get_kmeans_centroids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
Get KMeans centroids for a given dataset.
"""

import argparse
import os

import faiss
import numpy as np
import pandas as pd
from numpy.linalg import norm


def get_args():
parser = argparse.ArgumentParser(description="KMeans Centroid Calculation")
parser.add_argument(
"--nlist",
type=int,
default=256,
help="Number of clusters for KMeans",
)
parser.add_argument(
"--file-path",
type=str,
default="/tmp/vectordb_bench/dataset/openai/openai_medium_500k",
help="Path to the dataset directory",
)
parser.add_argument(
"--out-path",
type=str,
default="/tmp/vectordb_bench/centroids/kmeans-centroids/openai_medium_500k",
help="Path to the output directory",
)
parser.add_argument(
"--dim",
type=int,
default=512,
help="Dimension of the embeddings.",
)
return parser.parse_args()


def load_dataset(file_path):
print("Loading dataset from:", file_path)

# load parquet files
train_vectors = pd.read_parquet(f"{file_path}/train.parquet")

# sort by id
train_vectors.sort_values(by="id", inplace=True)
train_ids = train_vectors["id"].to_numpy(dtype=np.int64)
train_vectors = np.vstack(train_vectors["emb"].values)
train_vectors /= norm(train_vectors, axis=1, keepdims=True)
print(f"train_vectors shape: {train_vectors.shape}")

return train_vectors.astype(np.float32)


def main():
args = get_args()

nlist = args.nlist
seed = 42

# prepare dataset
train_vectors = load_dataset(args.file_path)
dim = train_vectors.shape[1]
assert dim == args.dim, f"Expected dimension {args.dim}, but got {dim}"
print("✅ Load dataset complete.")

# kmeans using faiss
kmeans = faiss.Kmeans(dim, nlist, niter=25, seed=seed, verbose=True, gpu=True)
kmeans.train(train_vectors)
print("✅ KMeans training complete.")

# allocate
_, labels = kmeans.index.search(train_vectors, 1)
labels = labels.flatten()
centroids = kmeans.centroids
print(f"Labels shape: {labels.shape}")
print(f"Centroids shape: {centroids.shape}")

# normalize
centroids /= norm(centroids, axis=1, keepdims=True)
print(f"Norm: {norm(centroids, axis=1)}")

# save centroids
os.makedirs(args.out_path, exist_ok=True)
file_name = os.path.join(args.out_path, f"centroids_{nlist}.npy")
np.save(file_name, centroids)
print(f"✅ Centroids saved to {file_name}")


if __name__ == "__main__":
main()
52 changes: 34 additions & 18 deletions scripts/prepare_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,32 +13,39 @@
import wget
from datasets import load_dataset

SUPPORTED_CASES = {
"pubmed768d400k": {
"dataset_name": "cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m",
"embedding_model": "embeddinggemma-300m",
},
"bloomberg768d368k": {
"dataset_name": "cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m",
"embedding_model": "embeddinggemma-300m",
},
"products512d400k": {
"dataset_name": "cryptolab-playground/amazon-products-clip-vit-b-32",
"embedding_model": "clip-vit-b-32",
},
"food512d101k": {"dataset_name": "cryptolab-playground/food101-clip-vit-b-32", "embedding_model": "clip-vit-b-32"},
}
SUPPORTED_EMBEDDING_MODELS = ["embeddinggemma-300m", "clip-vit-b-32"]


def get_args():
parser = argparse.ArgumentParser(description="Prepare dataset and ground truth neighbors for benchmarking.")
parser.add_argument(
"-d",
"--dataset-name",
type=str,
default="cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m",
default="pubmed768d400k",
help="Huggingface dataset name to download.",
choices=[
"cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m",
"cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m",
],
choices=list(SUPPORTED_CASES.keys()),
)
parser.add_argument(
"--dataset-dir",
type=str,
default=os.path.join(os.environ.get("DATASET_LOCAL_DIR", "/tmp/vectordb_bench/dataset"), "pubmed768d400k"),
help="Dataset directory to save the dataset and neighbors. Default: 'pubmed768d400k' in DATASET_LOCAL_DIR.",
)
parser.add_argument(
"-e",
"--embedding-model",
type=str,
default="embeddinggemma-300m",
help="Embedding model name to download centroids for.",
default=None,
help="Dataset directory to save the dataset and neighbors. Default: <dataset_name> in DATASET_LOCAL_DIR.",
)
parser.add_argument(
"--centroids-dir",
Expand All @@ -52,7 +59,7 @@ def get_args():
def download_dataset(dataset_name: str, output_dir: str = "./dataset/pubmed768d400k") -> None:
"""Download dataset from Huggingface and save as Parquet files."""
# load dataset
ds = load_dataset(dataset_name)
ds = load_dataset(SUPPORTED_CASES[dataset_name]["dataset_name"])
train = ds["train"].to_pandas()
test = ds["test"].to_pandas()

Expand All @@ -62,6 +69,7 @@ def download_dataset(dataset_name: str, output_dir: str = "./dataset/pubmed768d4

test_table = pa.Table.from_pandas(test)
pq.write_table(test_table, f"{output_dir}/test.parquet")
print(f"Saved train and test parquet data to {output_dir}.")


def prepare_neighbors(
Expand Down Expand Up @@ -89,12 +97,13 @@ def prepare_neighbors(

table = pa.Table.from_pandas(df)
pq.write_table(table, f"{data_dir}/neighbors.parquet")
print(f"Saved neighbors data to {data_dir}.")


def download_centroids(embedding_model: str, dataset_dir: str) -> None:
"""Download pre-computed centroids and for IVF_GAS index."""

if embedding_model != "embeddinggemma-300m":
if embedding_model not in SUPPORTED_EMBEDDING_MODELS:
raise ValueError(f"Centroids for {embedding_model} currently not available.")

# BASE URL: https://huggingface.co/datasets/cryptolab-playground/gas-centroids
Expand All @@ -103,13 +112,20 @@ def download_centroids(embedding_model: str, dataset_dir: str) -> None:
# download
os.makedirs(os.path.join(dataset_dir, embedding_model), exist_ok=True)
wget.download(f"{dataset_link}/centroids.npy", out=os.path.join(dataset_dir, embedding_model, "centroids.npy"))
print(f"\nDownloaded centroids to {os.path.join(dataset_dir, embedding_model)}")
print(f"\nSaved centroids data to {os.path.join(dataset_dir, embedding_model)}")


if __name__ == "__main__":
args = get_args()

base_dataset_dir = (
os.environ.get("DATASET_LOCAL_DIR", "/tmp/vectordb_bench/dataset")
if args.dataset_dir is None
else args.dataset_dir
)
args.dataset_dir = os.path.join(base_dataset_dir, args.dataset_name)
os.makedirs(args.dataset_dir, exist_ok=True)

download_dataset(args.dataset_name, args.dataset_dir)
prepare_neighbors(args.dataset_dir)
download_centroids(args.embedding_model, args.centroids_dir)
download_centroids(SUPPORTED_CASES[args.dataset_name]["embedding_model"], args.centroids_dir)
Loading