diff --git a/.gitignore b/.gitignore index 8985eeb4d..cea1306b0 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,6 @@ build/ venv/ .venv/ .idea/ -results/ logs/ # Worktrees diff --git a/install/requirements_py3.11.txt b/install/requirements_py3.11.txt index 4214267a3..130745816 100644 --- a/install/requirements_py3.11.txt +++ b/install/requirements_py3.11.txt @@ -20,7 +20,7 @@ psutil polars plotly environs -pydantic=2.0,<3 scikit-learn pymilvus clickhouse_connect diff --git a/pyproject.toml b/pyproject.toml index 2baeb16e3..e72be9697 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "polars", "plotly", "environs", - "pydantic=2.0,<3", "scikit-learn", "pymilvus", # with pandas, numpy "hdrhistogram>=0.10.1", diff --git a/vectordb_bench/backend/clients/alisql/config.py b/vectordb_bench/backend/clients/alisql/config.py index f942c30f1..16c56e90f 100644 --- a/vectordb_bench/backend/clients/alisql/config.py +++ b/vectordb_bench/backend/clients/alisql/config.py @@ -49,8 +49,8 @@ def parse_metric(self) -> str: class AliSQLHNSWConfig(AliSQLIndexConfig, DBCaseConfig): - M: int | None - ef_search: int | None + M: int | None = None + ef_search: int | None = None index: IndexType = IndexType.HNSW def index_param(self) -> dict: diff --git a/vectordb_bench/backend/clients/alloydb/config.py b/vectordb_bench/backend/clients/alloydb/config.py index d6e54e487..11e65084e 100644 --- a/vectordb_bench/backend/clients/alloydb/config.py +++ b/vectordb_bench/backend/clients/alloydb/config.py @@ -43,8 +43,8 @@ class AlloyDBIndexParam(TypedDict): metric: str index_type: str index_creation_with_options: Sequence[dict[str, Any]] - maintenance_work_mem: str | None - max_parallel_workers: int | None + maintenance_work_mem: str | None = None + max_parallel_workers: int | None = None class AlloyDBSearchParam(TypedDict): @@ -120,15 +120,15 @@ def _optionally_build_set_options( class AlloyDBScaNNConfig(AlloyDBIndexConfig): index: IndexType = IndexType.SCANN - num_leaves: int | None - quantizer: str | None - enable_pca: str | None - max_num_levels: int | None - num_leaves_to_search: int | None - max_top_neighbors_buffer_size: int | None - pre_reordering_num_neighbors: int | None - num_search_threads: int | None - max_num_prefetch_datasets: int | None + num_leaves: int | None = None + quantizer: str | None = None + enable_pca: str | None = None + max_num_levels: int | None = None + num_leaves_to_search: int | None = None + max_top_neighbors_buffer_size: int | None = None + pre_reordering_num_neighbors: int | None = None + num_search_threads: int | None = None + max_num_prefetch_datasets: int | None = None maintenance_work_mem: str | None = None max_parallel_workers: int | None = None diff --git a/vectordb_bench/backend/clients/api.py b/vectordb_bench/backend/clients/api.py index 5511f18db..0f8103597 100644 --- a/vectordb_bench/backend/clients/api.py +++ b/vectordb_bench/backend/clients/api.py @@ -2,7 +2,7 @@ from contextlib import contextmanager from enum import StrEnum -from pydantic import BaseModel, SecretStr, validator +from pydantic import BaseModel, model_validator from vectordb_bench.backend.filter import Filter, FilterOp @@ -90,13 +90,18 @@ def common_long_configs() -> list[str]: def to_dict(self) -> dict: raise NotImplementedError - @validator("*") - def not_empty_field(cls, v: any, field: any): - if field.name in cls.common_short_configs() or field.name in cls.common_long_configs(): - return v - if not v and isinstance(v, str | SecretStr): - raise ValueError("Empty string!") - return v + @model_validator(mode="before") + @classmethod + def not_empty_field(cls, data: any) -> any: + if not isinstance(data, dict): + return data + skip = set(cls.common_short_configs()) | set(cls.common_long_configs()) + for field_name, v in data.items(): + if field_name in skip: + continue + if isinstance(v, str) and not v: + raise ValueError("Empty string!") + return data class DBCaseConfig(ABC): diff --git a/vectordb_bench/backend/clients/aws_opensearch/config.py b/vectordb_bench/backend/clients/aws_opensearch/config.py index 5ab63010d..7742d421d 100644 --- a/vectordb_bench/backend/clients/aws_opensearch/config.py +++ b/vectordb_bench/backend/clients/aws_opensearch/config.py @@ -1,7 +1,7 @@ import logging from enum import Enum -from pydantic import BaseModel, SecretStr, validator +from pydantic import BaseModel, SecretStr, model_validator from ..api import DBCaseConfig, DBConfig, MetricType @@ -32,17 +32,18 @@ def to_dict(self) -> dict: "timeout": 600, } - @validator("*") - def not_empty_field(cls, v: any, field: any): - if ( - field.name in cls.common_short_configs() - or field.name in cls.common_long_configs() - or field.name in ["user", "password", "host"] - ): - return v - if isinstance(v, str | SecretStr) and len(v) == 0: - raise ValueError("Empty string!") - return v + @model_validator(mode="before") + @classmethod + def not_empty_field(cls, data: any) -> any: + if not isinstance(data, dict): + return data + skip = set(cls.common_short_configs()) | set(cls.common_long_configs()) | {"user", "password", "host"} + for field_name, v in data.items(): + if field_name in skip: + continue + if isinstance(v, str) and not v: + raise ValueError("Empty string!") + return data class AWSOS_Engine(Enum): diff --git a/vectordb_bench/backend/clients/chroma/config.py b/vectordb_bench/backend/clients/chroma/config.py index cd3e01ecc..a1d9903d1 100644 --- a/vectordb_bench/backend/clients/chroma/config.py +++ b/vectordb_bench/backend/clients/chroma/config.py @@ -6,7 +6,7 @@ class ChromaConfig(DBConfig): user: str | None = None - password: SecretStr | None + password: SecretStr | None = None host: SecretStr = "localhost" port: int = 8000 diff --git a/vectordb_bench/backend/clients/cockroachdb/config.py b/vectordb_bench/backend/clients/cockroachdb/config.py index 0d608da8f..88ec0e5ea 100644 --- a/vectordb_bench/backend/clients/cockroachdb/config.py +++ b/vectordb_bench/backend/clients/cockroachdb/config.py @@ -75,16 +75,16 @@ class CockroachDBIndexParam(TypedDict): metric: str index_creation_with_options: Sequence[dict[str, Any]] - min_partition_size: int | None - max_partition_size: int | None - build_beam_size: int | None + min_partition_size: int | None = None + max_partition_size: int | None = None + build_beam_size: int | None = None class CockroachDBSearchParam(TypedDict): """Search parameters for CockroachDB vector queries.""" metric_fun_op: LiteralString - vector_search_beam_size: int | None + vector_search_beam_size: int | None = None class CockroachDBSessionCommands(TypedDict): diff --git a/vectordb_bench/backend/clients/doris/config.py b/vectordb_bench/backend/clients/doris/config.py index a15309922..7c79ba728 100644 --- a/vectordb_bench/backend/clients/doris/config.py +++ b/vectordb_bench/backend/clients/doris/config.py @@ -1,6 +1,6 @@ import logging -from pydantic import BaseModel, SecretStr, validator +from pydantic import BaseModel, SecretStr, model_validator from ..api import DBCaseConfig, DBConfig, MetricType @@ -17,9 +17,10 @@ class DorisConfig(DBConfig): db_name: str = "test" ssl: bool = False - @validator("*") - def not_empty_field(cls, v: any, field: any): - return v + @model_validator(mode="before") + @classmethod + def not_empty_field(cls, data: any) -> any: + return data def to_dict(self) -> dict: pwd_str = self.password.get_secret_value() diff --git a/vectordb_bench/backend/clients/lindorm/config.py b/vectordb_bench/backend/clients/lindorm/config.py index 367f369e3..0e0d4aae6 100644 --- a/vectordb_bench/backend/clients/lindorm/config.py +++ b/vectordb_bench/backend/clients/lindorm/config.py @@ -43,9 +43,9 @@ def parse_metric(self) -> str: class HNSWConfig(LindormIndexConfig, DBCaseConfig): index: IndexType = IndexType.HNSW - M: int | None - efConstruction: int | None - efSearch: int | None + M: int | None = None + efConstruction: int | None = None + efSearch: int | None = None filter_type: str | None = "efficient_filter" k_expand_scope: int | None = 1000 @@ -72,12 +72,12 @@ def search_param(self, do_filter: bool = False) -> dict: # first layer searching for cluster centroids is hnsw class IVFPQConfig(LindormIndexConfig, DBCaseConfig): index: IndexType = IndexType.IVFPQ - nlist: int | None - nprobe: int | None + nlist: int | None = None + nprobe: int | None = None # search parameters - centroids_hnsw_M: int | None - centroids_hnsw_efConstruction: int | None - centroids_hnsw_efSearch: int | None + centroids_hnsw_M: int | None = None + centroids_hnsw_efConstruction: int | None = None + centroids_hnsw_efSearch: int | None = None filter_type: str | None = "efficient_filter" reorder_factor: int | None = 10 @@ -116,13 +116,13 @@ def search_param(self, do_filter: bool = False) -> dict: class IVFBQConfig(LindormIndexConfig, DBCaseConfig): index: IndexType = IndexType.IVFBQ - nlist: int | None - exbits: int | None - nprobe: int | None + nlist: int | None = None + exbits: int | None = None + nprobe: int | None = None # search parameters - centroids_hnsw_M: int | None - centroids_hnsw_efConstruction: int | None - centroids_hnsw_efSearch: int | None + centroids_hnsw_M: int | None = None + centroids_hnsw_efConstruction: int | None = None + centroids_hnsw_efSearch: int | None = None filter_type: str | None = "efficient_filter" reorder_factor: int | None = 10 diff --git a/vectordb_bench/backend/clients/mariadb/config.py b/vectordb_bench/backend/clients/mariadb/config.py index d183adc76..21ea9ac2e 100644 --- a/vectordb_bench/backend/clients/mariadb/config.py +++ b/vectordb_bench/backend/clients/mariadb/config.py @@ -46,11 +46,11 @@ def parse_metric(self) -> str: class MariaDBHNSWConfig(MariaDBIndexConfig, DBCaseConfig): - M: int | None - ef_search: int | None + M: int | None = None + ef_search: int | None = None index: IndexType = IndexType.HNSW storage_engine: str = "InnoDB" - max_cache_size: int | None + max_cache_size: int | None = None def index_param(self) -> dict: return { diff --git a/vectordb_bench/backend/clients/milvus/config.py b/vectordb_bench/backend/clients/milvus/config.py index 9ffbdcece..98118c6df 100644 --- a/vectordb_bench/backend/clients/milvus/config.py +++ b/vectordb_bench/backend/clients/milvus/config.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, SecretStr, validator +from pydantic import BaseModel, SecretStr, model_validator from ..api import DBCaseConfig, DBConfig, IndexType, MetricType, SQType @@ -19,17 +19,18 @@ def to_dict(self) -> dict: "replica_number": self.replica_number, } - @validator("*") - def not_empty_field(cls, v: any, field: any): - if ( - field.name in cls.common_short_configs() - or field.name in cls.common_long_configs() - or field.name in ["user", "password"] - ): - return v - if isinstance(v, str | SecretStr) and len(v) == 0: - raise ValueError("Empty string!") - return v + @model_validator(mode="before") + @classmethod + def not_empty_field(cls, data: any) -> any: + if not isinstance(data, dict): + return data + skip = set(cls.common_short_configs()) | set(cls.common_long_configs()) | {"user", "password"} + for field_name, v in data.items(): + if field_name in skip: + continue + if isinstance(v, str) and not v: + raise ValueError("Empty string!") + return data class MilvusIndexConfig(BaseModel): diff --git a/vectordb_bench/backend/clients/oss_opensearch/config.py b/vectordb_bench/backend/clients/oss_opensearch/config.py index 83fed3d58..a5d69459a 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/config.py +++ b/vectordb_bench/backend/clients/oss_opensearch/config.py @@ -1,7 +1,7 @@ import logging from enum import Enum -from pydantic import BaseModel, SecretStr, root_validator, validator +from pydantic import BaseModel, SecretStr, field_validator, model_validator from ..api import DBCaseConfig, DBConfig, MetricType @@ -32,17 +32,18 @@ def to_dict(self) -> dict: "timeout": 600, } - @validator("*") - def not_empty_field(cls, v: any, field: any): - if ( - field.name in cls.common_short_configs() - or field.name in cls.common_long_configs() - or field.name in ["user", "password", "host"] - ): - return v - if isinstance(v, str | SecretStr) and len(v) == 0: - raise ValueError("Empty string!") - return v + @model_validator(mode="before") + @classmethod + def not_empty_field(cls, data: any) -> any: + if not isinstance(data, dict): + return data + skip = set(cls.common_short_configs()) | set(cls.common_long_configs()) | {"user", "password", "host"} + for field_name, v in data.items(): + if field_name in skip: + continue + if isinstance(v, str) and not v: + raise ValueError("Empty string!") + return data class OSSOS_Engine(Enum): @@ -111,7 +112,8 @@ class OSSOpenSearchIndexConfig(BaseModel, DBCaseConfig): compression_level: str = CompressionLevel.LEVEL_32X oversample_factor: float = 1.0 - @validator("quantization_type", pre=True, always=True) + @field_validator("quantization_type", mode="before") + @classmethod def validate_quantization_type(cls, value: any): """Convert string values to enum""" if not value: @@ -128,19 +130,22 @@ def validate_quantization_type(cls, value: any): return mapping.get(value, OSSOpenSearchQuantization.NONE) - @root_validator - def validate_engine_name(cls, values: dict): - """Map engine_name string from UI to engine enum""" - if values.get("engine_name"): - engine_name = values["engine_name"].lower() + @model_validator(mode="before") + @classmethod + def validate_engine_name(cls, data: any) -> any: + if not isinstance(data, dict): + return data + # Map engine_name to engine enum + if data.get("engine_name"): + engine_name = data["engine_name"].lower() if engine_name == "faiss": - values["engine"] = OSSOS_Engine.faiss + data["engine"] = OSSOS_Engine.faiss elif engine_name == "lucene": - values["engine"] = OSSOS_Engine.lucene + data["engine"] = OSSOS_Engine.lucene else: log.warning(f"Unknown engine_name: {engine_name}, defaulting to faiss") - values["engine"] = OSSOS_Engine.faiss - return values + data["engine"] = OSSOS_Engine.faiss + return data def __eq__(self, obj: any): return ( diff --git a/vectordb_bench/backend/clients/pgdiskann/config.py b/vectordb_bench/backend/clients/pgdiskann/config.py index 7f83a05c8..8715b1e42 100644 --- a/vectordb_bench/backend/clients/pgdiskann/config.py +++ b/vectordb_bench/backend/clients/pgdiskann/config.py @@ -43,8 +43,8 @@ class PgDiskANNIndexConfig(BaseModel, DBCaseConfig): metric_type: MetricType | None = None create_index_before_load: bool = False create_index_after_load: bool = True - maintenance_work_mem: str | None - max_parallel_workers: int | None + maintenance_work_mem: str | None = None + max_parallel_workers: int | None = None def parse_metric(self) -> str: if self.metric_type == MetricType.L2: @@ -120,10 +120,10 @@ def _optionally_build_set_options( class PgDiskANNImplConfig(PgDiskANNIndexConfig): index: IndexType = IndexType.DISKANN - max_neighbors: int | None - l_value_ib: int | None - pq_param_num_chunks: int | None - l_value_is: float | None + max_neighbors: int | None = None + l_value_ib: int | None = None + pq_param_num_chunks: int | None = None + l_value_is: float | None = None reranking: bool | None = None reranking_metric: str | None = None quantized_fetch_limit: int | None = None diff --git a/vectordb_bench/backend/clients/pgvecto_rs/config.py b/vectordb_bench/backend/clients/pgvecto_rs/config.py index fbb7c5d81..73c2573a0 100644 --- a/vectordb_bench/backend/clients/pgvecto_rs/config.py +++ b/vectordb_bench/backend/clients/pgvecto_rs/config.py @@ -78,7 +78,7 @@ def session_param(self) -> dict[str, str | int]: ... class PgVectoRSHNSWConfig(PgVectoRSIndexConfig): index: IndexType = IndexType.HNSW m: int | None = None - ef_search: int | None + ef_search: int | None = None ef_construction: int | None = None def index_param(self) -> dict[str, str]: @@ -106,8 +106,8 @@ def session_param(self) -> dict[str, str | int]: class PgVectoRSIVFFlatConfig(PgVectoRSIndexConfig): index: IndexType = IndexType.IVFFlat - probes: int | None - lists: int | None + probes: int | None = None + lists: int | None = None def index_param(self) -> dict[str, str]: if self.quantization_type is None: diff --git a/vectordb_bench/backend/clients/pgvector/config.py b/vectordb_bench/backend/clients/pgvector/config.py index 98e82f1c2..7da238a4b 100644 --- a/vectordb_bench/backend/clients/pgvector/config.py +++ b/vectordb_bench/backend/clients/pgvector/config.py @@ -47,8 +47,8 @@ class PgVectorIndexParam(TypedDict): metric: str index_type: str index_creation_with_options: Sequence[dict[str, Any]] - maintenance_work_mem: str | None - max_parallel_workers: int | None + maintenance_work_mem: str | None = None + max_parallel_workers: int | None = None class PgVectorSearchParam(TypedDict): @@ -175,13 +175,13 @@ class PgVectorIVFFlatConfig(PgVectorIndexConfig): a good place to start is sqrt(lists) """ - lists: int | None - probes: int | None + lists: int | None = None + probes: int | None = None index: IndexType = IndexType.ES_IVFFlat maintenance_work_mem: str | None = None max_parallel_workers: int | None = None quantization_type: str | None = None - table_quantization_type: str | None + table_quantization_type: str | None = None reranking: bool | None = None quantized_fetch_limit: int | None = None reranking_metric: str | None = None @@ -226,12 +226,12 @@ class PgVectorHNSWConfig(PgVectorIndexConfig): m: int | None # DETAIL: Valid values are between "2" and "100". ef_construction: int | None # ef_construction must be greater than or equal to 2 * m - ef_search: int | None + ef_search: int | None = None index: IndexType = IndexType.ES_HNSW maintenance_work_mem: str | None = None max_parallel_workers: int | None = None quantization_type: str | None = None - table_quantization_type: str | None + table_quantization_type: str | None = None reranking: bool | None = None quantized_fetch_limit: int | None = None reranking_metric: str | None = None diff --git a/vectordb_bench/backend/clients/pgvectorscale/config.py b/vectordb_bench/backend/clients/pgvectorscale/config.py index e22c45c8d..07750cffb 100644 --- a/vectordb_bench/backend/clients/pgvectorscale/config.py +++ b/vectordb_bench/backend/clients/pgvectorscale/config.py @@ -70,14 +70,14 @@ def session_param(self) -> dict: ... class PgVectorScaleStreamingDiskANNConfig(PgVectorScaleIndexConfig): index: IndexType = IndexType.STREAMING_DISKANN - storage_layout: str | None - num_neighbors: int | None - search_list_size: int | None - max_alpha: float | None - num_dimensions: int | None - num_bits_per_dimension: int | None - query_search_list_size: int | None - query_rescore: int | None + storage_layout: str | None = None + num_neighbors: int | None = None + search_list_size: int | None = None + max_alpha: float | None = None + num_dimensions: int | None = None + num_bits_per_dimension: int | None = None + query_search_list_size: int | None = None + query_rescore: int | None = None def index_param(self) -> dict: return { diff --git a/vectordb_bench/backend/clients/polardb/config.py b/vectordb_bench/backend/clients/polardb/config.py index c75448c49..5121c1e19 100644 --- a/vectordb_bench/backend/clients/polardb/config.py +++ b/vectordb_bench/backend/clients/polardb/config.py @@ -11,7 +11,7 @@ class PolarDBConfigDict(TypedDict): host: str port: int database: str - unix_socket: str | None + unix_socket: str | None = None class PolarDBConfig(DBConfig): diff --git a/vectordb_bench/backend/clients/qdrant_cloud/config.py b/vectordb_bench/backend/clients/qdrant_cloud/config.py index b2eeb2ce6..06543aaab 100644 --- a/vectordb_bench/backend/clients/qdrant_cloud/config.py +++ b/vectordb_bench/backend/clients/qdrant_cloud/config.py @@ -1,6 +1,6 @@ from typing import TypeVar -from pydantic import BaseModel, SecretStr, validator +from pydantic import BaseModel, SecretStr, model_validator from ..api import DBCaseConfig, DBConfig, MetricType @@ -25,11 +25,18 @@ def to_dict(self) -> dict: "url": self.url.get_secret_value(), } - @validator("*") - def not_empty_field(cls, v: any, field: any): - if field.name in ["api_key"]: - return v - return super().not_empty_field(v, field) + @model_validator(mode="before") + @classmethod + def not_empty_field(cls, data: any) -> any: + if not isinstance(data, dict): + return data + skip = set(cls.common_short_configs()) | set(cls.common_long_configs()) | {"api_key"} + for field_name, v in data.items(): + if field_name in skip: + continue + if isinstance(v, str) and not v: + raise ValueError("Empty string!") + return data class QdrantIndexConfig(BaseModel, DBCaseConfig): diff --git a/vectordb_bench/backend/clients/tidb/config.py b/vectordb_bench/backend/clients/tidb/config.py index 71fdbad66..93098ede1 100644 --- a/vectordb_bench/backend/clients/tidb/config.py +++ b/vectordb_bench/backend/clients/tidb/config.py @@ -1,6 +1,6 @@ from typing import TypedDict -from pydantic import BaseModel, SecretStr, validator +from pydantic import BaseModel, SecretStr, model_validator from ..api import DBCaseConfig, DBConfig, MetricType @@ -35,13 +35,18 @@ def to_dict(self) -> TiDBConfigDict: "ssl_verify_identity": self.ssl, } - @validator("*") - def not_empty_field(cls, v: any, field: any): - if field.name in ["password", "db_label"]: - return v - if isinstance(v, str | SecretStr) and len(v) == 0: - raise ValueError("Empty string!") - return v + @model_validator(mode="before") + @classmethod + def not_empty_field(cls, data: any) -> any: + if not isinstance(data, dict): + return data + skip = set(cls.common_short_configs()) | set(cls.common_long_configs()) | {"password"} + for field_name, v in data.items(): + if field_name in skip: + continue + if isinstance(v, str) and not v: + raise ValueError("Empty string!") + return data class TiDBIndexConfig(BaseModel, DBCaseConfig): diff --git a/vectordb_bench/backend/dataset.py b/vectordb_bench/backend/dataset.py index d1de9e328..94216532f 100644 --- a/vectordb_bench/backend/dataset.py +++ b/vectordb_bench/backend/dataset.py @@ -7,12 +7,12 @@ import logging import pathlib from enum import Enum -from typing import Any, NamedTuple +from typing import Any, ClassVar, NamedTuple import pandas as pd import polars as pl from pyarrow.parquet import ParquetFile -from pydantic import PrivateAttr, validator +from pydantic import field_validator from vectordb_bench import config from vectordb_bench.base import BaseModel @@ -38,7 +38,7 @@ class BaseDataset(BaseModel): metric_type: MetricType use_shuffled: bool with_gt: bool = False - _size_label: dict[int, SizeLabel] = PrivateAttr() + _size_label: ClassVar[dict[int, SizeLabel]] is_custom: bool = False with_remote_resource: bool = True # for label filter cases @@ -57,7 +57,8 @@ class BaseDataset(BaseModel): gt_id_field: str = "id" gt_neighbors_field: str = "neighbors_id" - @validator("size") + @field_validator("size") + @classmethod def verify_size(cls, v: int): if v not in cls._size_label: msg = f"Size {v} not supported for the dataset, expected: {cls._size_label.keys()}" @@ -102,7 +103,8 @@ class CustomDataset(BaseDataset): scalar_labels_file: str = "scalar_labels.parquet" label_percentages: list[float] = [] - @validator("size") + @field_validator("size") + @classmethod def verify_size(cls, v: int): return v @@ -136,7 +138,7 @@ class LAION(BaseDataset): metric_type: MetricType = MetricType.L2 use_shuffled: bool = False with_gt: bool = True - _size_label: dict = { + _size_label: ClassVar[dict] = { 100_000_000: SizeLabel(100_000_000, "LARGE", 100), } @@ -146,7 +148,7 @@ class GIST(BaseDataset): dim: int = 960 metric_type: MetricType = MetricType.L2 use_shuffled: bool = False - _size_label: dict = { + _size_label: ClassVar[dict] = { 100_000: SizeLabel(100_000, "SMALL", 1), 1_000_000: SizeLabel(1_000_000, "MEDIUM", 1), } @@ -158,7 +160,7 @@ class Cohere(BaseDataset): metric_type: MetricType = MetricType.COSINE use_shuffled: bool = config.USE_SHUFFLED_DATA with_gt: bool = True - _size_label: dict = { + _size_label: ClassVar[dict] = { 100_000: SizeLabel(100_000, "SMALL", 1), 1_000_000: SizeLabel(1_000_000, "MEDIUM", 1), 10_000_000: SizeLabel(10_000_000, "LARGE", 10), @@ -196,7 +198,7 @@ class Bioasq(BaseDataset): metric_type: MetricType = MetricType.COSINE use_shuffled: bool = config.USE_SHUFFLED_DATA with_gt: bool = True - _size_label: dict = { + _size_label: ClassVar[dict] = { 1_000_000: SizeLabel(1_000_000, "MEDIUM", 1), 10_000_000: SizeLabel(10_000_000, "LARGE", 10), } @@ -232,7 +234,7 @@ class Glove(BaseDataset): dim: int = 200 metric_type: MetricType = MetricType.COSINE use_shuffled: bool = False - _size_label: dict = {1_000_000: SizeLabel(1_000_000, "MEDIUM", 1)} + _size_label: ClassVar[dict] = {1_000_000: SizeLabel(1_000_000, "MEDIUM", 1)} class SIFT(BaseDataset): @@ -240,7 +242,7 @@ class SIFT(BaseDataset): dim: int = 128 metric_type: MetricType = MetricType.L2 use_shuffled: bool = False - _size_label: dict = { + _size_label: ClassVar[dict] = { 500_000: SizeLabel( 500_000, "SMALL", @@ -257,7 +259,7 @@ class OpenAI(BaseDataset): metric_type: MetricType = MetricType.COSINE use_shuffled: bool = config.USE_SHUFFLED_DATA with_gt: bool = True - _size_label: dict = { + _size_label: ClassVar[dict] = { 50_000: SizeLabel(50_000, "SMALL", 1), 500_000: SizeLabel(500_000, "MEDIUM", 1), 5_000_000: SizeLabel(5_000_000, "LARGE", 10), diff --git a/vectordb_bench/base.py b/vectordb_bench/base.py index 502d5fa49..401d2086d 100644 --- a/vectordb_bench/base.py +++ b/vectordb_bench/base.py @@ -1,5 +1,6 @@ from pydantic import BaseModel as PydanticBaseModel +from pydantic import ConfigDict -class BaseModel(PydanticBaseModel, arbitrary_types_allowed=True): - pass +class BaseModel(PydanticBaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/vectordb_bench/frontend/components/custom/getCustomConfig.py b/vectordb_bench/frontend/components/custom/getCustomConfig.py index a1ddfb737..03831a83c 100644 --- a/vectordb_bench/frontend/components/custom/getCustomConfig.py +++ b/vectordb_bench/frontend/components/custom/getCustomConfig.py @@ -62,14 +62,16 @@ def get_custom_streaming_configs(): def save_custom_configs(custom_configs: list[CustomDatasetConfig]): with open(config.CUSTOM_CONFIG_DIR, "w") as f: - json.dump([custom_config.dict() for custom_config in custom_configs], f, indent=4) + json.dump([custom_config.model_dump() for custom_config in custom_configs], f, indent=4) def save_all_custom_configs( performance_configs: list[CustomCaseConfig], streaming_configs: list[CustomStreamingCaseConfig] ): """Save both performance and streaming configs to the same JSON file""" - all_configs = [config.dict() for config in performance_configs] + [config.dict() for config in streaming_configs] + all_configs = [config.model_dump() for config in performance_configs] + [ + config.model_dump() for config in streaming_configs + ] with open(config.CUSTOM_CONFIG_DIR, "w") as f: json.dump(all_configs, f, indent=4) diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py index 387f7fb4a..d15c4e7ee 100644 --- a/vectordb_bench/frontend/config/dbCaseConfigs.py +++ b/vectordb_bench/frontend/config/dbCaseConfigs.py @@ -119,7 +119,7 @@ def get_custom_case_items() -> list[UICaseItem]: CaseConfig( case_id=CaseType.PerformanceCustomDataset, custom_case={ - **custom_config.dict(), + **custom_config.model_dump(), "use_filter": False, }, ) @@ -140,7 +140,7 @@ def get_custom_case_items() -> list[UICaseItem]: CaseConfig( case_id=CaseType.PerformanceCustomDataset, custom_case={ - **custom_config.dict(), + **custom_config.model_dump(), "use_filter": True, "label_percentage": label_percentage, }, @@ -174,7 +174,7 @@ def get_custom_streaming_case_items() -> list[UICaseItem]: case_id=CaseType.StreamingCustomDataset, custom_case={ "description": custom_config.description, - "dataset_config": custom_config.dataset_config.dict(), + "dataset_config": custom_config.dataset_config.model_dump(), }, ) ], diff --git a/vectordb_bench/frontend/pages/qps_recall.py b/vectordb_bench/frontend/pages/qps_recall.py index 27f9c4691..fb8f680c5 100644 --- a/vectordb_bench/frontend/pages/qps_recall.py +++ b/vectordb_bench/frontend/pages/qps_recall.py @@ -43,7 +43,10 @@ def case_results_filter(case_result: CaseResult) -> bool: case = case_result.task_config.case_config.case return case.label == CaseLabel.Performance and case.filters.type == FilterOp.NonFilter - default_selected_task_labels = ["standard_2025"] + default_selected_task_labels = ["standard_20260403", "standard_20250519"] + # Filter defaults to only include labels that exist in results + available_labels = {r.task_label for r in allResults} + default_selected_task_labels = [l for l in default_selected_task_labels if l in available_labels] shownData, failedTasks, showCaseNames = getshownData( resultSelectorContainer, allResults, diff --git a/vectordb_bench/frontend/pages/results.py b/vectordb_bench/frontend/pages/results.py index a146f2fdc..216029bb1 100644 --- a/vectordb_bench/frontend/pages/results.py +++ b/vectordb_bench/frontend/pages/results.py @@ -32,7 +32,7 @@ def main(): st.caption( "Choose your desired test results to display from the sidebar. " "For your reference, we've included two standard benchmarks tested by our team. " - "Note that `standard_2025` was tested in 2025; the others in 2023. " + "Note that `standard_20260403` is the latest benchmark; the others were tested in 2023-2025. " "Unless explicitly labeled as distributed multi-node, test with single-node mode by default." ) st.caption("We welcome community contributions for better results, parameter configurations, and optimizations.") diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index cdc64b9d7..a7e7c09f1 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -205,7 +205,7 @@ def k(self, value): ''' def __hash__(self) -> int: - return hash(self.json()) + return hash(self.model_dump_json()) @property def case(self) -> Case: @@ -314,7 +314,7 @@ def write_db_file(self, result_dir: pathlib.Path, partial: Self, db: str): log.info(f"write results to disk {result_file}") with pathlib.Path(result_file).open("w") as f: - b = partial.json(exclude={"db_config": {"password", "api_key"}}) + b = partial.model_dump_json(exclude={"db_config": {"password", "api_key"}}) f.write(b) def get_case_config(case_config: CaseConfig) -> dict[CaseConfig]: @@ -381,7 +381,7 @@ def read_file(cls, full_path: pathlib.Path, trans_unit: bool = False) -> Self: else: # Default to 0 for older result files that don't have P95 data case_result["metrics"]["serial_latency_p95"] = 0.0 - return TestResult.validate(test_result) + return TestResult.model_validate(test_result) def display(self, dbs: list[DB] | None = None): filter_list = dbs if dbs and isinstance(dbs, list) else None diff --git a/vectordb_bench/restful/format_res.py b/vectordb_bench/restful/format_res.py index 2e289ec3b..326986319 100644 --- a/vectordb_bench/restful/format_res.py +++ b/vectordb_bench/restful/format_res.py @@ -63,7 +63,7 @@ def format_results(test_results: list[TestResult], task_label: str) -> list[dict db_label=task_config.db_config.db_label, version=task_config.db_config.version, note=task_config.db_config.note, - params=task_config.db_case_config.dict(), + params=task_config.db_case_config.model_dump(), case_name=case.name, dataset=dataset.full_name, dim=dataset.dim, @@ -71,6 +71,6 @@ def format_results(test_results: list[TestResult], task_label: str) -> list[dict filter_rate=filter_.filter_rate, k=task_config.case_config.k, **metrics, - ).dict() + ).model_dump() ) return results diff --git a/vectordb_bench/results/getLeaderboardData.py b/vectordb_bench/results/getLeaderboardData.py index aef024bdc..1650a6f87 100644 --- a/vectordb_bench/results/getLeaderboardData.py +++ b/vectordb_bench/results/getLeaderboardData.py @@ -1,14 +1,16 @@ -from vectordb_bench import config -import ujson import pathlib +from datetime import datetime + +import ujson + +from vectordb_bench import config from vectordb_bench.backend.cases import CaseType from vectordb_bench.backend.clients import DB from vectordb_bench.frontend.config.dbPrices import DB_DBLABEL_TO_PRICE from vectordb_bench.interface import benchMarkRunner from vectordb_bench.models import ResultLabel, TestResult -from datetime import datetime -taskLabelToCode = { +task_label_to_code = { ResultLabel.FAILED: -1, ResultLabel.OUTOFRANGE: -2, ResultLabel.NORMAL: 1, @@ -18,15 +20,14 @@ def format_time(ts: float) -> str: default_standard_test_time = datetime(2023, 8, 1) t = datetime.fromtimestamp(ts) - if t < default_standard_test_time: - t = default_standard_test_time + t = max(t, default_standard_test_time) return t.strftime("%Y-%m") def main(): - allResults: list[TestResult] = benchMarkRunner.get_results() + all_results: list[TestResult] = benchMarkRunner.get_results() - if allResults is not None: + if all_results is not None: data = [ { "db": d.task_config.db.value, @@ -36,18 +37,16 @@ def main(): "qps": d.metrics.qps, "latency": d.metrics.serial_latency_p99, "recall": d.metrics.recall, - "label": taskLabelToCode[d.label], + "label": task_label_to_code[d.label], "note": d.task_config.db_config.note, "version": d.task_config.db_config.version, "test_time": format_time(test_result.timestamp), } - for test_result in allResults + for test_result in all_results if "standard" in test_result.task_label for d in test_result.results - if d.task_config.case_config.case_id != CaseType.CapacityDim128 - and d.task_config.case_config.case_id != CaseType.CapacityDim960 - if d.task_config.db != DB.ZillizCloud - or test_result.timestamp >= datetime(2024, 1, 1).timestamp() + if d.task_config.case_config.case_id not in {CaseType.CapacityDim128, CaseType.CapacityDim960} + if d.task_config.db != DB.ZillizCloud or test_result.timestamp >= datetime(2024, 1, 1).timestamp() ] # compute qp$ @@ -58,7 +57,7 @@ def main(): price = DB_DBLABEL_TO_PRICE.get(db, {}).get(db_label, 0) d["qp$"] = (qps / price * 3600) if price > 0 else 0.0 - with open(pathlib.Path(config.RESULTS_LOCAL_DIR, "leaderboard.json"), "w") as f: + with pathlib.Path(config.RESULTS_LOCAL_DIR, "leaderboard.json").open("w") as f: ujson.dump(data, f) diff --git a/vectordb_bench/results/getLeaderboardDataV2.py b/vectordb_bench/results/getLeaderboardDataV2.py index 62440886f..188d9876f 100644 --- a/vectordb_bench/results/getLeaderboardDataV2.py +++ b/vectordb_bench/results/getLeaderboardDataV2.py @@ -1,17 +1,14 @@ import json import logging +import pathlib - +from vectordb_bench import config from vectordb_bench.backend.cases import CaseType, StreamingPerformanceCase -from vectordb_bench.backend.clients import DB +from vectordb_bench.interface import BenchMarkRunner from vectordb_bench.models import CaseResult -from vectordb_bench import config -import numpy as np logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s") -from vectordb_bench.interface import BenchMarkRunner - def get_standard_2025_results() -> list[CaseResult]: all_results = BenchMarkRunner.get_results() @@ -23,7 +20,7 @@ def get_standard_2025_results() -> list[CaseResult]: def save_to_json(data: list[dict], file_name: str): - with open(file_name, "w") as f: + with pathlib.Path(file_name).open("w") as f: json.dump(data, f, indent=4) @@ -56,11 +53,11 @@ def main(): } ) else: - case: StreamingPerformanceCase = case + streaming_case: StreamingPerformanceCase = case # use 90p search stage results to represent streaming performance qps_90p = metrics.st_max_qps_list_list[metrics.st_search_stage_list.index(90)] latency_90p = metrics.st_serial_latency_p99_list[metrics.st_search_stage_list.index(90)] - insert_rate = case.insert_rate + insert_rate = streaming_case.insert_rate streaming_data.append( { "dataset": dataset,