Skip to content

Commit 7f7833c

Browse files
add medium and large public dataset
add medium and large dataset to public
1 parent 7dcb725 commit 7f7833c

4 files changed

Lines changed: 21 additions & 0 deletions

File tree

vectordb_bench/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class config:
5353
LOAD_TIMEOUT_1536D_500K = 24 * 3600 # 24h
5454
LOAD_TIMEOUT_1536D_5M = 240 * 3600 # 10d
5555

56+
LOAD_TIMEOUT_1024D_1M = 24 * 3600 # 24h
5657
LOAD_TIMEOUT_1024D_10M = 240 * 3600 # 10d
5758

5859
OPTIMIZE_TIMEOUT_DEFAULT = 24 * 3600 # 24h
@@ -64,6 +65,7 @@ class config:
6465
OPTIMIZE_TIMEOUT_1536D_500K = 24 * 3600 # 24h
6566
OPTIMIZE_TIMEOUT_1536D_5M = 240 * 3600 # 10d
6667

68+
OPTIMIZE_TIMEOUT_1024D_1M = 24 * 3600 # 24h
6769
OPTIMIZE_TIMEOUT_1024D_10M = 240 * 3600 # 10d
6870

6971
def display(self) -> str:

vectordb_bench/backend/cases.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class CaseType(Enum):
4242
Performance1536D500K99P = 14
4343
Performance1536D5M99P = 15
4444

45+
Performance1024D1M = 17
4546
Performance1024D10M = 20
4647

4748
Performance1536D50K = 50
@@ -311,6 +312,18 @@ class Performance1536D5M99P(IntFilterPerformanceCase):
311312
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M
312313

313314

315+
class Performance1024D1M(PerformanceCase):
316+
case_id: CaseType = CaseType.Performance1024D1M
317+
filter_rate: float | int | None = None
318+
dataset: DatasetManager = Dataset.BIOASQ.manager(1_000_000)
319+
name: str = "Search Performance Test (1M Dataset, 1024 Dim)"
320+
description: str = """This case tests the search performance of a vector database with a medium 1M dataset
321+
(<b>Bioasq 1M vectors</b>, 1024 dimensions), at varying parallel levels. Results will show index building time,
322+
recall, and maximum QPS."""
323+
load_timeout: float | int = config.LOAD_TIMEOUT_1024D_1M
324+
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1024D_1M
325+
326+
314327
class Performance1024D10M(PerformanceCase):
315328
case_id: CaseType = CaseType.Performance1024D10M
316329
filter_rate: float | int | None = None
@@ -511,6 +524,7 @@ def filters(self) -> Filter:
511524
CaseType.Performance1536D5M1P: Performance1536D5M1P,
512525
CaseType.Performance1536D500K99P: Performance1536D500K99P,
513526
CaseType.Performance1536D5M99P: Performance1536D5M99P,
527+
CaseType.Performance1024D1M: Performance1024D1M,
514528
CaseType.Performance1024D10M: Performance1024D10M,
515529
CaseType.Performance1536D50K: Performance1536D50K,
516530
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,

vectordb_bench/backend/dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ class Bioasq(BaseDataset):
173173
use_shuffled: bool = config.USE_SHUFFLED_DATA
174174
with_gt: bool = True
175175
_size_label: dict = {
176+
1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
176177
10_000_000: SizeLabel(10_000_000, "LARGE", 10),
177178
}
178179
with_scalar_labels: bool = True
@@ -390,6 +391,7 @@ class DatasetWithSizeType(Enum):
390391
CohereSmall = "Small Cohere (768dim, 100K)"
391392
CohereMedium = "Medium Cohere (768dim, 1M)"
392393
CohereLarge = "Large Cohere (768dim, 10M)"
394+
BioasqMedium = "Medium Bioasq (1024dim, 1M)"
393395
BioasqLarge = "Large Bioasq (1024dim, 10M)"
394396
OpenAISmall = "Small OpenAI (1536dim, 50K)"
395397
OpenAIMedium = "Medium OpenAI (1536dim, 500K)"
@@ -425,6 +427,7 @@ def get_optimize_timeout(self) -> float:
425427
DatasetWithSizeType.CohereSmall: Dataset.COHERE.manager(100_000),
426428
DatasetWithSizeType.CohereMedium: Dataset.COHERE.manager(1_000_000),
427429
DatasetWithSizeType.CohereLarge: Dataset.COHERE.manager(10_000_000),
430+
DatasetWithSizeType.BioasqMedium: Dataset.BIOASQ.manager(1_000_000),
428431
DatasetWithSizeType.BioasqLarge: Dataset.BIOASQ.manager(10_000_000),
429432
DatasetWithSizeType.OpenAISmall: Dataset.OPENAI.manager(50_000),
430433
DatasetWithSizeType.OpenAIMedium: Dataset.OPENAI.manager(500_000),

vectordb_bench/frontend/config/dbCaseConfigs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
231231
UICaseItem(cases=generate_normal_cases(CaseType.Performance768D10M)),
232232
UICaseItem(cases=generate_normal_cases(CaseType.Performance768D1M)),
233233
UICaseItem(isLine=True),
234+
UICaseItem(cases=generate_normal_cases(CaseType.Performance1024D1M)),
234235
UICaseItem(cases=generate_normal_cases(CaseType.Performance1024D10M)),
235236
UICaseItem(isLine=True),
236237
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D5M)),
@@ -303,6 +304,7 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
303304
CaseType.Performance1536D5M,
304305
CaseType.Performance1536D500K,
305306
CaseType.Performance1536D50K,
307+
CaseType.Performance1024D1M,
306308
CaseType.Performance1024D10M,
307309
CaseType.CapacityDim960,
308310
CaseType.CapacityDim128,

0 commit comments

Comments
 (0)