Skip to content

Commit 3db3f5d

Browse files
Caroline-an777alwayslove2013
authored andcommitted
add to public
add medium and large public dataset add medium and large dataset to public add medium and large to public
1 parent a826631 commit 3db3f5d

4 files changed

Lines changed: 59 additions & 0 deletions

File tree

vectordb_bench/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ class config:
5353
LOAD_TIMEOUT_1536D_500K = 24 * 3600 # 24h
5454
LOAD_TIMEOUT_1536D_5M = 240 * 3600 # 10d
5555

56+
LOAD_TIMEOUT_1024D_1M = 24 * 3600 # 24h
57+
LOAD_TIMEOUT_1024D_10M = 240 * 3600 # 10d
58+
5659
OPTIMIZE_TIMEOUT_DEFAULT = 24 * 3600 # 24h
5760
OPTIMIZE_TIMEOUT_768D_100K = 24 * 3600 # 24h
5861
OPTIMIZE_TIMEOUT_768D_1M = 24 * 3600 # 24h
@@ -62,6 +65,9 @@ class config:
6265
OPTIMIZE_TIMEOUT_1536D_500K = 24 * 3600 # 24h
6366
OPTIMIZE_TIMEOUT_1536D_5M = 240 * 3600 # 10d
6467

68+
OPTIMIZE_TIMEOUT_1024D_1M = 24 * 3600 # 24h
69+
OPTIMIZE_TIMEOUT_1024D_10M = 240 * 3600 # 10d
70+
6571
def display(self) -> str:
6672
return [
6773
i

vectordb_bench/backend/cases.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ class CaseType(Enum):
4242
Performance1536D500K99P = 14
4343
Performance1536D5M99P = 15
4444

45+
Performance1024D1M = 17
46+
Performance1024D10M = 20
47+
4548
Performance1536D50K = 50
4649

4750
Custom = 100
@@ -309,6 +312,30 @@ class Performance1536D5M99P(IntFilterPerformanceCase):
309312
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M
310313

311314

315+
class Performance1024D1M(PerformanceCase):
316+
case_id: CaseType = CaseType.Performance1024D1M
317+
filter_rate: float | int | None = None
318+
dataset: DatasetManager = Dataset.BIOASQ.manager(1_000_000)
319+
name: str = "Search Performance Test (1M Dataset, 1024 Dim)"
320+
description: str = """This case tests the search performance of a vector database with a medium 1M dataset
321+
(<b>Bioasq 1M vectors</b>, 1024 dimensions), at varying parallel levels. Results will show index building time,
322+
recall, and maximum QPS."""
323+
load_timeout: float | int = config.LOAD_TIMEOUT_1024D_1M
324+
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1024D_1M
325+
326+
327+
class Performance1024D10M(PerformanceCase):
328+
case_id: CaseType = CaseType.Performance1024D10M
329+
filter_rate: float | int | None = None
330+
dataset: DatasetManager = Dataset.BIOASQ.manager(10_000_000)
331+
name: str = "Search Performance Test (10M Dataset, 1024 Dim)"
332+
description: str = """This case tests the search performance of a vector database with a large 10M dataset
333+
(<b>Bioasq 10M vectors</b>, 1024 dimensions), at varying parallel levels. Results will show index building time,
334+
recall, and maximum QPS."""
335+
load_timeout: float | int = config.LOAD_TIMEOUT_1024D_10M
336+
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1024D_10M
337+
338+
312339
class Performance1536D50K(PerformanceCase):
313340
case_id: CaseType = CaseType.Performance1536D50K
314341
filter_rate: float | int | None = None
@@ -497,6 +524,8 @@ def filters(self) -> Filter:
497524
CaseType.Performance1536D5M1P: Performance1536D5M1P,
498525
CaseType.Performance1536D500K99P: Performance1536D500K99P,
499526
CaseType.Performance1536D5M99P: Performance1536D5M99P,
527+
CaseType.Performance1024D1M: Performance1024D1M,
528+
CaseType.Performance1024D10M: Performance1024D10M,
500529
CaseType.Performance1536D50K: Performance1536D50K,
501530
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
502531
CaseType.StreamingPerformanceCase: StreamingPerformanceCase,

vectordb_bench/backend/dataset.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,20 @@ class Cohere(BaseDataset):
166166
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
167167

168168

169+
class Bioasq(BaseDataset):
170+
name: str = "Bioasq"
171+
dim: int = 1024
172+
metric_type: MetricType = MetricType.COSINE
173+
use_shuffled: bool = config.USE_SHUFFLED_DATA
174+
with_gt: bool = True
175+
_size_label: dict = {
176+
1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
177+
10_000_000: SizeLabel(10_000_000, "LARGE", 10),
178+
}
179+
with_scalar_labels: bool = True
180+
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
181+
182+
169183
class Glove(BaseDataset):
170184
name: str = "Glove"
171185
dim: int = 200
@@ -361,6 +375,7 @@ class Dataset(Enum):
361375
LAION = LAION
362376
GIST = GIST
363377
COHERE = Cohere
378+
BIOASQ = Bioasq
364379
GLOVE = Glove
365380
SIFT = SIFT
366381
OPENAI = OpenAI
@@ -376,6 +391,8 @@ class DatasetWithSizeType(Enum):
376391
CohereSmall = "Small Cohere (768dim, 100K)"
377392
CohereMedium = "Medium Cohere (768dim, 1M)"
378393
CohereLarge = "Large Cohere (768dim, 10M)"
394+
BioasqMedium = "Medium Bioasq (1024dim, 1M)"
395+
BioasqLarge = "Large Bioasq (1024dim, 10M)"
379396
OpenAISmall = "Small OpenAI (1536dim, 50K)"
380397
OpenAIMedium = "Medium OpenAI (1536dim, 500K)"
381398
OpenAILarge = "Large OpenAI (1536dim, 5M)"
@@ -410,6 +427,8 @@ def get_optimize_timeout(self) -> float:
410427
DatasetWithSizeType.CohereSmall: Dataset.COHERE.manager(100_000),
411428
DatasetWithSizeType.CohereMedium: Dataset.COHERE.manager(1_000_000),
412429
DatasetWithSizeType.CohereLarge: Dataset.COHERE.manager(10_000_000),
430+
DatasetWithSizeType.BioasqMedium: Dataset.BIOASQ.manager(1_000_000),
431+
DatasetWithSizeType.BioasqLarge: Dataset.BIOASQ.manager(10_000_000),
413432
DatasetWithSizeType.OpenAISmall: Dataset.OPENAI.manager(50_000),
414433
DatasetWithSizeType.OpenAIMedium: Dataset.OPENAI.manager(500_000),
415434
DatasetWithSizeType.OpenAILarge: Dataset.OPENAI.manager(5_000_000),

vectordb_bench/frontend/config/dbCaseConfigs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
231231
UICaseItem(cases=generate_normal_cases(CaseType.Performance768D10M)),
232232
UICaseItem(cases=generate_normal_cases(CaseType.Performance768D1M)),
233233
UICaseItem(isLine=True),
234+
UICaseItem(cases=generate_normal_cases(CaseType.Performance1024D1M)),
235+
UICaseItem(cases=generate_normal_cases(CaseType.Performance1024D10M)),
236+
UICaseItem(isLine=True),
234237
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D5M)),
235238
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D500K)),
236239
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D50K)),
@@ -301,6 +304,8 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
301304
CaseType.Performance1536D5M,
302305
CaseType.Performance1536D500K,
303306
CaseType.Performance1536D50K,
307+
CaseType.Performance1024D1M,
308+
CaseType.Performance1024D10M,
304309
CaseType.CapacityDim960,
305310
CaseType.CapacityDim128,
306311
]

0 commit comments

Comments
 (0)