Skip to content

Commit a826631

Browse files
Caroline-an777alwayslove2013
authored andcommitted
add label control to custom
1 parent 1337461 commit a826631

7 files changed

Lines changed: 83 additions & 34 deletions

File tree

vectordb_bench/backend/cases.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,8 @@ class PerformanceCustomDataset(PerformanceCase):
339339
description: str = ""
340340
gt_file: str
341341
dataset: DatasetManager
342+
label_percentage: float | None = None
343+
use_filter: bool
342344

343345
def __init__(
344346
self,
@@ -347,6 +349,8 @@ def __init__(
347349
load_timeout: float,
348350
optimize_timeout: float,
349351
dataset_config: dict,
352+
label_percentage: float | None = None,
353+
use_filter: bool = False,
350354
**kwargs,
351355
):
352356
dataset_config = CustomDatasetConfig(**dataset_config)
@@ -365,6 +369,7 @@ def __init__(
365369
train_vector_field=dataset_config.train_col_name,
366370
test_vector_field=dataset_config.test_col_name,
367371
gt_neighbors_field=dataset_config.gt_col_name,
372+
scalar_labels_file=f"{dataset_config.scalar_labels_name}.parquet",
368373
)
369374
super().__init__(
370375
name=name,
@@ -373,10 +378,14 @@ def __init__(
373378
optimize_timeout=optimize_timeout,
374379
gt_file=f"{dataset_config.gt_name}.parquet",
375380
dataset=DatasetManager(data=dataset),
381+
use_filter=use_filter,
382+
label_percentage=label_percentage,
376383
)
377384

378385
@property
379386
def filters(self) -> Filter:
387+
if self.use_filter is True:
388+
return LabelFilter(label_percentage=self.label_percentage)
380389
return NonFilter(gt_file_name=self.gt_file)
381390

382391

vectordb_bench/backend/dataset.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ class CustomDataset(BaseDataset):
9696
gt_file: str = "neighbors.parquet"
9797
test_vector_field: str = "emb"
9898
gt_neighbors_field: str = "neighbors_id"
99+
with_scalar_labels: bool = True
100+
scalar_labels_file_separated: bool = True
101+
scalar_labels_file: str = "scalar_labels.parquet"
102+
label_percentages: list[float] = []
99103

100104
@validator("size")
101105
def verify_size(cls, v: int):
@@ -115,16 +119,13 @@ def file_count(self) -> int:
115119

116120
@property
117121
def train_files(self) -> list[str]:
118-
train_file, train_count = self.train_file, self.file_count
122+
train_file = self.train_file
119123
prefix = f"{train_file}"
120124
train_files = []
121-
if train_count > 1:
122-
prefix_s = [item.strip() for item in prefix.split(",") if item.strip()]
123-
for i in range(train_count):
124-
sub_file = f"{prefix_s[i]}.parquet"
125-
train_files.append(sub_file)
126-
else:
127-
train_files.append(f"{prefix}.parquet")
125+
prefix_s = [item.strip() for item in prefix.split(",") if item.strip()]
126+
for i in range(len(prefix_s)):
127+
sub_file = f"{prefix_s[i]}.parquet"
128+
train_files.append(sub_file)
128129
return train_files
129130

130131

vectordb_bench/backend/task_runner.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -348,12 +348,7 @@ def display(self) -> None:
348348
fmt.append(DATA_FORMAT % ("-" * 11, "-" * 12, "-" * 20, "-" * 7, "-" * 7))
349349

350350
for f in self.case_runners:
351-
if f.ca.filter_rate != 0.0:
352-
filters = f.ca.filter_rate
353-
elif f.ca.filter_size != 0:
354-
filters = f.ca.filter_size
355-
else:
356-
filters = "None"
351+
filters = f.ca.filters.filter_rate
357352

358353
ds_str = f"{f.ca.dataset.data.name}-{f.ca.dataset.data.label}-{utils.numerize(f.ca.dataset.data.size)}"
359354
fmt.append(

vectordb_bench/frontend/components/custom/displayCustomCase.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key):
1212
"Folder Path", key=f"{key}_dir", value=customCase.dataset_config.dir
1313
)
1414

15-
columns = st.columns(4)
15+
columns = st.columns(3)
1616
customCase.dataset_config.dim = columns[0].number_input(
1717
"dim", key=f"{key}_dim", value=customCase.dataset_config.dim
1818
)
@@ -22,19 +22,12 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key):
2222
customCase.dataset_config.metric_type = columns[2].selectbox(
2323
"metric type", key=f"{key}_metric_type", options=["L2", "Cosine", "IP"]
2424
)
25-
customCase.dataset_config.file_count = columns[3].number_input(
26-
"train file count",
27-
key=f"{key}_file_count",
28-
value=customCase.dataset_config.file_count,
29-
help="if train file count is more than one, please input all your train file name and split with ','",
30-
)
3125

3226
columns = st.columns(3)
3327
customCase.dataset_config.train_name = columns[0].text_input(
3428
"train file name",
3529
key=f"{key}_train_name",
3630
value=customCase.dataset_config.train_name,
37-
help="if your file and column in the file is not named as previous explanation, please input the real name (for example: if the file name is `tr.parquet` and column name is `embbb`, then input tr and embbb)",
3831
)
3932
customCase.dataset_config.test_name = columns[1].text_input(
4033
"test file name", key=f"{key}_test_name", value=customCase.dataset_config.test_name
@@ -57,12 +50,23 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key):
5750
"ground truth emb name", key=f"{key}_gt_col_name", value=customCase.dataset_config.gt_col_name
5851
)
5952

60-
columns = st.columns(4)
61-
customCase.dataset_config.use_shuffled = columns[0].checkbox(
62-
"use shuffled data", key=f"{key}_use_shuffled", value=customCase.dataset_config.use_shuffled
53+
columns = st.columns(2)
54+
customCase.dataset_config.scalar_labels_name = columns[0].text_input(
55+
"scalar labels file name",
56+
key=f"{key}_scalar_labels_file_name",
57+
value=customCase.dataset_config.scalar_labels_name,
6358
)
64-
customCase.dataset_config.with_gt = columns[1].checkbox(
65-
"with groundtruth", key=f"{key}_with_gt", value=customCase.dataset_config.with_gt
59+
default_label_percentages = ",".join(map(str, customCase.dataset_config.with_label_percentages))
60+
label_percentage_input = columns[1].text_input(
61+
"label percentages",
62+
key=f"{key}_label_percantages",
63+
value=default_label_percentages,
6664
)
65+
try:
66+
customCase.dataset_config.label_percentages = [
67+
float(item.strip()) for item in label_percentage_input.split(",") if item.strip()
68+
]
69+
except ValueError as e:
70+
st.write(f"<span style='color:red'>{e},please input correct number</span>", unsafe_allow_html=True)
6771

6872
customCase.description = st.text_area("description", key=f"{key}_description", value=customCase.description)

vectordb_bench/frontend/components/custom/displaypPrams.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,18 @@ def displayParams(st):
22
st.markdown(
33
"""
44
- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
5-
- Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
6-
- Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
7-
- Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
5+
- Vectors data files: The file should have two kinds of columns: `id` as an incrementing `int` and `emb` as an array of `float32`. The name of two columns could be defined on your own.
6+
- Query test vectors: The file could be named on your own and should have two kinds of columns: `id` as an incrementing `int` and `emb` as an array of `float32`. The `id` column must be named as `id`, and `emb` column could be defined on your own.
7+
- Ground truth file: The file could be named on your own and should have two kinds of columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`. The `id` column must be named as `id`, and `neighbors_id` column could be defined on your own.
88
9-
- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
9+
- `Train File Name` - If the number of train file is `more than one`, please input all your train file name and `split with ','` without the `.parquet` file extensionthe. For example, if there are two train file and the name of them are `train1.parquet` and `train2.parquet`, then input `train1,train2`.
10+
11+
- `Ground Truth Emb Name` - No matter whether filter file is applied or not, the `neighbors_id` column in ground truth file must have the same name.
12+
13+
- `Scalar Labels File Name ` - If there is a scalar labels file, please input the filename without the .parquet extension. The file should have two columns: `id` as an incrementing `int` and `labels` as an array of `string`. The `id` column must correspond one-to-one with the `id` column in train file..
14+
15+
- `Label percentages` - If you have filter file, please input label percentage you want to real run and `split with ','` when it's `more than one`. If you `don't have` filter file, than `keep the text vacant.`
1016
11-
- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
1217
"""
1318
)
1419
st.caption(

vectordb_bench/frontend/components/custom/getCustomConfig.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ class CustomDatasetConfig(BaseModel):
2121
train_col_name: str = "emb"
2222
test_col_name: str = "emb"
2323
gt_col_name: str = "neighbors_id"
24+
scalar_labels_name: str = "scalar_labels"
25+
label_percentages: list[str] = []
26+
with_label_percentages: list[float] = [0.001, 0.02, 0.5]
2427

2528

2629
class CustomCaseConfig(BaseModel):

vectordb_bench/frontend/config/dbCaseConfigs.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,14 +114,46 @@ def get_custom_case_items() -> list[UICaseItem]:
114114
custom_configs = get_custom_configs()
115115
return [
116116
UICaseItem(
117+
label=f"{custom_config.dataset_config.name} - None Filter",
118+
description=(
119+
f"[Batch Cases] This case tests the search performance of a vector database with your own dataset, at varying parallel levels."
120+
f"Results will show index building time, recall, and maximum QPS."
121+
),
117122
cases=[
118123
CaseConfig(
119124
case_id=CaseType.PerformanceCustomDataset,
120-
custom_case=custom_config.dict(),
125+
custom_case={
126+
**custom_config.dict(),
127+
"use_filter": False,
128+
},
121129
)
122-
]
130+
],
123131
)
124132
for custom_config in custom_configs
133+
] + [
134+
UICaseItem(
135+
label=f"{custom_config.dataset_config.name} - Filter",
136+
description=(
137+
f'[Batch Cases] This case evaluate search performance under filtering constraints like "color==red."'
138+
f"Vdbbench provides an additional column of randomly distributed labels with fixed proportions, "
139+
f"such as [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]. "
140+
f"Essentially, vdbbench will test each filter label in your own dataset to"
141+
" assess the vector database's search performance across different filtering conditions."
142+
),
143+
cases=[
144+
CaseConfig(
145+
case_id=CaseType.PerformanceCustomDataset,
146+
custom_case={
147+
**custom_config.dict(),
148+
"use_filter": True,
149+
"label_percentage": label_percentage,
150+
},
151+
)
152+
for label_percentage in custom_config.dataset_config.label_percentages
153+
],
154+
)
155+
for custom_config in custom_configs
156+
if custom_config.dataset_config.label_percentages
125157
]
126158

127159

0 commit comments

Comments
 (0)