add label control to custom

Caroline-an777 · alwayslove2013 · commit a826631f513e · 2025-04-08T08:48:10.000+08:00
diff --git a/vectordb_bench/backend/cases.py b/vectordb_bench/backend/cases.py
@@ -339,6 +339,8 @@ class PerformanceCustomDataset(PerformanceCase):
     description: str = ""
     gt_file: str
     dataset: DatasetManager
+    label_percentage: float | None = None
+    use_filter: bool
 
     def __init__(
         self,
@@ -347,6 +349,8 @@ def __init__(
         load_timeout: float,
         optimize_timeout: float,
         dataset_config: dict,
+        label_percentage: float | None = None,
+        use_filter: bool = False,
         **kwargs,
     ):
         dataset_config = CustomDatasetConfig(**dataset_config)
@@ -365,6 +369,7 @@ def __init__(
             train_vector_field=dataset_config.train_col_name,
             test_vector_field=dataset_config.test_col_name,
             gt_neighbors_field=dataset_config.gt_col_name,
+            scalar_labels_file=f"{dataset_config.scalar_labels_name}.parquet",
         )
         super().__init__(
             name=name,
@@ -373,10 +378,14 @@ def __init__(
             optimize_timeout=optimize_timeout,
             gt_file=f"{dataset_config.gt_name}.parquet",
             dataset=DatasetManager(data=dataset),
+            use_filter=use_filter,
+            label_percentage=label_percentage,
         )
 
     @property
     def filters(self) -> Filter:
+        if self.use_filter is True:
+            return LabelFilter(label_percentage=self.label_percentage)
         return NonFilter(gt_file_name=self.gt_file)
 
 
diff --git a/vectordb_bench/backend/dataset.py b/vectordb_bench/backend/dataset.py
@@ -96,6 +96,10 @@ class CustomDataset(BaseDataset):
     gt_file: str = "neighbors.parquet"
     test_vector_field: str = "emb"
     gt_neighbors_field: str = "neighbors_id"
+    with_scalar_labels: bool = True
+    scalar_labels_file_separated: bool = True
+    scalar_labels_file: str = "scalar_labels.parquet"
+    label_percentages: list[float] = []
 
     @validator("size")
     def verify_size(cls, v: int):
@@ -115,16 +119,13 @@ def file_count(self) -> int:
 
     @property
     def train_files(self) -> list[str]:
-        train_file, train_count = self.train_file, self.file_count
+        train_file = self.train_file
         prefix = f"{train_file}"
         train_files = []
-        if train_count > 1:
-            prefix_s = [item.strip() for item in prefix.split(",") if item.strip()]
-            for i in range(train_count):
-                sub_file = f"{prefix_s[i]}.parquet"
-                train_files.append(sub_file)
-        else:
-            train_files.append(f"{prefix}.parquet")
+        prefix_s = [item.strip() for item in prefix.split(",") if item.strip()]
+        for i in range(len(prefix_s)):
+            sub_file = f"{prefix_s[i]}.parquet"
+            train_files.append(sub_file)
         return train_files
 
 
diff --git a/vectordb_bench/backend/task_runner.py b/vectordb_bench/backend/task_runner.py
@@ -348,12 +348,7 @@ def display(self) -> None:
         fmt.append(DATA_FORMAT % ("-" * 11, "-" * 12, "-" * 20, "-" * 7, "-" * 7))
 
         for f in self.case_runners:
-            if f.ca.filter_rate != 0.0:
-                filters = f.ca.filter_rate
-            elif f.ca.filter_size != 0:
-                filters = f.ca.filter_size
-            else:
-                filters = "None"
+            filters = f.ca.filters.filter_rate
 
             ds_str = f"{f.ca.dataset.data.name}-{f.ca.dataset.data.label}-{utils.numerize(f.ca.dataset.data.size)}"
             fmt.append(
diff --git a/vectordb_bench/frontend/components/custom/displayCustomCase.py b/vectordb_bench/frontend/components/custom/displayCustomCase.py
@@ -12,7 +12,7 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key):
         "Folder Path", key=f"{key}_dir", value=customCase.dataset_config.dir
     )
 
-    columns = st.columns(4)
+    columns = st.columns(3)
     customCase.dataset_config.dim = columns[0].number_input(
         "dim", key=f"{key}_dim", value=customCase.dataset_config.dim
     )
@@ -22,19 +22,12 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key):
     customCase.dataset_config.metric_type = columns[2].selectbox(
         "metric type", key=f"{key}_metric_type", options=["L2", "Cosine", "IP"]
     )
-    customCase.dataset_config.file_count = columns[3].number_input(
-        "train file count",
-        key=f"{key}_file_count",
-        value=customCase.dataset_config.file_count,
-        help="if train file count is more than one, please input all your train file name and split with ','",
-    )
 
     columns = st.columns(3)
     customCase.dataset_config.train_name = columns[0].text_input(
         "train file name",
         key=f"{key}_train_name",
         value=customCase.dataset_config.train_name,
-        help="if your file and column in the file is not named as previous explanation, please input the real name (for example: if the file name is `tr.parquet` and column name is `embbb`, then input tr and embbb)",
     )
     customCase.dataset_config.test_name = columns[1].text_input(
         "test file name", key=f"{key}_test_name", value=customCase.dataset_config.test_name
@@ -57,12 +50,23 @@ def displayCustomCase(customCase: CustomCaseConfig, st, key):
         "ground truth emb name", key=f"{key}_gt_col_name", value=customCase.dataset_config.gt_col_name
     )
 
-    columns = st.columns(4)
-    customCase.dataset_config.use_shuffled = columns[0].checkbox(
-        "use shuffled data", key=f"{key}_use_shuffled", value=customCase.dataset_config.use_shuffled
+    columns = st.columns(2)
+    customCase.dataset_config.scalar_labels_name = columns[0].text_input(
+        "scalar labels file name",
+        key=f"{key}_scalar_labels_file_name",
+        value=customCase.dataset_config.scalar_labels_name,
     )
-    customCase.dataset_config.with_gt = columns[1].checkbox(
-        "with groundtruth", key=f"{key}_with_gt", value=customCase.dataset_config.with_gt
+    default_label_percentages = ",".join(map(str, customCase.dataset_config.with_label_percentages))
+    label_percentage_input = columns[1].text_input(
+        "label percentages",
+        key=f"{key}_label_percantages",
+        value=default_label_percentages,
     )
+    try:
+        customCase.dataset_config.label_percentages = [
+            float(item.strip()) for item in label_percentage_input.split(",") if item.strip()
+        ]
+    except ValueError as e:
+        st.write(f"<span style='color:red'>{e},please input correct number</span>", unsafe_allow_html=True)
 
     customCase.description = st.text_area("description", key=f"{key}_description", value=customCase.description)
diff --git a/vectordb_bench/frontend/components/custom/displaypPrams.py b/vectordb_bench/frontend/components/custom/displaypPrams.py
@@ -2,13 +2,18 @@ def displayParams(st):
     st.markdown(
         """
 - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
-  - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
-  - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`. 
-  - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
+  - Vectors data files: The file should have two kinds of columns: `id` as an incrementing `int` and `emb` as an array of `float32`. The name of two columns could be defined on your own.
+  - Query test vectors: The file could be named on your own and should have two kinds of columns: `id` as an incrementing `int` and `emb` as an array of `float32`. The `id` column must be named as `id`, and `emb` column could be defined on your own.  
+  - Ground truth file: The file could be named on your own and should have two kinds of columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`. The `id` column must be named as `id`, and `neighbors_id` column could be defined on your own.
 
-- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
+- `Train File Name` - If the number of train file is `more than one`, please input all your train file name and `split with ','` without the `.parquet` file extensionthe. For example, if there are two train file and the name of them are `train1.parquet` and `train2.parquet`, then input `train1,train2`.
+
+- `Ground Truth Emb Name` - No matter whether filter file is applied or not, the `neighbors_id` column in ground truth file must have the same name.
+
+- `Scalar Labels File Name ` - If there is a scalar labels file, please input the filename without the .parquet extension. The file should have two columns: `id` as an incrementing `int` and `labels` as an array of `string`. The `id` column must correspond one-to-one with the `id` column in train file..
+
+- `Label percentages` - If you have filter file, please input label percentage you want to real run and `split with ','` when it's `more than one`. If you `don't have` filter file, than `keep the text vacant.`
 
-- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
 """
     )
     st.caption(
diff --git a/vectordb_bench/frontend/components/custom/getCustomConfig.py b/vectordb_bench/frontend/components/custom/getCustomConfig.py
@@ -21,6 +21,9 @@ class CustomDatasetConfig(BaseModel):
     train_col_name: str = "emb"
     test_col_name: str = "emb"
     gt_col_name: str = "neighbors_id"
+    scalar_labels_name: str = "scalar_labels"
+    label_percentages: list[str] = []
+    with_label_percentages: list[float] = [0.001, 0.02, 0.5]
 
 
 class CustomCaseConfig(BaseModel):
diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py
@@ -114,14 +114,46 @@ def get_custom_case_items() -> list[UICaseItem]:
     custom_configs = get_custom_configs()
     return [
         UICaseItem(
+            label=f"{custom_config.dataset_config.name} - None Filter",
+            description=(
+                f"[Batch Cases] This case tests the search performance of a vector database with your own dataset, at varying parallel levels."
+                f"Results will show index building time, recall, and maximum QPS."
+            ),
             cases=[
                 CaseConfig(
                     case_id=CaseType.PerformanceCustomDataset,
-                    custom_case=custom_config.dict(),
+                    custom_case={
+                        **custom_config.dict(),
+                        "use_filter": False,
+                    },
                 )
-            ]
+            ],
         )
         for custom_config in custom_configs
+    ] + [
+        UICaseItem(
+            label=f"{custom_config.dataset_config.name} - Filter",
+            description=(
+                f'[Batch Cases] This case evaluate search performance under filtering constraints like "color==red."'
+                f"Vdbbench provides an additional column of randomly distributed labels with fixed proportions, "
+                f"such as [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]. "
+                f"Essentially, vdbbench will test each filter label in your own dataset to"
+                " assess the vector database's search performance across different filtering conditions."
+            ),
+            cases=[
+                CaseConfig(
+                    case_id=CaseType.PerformanceCustomDataset,
+                    custom_case={
+                        **custom_config.dict(),
+                        "use_filter": True,
+                        "label_percentage": label_percentage,
+                    },
+                )
+                for label_percentage in custom_config.dataset_config.label_percentages
+            ],
+        )
+        for custom_config in custom_configs
+        if custom_config.dataset_config.label_percentages
     ]