Update normalization parameters and add estimator params validation (#210)

avolkov-intel · web-flow · commit 6d977d872160 · 2026-05-12T21:56:30.000+02:00
* Update preprocessing args

* Update scaling logic

* Fix scaling

* Add gisette normalization in SVM config

* Add estimator parameters filter

* Code format

* Minor fixes

* Update if condition

* Fix

* Update return statement
diff --git a/configs/BENCH-CONFIG-SPEC.md b/configs/BENCH-CONFIG-SPEC.md
@@ -89,7 +89,7 @@ Configs have the three highest parameter keys:
 | `data`:`id` | None |  | OpenML data id for `fetch_openml` source. |
 | `data`:`preprocessing_kwargs`:`replace_nan` | `median` | `median`, `mean` | Value to replace NaNs in preprocessed data. |
 | `data`:`preprocessing_kwargs`:`category_encoding` | `ordinal` | `ordinal`, `onehot`, `drop`, `ignore` | How to encode categorical features in preprocessed data. |
-| `data`:`preprocessing_kwargs`:`normalize` | False |  | Enables normalization of preprocessed data. |
+| `data`:`preprocessing_kwargs`:`normalize` | None | None, `mean`, `minmax`, `standard` | Enables normalization of preprocessed data. |
 | `data`:`preprocessing_kwargs`:`force_for_sparse` | True |  | Forces preprocessing for sparse data formats. |
 | `data`:`split_kwargs` | Empty `dict` or default split from dataset description |  | Data split parameters for `train_test_split` function. |
 | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
diff --git a/configs/common/knn.json b/configs/common/knn.json
@@ -8,7 +8,7 @@
                 }
             },
             "data": {
-                "preprocessing_kwargs": { "normalize": true }
+                "preprocessing_kwargs": { "normalize": "standard" }
             }
         },
         "sklearn knn parameters": {
diff --git a/configs/common/svm.json b/configs/common/svm.json
@@ -65,7 +65,7 @@
                     "max_iter": 10000
                 }
             },
-            "data": { "preprocessing_kwargs": { "normalize": true } }
+            "data": { "preprocessing_kwargs": { "normalize": "standard" } }
         },
         "svm clsf parameters": {
             "algorithm": { "estimator_params": { "random_state": 42 } }
diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json
@@ -3,14 +3,14 @@
     "PARAMETERS_SETS": {
         "dbscan datasets": {
             "data": [
-                { "dataset": "cifar", "split_kwargs": { "train_size": 15000 } },
+                { "dataset": "cifar", "split_kwargs": { "train_size": 15000 }, "preprocessing_kwargs": { "normalize": "mean" } },
                 { "dataset": "mnist", "split_kwargs": { "train_size": 40000 } },
                 { "dataset": "sensit", "split_kwargs": { "ignore": true } },
                 { "dataset": "susy", "split_kwargs": { "train_size": 100000 } },
                 {
                     "dataset": "skin_segmentation",
                     "split_kwargs": { "train_size": 100000 },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 {
                     "source": "make_blobs",
diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json
@@ -7,15 +7,21 @@
                     {
                         "dataset": "covtype",
                         "split_kwargs": { "ignore": true },
-                        "preprocessing_kwargs": { "normalize": true }
+                        "preprocessing_kwargs": { "normalize": "standard" }
                     },
                     {
-                        "dataset": ["mnist", "gisette"],
+                        "dataset": ["mnist"],
                         "split_kwargs": { "ignore": true }
                     },
+                    {
+                        "dataset" : "gisette",
+                        "split_kwargs" : {"ignore" : true},
+                        "preprocessing_kwargs": { "normalize": "standard" }
+                    },
                     {
                         "dataset": "cifar",
-                        "split_kwargs": { "train_size": 10000, "test_size": null }
+                        "split_kwargs": { "train_size": 10000, "test_size": null },
+                        "preprocessing_kwargs": { "normalize": "mean" }
                     }
                 ]
             },
@@ -28,7 +34,7 @@
                         "shuffle": true,
                         "random_state": 42
                     },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 "algorithm": {
                     "estimator_params": { "n_clusters": [2, 50] }
diff --git a/configs/regular/knn.json b/configs/regular/knn.json
@@ -5,7 +5,7 @@
             "data": [
                 { "dataset": "susy", "split_kwargs": { "train_size": 80000, "test_size": 20000 } },
                 { "dataset": "connect" },
-                { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }
+                { "dataset": "gisette", "preprocessing_kwargs": { "normalize": null } }
             ]
         },
         "kd_tree knn classification datasets": {
diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json
@@ -25,7 +25,7 @@
             {
                 "data": {
                     "dataset": "year_prediction_msd",
-                    "preprocessing_kwargs": { "normalize": true },
+                    "preprocessing_kwargs": { "normalize": "standard" },
                     "split_kwargs": { "train_size": 0.5, "test_size": 0.5 }
                 }
             },
diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json
@@ -68,14 +68,16 @@
             {
                 "data": {
                     "dataset": "cifar",
-                    "split_kwargs": { "train_size": 0.1, "test_size": null }
+                    "split_kwargs": { "train_size": 0.1, "test_size": null },
+                    "preprocessing_kwargs": { "normalize": "mean" }
                 },
                 "algorithm": { "estimator_params": {"C": 1e-9} }
             },
             {
                 "data": {
                     "dataset": "gisette",
-                    "split_kwargs": { "train_size": 2000, "test_size": null }
+                    "split_kwargs": { "train_size": 2000, "test_size": null },
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 "algorithm": { "estimator_params": {"C": 1e1} }
             }
diff --git a/configs/regular/svm.json b/configs/regular/svm.json
@@ -15,7 +15,7 @@
                 "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
             },
             {
-                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
+                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
                 "algorithm": {
                     "estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] }
                 }
@@ -30,7 +30,7 @@
                 "data": {
                     "dataset": "mnist",
                     "split_kwargs": { "train_size": 20000, "test_size": null },
-                    "preprocessing_kwargs": { "normalize": false }
+                    "preprocessing_kwargs": { "normalize" : null }
                 },
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
             }
@@ -45,7 +45,7 @@
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }
             },
             {
-                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
+                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
             },
             {
@@ -75,7 +75,7 @@
                 "algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } }
             },
             {
-                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
+                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
                 "algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } }
             }
         ],
@@ -89,7 +89,7 @@
                 "algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } }
             },
             {
-                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
+                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
                 "algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } }
             },
             {
diff --git a/configs/testing/azure-pipelines-ci.json b/configs/testing/azure-pipelines-ci.json
@@ -10,7 +10,7 @@
                     "random_state": 42
                 },
                 "preprocessing_kwargs": {
-                    "normalize": true
+                    "normalize": "standard"
                 }
             },
             "bench": { "n_runs": 5 },
diff --git a/configs/weekly/dbscan.json b/configs/weekly/dbscan.json
@@ -4,14 +4,19 @@
         "high-load dbscan datasets": {
             "data": [
                 {
-                    "dataset": ["cifar", "road_network", "covtype"],
+                    "dataset": "cifar",
                     "split_kwargs": { "ignore": true },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "mean" }
+                },
+                {
+                    "dataset": ["road_network", "covtype"],
+                    "split_kwargs": { "ignore": true },
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 {
                     "dataset": "susy",
                     "split_kwargs": { "train_size": 800000 },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 {
                     "source": "make_blobs",
diff --git a/configs/weekly/kmeans.json b/configs/weekly/kmeans.json
@@ -15,7 +15,7 @@
                 "data": {
                     "dataset": ["susy", "hepmass"],
                     "split_kwargs": { "ignore": true },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 }
             },
             {
@@ -37,7 +37,8 @@
             {
                 "data": {
                     "dataset": "cifar",
-                    "split_kwargs": { "ignore": true }
+                    "split_kwargs": { "ignore": true },
+                    "preprocessing_kwargs": { "normalize": "mean" }
                 }
             }
         ]
diff --git a/configs/weekly/linear_model.json b/configs/weekly/linear_model.json
@@ -31,7 +31,7 @@
                         "susy"
                     ],
                     "preprocessing_kwargs": {
-                        "normalize": true
+                        "normalize": "standard"
                     },
                     "split_kwargs": { "ignore": true }
                 }
diff --git a/configs/weekly/svm.json b/configs/weekly/svm.json
@@ -28,7 +28,7 @@
                 "data": {
                     "dataset": "mnist",
                     "split_kwargs": { "ignore": true },
-                    "preprocessing_kwargs": { "normalize": false }
+                    "preprocessing_kwargs": { "normalize": null }
                 },
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
             }
diff --git a/configs/weekly/tsne.json b/configs/weekly/tsne.json
@@ -16,9 +16,16 @@
             },
             {
                 "data": {
-                    "dataset": ["sensit", "mnist", "cifar"],
+                    "dataset": ["sensit", "mnist"],
                     "split_kwargs": { "ignore": true }
                 }
+            },
+            {
+                "data": {
+                    "dataset" : "cifar",
+                    "split_kwargs": { "ignore" : true },
+                    "preprocessing_kwargs": { "normalize": "mean" }
+                }
             }
         ]
     },
diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,7 @@ pandas
 tabulate
 fastparquet
 h5py
+openml
 openpyxl
 tqdm
 psutil
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
@@ -334,6 +334,48 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
     return acceleration_lines > 0 and fallback_lines == 0
 
 
+def validate_estimator_params(estimator_class, estimator_params: Dict) -> Dict:
+    """Validates parameters and returns only those supported by the estimator.
+
+    Args:
+        estimator_class: The estimator class to validate against
+        estimator_params: Dictionary of parameters to validate
+
+    Returns:
+        Dictionary with only valid parameters
+    """
+    try:
+        init_signature = inspect.signature(estimator_class.__init__)
+        valid_params = set(init_signature.parameters.keys()) - {"self"}
+
+        # Check if estimator accepts **kwargs
+        has_var_keyword = any(
+            param.kind == inspect.Parameter.VAR_KEYWORD
+            for param in init_signature.parameters.values()
+        )
+
+        # If accepts **kwargs, return all params
+        if has_var_keyword:
+            return estimator_params
+
+        # Filter out invalid params and warn
+        filtered_params = {}
+        for param_name, param_value in estimator_params.items():
+            if param_name in valid_params:
+                filtered_params[param_name] = param_value
+            else:
+                logger.warning(
+                    f"Parameter '{param_name}' is not supported by "
+                    f"{estimator_class.__name__} and will be ignored"
+                )
+
+        return filtered_params
+
+    except Exception as e:
+        logger.debug(f"Could not validate parameters for {estimator_class.__name__}: {e}")
+        return estimator_params
+
+
 def create_online_function(method_instance, data_args, batch_size):
     n_batches = data_args[0].shape[0] // batch_size
 
@@ -491,6 +533,9 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
         bench_case, "algorithm:estimator_params", dict()
     )
 
+    # validate and filter estimator parameters
+    estimator_params = validate_estimator_params(estimator_class, estimator_params)
+
     # get estimator methods for measurement
     estimator_methods = get_estimator_methods(bench_case)
 
diff --git a/sklbench/datasets/common.py b/sklbench/datasets/common.py
@@ -23,7 +23,12 @@
 import pandas as pd
 from scipy.sparse import csr_matrix
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
+from sklearn.preprocessing import (
+    MinMaxScaler,
+    OneHotEncoder,
+    OrdinalEncoder,
+    StandardScaler,
+)
 
 from ..utils.custom_types import Array
 from ..utils.logger import logger
@@ -167,7 +172,7 @@ def preprocess_x(
     x: Array,
     replace_nan="auto",
     category_encoding="ordinal",
-    normalize=False,
+    normalize=None,
     force_for_sparse=True,
     **kwargs,
 ) -> Array:
@@ -219,11 +224,20 @@ def preprocess_x(
             pass
         else:
             logger.warning(f'Unknown "{category_encoding}" category encoding type.')
-    # Mean-Standard normalization
+    # Normalization
     if normalize:
-        x = (x - x.mean()) / x.std()
+        if normalize == "standard":
+            scaler = StandardScaler(with_mean=True, with_std=True)
+        elif normalize == "mean":
+            scaler = StandardScaler(with_mean=True, with_std=False)
+        elif normalize == "minmax":
+            scaler = MinMaxScaler(feature_range=(0, 1))
+        else:
+            logger.warning(f'Unknown "{normalize}" normalization type.')
+        if scaler is not None:
+            x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns, index=x.index)
     if return_type == np.ndarray:
-        return x.values
+        return np.array(x)
     else:
         return x
 
diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`}`
`9`	`9`	`},`
`10`	`10`	`"data": {`
`11`		`- "preprocessing_kwargs": { "normalize": true }`
	`11`	`+ "preprocessing_kwargs": { "normalize": "standard" }`
`12`	`12`	`}`
`13`	`13`	`},`
`14`	`14`	`"sklearn knn parameters": {`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@`
`65`	`65`	`"max_iter": 10000`
`66`	`66`	`}`
`67`	`67`	`},`
`68`		`- "data": { "preprocessing_kwargs": { "normalize": true } }`
	`68`	`+ "data": { "preprocessing_kwargs": { "normalize": "standard" } }`
`69`	`69`	`},`
`70`	`70`	`"svm clsf parameters": {`
`71`	`71`	`"algorithm": { "estimator_params": { "random_state": 42 } }`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`"data": [`
`6`	`6`	`{ "dataset": "susy", "split_kwargs": { "train_size": 80000, "test_size": 20000 } },`
`7`	`7`	`{ "dataset": "connect" },`
`8`		`- { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }`
	`8`	`+ { "dataset": "gisette", "preprocessing_kwargs": { "normalize": null } }`
`9`	`9`	`]`
`10`	`10`	`},`
`11`	`11`	`"kd_tree knn classification datasets": {`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@`
`25`	`25`	`{`
`26`	`26`	`"data": {`
`27`	`27`	`"dataset": "year_prediction_msd",`
`28`		`- "preprocessing_kwargs": { "normalize": true },`
	`28`	`+ "preprocessing_kwargs": { "normalize": "standard" },`
`29`	`29`	`"split_kwargs": { "train_size": 0.5, "test_size": 0.5 }`
`30`	`30`	`}`
`31`	`31`	`},`
Original file line number	Diff line number	Diff line change
`@@ -68,14 +68,16 @@`
`68`	`68`	`{`
`69`	`69`	`"data": {`
`70`	`70`	`"dataset": "cifar",`
`71`		`- "split_kwargs": { "train_size": 0.1, "test_size": null }`
	`71`	`+ "split_kwargs": { "train_size": 0.1, "test_size": null },`
	`72`	`+ "preprocessing_kwargs": { "normalize": "mean" }`
`72`	`73`	`},`
`73`	`74`	`"algorithm": { "estimator_params": {"C": 1e-9} }`
`74`	`75`	`},`
`75`	`76`	`{`
`76`	`77`	`"data": {`
`77`	`78`	`"dataset": "gisette",`
`78`		`- "split_kwargs": { "train_size": 2000, "test_size": null }`
	`79`	`+ "split_kwargs": { "train_size": 2000, "test_size": null },`
	`80`	`+ "preprocessing_kwargs": { "normalize": "standard" }`
`79`	`81`	`},`
`80`	`82`	`"algorithm": { "estimator_params": {"C": 1e1} }`
`81`	`83`	`}`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`"algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }`
`16`	`16`	`},`
`17`	`17`	`{`
`18`		`- "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },`
	`18`	`+ "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },`
`19`	`19`	`"algorithm": {`
`20`	`20`	`"estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] }`
`21`	`21`	`}`
`@@ -30,7 +30,7 @@`
`30`	`30`	`"data": {`
`31`	`31`	`"dataset": "mnist",`
`32`	`32`	`"split_kwargs": { "train_size": 20000, "test_size": null },`
`33`		`- "preprocessing_kwargs": { "normalize": false }`
	`33`	`+ "preprocessing_kwargs": { "normalize" : null }`
`34`	`34`	`},`
`35`	`35`	`"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }`
`36`	`36`	`}`
`@@ -45,7 +45,7 @@`
`45`	`45`	`"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }`
`46`	`46`	`},`
`47`	`47`	`{`
`48`		`- "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },`
	`48`	`+ "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },`
`49`	`49`	`"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }`
`50`	`50`	`},`
`51`	`51`	`{`
`@@ -75,7 +75,7 @@`
`75`	`75`	`"algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } }`
`76`	`76`	`},`
`77`	`77`	`{`
`78`		`- "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },`
	`78`	`+ "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },`
`79`	`79`	`"algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } }`
`80`	`80`	`}`
`81`	`81`	`],`
`@@ -89,7 +89,7 @@`
`89`	`89`	`"algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } }`
`90`	`90`	`},`
`91`	`91`	`{`
`92`		`- "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },`
	`92`	`+ "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },`
`93`	`93`	`"algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } }`
`94`	`94`	`},`
`95`	`95`	`{`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`"random_state": 42`
`11`	`11`	`},`
`12`	`12`	`"preprocessing_kwargs": {`
`13`		`- "normalize": true`
	`13`	`+ "normalize": "standard"`
`14`	`14`	`}`
`15`	`15`	`},`
`16`	`16`	`"bench": { "n_runs": 5 },`