Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/BENCH-CONFIG-SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ Configs have the three highest parameter keys:
| `data`:`id` | None | | OpenML data id for `fetch_openml` source. |
| `data`:`preprocessing_kwargs`:`replace_nan` | `median` | `median`, `mean` | Value to replace NaNs in preprocessed data. |
| `data`:`preprocessing_kwargs`:`category_encoding` | `ordinal` | `ordinal`, `onehot`, `drop`, `ignore` | How to encode categorical features in preprocessed data. |
| `data`:`preprocessing_kwargs`:`normalize` | False | | Enables normalization of preprocessed data. |
| `data`:`preprocessing_kwargs`:`normalize` | None | None, `mean`, `minmax`, `standard` | Enables normalization of preprocessed data. |
| `data`:`preprocessing_kwargs`:`force_for_sparse` | True | | Forces preprocessing for sparse data formats. |
| `data`:`split_kwargs` | Empty `dict` or default split from dataset description | | Data split parameters for `train_test_split` function. |
| `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
Expand Down
2 changes: 1 addition & 1 deletion configs/common/knn.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
}
},
"data": {
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
Comment thread
david-cortes-intel marked this conversation as resolved.
}
},
"sklearn knn parameters": {
Expand Down
2 changes: 1 addition & 1 deletion configs/common/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"max_iter": 10000
}
},
"data": { "preprocessing_kwargs": { "normalize": true } }
"data": { "preprocessing_kwargs": { "normalize": "standard" } }
},
"svm clsf parameters": {
"algorithm": { "estimator_params": { "random_state": 42 } }
Expand Down
4 changes: 2 additions & 2 deletions configs/regular/dbscan.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
"PARAMETERS_SETS": {
"dbscan datasets": {
"data": [
{ "dataset": "cifar", "split_kwargs": { "train_size": 15000 } },
{ "dataset": "cifar", "split_kwargs": { "train_size": 15000 }, "preprocessing_kwargs": { "normalize": "mean" } },
{ "dataset": "mnist", "split_kwargs": { "train_size": 40000 } },
{ "dataset": "sensit", "split_kwargs": { "ignore": true } },
{ "dataset": "susy", "split_kwargs": { "train_size": 100000 } },
{
"dataset": "skin_segmentation",
"split_kwargs": { "train_size": 100000 },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"source": "make_blobs",
Expand Down
14 changes: 10 additions & 4 deletions configs/regular/kmeans.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,21 @@
{
"dataset": "covtype",
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": ["mnist", "gisette"],
"dataset": ["mnist"],
"split_kwargs": { "ignore": true }
},
{
"dataset" : "gisette",
"split_kwargs" : {"ignore" : true},
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": "cifar",
"split_kwargs": { "train_size": 10000, "test_size": null }
"split_kwargs": { "train_size": 10000, "test_size": null },
"preprocessing_kwargs": { "normalize": "mean" }
}
]
},
Expand All @@ -28,7 +34,7 @@
"shuffle": true,
"random_state": 42
},
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
"algorithm": {
"estimator_params": { "n_clusters": [2, 50] }
Expand Down
2 changes: 1 addition & 1 deletion configs/regular/knn.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"data": [
{ "dataset": "susy", "split_kwargs": { "train_size": 80000, "test_size": 20000 } },
{ "dataset": "connect" },
{ "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }
{ "dataset": "gisette", "preprocessing_kwargs": { "normalize": null } }
]
},
"kd_tree knn classification datasets": {
Expand Down
2 changes: 1 addition & 1 deletion configs/regular/linear_model.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
{
"data": {
"dataset": "year_prediction_msd",
"preprocessing_kwargs": { "normalize": true },
"preprocessing_kwargs": { "normalize": "standard" },
"split_kwargs": { "train_size": 0.5, "test_size": 0.5 }
}
},
Expand Down
9 changes: 6 additions & 3 deletions configs/regular/logreg.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,21 +61,24 @@
{
"data": {
"dataset": "hepmass",
"split_kwargs": { "train_size": 0.1, "test_size": null }
"split_kwargs": { "train_size": 0.1, "test_size": null },
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like this is the only case where benchmark behavior changes - is it intended?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it was done for a reason but let me check the convergence for both options

Copy link
Copy Markdown
Collaborator Author

@avolkov-intel avolkov-intel May 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially, there's no effect on the result: accuracy and number of iterations stays the same

"preprocessing_kwargs": { "normalize": "standard" }
},
"algorithm": { "estimator_params": {"C": 1e-5} }
},
{
"data": {
"dataset": "cifar",
"split_kwargs": { "train_size": 0.1, "test_size": null }
"split_kwargs": { "train_size": 0.1, "test_size": null },
"preprocessing_kwargs": { "normalize": "mean" }
},
"algorithm": { "estimator_params": {"C": 1e-9} }
},
{
"data": {
"dataset": "gisette",
"split_kwargs": { "train_size": 2000, "test_size": null }
"split_kwargs": { "train_size": 2000, "test_size": null },
"preprocessing_kwargs": { "normalize": "standard" }
},
"algorithm": { "estimator_params": {"C": 1e1} }
}
Expand Down
10 changes: 5 additions & 5 deletions configs/regular/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": {
"estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] }
}
Expand All @@ -30,7 +30,7 @@
"data": {
"dataset": "mnist",
"split_kwargs": { "train_size": 20000, "test_size": null },
"preprocessing_kwargs": { "normalize": false }
"preprocessing_kwargs": {"normalize" : null}
Comment thread
avolkov-intel marked this conversation as resolved.
Outdated
},
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
}
Expand All @@ -45,7 +45,7 @@
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
},
{
Expand Down Expand Up @@ -75,7 +75,7 @@
"algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } }
}
],
Expand All @@ -89,7 +89,7 @@
"algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } }
},
{
Expand Down
2 changes: 1 addition & 1 deletion configs/testing/azure-pipelines-ci.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"random_state": 42
},
"preprocessing_kwargs": {
"normalize": true
"normalize": "standard"
}
},
"bench": { "n_runs": 5 },
Expand Down
11 changes: 8 additions & 3 deletions configs/weekly/dbscan.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@
"high-load dbscan datasets": {
"data": [
{
"dataset": ["cifar", "road_network", "covtype"],
"dataset" : "cifar",
"split_kwargs": { "ignore" : true },
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"dataset" : "cifar",
"split_kwargs": { "ignore" : true },
"dataset": "cifar",
"split_kwargs": { "ignore": true },

"preprocessing_kwargs": { "normalize": "mean" }
},
{
"dataset": ["road_network", "covtype"],
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": "susy",
"split_kwargs": { "train_size": 800000 },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"source": "make_blobs",
Expand Down
5 changes: 3 additions & 2 deletions configs/weekly/kmeans.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"data": {
"dataset": ["susy", "hepmass"],
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
}
},
{
Expand All @@ -37,7 +37,8 @@
{
"data": {
"dataset": "cifar",
"split_kwargs": { "ignore": true }
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": "mean" }
}
}
]
Expand Down
2 changes: 1 addition & 1 deletion configs/weekly/linear_model.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"susy"
],
"preprocessing_kwargs": {
"normalize": true
"normalize": "standard"
},
"split_kwargs": { "ignore": true }
}
Expand Down
2 changes: 1 addition & 1 deletion configs/weekly/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"data": {
"dataset": "mnist",
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": false }
"preprocessing_kwargs": { "normalize": null }
},
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
}
Expand Down
9 changes: 8 additions & 1 deletion configs/weekly/tsne.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,16 @@
},
{
"data": {
"dataset": ["sensit", "mnist", "cifar"],
"dataset": ["sensit", "mnist"],
"split_kwargs": { "ignore": true }
}
},
{
"data": {
"dataset" : "cifar",
"split_kwargs": { "ignore" : true },
"preprocessing_kwargs": { "normalize": "mean" }
}
}
]
},
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pandas
tabulate
fastparquet
h5py
openml
Comment thread
david-cortes-intel marked this conversation as resolved.
openpyxl
tqdm
psutil
Expand Down
45 changes: 45 additions & 0 deletions sklbench/benchmarks/sklearn_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,48 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
return acceleration_lines > 0 and fallback_lines == 0


def validate_estimator_params(estimator_class, estimator_params: Dict) -> Dict:
"""Validates parameters and returns only those supported by the estimator.

Args:
estimator_class: The estimator class to validate against
estimator_params: Dictionary of parameters to validate

Returns:
Dictionary with only valid parameters
"""
try:
init_signature = inspect.signature(estimator_class.__init__)
valid_params = set(init_signature.parameters.keys()) - {"self"}

# Check if estimator accepts **kwargs
has_var_keyword = any(
param.kind == inspect.Parameter.VAR_KEYWORD
for param in init_signature.parameters.values()
)

# If accepts **kwargs, return all params
if has_var_keyword:
return estimator_params

# Filter out invalid params and warn
filtered_params = {}
for param_name, param_value in estimator_params.items():
if param_name in valid_params:
filtered_params[param_name] = param_value
else:
logger.warning(
f"Parameter '{param_name}' is not supported by "
f"{estimator_class.__name__} and will be ignored"
)

return filtered_params

except Exception as e:
logger.debug(f"Could not validate parameters for {estimator_class.__name__}: {e}")
return estimator_params


def create_online_function(method_instance, data_args, batch_size):
n_batches = data_args[0].shape[0] // batch_size

Expand Down Expand Up @@ -491,6 +533,9 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
bench_case, "algorithm:estimator_params", dict()
)

# validate and filter estimator parameters
estimator_params = validate_estimator_params(estimator_class, estimator_params)

# get estimator methods for measurement
estimator_methods = get_estimator_methods(bench_case)

Expand Down
22 changes: 18 additions & 4 deletions sklbench/datasets/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import (
MinMaxScaler,
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
)

from ..utils.custom_types import Array
from ..utils.logger import logger
Expand Down Expand Up @@ -167,7 +172,7 @@ def preprocess_x(
x: Array,
replace_nan="auto",
category_encoding="ordinal",
normalize=False,
normalize=None,
force_for_sparse=True,
**kwargs,
) -> Array:
Expand Down Expand Up @@ -219,9 +224,18 @@ def preprocess_x(
pass
else:
logger.warning(f'Unknown "{category_encoding}" category encoding type.')
# Mean-Standard normalization
# Normalization
if normalize:
x = (x - x.mean()) / x.std()
if normalize == "standard":
scaler = StandardScaler(with_mean=True, with_std=True)
elif normalize == "mean":
scaler = StandardScaler(with_mean=True, with_std=False)
elif normalize == "minmax":
scaler = MinMaxScaler(feature_range=(0, 1))
else:
logger.warning(f'Unknown "{normalize}" normalization type.')
if scaler is not None:
return pd.DataFrame(scaler.fit_transform(x), columns=x.columns, index=x.index)
Comment thread
david-cortes-intel marked this conversation as resolved.
Outdated
if return_type == np.ndarray:
return x.values
else:
Expand Down
Loading
Loading