Skip to content

Commit 6d977d8

Browse files
Update normalization parameters and add estimator params validation (#210)
* Update preprocessing args * Update scaling logic * Fix scaling * Add gisette normalization in SVM config * Add estimator parameters filter * Code format * Minor fixes * Update if condition * Fix * Update return statement
1 parent 649ff56 commit 6d977d8

19 files changed

Lines changed: 115 additions & 36 deletions

configs/BENCH-CONFIG-SPEC.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ Configs have the three highest parameter keys:
8989
| `data`:`id` | None | | OpenML data id for `fetch_openml` source. |
9090
| `data`:`preprocessing_kwargs`:`replace_nan` | `median` | `median`, `mean` | Value to replace NaNs in preprocessed data. |
9191
| `data`:`preprocessing_kwargs`:`category_encoding` | `ordinal` | `ordinal`, `onehot`, `drop`, `ignore` | How to encode categorical features in preprocessed data. |
92-
| `data`:`preprocessing_kwargs`:`normalize` | False | | Enables normalization of preprocessed data. |
92+
| `data`:`preprocessing_kwargs`:`normalize` | None | None, `mean`, `minmax`, `standard` | Enables normalization of preprocessed data. |
9393
| `data`:`preprocessing_kwargs`:`force_for_sparse` | True | | Forces preprocessing for sparse data formats. |
9494
| `data`:`split_kwargs` | Empty `dict` or default split from dataset description | | Data split parameters for `train_test_split` function. |
9595
| `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |

configs/common/knn.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
}
99
},
1010
"data": {
11-
"preprocessing_kwargs": { "normalize": true }
11+
"preprocessing_kwargs": { "normalize": "standard" }
1212
}
1313
},
1414
"sklearn knn parameters": {

configs/common/svm.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
"max_iter": 10000
6666
}
6767
},
68-
"data": { "preprocessing_kwargs": { "normalize": true } }
68+
"data": { "preprocessing_kwargs": { "normalize": "standard" } }
6969
},
7070
"svm clsf parameters": {
7171
"algorithm": { "estimator_params": { "random_state": 42 } }

configs/regular/dbscan.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
"PARAMETERS_SETS": {
44
"dbscan datasets": {
55
"data": [
6-
{ "dataset": "cifar", "split_kwargs": { "train_size": 15000 } },
6+
{ "dataset": "cifar", "split_kwargs": { "train_size": 15000 }, "preprocessing_kwargs": { "normalize": "mean" } },
77
{ "dataset": "mnist", "split_kwargs": { "train_size": 40000 } },
88
{ "dataset": "sensit", "split_kwargs": { "ignore": true } },
99
{ "dataset": "susy", "split_kwargs": { "train_size": 100000 } },
1010
{
1111
"dataset": "skin_segmentation",
1212
"split_kwargs": { "train_size": 100000 },
13-
"preprocessing_kwargs": { "normalize": true }
13+
"preprocessing_kwargs": { "normalize": "standard" }
1414
},
1515
{
1616
"source": "make_blobs",

configs/regular/kmeans.json

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,21 @@
77
{
88
"dataset": "covtype",
99
"split_kwargs": { "ignore": true },
10-
"preprocessing_kwargs": { "normalize": true }
10+
"preprocessing_kwargs": { "normalize": "standard" }
1111
},
1212
{
13-
"dataset": ["mnist", "gisette"],
13+
"dataset": ["mnist"],
1414
"split_kwargs": { "ignore": true }
1515
},
16+
{
17+
"dataset" : "gisette",
18+
"split_kwargs" : {"ignore" : true},
19+
"preprocessing_kwargs": { "normalize": "standard" }
20+
},
1621
{
1722
"dataset": "cifar",
18-
"split_kwargs": { "train_size": 10000, "test_size": null }
23+
"split_kwargs": { "train_size": 10000, "test_size": null },
24+
"preprocessing_kwargs": { "normalize": "mean" }
1925
}
2026
]
2127
},
@@ -28,7 +34,7 @@
2834
"shuffle": true,
2935
"random_state": 42
3036
},
31-
"preprocessing_kwargs": { "normalize": true }
37+
"preprocessing_kwargs": { "normalize": "standard" }
3238
},
3339
"algorithm": {
3440
"estimator_params": { "n_clusters": [2, 50] }

configs/regular/knn.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"data": [
66
{ "dataset": "susy", "split_kwargs": { "train_size": 80000, "test_size": 20000 } },
77
{ "dataset": "connect" },
8-
{ "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }
8+
{ "dataset": "gisette", "preprocessing_kwargs": { "normalize": null } }
99
]
1010
},
1111
"kd_tree knn classification datasets": {

configs/regular/linear_model.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
{
2626
"data": {
2727
"dataset": "year_prediction_msd",
28-
"preprocessing_kwargs": { "normalize": true },
28+
"preprocessing_kwargs": { "normalize": "standard" },
2929
"split_kwargs": { "train_size": 0.5, "test_size": 0.5 }
3030
}
3131
},

configs/regular/logreg.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,16 @@
6868
{
6969
"data": {
7070
"dataset": "cifar",
71-
"split_kwargs": { "train_size": 0.1, "test_size": null }
71+
"split_kwargs": { "train_size": 0.1, "test_size": null },
72+
"preprocessing_kwargs": { "normalize": "mean" }
7273
},
7374
"algorithm": { "estimator_params": {"C": 1e-9} }
7475
},
7576
{
7677
"data": {
7778
"dataset": "gisette",
78-
"split_kwargs": { "train_size": 2000, "test_size": null }
79+
"split_kwargs": { "train_size": 2000, "test_size": null },
80+
"preprocessing_kwargs": { "normalize": "standard" }
7981
},
8082
"algorithm": { "estimator_params": {"C": 1e1} }
8183
}

configs/regular/svm.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
1616
},
1717
{
18-
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
18+
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
1919
"algorithm": {
2020
"estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] }
2121
}
@@ -30,7 +30,7 @@
3030
"data": {
3131
"dataset": "mnist",
3232
"split_kwargs": { "train_size": 20000, "test_size": null },
33-
"preprocessing_kwargs": { "normalize": false }
33+
"preprocessing_kwargs": { "normalize" : null }
3434
},
3535
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
3636
}
@@ -45,7 +45,7 @@
4545
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }
4646
},
4747
{
48-
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
48+
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
4949
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
5050
},
5151
{
@@ -75,7 +75,7 @@
7575
"algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } }
7676
},
7777
{
78-
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
78+
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
7979
"algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } }
8080
}
8181
],
@@ -89,7 +89,7 @@
8989
"algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } }
9090
},
9191
{
92-
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
92+
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
9393
"algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } }
9494
},
9595
{

configs/testing/azure-pipelines-ci.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"random_state": 42
1111
},
1212
"preprocessing_kwargs": {
13-
"normalize": true
13+
"normalize": "standard"
1414
}
1515
},
1616
"bench": { "n_runs": 5 },

0 commit comments

Comments
 (0)