diff --git a/configs/regular/svm.json b/configs/regular/svm.json index 4a1bb915..babfdb9a 100644 --- a/configs/regular/svm.json +++ b/configs/regular/svm.json @@ -36,10 +36,6 @@ } ], "svr datasets": [ - { - "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } }, - "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } } - }, { "data": { "dataset": "fried", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, "algorithm": { "estimator_params": { "C": 2.0, "kernel": "rbf" } } @@ -84,10 +80,6 @@ } ], "nusvr datasets": [ - { - "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } }, - "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } } - }, { "data": { "dataset": "twodplanes", "split_kwargs": { "train_size": 25000, "test_size": null } }, "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } } diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py index d75f5ea3..5fb21832 100644 --- a/sklbench/datasets/downloaders.py +++ b/sklbench/datasets/downloaders.py @@ -97,7 +97,7 @@ def fetch_and_correct_openml( # Get the data with target column specified x, y, _, _ = dataset.get_data( - dataset_format="dataframe" if as_frame is True else "array", + dataset_format="dataframe", target=dataset.default_target_attribute, ) @@ -109,6 +109,8 @@ def fetch_and_correct_openml( if isinstance(x, pd.DataFrame): if any(pd.api.types.is_sparse(x[col]) for col in x.columns): x = x.sparse.to_dense() + if as_frame is not True: + x = x.to_numpy() # Convert y to numpy array if needed if isinstance(y, pd.Series): diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index b4ba6cef..94adfee9 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -450,7 +450,7 @@ def load_codrnanorm( data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict ) -> Tuple[Dict, Dict]: def transform_x_y(x, y): - x = pd.DataFrame(x.todense()) + x = pd.DataFrame(x.todense() if hasattr(x, "todense") else x) y = y.astype("int") y[y == -1] = 0 return x, y diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py index 82177337..a80da7fc 100644 --- a/sklbench/utils/measurement.py +++ b/sklbench/utils/measurement.py @@ -65,22 +65,31 @@ def enrich_metrics( """Transforms raw performance and other results into aggregated metrics""" # time metrics res = bench_result.copy() - mean, std = box_filter(res["time[ms]"]) - if include_performance_stability_metrics: + if isinstance(res["time[ms]"], list): + mean, std = box_filter(res["time[ms]"]) + if include_performance_stability_metrics: + res.update( + { + "1st run time[ms]": res["time[ms]"][0], + "1st-mean run ratio": res["time[ms]"][0] / mean, + } + ) res.update( { - "1st run time[ms]": res["time[ms]"][0], - "1st-mean run ratio": res["time[ms]"][0] / mean, + "time[ms]": mean, + "time CV": std / mean, # Coefficient of Variation } ) - res.update( - { - "time[ms]": mean, - "time CV": std / mean, # Coefficient of Variation - } - ) + else: + # already aggregated (e.g. from a baseline file) + mean = res["time[ms]"] + std = res.get("time std[ms]", 0.0) + if mean != 0: + res["time CV"] = std / mean + else: + res["time CV"] = 0.0 cost = res.get("cost[microdollar]", None) - if cost: + if cost and isinstance(cost, list): res["cost[microdollar]"] = box_filter(res["cost[microdollar]"])[0] batch_size = res.get("batch_size", None) if batch_size: