Revise SVR dataset scope and address uncaught errors (#208)

ethanglaser · web-flow · commit 842e5e78de02 · 2026-04-24T14:25:00.000-07:00
* accommodate non-list metrics in baselines

* remove year_prediction_msd from regular runs

* address future warning

* minor revision

* only call todense if sparse

* remove unnecessary diff
diff --git a/configs/regular/svm.json b/configs/regular/svm.json
@@ -36,10 +36,6 @@
             }
         ],
         "svr datasets": [
-            {
-                "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } },
-                "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } }
-            },
             {
                 "data": { "dataset": "fried", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } },
                 "algorithm": { "estimator_params": { "C": 2.0, "kernel": "rbf" } }
@@ -84,10 +80,6 @@
             }
         ],
         "nusvr datasets": [
-            {
-                "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } },
-                "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } }
-            },
             {
                 "data": { "dataset": "twodplanes", "split_kwargs": { "train_size": 25000, "test_size": null } },
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }
diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py
@@ -97,7 +97,7 @@ def fetch_and_correct_openml(
 
     # Get the data with target column specified
     x, y, _, _ = dataset.get_data(
-        dataset_format="dataframe" if as_frame is True else "array",
+        dataset_format="dataframe",
         target=dataset.default_target_attribute,
     )
 
@@ -109,6 +109,8 @@ def fetch_and_correct_openml(
     if isinstance(x, pd.DataFrame):
         if any(pd.api.types.is_sparse(x[col]) for col in x.columns):
             x = x.sparse.to_dense()
+        if as_frame is not True:
+            x = x.to_numpy()
 
     # Convert y to numpy array if needed
     if isinstance(y, pd.Series):
diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py
@@ -450,7 +450,7 @@ def load_codrnanorm(
     data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
 ) -> Tuple[Dict, Dict]:
     def transform_x_y(x, y):
-        x = pd.DataFrame(x.todense())
+        x = pd.DataFrame(x)
         y = y.astype("int")
         y[y == -1] = 0
         return x, y
diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py
@@ -65,22 +65,31 @@ def enrich_metrics(
     """Transforms raw performance and other results into aggregated metrics"""
     # time metrics
     res = bench_result.copy()
-    mean, std = box_filter(res["time[ms]"])
-    if include_performance_stability_metrics:
+    if isinstance(res["time[ms]"], list):
+        mean, std = box_filter(res["time[ms]"])
+        if include_performance_stability_metrics:
+            res.update(
+                {
+                    "1st run time[ms]": res["time[ms]"][0],
+                    "1st-mean run ratio": res["time[ms]"][0] / mean,
+                }
+            )
         res.update(
             {
-                "1st run time[ms]": res["time[ms]"][0],
-                "1st-mean run ratio": res["time[ms]"][0] / mean,
+                "time[ms]": mean,
+                "time CV": std / mean,  # Coefficient of Variation
             }
         )
-    res.update(
-        {
-            "time[ms]": mean,
-            "time CV": std / mean,  # Coefficient of Variation
-        }
-    )
+    else:
+        # already aggregated (e.g. from a baseline file)
+        mean = res["time[ms]"]
+        std = res.get("time std[ms]", 0.0)
+        if mean != 0:
+            res["time CV"] = std / mean
+        else:
+            res["time CV"] = 0.0
     cost = res.get("cost[microdollar]", None)
-    if cost:
+    if cost and isinstance(cost, list):
         res["cost[microdollar]"] = box_filter(res["cost[microdollar]"])[0]
     batch_size = res.get("batch_size", None)
     if batch_size:

Original file line number	Diff line number	Diff line change
`@@ -36,10 +36,6 @@`
`36`	`36`	`}`
`37`	`37`	`],`
`38`	`38`	`"svr datasets": [`
`39`		`- {`
`40`		`- "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } },`
`41`		`- "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } }`
`42`		`- },`
`43`	`39`	`{`
`44`	`40`	`"data": { "dataset": "fried", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } },`
`45`	`41`	`"algorithm": { "estimator_params": { "C": 2.0, "kernel": "rbf" } }`
`@@ -84,10 +80,6 @@`
`84`	`80`	`}`
`85`	`81`	`],`
`86`	`82`	`"nusvr datasets": [`
`87`		`- {`
`88`		`- "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } },`
`89`		`- "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } }`
`90`		`- },`
`91`	`83`	`{`
`92`	`84`	`"data": { "dataset": "twodplanes", "split_kwargs": { "train_size": 25000, "test_size": null } },`
`93`	`85`	`"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }`