Dataset download resolutions (#206)

ethanglaser · web-flow · commit 17ac9ae97000 · 2026-03-25T12:23:23.000-07:00
* Initial attempt at dataset download issue resolutions

* add openml to deps

* reduce unnecessary diff

* SparseArray error workaround

* add sparse workaround for X too

* another fix

* drop problematic epsilon dataset

* Add conda openml dep
diff --git a/configs/regular/svm.json b/configs/regular/svm.json
@@ -14,10 +14,6 @@
                 "data": { "dataset": "ijcnn", "split_kwargs": { "train_size": 20000, "test_size": null } },
                 "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
             },
-            {
-                "data": { "dataset": "epsilon", "split_kwargs": { "train_size": 10000, "test_size": 10000 } },
-                "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
-            },
             {
                 "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
                 "algorithm": {
diff --git a/configs/regular/train_test_split.json b/configs/regular/train_test_split.json
@@ -10,7 +10,6 @@
                         "susy",
                         "sift",
                         "gist",
-                        "epsilon",
                         "svhn"
                     ]
                 }
diff --git a/configs/regular/xgboost_binary.json b/configs/regular/xgboost_binary.json
@@ -42,23 +42,6 @@
                     }
                 }
             },
-            {
-                "data": {
-                    "dataset": "epsilon",
-                    "split_kwargs": {
-                        "train_size": 10000,
-                        "test_size": 100000
-                    }
-                },
-                "algorithm": {
-                    "estimator_params": {
-                        "max_depth": 8,
-                        "colsample_bytree": 0.1,
-                        "colsample_bynode": 0.1,
-                        "n_estimators": 200
-                    }
-                }
-            },
             {
                 "data": {
                     "dataset": "gisette",
diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml
@@ -20,3 +20,4 @@ dependencies:
   - psutil
   - requests
   - py-cpuinfo
+  - openml
diff --git a/envs/requirements-sklearn.txt b/envs/requirements-sklearn.txt
@@ -18,3 +18,4 @@ tqdm
 psutil
 requests
 py-cpuinfo
+openml
diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py
@@ -15,59 +15,113 @@
 # ===============================================================================
 
 import os
+import time
 from typing import Callable, List, Union
 
 import numpy as np
+import openml
 import pandas as pd
 import requests
 from scipy.sparse import csr_matrix
-from sklearn.datasets import fetch_openml
 
 
-def retrieve(url: str, filename: str) -> None:
+def retrieve(url: str, filename: str, max_retries: int = 3) -> None:
+    """Download a file from a URL with basic retry logic."""
     if os.path.isfile(filename):
         return
-    elif url.startswith("http"):
-        response = requests.get(url, stream=True)
-        if response.status_code != 200:
+
+    if not url.startswith("http"):
+        raise ValueError(f"URL must start with http:// or https://, got: {url}")
+
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(url, stream=True, timeout=120)
+            if response.status_code != 200:
+                raise AssertionError(
+                    f"Failed to download from {url}. "
+                    f"Response returned status code {response.status_code}"
+                )
+
+            total_size = int(response.headers.get("content-length", 0))
+            block_size = 8192
+
+            with open(filename, "wb") as datafile:
+                bytes_written = 0
+                for data in response.iter_content(block_size):
+                    if data:
+                        datafile.write(data)
+                        bytes_written += len(data)
+
+            # Verify download completeness if size is known
+            if total_size > 0 and bytes_written != total_size:
+                os.remove(filename)
+                if attempt < max_retries - 1:
+                    time.sleep(1)
+                    continue
+                raise AssertionError(
+                    f"Incomplete download from {url}. "
+                    f"Expected {total_size} bytes, got {bytes_written}"
+                )
+            return
+
+        except (
+            requests.exceptions.RequestException,
+            IOError,
+        ) as e:
+            if os.path.isfile(filename):
+                os.remove(filename)
+            if attempt < max_retries - 1:
+                time.sleep(1)
+                continue
             raise AssertionError(
-                f"Failed to download from {url}.\n"
-                f"Response returned status code {response.status_code}"
-            )
-        total_size = int(response.headers.get("content-length", 0))
-        block_size = 8192
-        n = 0
-        with open(filename, "wb+") as datafile:
-            for data in response.iter_content(block_size):
-                n += len(data) / 1024
-                datafile.write(data)
-        if total_size != 0 and n != total_size / 1024:
-            raise AssertionError("Some content was present but not downloaded/written")
+                f"Failed to download {url} after {max_retries} attempts: {e}"
+            ) from e
 
 
 def fetch_and_correct_openml(
     data_id: int, raw_data_cache_dir: str, as_frame: str = "auto"
 ):
-    x, y = fetch_openml(
-        data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir
+    """Fetch OpenML dataset using the openml package."""
+    # Configure openml cache directory
+    openml_cache = os.path.join(raw_data_cache_dir, "openml")
+    os.makedirs(openml_cache, exist_ok=True)
+    openml.config.set_root_cache_directory(openml_cache)
+
+    # Fetch the dataset
+    dataset = openml.datasets.get_dataset(
+        data_id,
+        download_data=True,
+        download_qualities=False,
+        download_features_meta_data=False,
     )
-    if (
-        isinstance(x, csr_matrix)
-        or isinstance(x, pd.DataFrame)
-        or isinstance(x, np.ndarray)
-    ):
-        pass
-    else:
-        raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml')
+
+    # Get the data with target column specified
+    x, y, _, _ = dataset.get_data(
+        dataset_format="dataframe" if as_frame is True else "array",
+        target=dataset.default_target_attribute,
+    )
+
+    # Validate x type
+    if not isinstance(x, (csr_matrix, pd.DataFrame, np.ndarray)):
+        raise ValueError(f'Unknown x type "{type(x)}" returned from openml')
+
+    # Convert sparse DataFrame to dense format
+    if isinstance(x, pd.DataFrame):
+        if any(pd.api.types.is_sparse(x[col]) for col in x.columns):
+            x = x.sparse.to_dense()
+
+    # Convert y to numpy array if needed
     if isinstance(y, pd.Series):
-        # label transforms to cat.codes if it is passed as categorical series
         if isinstance(y.dtype, pd.CategoricalDtype):
             y = y.cat.codes
-        y = y.values
-    elif isinstance(y, np.ndarray):
-        pass
-    else:
-        raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml')
+        # Use to_numpy() for sparse arrays to densify them, otherwise use values
+        if pd.api.types.is_sparse(y):
+            y = y.to_numpy()
+        else:
+            y = y.values
+    elif not isinstance(y, np.ndarray):
+        raise ValueError(f'Unknown y type "{type(y)}" returned from openml')
+
     return x, y
 
 

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,6 @@`
`10`	`10`	`"susy",`
`11`	`11`	`"sift",`
`12`	`12`	`"gist",`
`13`		`- "epsilon",`
`14`	`13`	`"svhn"`
`15`	`14`	`]`
`16`	`15`	`}`