Skip to content

Commit 17ac9ae

Browse files
authored
Dataset download resolutions (#206)
* Initial attempt at dataset download issue resolutions * add openml to deps * reduce unnecessary diff * SparseArray error workaround * add sparse workaround for X too * another fix * drop problematic epsilon dataset * Add conda openml dep
1 parent 399d9eb commit 17ac9ae

File tree

6 files changed

+89
-55
lines changed

6 files changed

+89
-55
lines changed

configs/regular/svm.json

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@
1414
"data": { "dataset": "ijcnn", "split_kwargs": { "train_size": 20000, "test_size": null } },
1515
"algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
1616
},
17-
{
18-
"data": { "dataset": "epsilon", "split_kwargs": { "train_size": 10000, "test_size": 10000 } },
19-
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
20-
},
2117
{
2218
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
2319
"algorithm": {

configs/regular/train_test_split.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
"susy",
1111
"sift",
1212
"gist",
13-
"epsilon",
1413
"svhn"
1514
]
1615
}

configs/regular/xgboost_binary.json

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,6 @@
4242
}
4343
}
4444
},
45-
{
46-
"data": {
47-
"dataset": "epsilon",
48-
"split_kwargs": {
49-
"train_size": 10000,
50-
"test_size": 100000
51-
}
52-
},
53-
"algorithm": {
54-
"estimator_params": {
55-
"max_depth": 8,
56-
"colsample_bytree": 0.1,
57-
"colsample_bynode": 0.1,
58-
"n_estimators": 200
59-
}
60-
}
61-
},
6245
{
6346
"data": {
6447
"dataset": "gisette",

envs/conda-env-sklearn.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ dependencies:
2020
- psutil
2121
- requests
2222
- py-cpuinfo
23+
- openml

envs/requirements-sklearn.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ tqdm
1818
psutil
1919
requests
2020
py-cpuinfo
21+
openml

sklbench/datasets/downloaders.py

Lines changed: 87 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -15,59 +15,113 @@
1515
# ===============================================================================
1616

1717
import os
18+
import time
1819
from typing import Callable, List, Union
1920

2021
import numpy as np
22+
import openml
2123
import pandas as pd
2224
import requests
2325
from scipy.sparse import csr_matrix
24-
from sklearn.datasets import fetch_openml
2526

2627

27-
def retrieve(url: str, filename: str) -> None:
28+
def retrieve(url: str, filename: str, max_retries: int = 3) -> None:
29+
"""Download a file from a URL with basic retry logic."""
2830
if os.path.isfile(filename):
2931
return
30-
elif url.startswith("http"):
31-
response = requests.get(url, stream=True)
32-
if response.status_code != 200:
32+
33+
if not url.startswith("http"):
34+
raise ValueError(f"URL must start with http:// or https://, got: {url}")
35+
36+
for attempt in range(max_retries):
37+
try:
38+
response = requests.get(url, stream=True, timeout=120)
39+
if response.status_code != 200:
40+
raise AssertionError(
41+
f"Failed to download from {url}. "
42+
f"Response returned status code {response.status_code}"
43+
)
44+
45+
total_size = int(response.headers.get("content-length", 0))
46+
block_size = 8192
47+
48+
with open(filename, "wb") as datafile:
49+
bytes_written = 0
50+
for data in response.iter_content(block_size):
51+
if data:
52+
datafile.write(data)
53+
bytes_written += len(data)
54+
55+
# Verify download completeness if size is known
56+
if total_size > 0 and bytes_written != total_size:
57+
os.remove(filename)
58+
if attempt < max_retries - 1:
59+
time.sleep(1)
60+
continue
61+
raise AssertionError(
62+
f"Incomplete download from {url}. "
63+
f"Expected {total_size} bytes, got {bytes_written}"
64+
)
65+
return
66+
67+
except (
68+
requests.exceptions.RequestException,
69+
IOError,
70+
) as e:
71+
if os.path.isfile(filename):
72+
os.remove(filename)
73+
if attempt < max_retries - 1:
74+
time.sleep(1)
75+
continue
3376
raise AssertionError(
34-
f"Failed to download from {url}.\n"
35-
f"Response returned status code {response.status_code}"
36-
)
37-
total_size = int(response.headers.get("content-length", 0))
38-
block_size = 8192
39-
n = 0
40-
with open(filename, "wb+") as datafile:
41-
for data in response.iter_content(block_size):
42-
n += len(data) / 1024
43-
datafile.write(data)
44-
if total_size != 0 and n != total_size / 1024:
45-
raise AssertionError("Some content was present but not downloaded/written")
77+
f"Failed to download {url} after {max_retries} attempts: {e}"
78+
) from e
4679

4780

4881
def fetch_and_correct_openml(
4982
data_id: int, raw_data_cache_dir: str, as_frame: str = "auto"
5083
):
51-
x, y = fetch_openml(
52-
data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir
84+
"""Fetch OpenML dataset using the openml package."""
85+
# Configure openml cache directory
86+
openml_cache = os.path.join(raw_data_cache_dir, "openml")
87+
os.makedirs(openml_cache, exist_ok=True)
88+
openml.config.set_root_cache_directory(openml_cache)
89+
90+
# Fetch the dataset
91+
dataset = openml.datasets.get_dataset(
92+
data_id,
93+
download_data=True,
94+
download_qualities=False,
95+
download_features_meta_data=False,
5396
)
54-
if (
55-
isinstance(x, csr_matrix)
56-
or isinstance(x, pd.DataFrame)
57-
or isinstance(x, np.ndarray)
58-
):
59-
pass
60-
else:
61-
raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml')
97+
98+
# Get the data with target column specified
99+
x, y, _, _ = dataset.get_data(
100+
dataset_format="dataframe" if as_frame is True else "array",
101+
target=dataset.default_target_attribute,
102+
)
103+
104+
# Validate x type
105+
if not isinstance(x, (csr_matrix, pd.DataFrame, np.ndarray)):
106+
raise ValueError(f'Unknown x type "{type(x)}" returned from openml')
107+
108+
# Convert sparse DataFrame to dense format
109+
if isinstance(x, pd.DataFrame):
110+
if any(pd.api.types.is_sparse(x[col]) for col in x.columns):
111+
x = x.sparse.to_dense()
112+
113+
# Convert y to numpy array if needed
62114
if isinstance(y, pd.Series):
63-
# label transforms to cat.codes if it is passed as categorical series
64115
if isinstance(y.dtype, pd.CategoricalDtype):
65116
y = y.cat.codes
66-
y = y.values
67-
elif isinstance(y, np.ndarray):
68-
pass
69-
else:
70-
raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml')
117+
# Use to_numpy() for sparse arrays to densify them, otherwise use values
118+
if pd.api.types.is_sparse(y):
119+
y = y.to_numpy()
120+
else:
121+
y = y.values
122+
elif not isinstance(y, np.ndarray):
123+
raise ValueError(f'Unknown y type "{type(y)}" returned from openml')
124+
71125
return x, y
72126

73127

0 commit comments

Comments
 (0)