|
15 | 15 | # =============================================================================== |
16 | 16 |
|
17 | 17 | import os |
| 18 | +import time |
18 | 19 | from typing import Callable, List, Union |
19 | 20 |
|
20 | 21 | import numpy as np |
| 22 | +import openml |
21 | 23 | import pandas as pd |
22 | 24 | import requests |
23 | 25 | from scipy.sparse import csr_matrix |
24 | | -from sklearn.datasets import fetch_openml |
25 | 26 |
|
26 | 27 |
|
27 | | -def retrieve(url: str, filename: str) -> None: |
| 28 | +def retrieve(url: str, filename: str, max_retries: int = 3) -> None: |
| 29 | + """Download a file from a URL with basic retry logic.""" |
28 | 30 | if os.path.isfile(filename): |
29 | 31 | return |
30 | | - elif url.startswith("http"): |
31 | | - response = requests.get(url, stream=True) |
32 | | - if response.status_code != 200: |
| 32 | + |
| 33 | + if not url.startswith("http"): |
| 34 | + raise ValueError(f"URL must start with http:// or https://, got: {url}") |
| 35 | + |
| 36 | + for attempt in range(max_retries): |
| 37 | + try: |
| 38 | + response = requests.get(url, stream=True, timeout=120) |
| 39 | + if response.status_code != 200: |
| 40 | + raise AssertionError( |
| 41 | + f"Failed to download from {url}. " |
| 42 | + f"Response returned status code {response.status_code}" |
| 43 | + ) |
| 44 | + |
| 45 | + total_size = int(response.headers.get("content-length", 0)) |
| 46 | + block_size = 8192 |
| 47 | + |
| 48 | + with open(filename, "wb") as datafile: |
| 49 | + bytes_written = 0 |
| 50 | + for data in response.iter_content(block_size): |
| 51 | + if data: |
| 52 | + datafile.write(data) |
| 53 | + bytes_written += len(data) |
| 54 | + |
| 55 | + # Verify download completeness if size is known |
| 56 | + if total_size > 0 and bytes_written != total_size: |
| 57 | + os.remove(filename) |
| 58 | + if attempt < max_retries - 1: |
| 59 | + time.sleep(1) |
| 60 | + continue |
| 61 | + raise AssertionError( |
| 62 | + f"Incomplete download from {url}. " |
| 63 | + f"Expected {total_size} bytes, got {bytes_written}" |
| 64 | + ) |
| 65 | + return |
| 66 | + |
| 67 | + except ( |
| 68 | + requests.exceptions.RequestException, |
| 69 | + IOError, |
| 70 | + ) as e: |
| 71 | + if os.path.isfile(filename): |
| 72 | + os.remove(filename) |
| 73 | + if attempt < max_retries - 1: |
| 74 | + time.sleep(1) |
| 75 | + continue |
33 | 76 | raise AssertionError( |
34 | | - f"Failed to download from {url}.\n" |
35 | | - f"Response returned status code {response.status_code}" |
36 | | - ) |
37 | | - total_size = int(response.headers.get("content-length", 0)) |
38 | | - block_size = 8192 |
39 | | - n = 0 |
40 | | - with open(filename, "wb+") as datafile: |
41 | | - for data in response.iter_content(block_size): |
42 | | - n += len(data) / 1024 |
43 | | - datafile.write(data) |
44 | | - if total_size != 0 and n != total_size / 1024: |
45 | | - raise AssertionError("Some content was present but not downloaded/written") |
| 77 | + f"Failed to download {url} after {max_retries} attempts: {e}" |
| 78 | + ) from e |
46 | 79 |
|
47 | 80 |
|
48 | 81 | def fetch_and_correct_openml( |
49 | 82 | data_id: int, raw_data_cache_dir: str, as_frame: str = "auto" |
50 | 83 | ): |
51 | | - x, y = fetch_openml( |
52 | | - data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir |
| 84 | + """Fetch OpenML dataset using the openml package.""" |
| 85 | + # Configure openml cache directory |
| 86 | + openml_cache = os.path.join(raw_data_cache_dir, "openml") |
| 87 | + os.makedirs(openml_cache, exist_ok=True) |
| 88 | + openml.config.set_root_cache_directory(openml_cache) |
| 89 | + |
| 90 | + # Fetch the dataset |
| 91 | + dataset = openml.datasets.get_dataset( |
| 92 | + data_id, |
| 93 | + download_data=True, |
| 94 | + download_qualities=False, |
| 95 | + download_features_meta_data=False, |
53 | 96 | ) |
54 | | - if ( |
55 | | - isinstance(x, csr_matrix) |
56 | | - or isinstance(x, pd.DataFrame) |
57 | | - or isinstance(x, np.ndarray) |
58 | | - ): |
59 | | - pass |
60 | | - else: |
61 | | - raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml') |
| 97 | + |
| 98 | + # Get the data with target column specified |
| 99 | + x, y, _, _ = dataset.get_data( |
| 100 | + dataset_format="dataframe" if as_frame is True else "array", |
| 101 | + target=dataset.default_target_attribute, |
| 102 | + ) |
| 103 | + |
| 104 | + # Validate x type |
| 105 | + if not isinstance(x, (csr_matrix, pd.DataFrame, np.ndarray)): |
| 106 | + raise ValueError(f'Unknown x type "{type(x)}" returned from openml') |
| 107 | + |
| 108 | + # Convert sparse DataFrame to dense format |
| 109 | + if isinstance(x, pd.DataFrame): |
| 110 | + if any(pd.api.types.is_sparse(x[col]) for col in x.columns): |
| 111 | + x = x.sparse.to_dense() |
| 112 | + |
| 113 | + # Convert y to numpy array if needed |
62 | 114 | if isinstance(y, pd.Series): |
63 | | - # label transforms to cat.codes if it is passed as categorical series |
64 | 115 | if isinstance(y.dtype, pd.CategoricalDtype): |
65 | 116 | y = y.cat.codes |
66 | | - y = y.values |
67 | | - elif isinstance(y, np.ndarray): |
68 | | - pass |
69 | | - else: |
70 | | - raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml') |
| 117 | + # Use to_numpy() for sparse arrays to densify them, otherwise use values |
| 118 | + if pd.api.types.is_sparse(y): |
| 119 | + y = y.to_numpy() |
| 120 | + else: |
| 121 | + y = y.values |
| 122 | + elif not isinstance(y, np.ndarray): |
| 123 | + raise ValueError(f'Unknown y type "{type(y)}" returned from openml') |
| 124 | + |
71 | 125 | return x, y |
72 | 126 |
|
73 | 127 |
|
|
0 commit comments