Skip to content

Commit ecddc78

Browse files
Migrate to use Pooch for data ingestion and update example data source (#1955)
* Update varnames * Add pooch * Update data downloading to use pooch * Update example data host to parcels-data repo * Update dev docs for EXAMPLE_DATA_FILES * remove platformdirs from dependencies * Update test name * Add v4 dev note * Fix mypy * update function name
1 parent fe62959 commit ecddc78

7 files changed

Lines changed: 75 additions & 59 deletions

File tree

.binder/environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ channels:
33
- conda-forge
44
dependencies:
55
- parcels
6+
- pooch
67
- trajan

.github/ci/min-core-deps.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ dependencies:
1414
# (see https://github.com/Unidata/netcdf4-python/issues/1090)
1515
- netcdf4=1.6
1616
- numpy=1.23
17-
- platformdirs=2.5
1817
- psutil=5.9
1918
- pymbolic=2022.1
2019
- pytest=7.1

docs/v4/TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
List of tasks that are important to do before the release of version 4 (but can't be done now via code changes in `v4-dev`).
44

55
- [ ] Make migration guide for v3 to v4
6-
- [ ] Just prior to release: Update conda feedstock recipe dependencies (remove cgen and compiler dependencies). Make sure that recipe is up-to-date.
6+
- [ ] Just prior to release: Update conda feedstock recipe dependencies (remove cgen and compiler dependencies, add pooch as dependency and remove platformdirs). Make sure that recipe is up-to-date.
77
- [ ] Revamp the oceanparcels.org landing page, and perhaps also consider new logo/branding?
88
- [ ] Rerun all the tutorials so that their output is in line with new v4 print statements etc
99
- Documentation

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ dependencies: #! Keep in sync with [tool.pixi.dependencies] in pyproject.toml
88
- matplotlib-base>=2.0.2
99
- netcdf4>=1.1.9
1010
- numpy>=1.9.1
11-
- platformdirs
1211
- psutil
1312
- pymbolic
1413
- scipy>=0.16.0
@@ -18,6 +17,7 @@ dependencies: #! Keep in sync with [tool.pixi.dependencies] in pyproject.toml
1817
- dask>=2.0
1918
- scikit-learn
2019
- zarr>=2.11.0,!=2.18.0,<3
20+
- pooch
2121

2222
# Notebooks
2323
- trajan

parcels/tools/exampledata_utils.py

Lines changed: 63 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,33 @@
11
import os
22
from datetime import datetime, timedelta
33
from pathlib import Path
4-
from urllib.request import urlretrieve
54

6-
import platformdirs
5+
import pooch
76
import xarray as xr
87

98
from parcels.tools._v3to4 import patch_dataset_v4_compat
109

11-
__all__ = ["download_example_dataset", "get_data_home", "list_example_datasets"]
12-
13-
example_data_files = {
10+
__all__ = ["download_example_dataset", "list_example_datasets"]
11+
12+
# When modifying existing datasets in a backwards incompatible way,
13+
# make a new release in the repo and update the DATA_REPO_TAG to the new tag
14+
DATA_REPO_TAG = "main"
15+
16+
DATA_URL = f"https://github.com/OceanParcels/parcels-data/raw/{DATA_REPO_TAG}/data"
17+
18+
# Keys are the dataset names. Values are the filenames in the dataset folder. Note that
19+
# you can specify subfolders in the dataset folder putting slashes in the filename list.
20+
# e.g.,
21+
# "my_dataset": ["file0.nc", "folder1/file1.nc", "folder2/file2.nc"]
22+
# my_dataset/
23+
# ├── file0.nc
24+
# ├── folder1/
25+
# │ └── file1.nc
26+
# └── folder2/
27+
# └── file2.nc
28+
#
29+
# See instructions at https://github.com/OceanParcels/parcels-data for adding new datasets
30+
EXAMPLE_DATA_FILES: dict[str, list[str]] = {
1431
"MovingEddies_data": [
1532
"moving_eddiesP.nc",
1633
"moving_eddiesU.nc",
@@ -79,24 +96,32 @@
7996
}
8097

8198

82-
example_data_url = "http://oceanparcels.org/examples-data"
99+
def _create_pooch_registry() -> dict[str, None]:
100+
"""Collapses the mapping of dataset names to filenames into a pooch registry.
83101
102+
Hashes are set to None for all files.
103+
"""
104+
registry: dict[str, None] = {}
105+
for dataset, filenames in EXAMPLE_DATA_FILES.items():
106+
for filename in filenames:
107+
registry[f"{dataset}/{filename}"] = None
108+
return registry
84109

85-
def get_data_home(data_home=None):
86-
"""Return a path to the cache directory for example datasets.
87110

88-
This directory is used by :func:`load_dataset`.
111+
POOCH_REGISTRY = _create_pooch_registry()
89112

90-
If the ``data_home`` argument is not provided, it will use a directory
91-
specified by the ``PARCELS_EXAMPLE_DATA`` environment variable (if it exists)
92-
or otherwise default to an OS-appropriate user cache location.
93-
"""
113+
114+
def _get_pooch(data_home=None):
115+
if data_home is None:
116+
data_home = os.environ.get("PARCELS_EXAMPLE_DATA")
94117
if data_home is None:
95-
data_home = os.environ.get("PARCELS_EXAMPLE_DATA", platformdirs.user_cache_dir("parcels"))
96-
data_home = os.path.expanduser(data_home)
97-
if not os.path.exists(data_home):
98-
os.makedirs(data_home)
99-
return data_home
118+
data_home = pooch.os_cache("parcels")
119+
120+
return pooch.create(
121+
path=data_home,
122+
base_url=DATA_URL,
123+
registry=POOCH_REGISTRY,
124+
)
100125

101126

102127
def list_example_datasets() -> list[str]:
@@ -109,7 +134,7 @@ def list_example_datasets() -> list[str]:
109134
datasets : list of str
110135
The names of the available example datasets.
111136
"""
112-
return list(example_data_files.keys())
137+
return list(EXAMPLE_DATA_FILES.keys())
113138

114139

115140
def download_example_dataset(dataset: str, data_home=None):
@@ -133,26 +158,30 @@ def download_example_dataset(dataset: str, data_home=None):
133158
Path to the folder containing the downloaded dataset files.
134159
"""
135160
# Dev note: `dataset` is assumed to be a folder name with netcdf files
136-
if dataset not in example_data_files:
161+
if dataset not in EXAMPLE_DATA_FILES:
137162
raise ValueError(
138-
f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(example_data_files.keys())
163+
f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(EXAMPLE_DATA_FILES.keys())
139164
)
165+
odie = _get_pooch(data_home=data_home)
140166

141-
cache_folder = get_data_home(data_home)
142-
dataset_folder = Path(cache_folder) / dataset
167+
cache_folder = Path(odie.path)
168+
dataset_folder = cache_folder / dataset
143169

144-
if not dataset_folder.exists():
145-
dataset_folder.mkdir(parents=True)
170+
for file_name in odie.registry:
171+
if file_name.startswith(dataset):
172+
should_patch = dataset == "GlobCurrent_example_data"
173+
odie.fetch(file_name, processor=v4_compat_patch if should_patch else None)
146174

147-
for filename in example_data_files[dataset]:
148-
filepath = dataset_folder / filename
149-
if not filepath.exists():
150-
url = f"{example_data_url}/{dataset}/{filename}"
151-
urlretrieve(url, str(filepath))
175+
return dataset_folder
152176

153-
should_patch = dataset == "GlobCurrent_example_data"
154177

155-
if should_patch:
156-
xr.load_dataset(filepath).pipe(patch_dataset_v4_compat).to_netcdf(filepath)
178+
def v4_compat_patch(fname, action, pup):
179+
"""
180+
Patch the GlobCurrent example dataset to be compatible with v4.
157181
158-
return dataset_folder
182+
See https://www.fatiando.org/pooch/latest/processors.html#creating-your-own-processors
183+
"""
184+
if action == "fetch":
185+
return fname
186+
xr.load_dataset(fname).pipe(patch_dataset_v4_compat).to_netcdf(fname)
187+
return fname

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ dependencies = [
3232
"pytest",
3333
"scipy",
3434
"xarray",
35+
"pooch",
3536
]
3637

3738
[project.urls]
@@ -63,7 +64,6 @@ jupyter = "*"
6364
matplotlib-base = ">=2.0.2"
6465
netcdf4 = ">=1.1.9"
6566
numpy = ">=1.9.1"
66-
platformdirs = "*"
6767
psutil = "*"
6868
pymbolic = "*"
6969
scipy = ">=0.16.0"
@@ -222,5 +222,6 @@ module = [
222222
"cftime",
223223
"pykdtree.kdtree",
224224
"netCDF4",
225+
"pooch",
225226
]
226227
ignore_missing_imports = true

tests/tools/test_exampledata_utils.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,21 @@
1-
from pathlib import Path
2-
31
import pytest
42
import requests
53

64
from parcels.tools.exampledata_utils import (
5+
_get_pooch,
76
download_example_dataset,
87
list_example_datasets,
98
)
109

1110

12-
@pytest.fixture
13-
def mock_download(monkeypatch):
14-
"""Avoid the download, only check the status code and create empty file."""
15-
16-
def mock_urlretrieve(url, filename):
17-
response = requests.head(url)
18-
19-
if 400 <= response.status_code < 600:
20-
raise Exception(f"Failed to access URL: {url}. Status code: {response.status_code}")
21-
22-
Path(filename).touch()
23-
24-
monkeypatch.setattr("parcels.tools.exampledata_utils.urlretrieve", mock_urlretrieve)
25-
11+
@pytest.mark.parametrize("url", [_get_pooch().get_url(filename) for filename in _get_pooch().registry.keys()])
12+
def test_pooch_registry_url_reponse(url):
13+
response = requests.head(url)
14+
assert not (400 <= response.status_code < 600)
2615

27-
@pytest.mark.usefixtures("mock_download")
28-
@pytest.mark.parametrize("dataset", list_example_datasets())
29-
def test_download_example_dataset(tmp_path, dataset):
30-
if dataset == "GlobCurrent_example_data":
31-
pytest.skip(f"{dataset} too time consuming.")
3216

17+
@pytest.mark.parametrize("dataset", list_example_datasets()[:1])
18+
def test_download_example_dataset_folder_creation(tmp_path, dataset):
3319
dataset_folder_path = download_example_dataset(dataset, data_home=tmp_path)
3420

3521
assert dataset_folder_path.exists()

0 commit comments

Comments
 (0)