Migrate to use Pooch for data ingestion and update example data source (#1955)

VeckoTheGecko · web-flow · commit ecddc7857ca3 · 2025-04-03T15:08:38.000Z
* Update varnames

* Add pooch

* Update data downloading to use pooch

* Update example data host to parcels-data repo

* Update dev docs for EXAMPLE_DATA_FILES

* remove platformdirs from dependencies

* Update test name

* Add v4 dev note

* Fix mypy

* update function name
diff --git a/.binder/environment.yml b/.binder/environment.yml
@@ -3,4 +3,5 @@ channels:
   - conda-forge
 dependencies:
   - parcels
+  - pooch
   - trajan
diff --git a/.github/ci/min-core-deps.yml b/.github/ci/min-core-deps.yml
@@ -14,7 +14,6 @@ dependencies:
   # (see https://github.com/Unidata/netcdf4-python/issues/1090)
   - netcdf4=1.6
   - numpy=1.23
-  - platformdirs=2.5
   - psutil=5.9
   - pymbolic=2022.1
   - pytest=7.1
diff --git a/docs/v4/TODO.md b/docs/v4/TODO.md
@@ -3,7 +3,7 @@
 List of tasks that are important to do before the release of version 4 (but can't be done now via code changes in `v4-dev`).
 
 - [ ] Make migration guide for v3 to v4
-- [ ] Just prior to release: Update conda feedstock recipe dependencies (remove cgen and compiler dependencies). Make sure that recipe is up-to-date.
+- [ ] Just prior to release: Update conda feedstock recipe dependencies (remove cgen and compiler dependencies, add pooch as dependency and remove platformdirs). Make sure that recipe is up-to-date.
 - [ ] Revamp the oceanparcels.org landing page, and perhaps also consider new logo/branding?
 - [ ] Rerun all the tutorials so that their output is in line with new v4 print statements etc
 - Documentation
diff --git a/environment.yml b/environment.yml
@@ -8,7 +8,6 @@ dependencies: #! Keep in sync with [tool.pixi.dependencies] in pyproject.toml
   - matplotlib-base>=2.0.2
   - netcdf4>=1.1.9
   - numpy>=1.9.1
-  - platformdirs
   - psutil
   - pymbolic
   - scipy>=0.16.0
@@ -18,6 +17,7 @@ dependencies: #! Keep in sync with [tool.pixi.dependencies] in pyproject.toml
   - dask>=2.0
   - scikit-learn
   - zarr>=2.11.0,!=2.18.0,<3
+  - pooch
 
   # Notebooks
   - trajan
diff --git a/parcels/tools/exampledata_utils.py b/parcels/tools/exampledata_utils.py
@@ -1,16 +1,33 @@
 import os
 from datetime import datetime, timedelta
 from pathlib import Path
-from urllib.request import urlretrieve
 
-import platformdirs
+import pooch
 import xarray as xr
 
 from parcels.tools._v3to4 import patch_dataset_v4_compat
 
-__all__ = ["download_example_dataset", "get_data_home", "list_example_datasets"]
-
-example_data_files = {
+__all__ = ["download_example_dataset", "list_example_datasets"]
+
+# When modifying existing datasets in a backwards incompatible way,
+# make a new release in the repo and update the DATA_REPO_TAG to the new tag
+DATA_REPO_TAG = "main"
+
+DATA_URL = f"https://github.com/OceanParcels/parcels-data/raw/{DATA_REPO_TAG}/data"
+
+# Keys are the dataset names. Values are the filenames in the dataset folder. Note that
+# you can specify subfolders in the dataset folder putting slashes in the filename list.
+# e.g.,
+# "my_dataset": ["file0.nc", "folder1/file1.nc", "folder2/file2.nc"]
+# my_dataset/
+# ├── file0.nc
+# ├── folder1/
+# │   └── file1.nc
+# └── folder2/
+#     └── file2.nc
+#
+# See instructions at https://github.com/OceanParcels/parcels-data for adding new datasets
+EXAMPLE_DATA_FILES: dict[str, list[str]] = {
     "MovingEddies_data": [
         "moving_eddiesP.nc",
         "moving_eddiesU.nc",
@@ -79,24 +96,32 @@
 }
 
 
-example_data_url = "http://oceanparcels.org/examples-data"
+def _create_pooch_registry() -> dict[str, None]:
+    """Collapses the mapping of dataset names to filenames into a pooch registry.
 
+    Hashes are set to None for all files.
+    """
+    registry: dict[str, None] = {}
+    for dataset, filenames in EXAMPLE_DATA_FILES.items():
+        for filename in filenames:
+            registry[f"{dataset}/{filename}"] = None
+    return registry
 
-def get_data_home(data_home=None):
-    """Return a path to the cache directory for example datasets.
 
-    This directory is used by :func:`load_dataset`.
+POOCH_REGISTRY = _create_pooch_registry()
 
-    If the ``data_home`` argument is not provided, it will use a directory
-    specified by the ``PARCELS_EXAMPLE_DATA`` environment variable (if it exists)
-    or otherwise default to an OS-appropriate user cache location.
-    """
+
+def _get_pooch(data_home=None):
+    if data_home is None:
+        data_home = os.environ.get("PARCELS_EXAMPLE_DATA")
     if data_home is None:
-        data_home = os.environ.get("PARCELS_EXAMPLE_DATA", platformdirs.user_cache_dir("parcels"))
-    data_home = os.path.expanduser(data_home)
-    if not os.path.exists(data_home):
-        os.makedirs(data_home)
-    return data_home
+        data_home = pooch.os_cache("parcels")
+
+    return pooch.create(
+        path=data_home,
+        base_url=DATA_URL,
+        registry=POOCH_REGISTRY,
+    )
 
 
 def list_example_datasets() -> list[str]:
@@ -109,7 +134,7 @@ def list_example_datasets() -> list[str]:
     datasets : list of str
         The names of the available example datasets.
     """
-    return list(example_data_files.keys())
+    return list(EXAMPLE_DATA_FILES.keys())
 
 
 def download_example_dataset(dataset: str, data_home=None):
@@ -133,26 +158,30 @@ def download_example_dataset(dataset: str, data_home=None):
         Path to the folder containing the downloaded dataset files.
     """
     # Dev note: `dataset` is assumed to be a folder name with netcdf files
-    if dataset not in example_data_files:
+    if dataset not in EXAMPLE_DATA_FILES:
         raise ValueError(
-            f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(example_data_files.keys())
+            f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(EXAMPLE_DATA_FILES.keys())
         )
+    odie = _get_pooch(data_home=data_home)
 
-    cache_folder = get_data_home(data_home)
-    dataset_folder = Path(cache_folder) / dataset
+    cache_folder = Path(odie.path)
+    dataset_folder = cache_folder / dataset
 
-    if not dataset_folder.exists():
-        dataset_folder.mkdir(parents=True)
+    for file_name in odie.registry:
+        if file_name.startswith(dataset):
+            should_patch = dataset == "GlobCurrent_example_data"
+            odie.fetch(file_name, processor=v4_compat_patch if should_patch else None)
 
-    for filename in example_data_files[dataset]:
-        filepath = dataset_folder / filename
-        if not filepath.exists():
-            url = f"{example_data_url}/{dataset}/{filename}"
-            urlretrieve(url, str(filepath))
+    return dataset_folder
 
-            should_patch = dataset == "GlobCurrent_example_data"
 
-            if should_patch:
-                xr.load_dataset(filepath).pipe(patch_dataset_v4_compat).to_netcdf(filepath)
+def v4_compat_patch(fname, action, pup):
+    """
+    Patch the GlobCurrent example dataset to be compatible with v4.
 
-    return dataset_folder
+    See https://www.fatiando.org/pooch/latest/processors.html#creating-your-own-processors
+    """
+    if action == "fetch":
+        return fname
+    xr.load_dataset(fname).pipe(patch_dataset_v4_compat).to_netcdf(fname)
+    return fname
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
   "pytest",
   "scipy",
   "xarray",
+  "pooch",
 ]
 
 [project.urls]
@@ -63,7 +64,6 @@ jupyter = "*"
 matplotlib-base = ">=2.0.2"
 netcdf4 = ">=1.1.9"
 numpy = ">=1.9.1"
-platformdirs = "*"
 psutil = "*"
 pymbolic = "*"
 scipy = ">=0.16.0"
@@ -222,5 +222,6 @@ module = [
     "cftime",
     "pykdtree.kdtree",
     "netCDF4",
+    "pooch",
 ]
 ignore_missing_imports = true
diff --git a/tests/tools/test_exampledata_utils.py b/tests/tools/test_exampledata_utils.py
@@ -1,35 +1,21 @@
-from pathlib import Path
-
 import pytest
 import requests
 
 from parcels.tools.exampledata_utils import (
+    _get_pooch,
     download_example_dataset,
     list_example_datasets,
 )
 
 
-@pytest.fixture
-def mock_download(monkeypatch):
-    """Avoid the download, only check the status code and create empty file."""
-
-    def mock_urlretrieve(url, filename):
-        response = requests.head(url)
-
-        if 400 <= response.status_code < 600:
-            raise Exception(f"Failed to access URL: {url}. Status code: {response.status_code}")
-
-        Path(filename).touch()
-
-    monkeypatch.setattr("parcels.tools.exampledata_utils.urlretrieve", mock_urlretrieve)
-
+@pytest.mark.parametrize("url", [_get_pooch().get_url(filename) for filename in _get_pooch().registry.keys()])
+def test_pooch_registry_url_reponse(url):
+    response = requests.head(url)
+    assert not (400 <= response.status_code < 600)
 
-@pytest.mark.usefixtures("mock_download")
-@pytest.mark.parametrize("dataset", list_example_datasets())
-def test_download_example_dataset(tmp_path, dataset):
-    if dataset == "GlobCurrent_example_data":
-        pytest.skip(f"{dataset} too time consuming.")
 
+@pytest.mark.parametrize("dataset", list_example_datasets()[:1])
+def test_download_example_dataset_folder_creation(tmp_path, dataset):
     dataset_folder_path = download_example_dataset(dataset, data_home=tmp_path)
 
     assert dataset_folder_path.exists()