Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .binder/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ channels:
- conda-forge
dependencies:
- parcels
- pooch
Comment thread
VeckoTheGecko marked this conversation as resolved.
- trajan
1 change: 0 additions & 1 deletion .github/ci/min-core-deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ dependencies:
# (see https://github.com/Unidata/netcdf4-python/issues/1090)
- netcdf4=1.6
- numpy=1.23
- platformdirs=2.5
- psutil=5.9
- pymbolic=2022.1
- pytest=7.1
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ dependencies: #! Keep in sync with [tool.pixi.dependencies] in pyproject.toml
- matplotlib-base>=2.0.2
- netcdf4>=1.1.9
- numpy>=1.9.1
- platformdirs
- psutil
- pymbolic
- scipy>=0.16.0
Expand All @@ -18,6 +17,7 @@ dependencies: #! Keep in sync with [tool.pixi.dependencies] in pyproject.toml
- dask>=2.0
- scikit-learn
- zarr>=2.11.0,!=2.18.0,<3
- pooch

# Notebooks
- trajan
Expand Down
97 changes: 63 additions & 34 deletions parcels/tools/exampledata_utils.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,33 @@
import os
from datetime import datetime, timedelta
from pathlib import Path
from urllib.request import urlretrieve

import platformdirs
import pooch
import xarray as xr

from parcels.tools._v3to4 import patch_dataset_v4_compat

__all__ = ["download_example_dataset", "get_data_home", "list_example_datasets"]

example_data_files = {
__all__ = ["download_example_dataset", "list_example_datasets"]

# When modifying existing datasets in a backwards incompatible way,
# make a new release in the repo and update the DATA_REPO_TAG to the new tag
DATA_REPO_TAG = "main"

DATA_URL = f"https://github.com/OceanParcels/parcels-data/raw/{DATA_REPO_TAG}/data"

# Keys are the dataset names. Values are the filenames in the dataset folder. Note that
# you can specify subfolders in the dataset folder putting slashes in the filename list.
# e.g.,
# "my_dataset": ["file0.nc", "folder1/file1.nc", "folder2/file2.nc"]
# my_dataset/
# ├── file0.nc
# ├── folder1/
# │ └── file1.nc
# └── folder2/
# └── file2.nc
#
# See instructions at https://github.com/OceanParcels/parcels-data for adding new datasets
Comment on lines +18 to +29
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fluidnumerics-joe are these instructions clear? Do you think you can add your datasets now?

EXAMPLE_DATA_FILES: dict[str, list[str]] = {
"MovingEddies_data": [
"moving_eddiesP.nc",
"moving_eddiesU.nc",
Expand Down Expand Up @@ -79,24 +96,32 @@
}


example_data_url = "http://oceanparcels.org/examples-data"
def _create_pooch_registry() -> dict[str, None]:
"""Collapses the mapping of dataset names to filenames into a pooch registry.

Hashes are set to None for all files.
"""
registry = {}
for dataset, filenames in EXAMPLE_DATA_FILES.items():
for filename in filenames:
registry[f"{dataset}/{filename}"] = None
return registry

def get_data_home(data_home=None):
"""Return a path to the cache directory for example datasets.

This directory is used by :func:`load_dataset`.
POOCH_REGISTRY = _create_pooch_registry()

If the ``data_home`` argument is not provided, it will use a directory
specified by the ``PARCELS_EXAMPLE_DATA`` environment variable (if it exists)
or otherwise default to an OS-appropriate user cache location.
"""

def _get_odie(data_home=None):
Comment thread
erikvansebille marked this conversation as resolved.
Outdated
if data_home is None:
data_home = os.environ.get("PARCELS_EXAMPLE_DATA")
if data_home is None:
Comment thread
erikvansebille marked this conversation as resolved.
data_home = os.environ.get("PARCELS_EXAMPLE_DATA", platformdirs.user_cache_dir("parcels"))
data_home = os.path.expanduser(data_home)
if not os.path.exists(data_home):
os.makedirs(data_home)
return data_home
data_home = pooch.os_cache("parcels")

return pooch.create(
path=data_home,
base_url=DATA_URL,
registry=POOCH_REGISTRY,
)


def list_example_datasets() -> list[str]:
Expand All @@ -109,7 +134,7 @@
datasets : list of str
The names of the available example datasets.
"""
return list(example_data_files.keys())
return list(EXAMPLE_DATA_FILES.keys())


def download_example_dataset(dataset: str, data_home=None):
Expand All @@ -133,26 +158,30 @@
Path to the folder containing the downloaded dataset files.
"""
# Dev note: `dataset` is assumed to be a folder name with netcdf files
if dataset not in example_data_files:
if dataset not in EXAMPLE_DATA_FILES:
raise ValueError(
f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(example_data_files.keys())
f"Dataset {dataset!r} not found. Available datasets are: " + ", ".join(EXAMPLE_DATA_FILES.keys())
)
odie = _get_odie(data_home=data_home)

cache_folder = get_data_home(data_home)
dataset_folder = Path(cache_folder) / dataset
cache_folder = Path(odie.path)
dataset_folder = cache_folder / dataset

if not dataset_folder.exists():
dataset_folder.mkdir(parents=True)
for file_name in odie.registry:
if file_name.startswith(dataset):
should_patch = dataset == "GlobCurrent_example_data"
Comment thread
erikvansebille marked this conversation as resolved.
odie.fetch(file_name, processor=v4_compat_patch if should_patch else None)

for filename in example_data_files[dataset]:
filepath = dataset_folder / filename
if not filepath.exists():
url = f"{example_data_url}/{dataset}/{filename}"
urlretrieve(url, str(filepath))
return dataset_folder

should_patch = dataset == "GlobCurrent_example_data"

if should_patch:
xr.load_dataset(filepath).pipe(patch_dataset_v4_compat).to_netcdf(filepath)
def v4_compat_patch(fname, action, pup):
"""
Patch the GlobCurrent example dataset to be compatible with v4.

return dataset_folder
See https://www.fatiando.org/pooch/latest/processors.html#creating-your-own-processors
"""
if action == "fetch":
return fname
xr.load_dataset(fname).pipe(patch_dataset_v4_compat).to_netcdf(fname)
return fname

Check warning on line 187 in parcels/tools/exampledata_utils.py

View check run for this annotation

Codecov / codecov/patch

parcels/tools/exampledata_utils.py#L184-L187

Added lines #L184 - L187 were not covered by tests
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies = [
"pytest",
"scipy",
"xarray",
"pooch",
]

[project.urls]
Expand Down Expand Up @@ -63,7 +64,6 @@ jupyter = "*"
matplotlib-base = ">=2.0.2"
netcdf4 = ">=1.1.9"
numpy = ">=1.9.1"
platformdirs = "*"
psutil = "*"
pymbolic = "*"
scipy = ">=0.16.0"
Expand Down
26 changes: 6 additions & 20 deletions tests/tools/test_exampledata_utils.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,21 @@
from pathlib import Path

import pytest
import requests

from parcels.tools.exampledata_utils import (
_get_odie,
download_example_dataset,
list_example_datasets,
)


@pytest.fixture
def mock_download(monkeypatch):
"""Avoid the download, only check the status code and create empty file."""

def mock_urlretrieve(url, filename):
response = requests.head(url)

if 400 <= response.status_code < 600:
raise Exception(f"Failed to access URL: {url}. Status code: {response.status_code}")
@pytest.mark.parametrize("url", [_get_odie().get_url(filename) for filename in _get_odie().registry.keys()])
def test_pooch_registry_url_reponse(url):
response = requests.head(url)
assert not (400 <= response.status_code < 600)

Path(filename).touch()

monkeypatch.setattr("parcels.tools.exampledata_utils.urlretrieve", mock_urlretrieve)


@pytest.mark.usefixtures("mock_download")
@pytest.mark.parametrize("dataset", list_example_datasets())
@pytest.mark.parametrize("dataset", list_example_datasets()[:1])
Comment thread
erikvansebille marked this conversation as resolved.
def test_download_example_dataset(tmp_path, dataset):
if dataset == "GlobCurrent_example_data":
pytest.skip(f"{dataset} too time consuming.")

dataset_folder_path = download_example_dataset(dataset, data_home=tmp_path)

assert dataset_folder_path.exists()
Expand Down
Loading