Sharing of representative datasets via compressed Zarr zips (#2570)

VeckoTheGecko · web-flow · commit 06454422ea52 · 2026-04-09T10:49:40.000Z
diff --git a/.github/ISSUE_TEMPLATE/02_bug.yaml b/.github/ISSUE_TEMPLATE/02_bug.yaml
@@ -17,7 +17,7 @@ body:
   - type: "textarea"
     attributes:
       label: "Code sample"
-      description: "If relevant, please provide a code example where this bug is shown as well as any error message. A [minimal, reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) is preffered as it makes it much easier for developers to identify the cause of the bug. This also allows them quickly determine whether the problem is with your code or with Parcels itself. If you want support on a specific dataset, please [follow our instructions on how to share dataset metadata](https://docs.parcels-code.org/en/main/development/posting-issues.html)"
+      description: "If relevant, please provide a code example where this bug is shown as well as any error message. A [minimal, reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) is preffered as it makes it much easier for developers to identify the cause of the bug. This also allows them quickly determine whether the problem is with your code or with Parcels itself. If you want support on a specific dataset, please [follow our instructions on how to share representative datasets](https://docs.parcels-code.org/en/main/development/posting-issues.html)"
       value: |
         ```python
         # Paste your code within this block
diff --git a/docs/development/posting-issues.md b/docs/development/posting-issues.md
@@ -20,51 +20,96 @@ Following these templates provides structure and ensures that we have all the ne
 Parcels is designed to work with a large range of input datasets.
 
 When extending support for various input datasets, or trying to debug problems
-that only occur with specific datasets, having the dataset metadata is very valuable.
+that only occur with specific datasets, having access to your dataset (or a
+close representation of it) is very valuable.
 
-This metadata could include information such as:
+This could include information such as:
 
 - the nature of the array variables (e.g., via CF compliant metadata)
 - descriptions about the origin of the dataset, or additional comments
 - the shapes and data types of the arrays
+- the grid topology (coordinates and key variables)
 
 This also allows us to see if your metadata is broken/non-compliant with standards - where we can then suggest fixes for you (and maybe we can tell the data provider!).
 Since version 4 of Parcels we rely much more on metadata to discover information about your input data.
 
-Sharing this metadata often provides enough debugging information to solve your problem, instead of having to share a whole dataset.
+Sharing a compact representation of your dataset often provides enough information to solve your problem, without having to share the full dataset (which may be very large or contain sensitive data).
 
-Sharing dataset metadata is made easy in Parcels.
+Parcels makes this easy by replacing irrelevant array data with zeros and saving the result as a compressed Zarr zip store, which is typically small enough to attach directly to a GitHub issue.
 
 ### Step 1. Users
 
 As a user with access to your dataset, you would do:
 
 ```{code-cell}
-import json
+:tags: [hide-cell]
 
+# Generate an example dataset to zip. The user would use their own.
 import xarray as xr
+from parcels._datasets.structured.generic import datasets
+datasets['ds_2d_left'].to_netcdf("my_dataset.nc")
+```
+
+```{code-cell}
+import os
+
+import xarray as xr
+import zarr
+
+from parcels._datasets.utils import replace_arrays_with_zeros
+
+# load your dataset
+ds = xr.open_dataset("my_dataset.nc")  # or xr.open_zarr(...), etc.
+
+# Replace all data arrays with zeros, keeping coordinate metadata.
+# This keeps array shapes and metadata while removing actual data.
+#
+# You can customise `except_for` to also retain actual values for specific variables:
+#   except_for='coords'         — keep coordinate arrays (useful for grid topology)
+#   except_for=['lon', 'lat']   — keep a specific list of variables
+#   except_for=None   — remove all arrays (useful to know about dtypes, structure, and metadata). This is the default for the function.
+ds_trimmed = replace_arrays_with_zeros(ds, except_for = None)
 
-# defining an example dataset to illustrate
-# (you would use `xr.open_dataset(...)` instead)
-ds = xr.Dataset(attrs={"description": "my dataset"})
+# Save to a zipped Zarr store - replace `my_dataset` with a more informative name
+with zarr.storage.ZipStore("my_dataset.zip", mode='w') as store:
+    ds_trimmed.to_zarr(store)
 
-output_file = "my_dataset.json"
-with open(output_file, "w") as f:
-    json.dump(ds.to_dict(data=False), f)  # write your dataset to a JSON excluding array data
+size_mb_original = os.path.getsize("my_dataset.nc") / 1e6
+print(f"Original size: {size_mb_original:.1f} MB")
+
+# Check the file size (aim for < 25 MB so it can be attached to a GitHub issue)
+size_mb = os.path.getsize("my_dataset.zip") / 1e6
+print(f"Zip store size: {size_mb:.1f} MB")
 ```
 
-Then attach the JSON file written above alongside your issue
+Then attach the zip file written above alongside your issue.
+
+If the file is larger than 25 MB, try passing `except_for=None` (the default)
+to ensure all arrays are zeroed out. If it is still too large, consider
+subsetting your dataset to a smaller spatial or temporal region before saving.
 
 ### Step 2. Maintainers and developers
 
-As developers looking to inspect the metadata, we would do:
+As developers looking to inspect the dataset, we would do:
+
+```{code-cell}
+import xarray as xr
+import zarr
+
+ds = xr.open_zarr(zarr.storage.ZipStore("my_dataset.zip", mode="r"))
+ds
+```
 
 ```{code-cell}
-from parcels._datasets.utils import from_xarray_dataset_dict
+:tags: [hide-cell]
+
+# Cleanup files in doc build process
+del ds
+from pathlib import Path
+Path("my_dataset.zip").unlink()
+Path("my_dataset.nc").unlink()
 
-with open(output_file) as f:
-    d = json.load(f)
-ds = from_xarray_dataset_dict(d)
 ```
 
-From there we can take a look the metadata of your dataset!
+From there we can take a look at the structure, metadata, and grid topology of your dataset!
+This also makes it straightforward for us to add this dataset to our test suite.
diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py
@@ -1,5 +1,5 @@
-import copy
-from typing import Any
+from collections.abc import Hashable
+from typing import Any, Literal
 
 import numpy as np
 import xarray as xr
@@ -186,21 +186,49 @@ def verbose_print(*args, **kwargs):
     verbose_print("=" * 30 + " End of Comparison " + "=" * 30)
 
 
-def from_xarray_dataset_dict(d) -> xr.Dataset:
-    """Reconstruct a dataset with zero data from the output of ``xarray.Dataset.to_dict(data=False)``.
+def replace_arrays_with_zeros(
+    ds: xr.Dataset, except_for: Literal["coords"] | list[Hashable] | None = None
+) -> xr.Dataset:
+    """Replace datavars in the xarray dataset with zeros, except for some.
 
-    Useful in issues helping users debug fieldsets - sharing dataset schemas with associated metadata
-    without sharing the data itself.
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The dataset whose arrays will be replaced with zeros.
+    except_for : "coords" or list of Hashable or None, optional
+        Controls which arrays are preserved:
 
-    Example
+        - ``None``: Replace all arrays with zeros.
+        - ``"coords"``: Replace all arrays with zeros except the non-index coords.
+        - list: Provide a list of variable/coord names to exclude from zeroing.
+
+    Returns
     -------
-    >>> import xarray as xr
-    >>> from parcels._datasets.structured.generic import datasets
-    >>> ds = datasets['ds_2d_left']
-    >>> d = ds.to_dict(data=False)
-    >>> ds2 = from_xarray_dataset_dict(d)
+    xr.Dataset
+        A copy of ``ds`` with the selected arrays replaced by zeros.
     """
-    return xr.Dataset.from_dict(_fill_with_dummy_data(copy.deepcopy(d)))
+    import dask.array as da
+
+    if except_for is None:
+        except_for = []
+    if except_for == "coords":
+        except_for = list(ds.coords.keys())
+
+    ds = ds.copy()
+    ds_keys = set(ds.data_vars) | set(ds.coords)
+    for k in except_for:
+        if k not in ds_keys:
+            raise ValueError(f"Item {k!r} in `except_for` not a valid item in dataset. Got {except_for=!r}.")
+
+    for k in ds_keys - set(except_for):
+        data = da.zeros_like(ds[k].data)
+        try:
+            ds[k].data = data
+        except ValueError:
+            # Cannot assign to dimension coordinate, leave as is
+            pass
+
+    return ds
 
 
 def _fill_with_dummy_data(d: dict[str, dict]):
diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py
@@ -0,0 +1,79 @@
+import numpy as np
+import pytest
+import xarray as xr
+
+from parcels._datasets import utils
+from parcels._datasets.structured.generic import datasets
+
+
+@pytest.fixture
+def nonzero_ds():
+    """Small dataset with nonzero data_vars and non-index coords for replace_arrays_with_zeros tests.
+
+    Uses 2D lon/lat as coords so they are regular (non-index) variables that can be zeroed.
+    """
+    import dask.array as da
+
+    lon = np.array([[1.0, 2.0, 3.0, 4.0]] * 3)
+    lat = np.array([[10.0] * 4, [20.0] * 4, [30.0] * 4])
+    return xr.Dataset(
+        {
+            "U": (["y", "x"], da.from_array(np.ones((3, 4)), chunks=-1)),
+            "V": (["y", "x"], da.from_array(np.full((3, 4), 2.0), chunks=-1)),
+        },
+        coords={
+            "lon": (["y", "x"], da.from_array(lon, chunks=-1)),
+            "lat": (["y", "x"], da.from_array(lat, chunks=-1)),
+        },
+    )
+
+
+@pytest.mark.parametrize("ds", [pytest.param(v, id=k) for k, v in datasets.items()])
+@pytest.mark.parametrize("except_for", [None, "coords"])
+def test_replace_arrays_with_zeros(ds, except_for):
+    # make sure doesn't error with range of datasets
+    utils.replace_arrays_with_zeros(ds, except_for=except_for)
+
+
+def test_replace_arrays_with_zeros_none(nonzero_ds):
+    """except_for=None: all data_vars and coords replaced with zeros."""
+    result = utils.replace_arrays_with_zeros(nonzero_ds, except_for=None)
+
+    for k in set(result.data_vars) | set(result.coords):
+        assert np.all(result[k].values == 0), f"{k!r} should be zero"
+
+
+def test_replace_arrays_with_zeros_coords(nonzero_ds):
+    """except_for='coords': data_vars zeroed, coords preserved."""
+    result = utils.replace_arrays_with_zeros(nonzero_ds, except_for="coords")
+
+    for k in result.data_vars:
+        assert np.all(result[k].values == 0), f"data_var {k!r} should be zero"
+
+    np.testing.assert_array_equal(result["lon"].values, nonzero_ds["lon"].values)
+    np.testing.assert_array_equal(result["lat"].values, nonzero_ds["lat"].values)
+
+
+def test_replace_arrays_with_zeros_list(nonzero_ds):
+    """except_for=[...]: listed variables preserved, others zeroed."""
+    result = utils.replace_arrays_with_zeros(nonzero_ds, except_for=["U", "lon"])
+
+    np.testing.assert_array_equal(result["U"].values, nonzero_ds["U"].values)
+    np.testing.assert_array_equal(result["lon"].values, nonzero_ds["lon"].values)
+    assert np.all(result["V"].values == 0), "V should be zero"
+    assert np.all(result["lat"].values == 0), "lat should be zero"
+
+
+def test_replace_arrays_with_zeros_does_not_mutate(nonzero_ds):
+    """Original dataset is not modified."""
+    original_U = nonzero_ds["U"].values.copy()
+    original_lon = nonzero_ds["lon"].values.copy()
+    utils.replace_arrays_with_zeros(nonzero_ds, except_for=None)
+    np.testing.assert_array_equal(nonzero_ds["U"].values, original_U)
+    np.testing.assert_array_equal(nonzero_ds["lon"].values, original_lon)
+
+
+def test_replace_arrays_with_zeros_invalid_key(nonzero_ds):
+    """Invalid key in except_for raises ValueError."""
+    with pytest.raises(ValueError, match="not a valid item"):
+        utils.replace_arrays_with_zeros(nonzero_ds, except_for=["nonexistent"])