Add xr.Dataset support to rechunk_no_shuffle (#1069)

brendancol · brendancol · commit 6258962c34ca · 2026-03-23T21:51:57.000-07:00
Accepts both DataArray and Dataset. For Datasets, each dask-backed
variable is rechunked independently. Also adds the method to the
Dataset .xrs accessor.
diff --git a/xrspatial/accessor.py b/xrspatial/accessor.py
@@ -910,3 +910,9 @@ def open_geotiff(self, source, **kwargs):
                                    y_min, y_max, x_min, x_max)
         kwargs.pop('window', None)
         return open_geotiff(source, window=window, **kwargs)
+
+    # ---- Chunking ----
+
+    def rechunk_no_shuffle(self, **kwargs):
+        from .utils import rechunk_no_shuffle
+        return rechunk_no_shuffle(self._obj, **kwargs)
diff --git a/xrspatial/utils.py b/xrspatial/utils.py
@@ -1028,8 +1028,34 @@ def _sample_windows_min_max(
     return float(np.nanmin(np.array(mins, dtype=float))), float(np.nanmax(np.array(maxs, dtype=float)))
 
 
+def _rechunk_dataarray(agg, target_bytes):
+    """Rechunk a single dask-backed DataArray.  Returns unchanged if not dask."""
+    if not has_dask_array() or not isinstance(agg.data, da.Array):
+        return agg
+
+    chunks = agg.chunks  # tuple of tuples
+    base = tuple(c[0] for c in chunks)
+
+    current_bytes = agg.dtype.itemsize
+    for b in base:
+        current_bytes *= b
+
+    if current_bytes >= target_bytes:
+        return agg
+
+    ndim = len(base)
+    ratio = target_bytes / current_bytes
+    multiplier = max(1, int(ratio ** (1.0 / ndim)))
+
+    if multiplier <= 1:
+        return agg
+
+    new_chunks = {dim: b * multiplier for dim, b in zip(agg.dims, base)}
+    return agg.chunk(new_chunks)
+
+
 def rechunk_no_shuffle(agg, target_mb=128):
-    """Rechunk a dask-backed DataArray without triggering a shuffle.
+    """Rechunk dask-backed data without triggering a shuffle.
 
     Computes an integer multiplier per dimension so that each new chunk
     is an exact multiple of the original chunk size.  This lets dask
@@ -1038,23 +1064,23 @@ def rechunk_no_shuffle(agg, target_mb=128):
 
     Parameters
     ----------
-    agg : xr.DataArray
-        Input raster.  If not backed by a dask array the input is
-        returned unchanged.
+    agg : xr.DataArray or xr.Dataset
+        Input raster or collection of rasters.  Non-dask variables
+        pass through unchanged.
     target_mb : int or float
         Target chunk size in megabytes.  The actual chunk size will be
         the closest multiple of the source chunk that does not exceed
         this target.  Default 128.
 
     Returns
     -------
-    xr.DataArray
-        Rechunked DataArray.  Coordinates and attributes are preserved.
+    xr.DataArray or xr.Dataset
+        Rechunked object.  Coordinates and attributes are preserved.
 
     Raises
     ------
     TypeError
-        If *agg* is not an ``xr.DataArray``.
+        If *agg* is not an ``xr.DataArray`` or ``xr.Dataset``.
     ValueError
         If *target_mb* is not positive.
 
@@ -1066,37 +1092,23 @@ def rechunk_no_shuffle(agg, target_mb=128):
     >>> big = rechunk_no_shuffle(arr, target_mb=64)
     >>> big.chunks  # multiples of 256
     """
-    if not isinstance(agg, xr.DataArray):
+    if not isinstance(agg, (xr.DataArray, xr.Dataset)):
         raise TypeError(
-            f"rechunk_no_shuffle(): expected xr.DataArray, "
+            f"rechunk_no_shuffle(): expected xr.DataArray or xr.Dataset, "
             f"got {type(agg).__name__}"
         )
     if target_mb <= 0:
         raise ValueError(
             f"rechunk_no_shuffle(): target_mb must be > 0, got {target_mb}"
         )
 
-    if not has_dask_array() or not isinstance(agg.data, da.Array):
-        return agg
-
-    chunks = agg.chunks  # tuple of tuples
-    base = tuple(c[0] for c in chunks)
-
-    current_bytes = agg.dtype.itemsize
-    for b in base:
-        current_bytes *= b
-
     target_bytes = target_mb * 1024 * 1024
 
-    if current_bytes >= target_bytes:
-        return agg
-
-    ndim = len(base)
-    ratio = target_bytes / current_bytes
-    multiplier = max(1, int(ratio ** (1.0 / ndim)))
+    if isinstance(agg, xr.DataArray):
+        return _rechunk_dataarray(agg, target_bytes)
 
-    if multiplier <= 1:
-        return agg
-
-    new_chunks = {dim: b * multiplier for dim, b in zip(agg.dims, base)}
-    return agg.chunk(new_chunks)
+    # Dataset: rechunk each variable independently
+    new_vars = {}
+    for name, var in agg.data_vars.items():
+        new_vars[name] = _rechunk_dataarray(var, target_bytes)
+    return agg.assign(new_vars)