Remove zarr re-open logic from rechunk and preview

brendancol · brendancol · commit a71302ad0414 · 2026-03-27T13:07:40.000-07:00
Drop _is_unmodified_zarr, _reopen_preview_chunks, and
_preview_chunk_budget. Dataset rechunk now uses ds.chunk()
instead of re-opening the zarr store under the hood.
diff --git a/xrspatial/preview.py b/xrspatial/preview.py
@@ -13,33 +13,6 @@
 _COARSEN_METHODS = ('mean', 'median', 'max', 'min')
 _METHODS = (*_COARSEN_METHODS, 'nearest', 'bilinear')
 
-# Fallback chunk budget when no distributed client is available.
-_DEFAULT_PREVIEW_CHUNK_BYTES = 512 * 1024 * 1024
-
-
-def _preview_chunk_budget():
-    """Max bytes per preview chunk, based on the active dask cluster.
-
-    If a ``dask.distributed`` client is connected, returns
-    ``worker_memory * 0.7 / nthreads`` so that concurrent tasks on
-    the same worker stay under the memory-pause threshold.  Otherwise
-    falls back to ``_DEFAULT_PREVIEW_CHUNK_BYTES`` (512 MB).
-    """
-    try:
-        from dask.distributed import get_client
-        client = get_client()
-        info = client.scheduler_info()
-        workers = info.get('workers', {})
-        if workers:
-            w = next(iter(workers.values()))
-            mem = w.get('memory_limit', 0)
-            nthreads = w.get('nthreads', 1) or 1
-            if mem > 0:
-                return int(mem * 0.7 / nthreads)
-    except Exception:
-        pass
-    return _DEFAULT_PREVIEW_CHUNK_BYTES
-
 
 def _nan_full(oh, ow, block):
     """NaN-filled ``(oh, ow)`` array matching *block*'s type and dtype."""
@@ -339,58 +312,6 @@ def _refine_to_target(result, target_h, target_w, y_dim, x_dim):
 # Public API
 # ---------------------------------------------------------------------------
 
-def _reopen_preview_chunks(agg):
-    """Re-open a zarr-backed DataArray with memory-safe chunks.
-
-    Computes the largest chunk size that is an exact multiple of the
-    zarr storage chunks and fits under the per-task memory budget
-    (derived from the active dask cluster configuration).  This keeps
-    the task graph small (far fewer chunks than storage granularity)
-    while keeping peak memory per task well within worker limits even
-    when ``threads_per_worker > 1``.
-
-    Returns a new DataArray or *None* if the source isn't available.
-    When the input is a spatial subset (``.sel()``), the returned
-    array covers the same coordinate range.
-    """
-    source = agg.encoding.get('_xrs_zarr_source')
-    pref = agg.encoding.get('preferred_chunks')
-    if source is None or pref is None or agg.name is None:
-        return None
-    try:
-        budget = _preview_chunk_budget()
-        # Compute the largest multiple of storage chunks that fits
-        # under the per-task budget.
-        base = tuple(pref[d] for d in agg.dims if d in pref)
-        if not base or len(base) != 2:
-            return None
-        base_bytes = agg.dtype.itemsize * base[0] * base[1]
-        if base_bytes >= budget:
-            # Storage chunks already exceed the budget; use them as-is.
-            chunks = pref
-        else:
-            ratio = budget / base_bytes
-            multiplier = max(1, int(ratio ** (1.0 / len(base))))
-            chunks = {d: pref[d] * multiplier for d in agg.dims if d in pref}
-
-        ds = xr.open_zarr(source, chunks=chunks)
-        if agg.name not in ds:
-            return None
-        da_full = ds[agg.name]
-        # Select to match the current DataArray's coordinate extent.
-        sel = {}
-        for dim in agg.dims:
-            if dim in agg.coords and dim in da_full.coords:
-                c = agg.coords[dim].values
-                if len(c) > 0:
-                    sel[dim] = slice(c[0], c[-1])
-        if sel:
-            da_full = da_full.sel(sel)
-        return da_full
-    except Exception:
-        return None
-
-
 @supports_dataset
 def preview(agg, width=1000, height=None, method='mean', name='preview'):
     """Downsample a raster to target pixel dimensions.
@@ -433,22 +354,6 @@ def preview(agg, width=1000, height=None, method='mean', name='preview'):
             f"method must be one of {_METHODS!r}, got {method!r}"
         )
 
-    # If chunks are too large for a single worker task, re-open from
-    # the zarr source with memory-safe chunks.  The budget accounts
-    # for threads_per_worker so concurrent tasks don't collectively
-    # exceed the worker's memory-pause threshold.
-    try:
-        import dask.array as _da
-        if isinstance(agg.data, _da.Array):
-            chunk_bytes = (agg.dtype.itemsize
-                          * agg.data.chunksize[0] * agg.data.chunksize[1])
-            if chunk_bytes > _preview_chunk_budget():
-                safe = _reopen_preview_chunks(agg)
-                if safe is not None:
-                    agg = safe
-    except ImportError:
-        pass
-
     h = agg.sizes[agg.dims[0]]
     w = agg.sizes[agg.dims[1]]
 
diff --git a/xrspatial/tests/test_rechunk_no_shuffle.py b/xrspatial/tests/test_rechunk_no_shuffle.py
@@ -8,14 +8,6 @@
 
 da = pytest.importorskip("dask.array")
 
-_has_zarr = True
-try:
-    import zarr  # noqa: F401
-except ImportError:
-    _has_zarr = False
-
-requires_zarr = pytest.mark.skipif(not _has_zarr, reason="zarr not installed")
-
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -110,52 +102,7 @@ def test_rejects_non_dataarray():
         rechunk_no_shuffle(np.zeros((10, 10)))
 
 
-# ---------------------------------------------------------------------------
-# Zarr re-open optimisation
-# ---------------------------------------------------------------------------
-
-@requires_zarr
-def test_zarr_reopen_reduces_graph(tmp_path):
-    """For a fresh zarr Dataset, rechunk should re-open with fewer tasks."""
-    path = str(tmp_path / "rns_zarr_reopen.zarr")
-    ds = xr.Dataset({"elev": xr.DataArray(
-        np.random.rand(100, 100).astype(np.float64), dims=["y", "x"],
-        coords={"y": np.arange(100), "x": np.arange(100)},
-    )})
-    ds.chunk({"y": 10, "x": 10}).to_zarr(path)
-
-    ds_in = xr.open_zarr(path)
-    tasks_before = len(ds_in["elev"].data.__dask_graph__())
-
-    ds_out = rechunk_no_shuffle(ds_in, target_mb=1)
-    tasks_after = len(ds_out["elev"].data.__dask_graph__())
-
-    # Re-open should produce fewer tasks, not more
-    assert tasks_after < tasks_before, (
-        f"expected fewer tasks after rechunk, got {tasks_after} >= {tasks_before}"
-    )
-    # Values must match
-    np.testing.assert_array_equal(ds_in["elev"].values, ds_out["elev"].values)
-
-
-@requires_zarr
-def test_zarr_reopen_skipped_after_sel(tmp_path):
-    """After .sel(), the graph has >2 layers so re-open is skipped."""
-    path = str(tmp_path / "rns_zarr_sel.zarr")
-    ds = xr.Dataset({"elev": xr.DataArray(
-        np.random.rand(100, 100).astype(np.float64), dims=["y", "x"],
-        coords={"y": np.arange(100), "x": np.arange(100)},
-    )})
-    ds.chunk({"y": 10, "x": 10}).to_zarr(path)
-
-    ds_sel = xr.open_zarr(path).sel(y=slice(10, 50))
-    result = rechunk_no_shuffle(ds_sel, target_mb=1)
-
-    # Should still rechunk (values match), just not via re-open
-    np.testing.assert_array_equal(ds_sel["elev"].values, result["elev"].values)
-
-
-def test_dataset_rechunk_fallback():
+def test_dataset_rechunk():
     """Dataset without zarr backing rechunks via the map() fallback."""
     ds = xr.Dataset({
         "elev": xr.DataArray(
diff --git a/xrspatial/utils.py b/xrspatial/utils.py
@@ -1055,19 +1055,6 @@ def _no_shuffle_chunks(chunks, dtype, dims, target_mb):
     return {dim: b * multiplier for dim, b in zip(dims, base)}
 
 
-def _is_unmodified_zarr(ds):
-    """True when every dask variable is a direct zarr read (2 layers)."""
-    found_dask = False
-    for var in ds.data_vars.values():
-        data = var.data
-        if has_dask_array() and isinstance(data, da.Array):
-            found_dask = True
-            graph = data.__dask_graph__()
-            if hasattr(graph, 'layers') and len(graph.layers) != 2:
-                return False
-    return found_dask
-
-
 def rechunk_no_shuffle(agg, target_mb=128):
     """Rechunk a dask-backed DataArray or Dataset without triggering a shuffle.
 
@@ -1076,12 +1063,6 @@ def rechunk_no_shuffle(agg, target_mb=128):
     merge whole source chunks in-place instead of splitting and
     recombining partial blocks (which is effectively a shuffle).
 
-    For file-backed data (e.g. Zarr stores), the function re-opens
-    the source with the target chunk sizes so that each dask task
-    reads multiple storage chunks in one call.  This produces a
-    dramatically smaller task graph compared to ``.chunk()``, which
-    adds a rechunk merge layer on top of the existing read tasks.
-
     Parameters
     ----------
     agg : xr.DataArray or xr.Dataset
@@ -1137,7 +1118,7 @@ def rechunk_no_shuffle(agg, target_mb=128):
 
 
 def _rechunk_dataset_no_shuffle(ds, target_mb):
-    """Rechunk a Dataset, re-opening from zarr when possible."""
+    """Rechunk every variable in a Dataset without triggering a shuffle."""
     if target_mb <= 0:
         raise ValueError(
             f"rechunk_no_shuffle(): target_mb must be > 0, got {target_mb}"
@@ -1160,27 +1141,7 @@ def _rechunk_dataset_no_shuffle(ds, target_mb):
     if new_chunks is None:
         return ds
 
-    # For unmodified zarr reads, re-open with target chunks so
-    # each dask task reads multiple storage chunks in one call.
-    # This avoids the extra rechunk-merge graph layer that
-    # .chunk() would add on top of the existing read tasks.
-    source = ds.encoding.get('source')
-    if source is not None and _is_unmodified_zarr(ds):
-        try:
-            reopened = xr.open_zarr(source, chunks=new_chunks)
-            if set(ds.data_vars) <= set(reopened.data_vars):
-                result = reopened[list(ds.data_vars)]
-                # Propagate zarr source into each variable's encoding
-                # so downstream operations (e.g. preview) can re-open
-                # with different chunks when needed.
-                for name in result.data_vars:
-                    result[name].encoding['_xrs_zarr_source'] = source
-                return result
-        except Exception:
-            pass
-
-    # Fallback: rechunk each variable individually.
-    return ds.map(rechunk_no_shuffle, target_mb=target_mb)
+    return ds.chunk(new_chunks)
 
 
 def _normalize_depth(depth, ndim):