Fix OOM in geotiff dask read, sieve memory, and reproject GPU fallback

brendancol · brendancol · commit 21475cf96a47 · 2026-04-10T21:51:58.000-07:00
Three performance fixes from the Phase 2 sweep targeting WILL OOM
verdicts under 30TB workloads:

geotiff: read_geotiff_dask() was reading the entire file into RAM just
to extract metadata before building the lazy dask graph. Now uses
_read_geo_info() which parses only the IFD via mmap -- O(1) memory
regardless of file size. Peak memory during dask setup dropped from
4.41 MB to 0.21 MB at 512x512 (21x reduction).

sieve: region_val_buf was allocated at rows*cols (16 GB for a 46K x 46K
raster) when the actual region count is typically orders of magnitude
smaller. Now counts regions first, allocates at actual size. Also reuses
the dead rank array as root_to_id, saving another 4 bytes/pixel. Memory
guard fixed from a misleading 5x multiplier to an accurate 28
bytes/pixel estimate.

reproject: _reproject_dask_cupy pre-allocated the full output on GPU via
cp.full(out_shape), which OOMs for large outputs. Now checks available
GPU memory and falls back to the existing map_blocks path (with
is_cupy=True) when the output exceeds VRAM. Fast path preserved for
outputs that fit.
diff --git a/xrspatial/accessor.py b/xrspatial/accessor.py
@@ -1027,7 +1027,7 @@ def open_geotiff(self, source, **kwargs):
         y_min, y_max = float(y.min()), float(y.max())
         x_min, x_max = float(x.min()), float(x.max())
 
-        geo_info, file_h, file_w = _read_geo_info(source)
+        geo_info, file_h, file_w, _dtype, _nbands = _read_geo_info(source)
         t = geo_info.transform
 
         # Expand extent by half a pixel so we capture edge pixels
diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py
@@ -114,11 +114,19 @@ def _coords_to_transform(da: xr.DataArray) -> GeoTransform | None:
     )
 
 
-def _read_geo_info(source: str):
+def _read_geo_info(source: str, *, overview_level: int | None = None):
     """Read only the geographic metadata and image dimensions from a GeoTIFF.
 
-    Returns (geo_info, height, width) without reading pixel data.
+    Returns (geo_info, height, width, dtype, n_bands) without reading pixel
+    data.  Uses mmap for header-only access -- O(1) memory regardless of file
+    size.
+
+    Parameters
+    ----------
+    overview_level : int or None
+        Overview IFD index (0 = full resolution).
     """
+    from ._dtypes import tiff_dtype_to_numpy
     from ._geotags import extract_geo_info
     from ._header import parse_all_ifds, parse_header
 
@@ -128,9 +136,17 @@ def _read_geo_info(source: str):
     try:
         header = parse_header(data)
         ifds = parse_all_ifds(data, header)
-        ifd = ifds[0]
+        ifd_idx = 0
+        if overview_level is not None:
+            ifd_idx = min(overview_level, len(ifds) - 1)
+        ifd = ifds[ifd_idx]
         geo_info = extract_geo_info(ifd, data, header.byte_order)
-        return geo_info, ifd.height, ifd.width
+        bps = ifd.bits_per_sample
+        if isinstance(bps, tuple):
+            bps = bps[0]
+        file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
+        n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0
+        return geo_info, ifd.height, ifd.width, file_dtype, n_bands
     finally:
         data.close()
 
@@ -873,11 +889,9 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512,
     if source.lower().endswith('.vrt'):
         return read_vrt(source, dtype=dtype, name=name, chunks=chunks)
 
-    # First, do a metadata-only read to get shape, dtype, coords, attrs
-    arr, geo_info = read_to_array(source, overview_level=overview_level)
-    full_h, full_w = arr.shape[:2]
-    n_bands = arr.shape[2] if arr.ndim == 3 else 0
-    file_dtype = arr.dtype
+    # Metadata-only read: O(1) memory via mmap, no pixel decompression
+    geo_info, full_h, full_w, file_dtype, n_bands = _read_geo_info(
+        source, overview_level=overview_level)
     nodata = geo_info.nodata
 
     # Nodata masking promotes integer arrays to float64 (for NaN).
diff --git a/xrspatial/reproject/__init__.py b/xrspatial/reproject/__init__.py
@@ -965,20 +965,18 @@ def _reproject_dask_cupy(
     resampling, nodata, precision,
     chunk_size,
 ):
-    """Dask+CuPy backend: process output chunks on GPU sequentially.
-
-    Instead of dask.delayed per chunk (which has ~15ms overhead each from
-    pyproj init + small CUDA launches), we:
-    1. Create CRS/transformer objects once
-    2. Use GPU-sized output chunks (2048x2048 by default)
-    3. For each output chunk, compute CUDA coordinates and fetch only
-       the source window needed from the dask array
-    4. Assemble the result as a CuPy array
-
-    For sources that fit in GPU memory, this is ~22x faster than the
-    dask.delayed path.  For sources that don't fit, each chunk fetches
-    only its required window, so GPU memory usage scales with chunk size,
-    not source size.
+    """Dask+CuPy backend: process output chunks on GPU.
+
+    Two modes based on available GPU memory:
+
+    **Fast path** (output fits in GPU VRAM): pre-allocates the full output
+    on GPU and fills it chunk-by-chunk.  ~22x faster than the map_blocks
+    path because CRS/transformer objects are created once and CUDA kernels
+    run with minimal launch overhead.
+
+    **Chunked fallback** (output exceeds GPU VRAM): delegates to
+    ``_reproject_dask(is_cupy=True)`` which uses ``map_blocks`` and
+    processes one chunk at a time with O(chunk_size) GPU memory.
     """
     import cupy as cp
 
@@ -999,18 +997,29 @@ def _reproject_dask_cupy(
     src_res_x = (src_right - src_left) / src_w
     src_res_y = (src_top - src_bottom) / src_h
 
-    # Memory guard: the full output is allocated on GPU.
+    # Memory check: if the full output doesn't fit in GPU memory,
+    # fall back to the map_blocks path which is O(chunk_size) memory.
     estimated = out_shape[0] * out_shape[1] * 8  # float64
     try:
         free_gpu, _total = cp.cuda.Device().mem_info
-        if estimated > 0.8 * free_gpu:
-            raise MemoryError(
-                f"_reproject_dask_cupy needs ~{estimated / 1e9:.1f} GB on GPU "
-                f"for the full output but only ~{free_gpu / 1e9:.1f} GB free.  "
-                f"Reduce output resolution or use the dask+numpy path."
-            )
+        fits_in_gpu = estimated < 0.5 * free_gpu
     except (AttributeError, RuntimeError):
-        pass  # no device info available
+        fits_in_gpu = False
+
+    if not fits_in_gpu:
+        import warnings
+        warnings.warn(
+            f"Output ({estimated / 1e9:.1f} GB) exceeds GPU memory; "
+            f"falling back to chunked map_blocks path.",
+            stacklevel=3,
+        )
+        return _reproject_dask(
+            raster, src_bounds, src_shape, y_desc,
+            src_wkt, tgt_wkt,
+            out_bounds, out_shape,
+            resampling, nodata, precision,
+            chunk_size or 2048, True,  # is_cupy=True
+        )
 
     result = cp.full(out_shape, nodata, dtype=cp.float64)
 
diff --git a/xrspatial/sieve.py b/xrspatial/sieve.py
@@ -131,10 +131,32 @@ def _label_connected(data, valid, neighborhood):
                 ):
                     _uf_union(parent, rank, idx, (r - 1) * cols + (c + 1))
 
+    # --- Count unique regions first so region_val_buf is right-sized ---
+    # Reuse rank array (no longer needed after union-find) as root_to_id.
+    # This eliminates a separate n-element int32 allocation.
+    root_to_id = rank  # alias; rank is dead after union-find
+    for i in range(n):
+        root_to_id[i] = 0  # clear
+
+    n_regions = 0
+    for i in range(n):
+        r = i // cols
+        c = i % cols
+        if not valid[r, c]:
+            continue
+        root = _uf_find(parent, i)
+        if root_to_id[root] == 0:
+            root_to_id[root] = 1  # mark as seen
+            n_regions += 1
+
+    # Allocate region_val_buf at actual region count, not pixel count.
+    # For a 46K x 46K raster with 100K regions this saves ~16 GB.
+    region_val_buf = np.full(n_regions + 1, np.nan, dtype=np.float64)
+
     # Assign contiguous region IDs
     region_map_flat = np.zeros(n, dtype=np.int32)
-    root_to_id = np.zeros(n, dtype=np.int32)
-    region_val_buf = np.full(n + 1, np.nan, dtype=np.float64)
+    for i in range(n):
+        root_to_id[i] = 0  # clear for ID assignment
     next_id = 1
 
     for i in range(n):
@@ -319,14 +341,17 @@ def _available_memory_bytes():
 def _sieve_dask(data, threshold, neighborhood, skip_values):
     """Dask+numpy backend: compute to numpy, sieve, wrap back."""
     avail = _available_memory_bytes()
-    estimated_bytes = np.prod(data.shape) * data.dtype.itemsize
-    if estimated_bytes * 5 > 0.5 * avail:
+    n_pixels = np.prod(data.shape)
+    # Peak memory: input + result (float64 each) + parent + rank +
+    # region_map_flat (int32 each) = 2*8 + 3*4 = 28 bytes/pixel.
+    estimated_bytes = n_pixels * 28
+    if estimated_bytes > 0.5 * avail:
         raise MemoryError(
-            f"sieve() needs the full array in memory "
-            f"(~{estimated_bytes * 5 / 1e9:.1f} GB) but only "
-            f"~{avail / 1e9:.1f} GB is available.  Connected-component "
-            f"labeling is a global operation that cannot be chunked.  "
-            f"Consider downsampling or tiling the input manually."
+            f"sieve() needs ~{estimated_bytes / 1e9:.1f} GB for the full "
+            f"array plus CCL bookkeeping, but only ~{avail / 1e9:.1f} GB "
+            f"is available.  Connected-component labeling is a global "
+            f"operation that cannot be chunked.  Consider downsampling "
+            f"or tiling the input manually."
         )
 
     np_data = data.compute()
@@ -338,18 +363,19 @@ def _sieve_dask(data, threshold, neighborhood, skip_values):
 
 def _sieve_dask_cupy(data, threshold, neighborhood, skip_values):
     """Dask+CuPy backend: compute to cupy, sieve via CPU fallback, wrap back."""
-    estimated_bytes = np.prod(data.shape) * data.dtype.itemsize
+    n_pixels = np.prod(data.shape)
+    estimated_bytes = n_pixels * 28
     try:
         import cupy as cp
 
         free_gpu, _total = cp.cuda.Device().mem_info
-        if estimated_bytes * 5 > 0.5 * free_gpu:
+        if estimated_bytes > 0.5 * free_gpu:
             raise MemoryError(
-                f"sieve() needs the full array on GPU "
-                f"(~{estimated_bytes * 5 / 1e9:.1f} GB) but only "
-                f"~{free_gpu / 1e9:.1f} GB free.  Connected-component "
-                f"labeling is a global operation that cannot be chunked.  "
-                f"Consider downsampling or tiling the input manually."
+                f"sieve() needs ~{estimated_bytes / 1e9:.1f} GB for the "
+                f"full array plus CCL bookkeeping, but only "
+                f"~{free_gpu / 1e9:.1f} GB free GPU memory.  Connected-"
+                f"component labeling is a global operation that cannot be "
+                f"chunked.  Consider downsampling or tiling the input."
             )
     except (ImportError, AttributeError):
         pass