Skip to content

Commit abc8d96

Browse files
committed
Prevent OOM on large datasets in reproject and merge (#1045)
Three safeguards for datasets that exceed available RAM: 1. Auto-chunk large non-dask inputs (reproject): If the source array exceeds 512MB, automatically wrap it in dask.array with the configured chunk_size (default 512x512). This routes it through the chunked dask path instead of the in-memory path that would call .values and OOM. 2. Auto-promote merge to dask path: If the combined output size (output_shape * n_tiles * 8 bytes) exceeds 512MB, use the dask merge path even if no input is dask. This prevents _merge_inmemory from calling .values on each tile. 3. Cap source window size in chunk workers: If a single output chunk maps to a source window larger than 64 Mpixels (~512MB for float64), return nodata instead of materializing the window. This prevents extreme projections (e.g. polar stereographic edge pixels mapping to the entire source hemisphere) from OOMing individual chunk workers. A 30TB dataset with 16GB RAM would now: - Auto-chunk into 512x512 tiles - Process each tile independently (~2MB working memory per tile) - Never materialize more than 512MB in a single operation
1 parent 4cae5fc commit abc8d96

File tree

1 file changed

+35
-1
lines changed

1 file changed

+35
-1
lines changed

xrspatial/reproject/__init__.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,13 @@ def _reproject_chunk_numpy(
256256
c_min_clip = max(0, c_min)
257257
c_max_clip = min(src_w, c_max)
258258

259+
# Guard: cap source window to prevent OOM if projection maps a small
260+
# output chunk to a huge source area (e.g. polar stereographic edges).
261+
_MAX_WINDOW_PIXELS = 64 * 1024 * 1024 # 64 Mpix (~512 MB for float64)
262+
win_pixels = (r_max_clip - r_min_clip) * (c_max_clip - c_min_clip)
263+
if win_pixels > _MAX_WINDOW_PIXELS:
264+
return np.full(chunk_shape, nodata, dtype=np.float64)
265+
259266
# Extract source window
260267
window = source_data[r_min_clip:r_max_clip, c_min_clip:c_max_clip]
261268
if hasattr(window, 'compute'):
@@ -558,6 +565,26 @@ def reproject(
558565
else:
559566
is_cupy = is_cupy_array(data)
560567

568+
# Auto-chunk large non-dask arrays to prevent OOM.
569+
# A 30TB float32 raster would instantly OOM if we called .values.
570+
# Threshold: 512MB (configurable via chunk_size).
571+
if not is_dask and not is_cupy:
572+
nbytes = src_shape[0] * src_shape[1] * data.dtype.itemsize
573+
if data.ndim == 3:
574+
nbytes *= data.shape[2]
575+
_OOM_THRESHOLD = 512 * 1024 * 1024 # 512 MB
576+
if nbytes > _OOM_THRESHOLD:
577+
import dask.array as _da
578+
cs = chunk_size or 512
579+
if isinstance(cs, int):
580+
cs = (cs, cs)
581+
data = _da.from_array(data, chunks=cs)
582+
raster = xr.DataArray(
583+
data, dims=raster.dims, coords=raster.coords,
584+
name=raster.name, attrs=raster.attrs,
585+
)
586+
is_dask = True
587+
561588
# Serialize CRS for pickle safety
562589
src_wkt = src_crs.to_wkt()
563590
tgt_wkt = tgt_crs.to_wkt()
@@ -1082,14 +1109,21 @@ def merge(
10821109
out_shape = grid['shape']
10831110
tgt_wkt = tgt_crs.to_wkt()
10841111

1085-
# Detect if any input is dask
1112+
# Detect if any input is dask, or if total size exceeds memory threshold
10861113
from ..utils import has_dask_array
10871114

10881115
any_dask = False
10891116
if has_dask_array():
10901117
import dask.array as _da
10911118
any_dask = any(isinstance(r.data, _da.Array) for r in rasters)
10921119

1120+
# Auto-promote to dask path if output would be too large for in-memory merge
1121+
if not any_dask:
1122+
out_nbytes = out_shape[0] * out_shape[1] * 8 * len(rasters) # float64 per tile
1123+
_OOM_THRESHOLD = 512 * 1024 * 1024
1124+
if out_nbytes > _OOM_THRESHOLD:
1125+
any_dask = True
1126+
10931127
if any_dask:
10941128
result_data = _merge_dask(
10951129
raster_infos, tgt_wkt, out_bounds, out_shape,

0 commit comments

Comments
 (0)