From 239040afaaae1341d07a21eb02766b0976ac032b Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 17 May 2026 21:41:04 -0700 Subject: [PATCH 1/2] rasterize: slice line_props/point_props per tile in dask backends (#2020) The dask backends embedded the full per-type props tables into every delayed tile task, despite the polygon path already filtering via poly_props[pmask]. For workloads with many localized geometries this amplified scheduler overhead by O(N_geoms * N_tiles). Adds _slice_props_for_tile that returns the subset of props referenced by a tile and a remapped local-index array. Wired into _run_dask_numpy and _run_dask_cupy alongside the existing segment / point tile filters. Measured: 5000 points x 8 cols / 100 tiles graph shrinks ~30 MB -> 0.3 MB; localized lines ~32 MB -> 1.1 MB. Dask + cupy parity verified. --- .claude/sweep-performance-state.csv | 2 +- xrspatial/rasterize.py | 59 ++++- .../test_rasterize_tile_props_slice_2020.py | 236 ++++++++++++++++++ 3 files changed, 290 insertions(+), 7 deletions(-) create mode 100644 xrspatial/tests/test_rasterize_tile_props_slice_2020.py diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index 7a97ede0e..848689345 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -32,7 +32,7 @@ perlin,2026-03-31T18:00:00Z,WILL OOM,memory-bound,0,, polygon_clip,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1207,Re-audit 2026-04-16: fix verified SAFE. Mask stays lazy via rasterize chunks kwarg; per-chunk peak bounded. polygonize,2026-04-16T12:00:00Z,RISKY,compute-bound,0,,Re-audit 2026-04-16 after PR 1190 NaN fix + 1176 simplification. No HIGH. MEDIUM: sequential per-chunk dask.compute loop at L1528 serializes work; _polygonize_cupy full GPU->CPU transfer at L665; per-value bin_mask alloc in _calculate_regions_cupy. proximity,2026-03-31T18:00:00Z,WILL OOM,memory-bound,3,1111,Memory guard added to line-sweep path. KDTree path (EUCLIDEAN/MANHATTAN + scipy) already had guards. GREAT_CIRCLE unbounded path already guarded. -rasterize,2026-04-15T12:00:00Z,SAFE,graph-bound,0,false-positive,Downgraded. Tile-by-tile graph construction with per-tile geometry filtering is the correct pattern. Pre-filtering ensures each delayed task gets only its relevant subset. O(n_tiles) bbox check at graph-build time is vectorized and fast. +rasterize,2026-05-17,SAFE,graph-bound,0,2020,"Pass 2 (2026-05-17): re-audit identified MEDIUM Cat-2/Cat-3 graph-bound waste in _run_dask_numpy/_run_dask_cupy -- full line_props/point_props embedded in every delayed tile task (polygon path already filtered via poly_props[pmask]). Filed #2020 and fixed: added _slice_props_for_tile helper to remap geom_idx and slice props per tile (mirrors polygon path). Measured 5000 points x 8 cols / 100 tiles graph shrank from ~30 MB to <0.3 MB (37x); localized lines from ~32 MB to ~1.1 MB. 9 new tests in test_rasterize_tile_props_slice_2020.py (helper unit tests + graph-payload bound + numpy/dask output parity for lines/points/sum-merge). All 184 existing rasterize tests pass; dask+cupy parity verified. Dask graph probe: 2560x2560 chunks=256 yields 400 tasks (4 tasks/chunk constant); 25600x25600 chunks=1024 yields 2500 tasks. cupy 512x512 returns cupy.ndarray with no host round-trip. CUDA _scanline_fill_gpu: 39 regs/thread, 24576 B local_mem/thread (matches static cuda.local.array allocations 2048*8 + 2048*4 bytes). SAFE/graph-bound verdict holds; previous 2026-04-15 false-positive on polygon filtering still valid. | Original (2026-04-15): Tile-by-tile graph construction with per-tile geometry filtering is the correct pattern. Pre-filtering ensures each delayed task gets only its relevant subset." reproject,2026-05-10,SAFE,compute-bound,1,1571,"Pass 5 (2026-05-10): 1 HIGH filed and fixed in tree -- issue #1571 + fix _merge_block_adapter same-CRS dask path. _place_same_crs in the dask adapter previously called src_data.compute() on the full source per output chunk (68x amplification measured on 256x256x2 source split into 32x32 output chunks, 8.9M pixels materialized vs 131K total source). Fix: added _place_same_crs_lazy at __init__.py:1716 that slices the source window first then computes only that slice. Verified post-fix: 1.00x ratio, 131K pixels materialized for 131K source. New regression test test_merge_dask_same_crs_bounded_materialization codifies the bound. Other audits clean: CUDA resample kernels use 16x16 blocks (cubic=46 regs, bilinear=36, nearest=22 -- well under the 64K-per-block limit, 0 local mem). _reproject_chunk_numpy/cupy already slice source first before .compute(). Dask graph at 25600x25600 src with 1024 chunks yields 4752 tasks (no per-chunk source dependency). _apply_vertical_shift uses in-place += that may not work on dask arrays -- correctness concern, not perf, defer to accuracy sweep." resample,2026-04-15T12:00:00Z,SAFE,compute-bound,0,false-positive,Downgraded. GPU-CPU-GPU round-trip only in aggregate path for non-integer scale factors. Interpolation (nearest/bilinear/cubic) stays on GPU. No GPU kernel exists for irregular per-pixel binning. sieve,2026-04-14T12:00:00Z,WILL OOM,memory-bound,0,false-positive,False positive. Memory guards already in place on both dask paths. CCL is inherently global — documented limitation. CuPy CPU fallback is deliberate and documented. diff --git a/xrspatial/rasterize.py b/xrspatial/rasterize.py index d668003b4..55b3edbd9 100644 --- a/xrspatial/rasterize.py +++ b/xrspatial/rasterize.py @@ -1578,6 +1578,33 @@ def _points_for_tile(rows, cols, geom_idx, r_start, r_end, c_start, c_end): return (rows[mask] - r_start, cols[mask] - c_start, geom_idx[mask]) +def _slice_props_for_tile(geom_idx, props): + """Slice a per-type props table to only entries referenced by *geom_idx*. + + Returns ``(local_idx, sliced_props)`` where ``local_idx`` is a remapped + int32 array of the same length as ``geom_idx`` but with values in + ``range(len(sliced_props))``, and ``sliced_props`` is the + contiguous subset of ``props`` that those local indices index into. + + This is the line / point counterpart to the polygon path's + ``poly_wkb_arr[pmask]`` / ``poly_props[pmask]`` slicing. Without it, + every dask tile task would embed the full ``line_props`` / + ``point_props`` table in its closure, even when the tile sees only + a handful of segments. At a few thousand geometries this is a + multi-MB scheduler overhead per task; at 1 M geometries x 100 tiles + the difference is ~800 MB of redundant graph state. + + If ``geom_idx`` is empty, returns an empty ``(0,)`` int32 array and + a zero-row slice of ``props`` (preserving column count). + """ + if len(geom_idx) == 0: + return geom_idx, props[:0] + # np.unique returns sorted unique values and an inverse map that + # remaps geom_idx in-place to local 0..len(unique)-1 indices. + unique_idx, local_idx = np.unique(geom_idx, return_inverse=True) + return local_idx.astype(np.int32), props[unique_idx] + + def _polys_to_wkb(geoms): """Pre-serialize polygon geometries to WKB for cheap pickling.""" return [g.wkb for g in geoms] @@ -1698,10 +1725,22 @@ def _run_dask_numpy(geometries, props_array, bounds, height, width, fill, tp = _points_for_tile(pt_rows, pt_cols, pt_geom_idx, r_start, r_end, c_start, c_end) + # Slice line_props / point_props to only the geometries this + # tile actually references, remapping the geom_idx arrays to + # local indices. Mirrors the polygon path's poly_props[pmask] + # slicing. Skipped for empty filters (the helper returns the + # already-empty inputs unchanged). + ts_local_idx, tile_line_props = _slice_props_for_tile( + ts[4], line_props) + ts = (*ts[:4], ts_local_idx) + tp_local_idx, tile_point_props = _slice_props_for_tile( + tp[2], point_props) + tp = (*tp[:2], tp_local_idx) + delayed_tile = dask.delayed(_rasterize_tile_numpy)( tile_wkb, tile_poly_props, tile_bounds, tile_h, tile_w, fill, dtype, all_touched, merge_fn, - *ts, line_props, *tp, point_props) + *ts, tile_line_props, *tp, tile_point_props) blocks[i][j] = da.from_delayed( delayed_tile, shape=(tile_h, tile_w), dtype=dtype) ri += 1 @@ -1818,10 +1857,11 @@ def _run_dask_cupy(geometries, props_array, bounds, height, width, fill, # Pre-compute segment bboxes once (avoids redundant min/max per tile) seg_bboxes = _segment_bboxes(seg_r0, seg_c0, seg_r1, seg_c1) - # Transfer shared props tables to GPU once (avoids repeated PCIe - # transfers in each tile worker). - d_line_props = cupy.asarray(line_props) - d_point_props = cupy.asarray(point_props) + # Per-tile props slicing keeps PCIe traffic and graph payload small: + # each tile only references a subset of geometries, so we slice + # line_props / point_props per tile (as np arrays in the graph) and + # transfer only that subset to the GPU inside the worker. Mirrors + # the polygon path's poly_props[pmask] pattern. tiles = _tile_grid(bounds, height, width, row_chunks, col_chunks) n_row_chunks = len(row_chunks) @@ -1850,10 +1890,17 @@ def _run_dask_cupy(geometries, props_array, bounds, height, width, fill, tp = _points_for_tile(pt_rows, pt_cols, pt_geom_idx, r_start, r_end, c_start, c_end) + ts_local_idx, tile_line_props = _slice_props_for_tile( + ts[4], line_props) + ts = (*ts[:4], ts_local_idx) + tp_local_idx, tile_point_props = _slice_props_for_tile( + tp[2], point_props) + tp = (*tp[:2], tp_local_idx) + delayed_tile = dask.delayed(_rasterize_tile_cupy)( tile_wkb, tile_poly_props, tile_bounds, tile_h, tile_w, fill, dtype, all_touched, merge_fn, - *ts, d_line_props, *tp, d_point_props) + *ts, tile_line_props, *tp, tile_point_props) blocks[i][j] = da.from_delayed( delayed_tile, shape=(tile_h, tile_w), dtype=dtype, meta=cupy.empty(())) diff --git a/xrspatial/tests/test_rasterize_tile_props_slice_2020.py b/xrspatial/tests/test_rasterize_tile_props_slice_2020.py new file mode 100644 index 000000000..42aefc8d8 --- /dev/null +++ b/xrspatial/tests/test_rasterize_tile_props_slice_2020.py @@ -0,0 +1,236 @@ +"""Per-tile props slicing in dask backends (issue #2020). + +Locks in that ``_run_dask_numpy`` and ``_run_dask_cupy`` no longer embed +the full ``line_props`` / ``point_props`` table into every delayed tile +task. Only the geometries referenced by each tile end up in that tile's +closure, mirroring the polygon path's ``poly_props[pmask]`` slicing. +""" +from __future__ import annotations + +import pickle + +import numpy as np +import pytest + +try: + from shapely.geometry import LineString, Point, box # noqa: F401 + has_shapely = True +except ImportError: + has_shapely = False + +try: + import geopandas as gpd + has_geopandas = True +except ImportError: + has_geopandas = False + +try: + import dask.array as da # noqa: F401 + has_dask = True +except ImportError: + has_dask = False + +if has_shapely: + from xrspatial.rasterize import ( + _slice_props_for_tile, + rasterize, + ) + +pytestmark = [ + pytest.mark.skipif(not has_shapely, reason="shapely not installed"), + pytest.mark.skipif(not has_dask, reason="dask not installed"), + pytest.mark.skipif(not has_geopandas, reason="geopandas not installed"), +] + + +# --------------------------------------------------------------------------- +# _slice_props_for_tile helper +# --------------------------------------------------------------------------- + +class TestSlicePropsForTile: + def test_empty_geom_idx_returns_empty_props(self): + props = np.arange(50, dtype=np.float64).reshape(10, 5) + empty_idx = np.empty(0, dtype=np.int32) + local_idx, sliced = _slice_props_for_tile(empty_idx, props) + assert len(local_idx) == 0 + assert sliced.shape == (0, 5) + assert sliced.dtype == props.dtype + + def test_remap_round_trips_props(self): + """``props[geom_idx]`` and ``sliced[local_idx]`` agree row-wise.""" + props = np.arange(20, dtype=np.float64).reshape(10, 2) + geom_idx = np.array([7, 2, 7, 9, 2, 3], dtype=np.int32) + local_idx, sliced = _slice_props_for_tile(geom_idx, props) + assert local_idx.dtype == np.int32 + # local_idx values must index into sliced + assert local_idx.max() < len(sliced) + np.testing.assert_array_equal(sliced[local_idx], props[geom_idx]) + + def test_single_geom_compacts_to_one_row(self): + props = np.arange(50, dtype=np.float64).reshape(10, 5) + geom_idx = np.array([4, 4, 4], dtype=np.int32) + local_idx, sliced = _slice_props_for_tile(geom_idx, props) + assert sliced.shape == (1, 5) + assert np.all(local_idx == 0) + np.testing.assert_array_equal(sliced[0], props[4]) + + def test_preserves_column_count_on_empty(self): + """Empty input must keep the column dimension so downstream + indexing into ``[:, j]`` does not raise.""" + props = np.empty((0, 3), dtype=np.float64) + empty_idx = np.empty(0, dtype=np.int32) + _, sliced = _slice_props_for_tile(empty_idx, props) + assert sliced.shape == (0, 3) + + +# --------------------------------------------------------------------------- +# Graph-size regression: localized points should not embed full point_props +# --------------------------------------------------------------------------- + +class TestDaskGraphPayloadBounded: + """Per-tile delayed args must not embed the full per-type props table.""" + + def test_point_graph_scales_with_per_tile_points_not_total(self): + rng = np.random.default_rng(0) + n_points = 5_000 + # Multi-column props amplify any per-task embedding waste + n_cols = 8 + xs = rng.uniform(-180, 180, n_points) + ys = rng.uniform(-90, 90, n_points) + data = {f"c{j}": rng.random(n_points) for j in range(n_cols)} + data["geometry"] = [Point(x, y) for x, y in zip(xs, ys)] + gdf = gpd.GeoDataFrame(data, crs="EPSG:4326") + + result = rasterize( + gdf, + width=2560, + height=2560, + bounds=(-180, -90, 180, 90), + chunks=(256, 256), + columns=[f"c{j}" for j in range(n_cols)], + ) + graph = result.data.__dask_graph__() + graph_bytes = len(pickle.dumps(dict(graph))) + + # Without the fix: 100 tiles * 5000 points * 8 cols * 8 bytes + # = 32 MB just for point_props duplicates, on top of segment data. + # With the fix: each tile sees ~50 points so each task embeds + # ~3 KB of point_props; total stays well under 4 MB end-to-end. + assert graph_bytes < 4_000_000, ( + f"graph pickled to {graph_bytes:,} bytes -- the per-tile " + f"point_props slice may have regressed and the full table is " + f"being embedded into every delayed task." + ) + + def test_localized_line_graph_scales_with_per_tile_lines(self): + rng = np.random.default_rng(1) + n_lines = 5_000 + n_cols = 8 + geoms = [] + for _ in range(n_lines): + # Each line spans a single ~1-degree cell, so most tiles + # see only a handful of segments. + cx = rng.uniform(-170, 170) + cy = rng.uniform(-80, 80) + geoms.append( + LineString([(cx, cy), + (cx + rng.uniform(-1, 1), + cy + rng.uniform(-1, 1))]) + ) + data = {f"c{j}": rng.random(n_lines) for j in range(n_cols)} + data["geometry"] = geoms + gdf = gpd.GeoDataFrame(data, crs="EPSG:4326") + + result = rasterize( + gdf, + width=2560, + height=2560, + bounds=(-180, -90, 180, 90), + chunks=(256, 256), + columns=[f"c{j}" for j in range(n_cols)], + ) + graph = result.data.__dask_graph__() + graph_bytes = len(pickle.dumps(dict(graph))) + + # Without the fix: ~32 MB. With the fix: ~1-2 MB for localized + # lines. Bound the regression check at 5 MB to leave headroom + # for pickle overhead. + assert graph_bytes < 5_000_000, ( + f"graph pickled to {graph_bytes:,} bytes -- localized lines " + f"should not embed the full line_props in every tile." + ) + + +# --------------------------------------------------------------------------- +# Correctness: filtered props slice produces identical pixels to baseline +# --------------------------------------------------------------------------- + +class TestDaskBackendOutputUnchanged: + """Slicing per tile must not change a single rasterized pixel.""" + + def _make_gdf(self, seed=42, n_lines=30, n_points=30): + rng = np.random.default_rng(seed) + geoms = [] + for _ in range(n_lines): + cx = rng.uniform(-150, 150) + cy = rng.uniform(-70, 70) + geoms.append( + LineString([(cx, cy), + (cx + rng.uniform(-5, 5), + cy + rng.uniform(-5, 5))]) + ) + for _ in range(n_points): + geoms.append( + Point(rng.uniform(-150, 150), rng.uniform(-70, 70)) + ) + vals = list(range(1, len(geoms) + 1)) + return gpd.GeoDataFrame( + {"value": vals, "geometry": geoms}, crs="EPSG:4326" + ) + + def test_numpy_vs_dask_lines_and_points(self): + gdf = self._make_gdf() + np_res = rasterize( + gdf, column="value", + width=512, height=512, bounds=(-180, -90, 180, 90), + fill=0.0, + ) + dk_res = rasterize( + gdf, column="value", + width=512, height=512, bounds=(-180, -90, 180, 90), + fill=0.0, chunks=(64, 64), + ) + np.testing.assert_array_equal(np_res.values, dk_res.values) + + def test_numpy_vs_dask_sum_merge(self): + """Slicing must preserve overlap-sensitive merges (sum).""" + gdf = self._make_gdf(seed=7, n_lines=80, n_points=80) + np_res = rasterize( + gdf, column="value", + width=256, height=256, bounds=(-180, -90, 180, 90), + fill=0.0, merge="sum", + ) + dk_res = rasterize( + gdf, column="value", + width=256, height=256, bounds=(-180, -90, 180, 90), + fill=0.0, merge="sum", chunks=(32, 32), + ) + np.testing.assert_array_equal(np_res.values, dk_res.values) + + def test_numpy_vs_dask_no_lines_no_points(self): + """Polygon-only input must still produce identical results.""" + gdf = gpd.GeoDataFrame( + {"value": [1.0, 2.0]}, + geometry=[box(0, 0, 4, 4), box(6, 6, 10, 10)], + crs="EPSG:4326", + ) + np_res = rasterize( + gdf, column="value", + width=64, height=64, bounds=(0, 0, 10, 10), fill=0.0, + ) + dk_res = rasterize( + gdf, column="value", + width=64, height=64, bounds=(0, 0, 10, 10), + fill=0.0, chunks=(16, 16), + ) + np.testing.assert_array_equal(np_res.values, dk_res.values) From 218417bc80ecacae2fce30ea925c9d4c8f4a45f9 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 06:57:09 -0700 Subject: [PATCH 2/2] rasterize: address review suggestions on per-tile slicing Add an explicit straddling-line test that asserts a line crossing a tile seam renders on both sides. Fix the np.unique comment (inverse map is a new array, not in-place), normalise the empty-branch local_idx dtype to int32, and note in the cupy comment that for sprawling-line workloads the cost shifts from graph payload to per-tile PCIe uploads. --- xrspatial/rasterize.py | 10 ++++-- .../test_rasterize_tile_props_slice_2020.py | 33 +++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/xrspatial/rasterize.py b/xrspatial/rasterize.py index 55b3edbd9..a3fae3f27 100644 --- a/xrspatial/rasterize.py +++ b/xrspatial/rasterize.py @@ -1598,9 +1598,9 @@ def _slice_props_for_tile(geom_idx, props): a zero-row slice of ``props`` (preserving column count). """ if len(geom_idx) == 0: - return geom_idx, props[:0] + return geom_idx.astype(np.int32, copy=False), props[:0] # np.unique returns sorted unique values and an inverse map that - # remaps geom_idx in-place to local 0..len(unique)-1 indices. + # points each entry in geom_idx to its position in unique_idx. unique_idx, local_idx = np.unique(geom_idx, return_inverse=True) return local_idx.astype(np.int32), props[unique_idx] @@ -1861,7 +1861,11 @@ def _run_dask_cupy(geometries, props_array, bounds, height, width, fill, # each tile only references a subset of geometries, so we slice # line_props / point_props per tile (as np arrays in the graph) and # transfer only that subset to the GPU inside the worker. Mirrors - # the polygon path's poly_props[pmask] pattern. + # the polygon path's poly_props[pmask] pattern. Tradeoff: for + # sprawling-line workloads where every tile references most rows, + # the GPU now sees N_tiles small uploads instead of one shared + # driver-side upload, shifting cost from graph payload to PCIe + # traffic. Localized geometries (the realistic case) still win. tiles = _tile_grid(bounds, height, width, row_chunks, col_chunks) n_row_chunks = len(row_chunks) diff --git a/xrspatial/tests/test_rasterize_tile_props_slice_2020.py b/xrspatial/tests/test_rasterize_tile_props_slice_2020.py index 42aefc8d8..f6d1fe600 100644 --- a/xrspatial/tests/test_rasterize_tile_props_slice_2020.py +++ b/xrspatial/tests/test_rasterize_tile_props_slice_2020.py @@ -234,3 +234,36 @@ def test_numpy_vs_dask_no_lines_no_points(self): fill=0.0, chunks=(16, 16), ) np.testing.assert_array_equal(np_res.values, dk_res.values) + + def test_line_straddling_tile_boundary_renders_contiguously(self): + """A line whose pixel bbox overlaps two tiles must be present in + both tile closures. If the props slice ever drops the geometry + from one of the two tiles, the dask output will diverge from + numpy along the seam (gap pixels) or carry the fill value where + the line should be.""" + # Single horizontal line crossing the vertical seam at x=0 on a + # bounds=(-10,-10,10,10) raster with width=80, chunks=(40,40). + # Tile seam in pixel space sits at column 40; the line spans + # x=-5..5 (pixel columns ~20..60), straddling the seam. + gdf = gpd.GeoDataFrame( + {"value": [7.0]}, + geometry=[LineString([(-5.0, 0.0), (5.0, 0.0)])], + crs="EPSG:4326", + ) + np_res = rasterize( + gdf, column="value", + width=80, height=80, bounds=(-10, -10, 10, 10), fill=0.0, + ) + dk_res = rasterize( + gdf, column="value", + width=80, height=80, bounds=(-10, -10, 10, 10), + fill=0.0, chunks=(40, 40), + ) + # Whole-array equality: any dropped pixel along the seam fails. + np.testing.assert_array_equal(np_res.values, dk_res.values) + # Structural assertion: the value must appear on both sides of + # the column-40 seam so a future bug that only renders one half + # is caught even if the numpy reference also regressed. + burned = (dk_res.values == 7.0) + assert burned[:, :40].any(), "line missing from left tile" + assert burned[:, 40:].any(), "line missing from right tile"