|
| 1 | +"""Per-tile props slicing in dask backends (issue #2020). |
| 2 | +
|
| 3 | +Locks in that ``_run_dask_numpy`` and ``_run_dask_cupy`` no longer embed |
| 4 | +the full ``line_props`` / ``point_props`` table into every delayed tile |
| 5 | +task. Only the geometries referenced by each tile end up in that tile's |
| 6 | +closure, mirroring the polygon path's ``poly_props[pmask]`` slicing. |
| 7 | +""" |
| 8 | +from __future__ import annotations |
| 9 | + |
| 10 | +import pickle |
| 11 | + |
| 12 | +import numpy as np |
| 13 | +import pytest |
| 14 | + |
| 15 | +try: |
| 16 | + from shapely.geometry import LineString, Point, box # noqa: F401 |
| 17 | + has_shapely = True |
| 18 | +except ImportError: |
| 19 | + has_shapely = False |
| 20 | + |
| 21 | +try: |
| 22 | + import geopandas as gpd |
| 23 | + has_geopandas = True |
| 24 | +except ImportError: |
| 25 | + has_geopandas = False |
| 26 | + |
| 27 | +try: |
| 28 | + import dask.array as da # noqa: F401 |
| 29 | + has_dask = True |
| 30 | +except ImportError: |
| 31 | + has_dask = False |
| 32 | + |
| 33 | +if has_shapely: |
| 34 | + from xrspatial.rasterize import ( |
| 35 | + _slice_props_for_tile, |
| 36 | + rasterize, |
| 37 | + ) |
| 38 | + |
| 39 | +pytestmark = [ |
| 40 | + pytest.mark.skipif(not has_shapely, reason="shapely not installed"), |
| 41 | + pytest.mark.skipif(not has_dask, reason="dask not installed"), |
| 42 | + pytest.mark.skipif(not has_geopandas, reason="geopandas not installed"), |
| 43 | +] |
| 44 | + |
| 45 | + |
| 46 | +# --------------------------------------------------------------------------- |
| 47 | +# _slice_props_for_tile helper |
| 48 | +# --------------------------------------------------------------------------- |
| 49 | + |
| 50 | +class TestSlicePropsForTile: |
| 51 | + def test_empty_geom_idx_returns_empty_props(self): |
| 52 | + props = np.arange(50, dtype=np.float64).reshape(10, 5) |
| 53 | + empty_idx = np.empty(0, dtype=np.int32) |
| 54 | + local_idx, sliced = _slice_props_for_tile(empty_idx, props) |
| 55 | + assert len(local_idx) == 0 |
| 56 | + assert sliced.shape == (0, 5) |
| 57 | + assert sliced.dtype == props.dtype |
| 58 | + |
| 59 | + def test_remap_round_trips_props(self): |
| 60 | + """``props[geom_idx]`` and ``sliced[local_idx]`` agree row-wise.""" |
| 61 | + props = np.arange(20, dtype=np.float64).reshape(10, 2) |
| 62 | + geom_idx = np.array([7, 2, 7, 9, 2, 3], dtype=np.int32) |
| 63 | + local_idx, sliced = _slice_props_for_tile(geom_idx, props) |
| 64 | + assert local_idx.dtype == np.int32 |
| 65 | + # local_idx values must index into sliced |
| 66 | + assert local_idx.max() < len(sliced) |
| 67 | + np.testing.assert_array_equal(sliced[local_idx], props[geom_idx]) |
| 68 | + |
| 69 | + def test_single_geom_compacts_to_one_row(self): |
| 70 | + props = np.arange(50, dtype=np.float64).reshape(10, 5) |
| 71 | + geom_idx = np.array([4, 4, 4], dtype=np.int32) |
| 72 | + local_idx, sliced = _slice_props_for_tile(geom_idx, props) |
| 73 | + assert sliced.shape == (1, 5) |
| 74 | + assert np.all(local_idx == 0) |
| 75 | + np.testing.assert_array_equal(sliced[0], props[4]) |
| 76 | + |
| 77 | + def test_preserves_column_count_on_empty(self): |
| 78 | + """Empty input must keep the column dimension so downstream |
| 79 | + indexing into ``[:, j]`` does not raise.""" |
| 80 | + props = np.empty((0, 3), dtype=np.float64) |
| 81 | + empty_idx = np.empty(0, dtype=np.int32) |
| 82 | + _, sliced = _slice_props_for_tile(empty_idx, props) |
| 83 | + assert sliced.shape == (0, 3) |
| 84 | + |
| 85 | + |
| 86 | +# --------------------------------------------------------------------------- |
| 87 | +# Graph-size regression: localized points should not embed full point_props |
| 88 | +# --------------------------------------------------------------------------- |
| 89 | + |
| 90 | +class TestDaskGraphPayloadBounded: |
| 91 | + """Per-tile delayed args must not embed the full per-type props table.""" |
| 92 | + |
| 93 | + def test_point_graph_scales_with_per_tile_points_not_total(self): |
| 94 | + rng = np.random.default_rng(0) |
| 95 | + n_points = 5_000 |
| 96 | + # Multi-column props amplify any per-task embedding waste |
| 97 | + n_cols = 8 |
| 98 | + xs = rng.uniform(-180, 180, n_points) |
| 99 | + ys = rng.uniform(-90, 90, n_points) |
| 100 | + data = {f"c{j}": rng.random(n_points) for j in range(n_cols)} |
| 101 | + data["geometry"] = [Point(x, y) for x, y in zip(xs, ys)] |
| 102 | + gdf = gpd.GeoDataFrame(data, crs="EPSG:4326") |
| 103 | + |
| 104 | + result = rasterize( |
| 105 | + gdf, |
| 106 | + width=2560, |
| 107 | + height=2560, |
| 108 | + bounds=(-180, -90, 180, 90), |
| 109 | + chunks=(256, 256), |
| 110 | + columns=[f"c{j}" for j in range(n_cols)], |
| 111 | + ) |
| 112 | + graph = result.data.__dask_graph__() |
| 113 | + graph_bytes = len(pickle.dumps(dict(graph))) |
| 114 | + |
| 115 | + # Without the fix: 100 tiles * 5000 points * 8 cols * 8 bytes |
| 116 | + # = 32 MB just for point_props duplicates, on top of segment data. |
| 117 | + # With the fix: each tile sees ~50 points so each task embeds |
| 118 | + # ~3 KB of point_props; total stays well under 4 MB end-to-end. |
| 119 | + assert graph_bytes < 4_000_000, ( |
| 120 | + f"graph pickled to {graph_bytes:,} bytes -- the per-tile " |
| 121 | + f"point_props slice may have regressed and the full table is " |
| 122 | + f"being embedded into every delayed task." |
| 123 | + ) |
| 124 | + |
| 125 | + def test_localized_line_graph_scales_with_per_tile_lines(self): |
| 126 | + rng = np.random.default_rng(1) |
| 127 | + n_lines = 5_000 |
| 128 | + n_cols = 8 |
| 129 | + geoms = [] |
| 130 | + for _ in range(n_lines): |
| 131 | + # Each line spans a single ~1-degree cell, so most tiles |
| 132 | + # see only a handful of segments. |
| 133 | + cx = rng.uniform(-170, 170) |
| 134 | + cy = rng.uniform(-80, 80) |
| 135 | + geoms.append( |
| 136 | + LineString([(cx, cy), |
| 137 | + (cx + rng.uniform(-1, 1), |
| 138 | + cy + rng.uniform(-1, 1))]) |
| 139 | + ) |
| 140 | + data = {f"c{j}": rng.random(n_lines) for j in range(n_cols)} |
| 141 | + data["geometry"] = geoms |
| 142 | + gdf = gpd.GeoDataFrame(data, crs="EPSG:4326") |
| 143 | + |
| 144 | + result = rasterize( |
| 145 | + gdf, |
| 146 | + width=2560, |
| 147 | + height=2560, |
| 148 | + bounds=(-180, -90, 180, 90), |
| 149 | + chunks=(256, 256), |
| 150 | + columns=[f"c{j}" for j in range(n_cols)], |
| 151 | + ) |
| 152 | + graph = result.data.__dask_graph__() |
| 153 | + graph_bytes = len(pickle.dumps(dict(graph))) |
| 154 | + |
| 155 | + # Without the fix: ~32 MB. With the fix: ~1-2 MB for localized |
| 156 | + # lines. Bound the regression check at 5 MB to leave headroom |
| 157 | + # for pickle overhead. |
| 158 | + assert graph_bytes < 5_000_000, ( |
| 159 | + f"graph pickled to {graph_bytes:,} bytes -- localized lines " |
| 160 | + f"should not embed the full line_props in every tile." |
| 161 | + ) |
| 162 | + |
| 163 | + |
| 164 | +# --------------------------------------------------------------------------- |
| 165 | +# Correctness: filtered props slice produces identical pixels to baseline |
| 166 | +# --------------------------------------------------------------------------- |
| 167 | + |
| 168 | +class TestDaskBackendOutputUnchanged: |
| 169 | + """Slicing per tile must not change a single rasterized pixel.""" |
| 170 | + |
| 171 | + def _make_gdf(self, seed=42, n_lines=30, n_points=30): |
| 172 | + rng = np.random.default_rng(seed) |
| 173 | + geoms = [] |
| 174 | + for _ in range(n_lines): |
| 175 | + cx = rng.uniform(-150, 150) |
| 176 | + cy = rng.uniform(-70, 70) |
| 177 | + geoms.append( |
| 178 | + LineString([(cx, cy), |
| 179 | + (cx + rng.uniform(-5, 5), |
| 180 | + cy + rng.uniform(-5, 5))]) |
| 181 | + ) |
| 182 | + for _ in range(n_points): |
| 183 | + geoms.append( |
| 184 | + Point(rng.uniform(-150, 150), rng.uniform(-70, 70)) |
| 185 | + ) |
| 186 | + vals = list(range(1, len(geoms) + 1)) |
| 187 | + return gpd.GeoDataFrame( |
| 188 | + {"value": vals, "geometry": geoms}, crs="EPSG:4326" |
| 189 | + ) |
| 190 | + |
| 191 | + def test_numpy_vs_dask_lines_and_points(self): |
| 192 | + gdf = self._make_gdf() |
| 193 | + np_res = rasterize( |
| 194 | + gdf, column="value", |
| 195 | + width=512, height=512, bounds=(-180, -90, 180, 90), |
| 196 | + fill=0.0, |
| 197 | + ) |
| 198 | + dk_res = rasterize( |
| 199 | + gdf, column="value", |
| 200 | + width=512, height=512, bounds=(-180, -90, 180, 90), |
| 201 | + fill=0.0, chunks=(64, 64), |
| 202 | + ) |
| 203 | + np.testing.assert_array_equal(np_res.values, dk_res.values) |
| 204 | + |
| 205 | + def test_numpy_vs_dask_sum_merge(self): |
| 206 | + """Slicing must preserve overlap-sensitive merges (sum).""" |
| 207 | + gdf = self._make_gdf(seed=7, n_lines=80, n_points=80) |
| 208 | + np_res = rasterize( |
| 209 | + gdf, column="value", |
| 210 | + width=256, height=256, bounds=(-180, -90, 180, 90), |
| 211 | + fill=0.0, merge="sum", |
| 212 | + ) |
| 213 | + dk_res = rasterize( |
| 214 | + gdf, column="value", |
| 215 | + width=256, height=256, bounds=(-180, -90, 180, 90), |
| 216 | + fill=0.0, merge="sum", chunks=(32, 32), |
| 217 | + ) |
| 218 | + np.testing.assert_array_equal(np_res.values, dk_res.values) |
| 219 | + |
| 220 | + def test_numpy_vs_dask_no_lines_no_points(self): |
| 221 | + """Polygon-only input must still produce identical results.""" |
| 222 | + gdf = gpd.GeoDataFrame( |
| 223 | + {"value": [1.0, 2.0]}, |
| 224 | + geometry=[box(0, 0, 4, 4), box(6, 6, 10, 10)], |
| 225 | + crs="EPSG:4326", |
| 226 | + ) |
| 227 | + np_res = rasterize( |
| 228 | + gdf, column="value", |
| 229 | + width=64, height=64, bounds=(0, 0, 10, 10), fill=0.0, |
| 230 | + ) |
| 231 | + dk_res = rasterize( |
| 232 | + gdf, column="value", |
| 233 | + width=64, height=64, bounds=(0, 0, 10, 10), |
| 234 | + fill=0.0, chunks=(16, 16), |
| 235 | + ) |
| 236 | + np.testing.assert_array_equal(np_res.values, dk_res.values) |
| 237 | + |
| 238 | + def test_line_straddling_tile_boundary_renders_contiguously(self): |
| 239 | + """A line whose pixel bbox overlaps two tiles must be present in |
| 240 | + both tile closures. If the props slice ever drops the geometry |
| 241 | + from one of the two tiles, the dask output will diverge from |
| 242 | + numpy along the seam (gap pixels) or carry the fill value where |
| 243 | + the line should be.""" |
| 244 | + # Single horizontal line crossing the vertical seam at x=0 on a |
| 245 | + # bounds=(-10,-10,10,10) raster with width=80, chunks=(40,40). |
| 246 | + # Tile seam in pixel space sits at column 40; the line spans |
| 247 | + # x=-5..5 (pixel columns ~20..60), straddling the seam. |
| 248 | + gdf = gpd.GeoDataFrame( |
| 249 | + {"value": [7.0]}, |
| 250 | + geometry=[LineString([(-5.0, 0.0), (5.0, 0.0)])], |
| 251 | + crs="EPSG:4326", |
| 252 | + ) |
| 253 | + np_res = rasterize( |
| 254 | + gdf, column="value", |
| 255 | + width=80, height=80, bounds=(-10, -10, 10, 10), fill=0.0, |
| 256 | + ) |
| 257 | + dk_res = rasterize( |
| 258 | + gdf, column="value", |
| 259 | + width=80, height=80, bounds=(-10, -10, 10, 10), |
| 260 | + fill=0.0, chunks=(40, 40), |
| 261 | + ) |
| 262 | + # Whole-array equality: any dropped pixel along the seam fails. |
| 263 | + np.testing.assert_array_equal(np_res.values, dk_res.values) |
| 264 | + # Structural assertion: the value must appear on both sides of |
| 265 | + # the column-40 seam so a future bug that only renders one half |
| 266 | + # is caught even if the numpy reference also regressed. |
| 267 | + burned = (dk_res.values == 7.0) |
| 268 | + assert burned[:, :40].any(), "line missing from left tile" |
| 269 | + assert burned[:, 40:].any(), "line missing from right tile" |
0 commit comments