|
28 | 28 |
|
29 | 29 | from __future__ import annotations |
30 | 30 |
|
| 31 | +import math as _math |
31 | 32 | from functools import partial |
32 | 33 | from math import sqrt |
33 | 34 |
|
|
39 | 40 | except ImportError: |
40 | 41 | da = None |
41 | 42 |
|
| 43 | +from numba import cuda |
| 44 | + |
| 45 | +try: |
| 46 | + import cupy |
| 47 | +except ImportError: |
| 48 | + class cupy: # type: ignore[no-redef] |
| 49 | + ndarray = False |
| 50 | + |
42 | 51 | from xrspatial.utils import ( |
43 | | - get_dataarray_resolution, ngjit, |
| 52 | + cuda_args, get_dataarray_resolution, ngjit, |
44 | 53 | has_cuda_and_cupy, is_cupy_array, is_dask_cupy, |
45 | 54 | ) |
46 | 55 | from xrspatial.dataset_support import supports_dataset |
@@ -236,6 +245,191 @@ def _cost_distance_numpy(source_data, friction_data, cellsize_x, cellsize_y, |
236 | 245 | ) |
237 | 246 |
|
238 | 247 |
|
| 248 | +# --------------------------------------------------------------------------- |
| 249 | +# CuPy GPU backend — iterative parallel relaxation (parallel Bellman-Ford) |
| 250 | +# --------------------------------------------------------------------------- |
| 251 | + |
| 252 | +@cuda.jit |
| 253 | +def _cost_distance_relax_kernel(friction, dist, changed, |
| 254 | + height, width, |
| 255 | + dy, dx, dd, n_neighbors, |
| 256 | + max_cost): |
| 257 | + """One relaxation pass: each pixel checks all neighbours for shorter paths. |
| 258 | +
|
| 259 | + Iterate until *changed* stays 0. Convergence is guaranteed because |
| 260 | + dist values only decrease and the graph has finite edge weights. |
| 261 | + """ |
| 262 | + iy, ix = cuda.grid(2) |
| 263 | + if iy >= height or ix >= width: |
| 264 | + return |
| 265 | + |
| 266 | + f_u = friction[iy, ix] |
| 267 | + if not (_math.isfinite(f_u) and f_u > 0.0): |
| 268 | + return |
| 269 | + |
| 270 | + current = dist[iy, ix] |
| 271 | + best = current |
| 272 | + |
| 273 | + for k in range(n_neighbors): |
| 274 | + vy = iy + dy[k] |
| 275 | + vx = ix + dx[k] |
| 276 | + if vy < 0 or vy >= height or vx < 0 or vx >= width: |
| 277 | + continue |
| 278 | + |
| 279 | + d_v = dist[vy, vx] |
| 280 | + if d_v >= best: |
| 281 | + continue |
| 282 | + |
| 283 | + f_v = friction[vy, vx] |
| 284 | + if not (_math.isfinite(f_v) and f_v > 0.0): |
| 285 | + continue |
| 286 | + |
| 287 | + edge_cost = dd[k] * (f_u + f_v) * 0.5 |
| 288 | + new_cost = d_v + edge_cost |
| 289 | + |
| 290 | + if new_cost < best: |
| 291 | + best = new_cost |
| 292 | + |
| 293 | + if best < current and best <= max_cost: |
| 294 | + dist[iy, ix] = best |
| 295 | + changed[0] = 1 |
| 296 | + |
| 297 | + |
| 298 | +def _cost_distance_cupy(source_data, friction_data, cellsize_x, cellsize_y, |
| 299 | + max_cost, target_values, dy, dx, dd): |
| 300 | + """GPU cost-distance via iterative parallel relaxation. |
| 301 | +
|
| 302 | + Each CUDA thread processes one pixel per iteration, checking all |
| 303 | + neighbours for shorter paths (parallel Bellman-Ford). The wavefront |
| 304 | + advances at least one pixel per iteration, so convergence takes at |
| 305 | + most O(height + width) iterations. |
| 306 | + """ |
| 307 | + import cupy as cp |
| 308 | + |
| 309 | + height, width = source_data.shape |
| 310 | + src = source_data.astype(cp.float64) if source_data.dtype != cp.float64 \ |
| 311 | + else source_data |
| 312 | + fric = friction_data.astype(cp.float64) if friction_data.dtype != cp.float64 \ |
| 313 | + else friction_data |
| 314 | + |
| 315 | + # Initialize all distances to inf |
| 316 | + dist = cp.full((height, width), cp.inf, dtype=cp.float64) |
| 317 | + |
| 318 | + # Find source pixels |
| 319 | + if len(target_values) == 0: |
| 320 | + source_mask = cp.isfinite(src) & (src != 0) |
| 321 | + else: |
| 322 | + source_mask = cp.isin(src, cp.asarray(target_values, dtype=cp.float64)) |
| 323 | + source_mask &= cp.isfinite(src) |
| 324 | + |
| 325 | + # Only seed sources on passable terrain |
| 326 | + passable = cp.isfinite(fric) & (fric > 0) |
| 327 | + source_mask &= passable |
| 328 | + dist[source_mask] = 0.0 |
| 329 | + |
| 330 | + if not cp.any(source_mask): |
| 331 | + return cp.full((height, width), cp.nan, dtype=cp.float32) |
| 332 | + |
| 333 | + # Transfer neighbor offsets to device |
| 334 | + dy_d = cp.asarray(dy, dtype=cp.int64) |
| 335 | + dx_d = cp.asarray(dx, dtype=cp.int64) |
| 336 | + dd_d = cp.asarray(dd, dtype=cp.float64) |
| 337 | + n_neighbors = len(dy) |
| 338 | + |
| 339 | + changed = cp.zeros(1, dtype=cp.int32) |
| 340 | + griddim, blockdim = cuda_args((height, width)) |
| 341 | + |
| 342 | + max_iterations = height + width |
| 343 | + for _ in range(max_iterations): |
| 344 | + changed[0] = 0 |
| 345 | + _cost_distance_relax_kernel[griddim, blockdim]( |
| 346 | + fric, dist, changed, |
| 347 | + height, width, |
| 348 | + dy_d, dx_d, dd_d, n_neighbors, |
| 349 | + np.float64(max_cost), |
| 350 | + ) |
| 351 | + if int(changed[0]) == 0: |
| 352 | + break |
| 353 | + |
| 354 | + # Convert: inf or over-budget -> NaN, else float32 |
| 355 | + out = cp.where( |
| 356 | + cp.isinf(dist) | (dist > max_cost), cp.nan, dist, |
| 357 | + ).astype(cp.float32) |
| 358 | + return out |
| 359 | + |
| 360 | + |
| 361 | +def _cost_distance_dask_cupy(source_da, friction_da, |
| 362 | + cellsize_x, cellsize_y, max_cost, |
| 363 | + target_values, dy, dx, dd): |
| 364 | + """Dask+CuPy cost distance. |
| 365 | +
|
| 366 | + Bounded max_cost: ``da.map_overlap`` with per-chunk GPU relaxation. |
| 367 | + Unbounded / large radius: convert to dask+numpy, use CPU iterative path |
| 368 | + (Dijkstra is O(N log N) vs parallel relaxation's O(N * diameter), so CPU |
| 369 | + is more efficient for unbounded global shortest paths). |
| 370 | + """ |
| 371 | + import cupy as cp |
| 372 | + |
| 373 | + height, width = source_da.shape |
| 374 | + |
| 375 | + use_map_overlap = False |
| 376 | + if np.isfinite(max_cost): |
| 377 | + positive_friction = da.where(friction_da > 0, friction_da, np.inf) |
| 378 | + f_min = float(da.nanmin(positive_friction).compute()) |
| 379 | + if np.isfinite(f_min) and f_min > 0: |
| 380 | + min_cellsize = min(cellsize_x, cellsize_y) |
| 381 | + max_radius = max_cost / (f_min * min_cellsize) |
| 382 | + pad = int(max_radius + 1) |
| 383 | + chunks_y, chunks_x = source_da.chunks |
| 384 | + if pad < max(chunks_y) and pad < max(chunks_x): |
| 385 | + use_map_overlap = True |
| 386 | + |
| 387 | + if use_map_overlap: |
| 388 | + pad_y = int(max_cost / (f_min * cellsize_y) + 1) |
| 389 | + pad_x = int(max_cost / (f_min * cellsize_x) + 1) |
| 390 | + |
| 391 | + # Closure captures the scalar parameters |
| 392 | + tv = target_values |
| 393 | + mc = max_cost |
| 394 | + cx, cy = cellsize_x, cellsize_y |
| 395 | + _dy, _dx, _dd = dy, dx, dd |
| 396 | + |
| 397 | + def _chunk_func(source_block, friction_block): |
| 398 | + return _cost_distance_cupy( |
| 399 | + source_block, friction_block, |
| 400 | + cx, cy, mc, tv, _dy, _dx, _dd, |
| 401 | + ) |
| 402 | + |
| 403 | + return da.map_overlap( |
| 404 | + _chunk_func, |
| 405 | + source_da, friction_da, |
| 406 | + depth=(pad_y, pad_x), |
| 407 | + boundary=np.nan, |
| 408 | + dtype=np.float32, |
| 409 | + meta=cp.array((), dtype=cp.float32), |
| 410 | + ) |
| 411 | + |
| 412 | + # Unbounded or padding too large: convert to dask+numpy, use CPU path |
| 413 | + source_np = source_da.map_blocks( |
| 414 | + lambda b: b.get(), dtype=source_da.dtype, |
| 415 | + meta=np.array((), dtype=source_da.dtype), |
| 416 | + ) |
| 417 | + friction_np = friction_da.map_blocks( |
| 418 | + lambda b: b.get(), dtype=friction_da.dtype, |
| 419 | + meta=np.array((), dtype=friction_da.dtype), |
| 420 | + ) |
| 421 | + result = _cost_distance_dask( |
| 422 | + source_np, friction_np, |
| 423 | + cellsize_x, cellsize_y, max_cost, |
| 424 | + target_values, dy, dx, dd, |
| 425 | + ) |
| 426 | + # Convert back to dask+cupy |
| 427 | + return result.map_blocks( |
| 428 | + cp.asarray, dtype=result.dtype, |
| 429 | + meta=cp.array((), dtype=result.dtype), |
| 430 | + ) |
| 431 | + |
| 432 | + |
239 | 433 | # --------------------------------------------------------------------------- |
240 | 434 | # Tile kernel for iterative boundary Dijkstra |
241 | 435 | # --------------------------------------------------------------------------- |
@@ -965,28 +1159,13 @@ def cost_distance( |
965 | 1159 | chunks=source_data.chunks) |
966 | 1160 |
|
967 | 1161 | if _is_cupy_backend: |
968 | | - import cupy |
969 | | - # Transfer to CPU, run numpy kernel, transfer back |
970 | | - source_np = source_data.get() |
971 | | - friction_np = np.asarray(friction.data.get(), dtype=np.float64) |
972 | | - result_data = cupy.asarray( |
973 | | - _cost_distance_numpy( |
974 | | - source_np, friction_np, |
975 | | - cellsize_x, cellsize_y, max_cost_f, |
976 | | - target_values, dy, dx, dd, |
977 | | - ) |
| 1162 | + result_data = _cost_distance_cupy( |
| 1163 | + source_data, friction_data, |
| 1164 | + cellsize_x, cellsize_y, max_cost_f, |
| 1165 | + target_values, dy, dx, dd, |
978 | 1166 | ) |
979 | 1167 | elif _is_dask_cupy: |
980 | | - # Spill each chunk to CPU via map_overlap, then wrap back as dask |
981 | | - source_data = source_data.map_blocks( |
982 | | - lambda b: b.get(), dtype=source_data.dtype, |
983 | | - meta=np.array((), dtype=source_data.dtype), |
984 | | - ) |
985 | | - friction_data = friction_data.map_blocks( |
986 | | - lambda b: b.get(), dtype=friction_data.dtype, |
987 | | - meta=np.array((), dtype=friction_data.dtype), |
988 | | - ) |
989 | | - result_data = _cost_distance_dask( |
| 1168 | + result_data = _cost_distance_dask_cupy( |
990 | 1169 | source_data, friction_data, |
991 | 1170 | cellsize_x, cellsize_y, max_cost_f, |
992 | 1171 | target_values, dy, dx, dd, |
|
0 commit comments