Skip to content

Commit 86da4e0

Browse files
authored
Fixes #905: add GPU (CuPy) backend for cost_distance (#910)
Replace CPU fallback with native CUDA iterative parallel relaxation (parallel Bellman-Ford). Each thread processes one pixel per iteration, checking all neighbours for shorter paths. The wavefront advances at least one pixel per iteration; convergence takes O(height + width) iterations with early termination when no changes occur. CuPy path: seeds source pixels at cost 0, runs relaxation kernel until convergence, converts inf/over-budget to NaN. Dask+CuPy: bounded max_cost uses da.map_overlap with per-chunk GPU relaxation; unbounded falls back to CPU iterative tile Dijkstra (O(N log N) Dijkstra beats O(N * diameter) relaxation for global paths). Tests extended with cupy and dask+cupy backends across 8 parametrized test functions. New tests for unbounded dask+cupy and cupy chunk type.
1 parent 593e559 commit 86da4e0

File tree

3 files changed

+256
-34
lines changed

3 files changed

+256
-34
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ In the GIS world, rasters are used for representing continuous phenomena (e.g. e
202202
| Name | Description | NumPy xr.DataArray | Dask xr.DataArray | CuPy GPU xr.DataArray | Dask GPU xr.DataArray |
203203
|:----------:|:------------|:----------------------:|:--------------------:|:-------------------:|:------:|
204204
| [Allocation](xrspatial/proximity.py) | Assigns each cell to the identity of the nearest source feature | ✅️ || ✅️ | ✅️ |
205-
| [Cost Distance](xrspatial/cost_distance.py) | Computes minimum accumulated cost to the nearest source through a friction surface | ✅️ || 🔄 | 🔄 |
205+
| [Cost Distance](xrspatial/cost_distance.py) | Computes minimum accumulated cost to the nearest source through a friction surface | ✅️ || ✅️ | ✅️ |
206206
| [Direction](xrspatial/proximity.py) | Computes the direction from each cell to the nearest source feature | ✅️ || ✅️ | ✅️ |
207207
| [Proximity](xrspatial/proximity.py) | Computes the distance from each cell to the nearest source feature | ✅️ || ✅️ | ✅️ |
208208

xrspatial/cost_distance.py

Lines changed: 200 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
from __future__ import annotations
3030

31+
import math as _math
3132
from functools import partial
3233
from math import sqrt
3334

@@ -39,8 +40,16 @@
3940
except ImportError:
4041
da = None
4142

43+
from numba import cuda
44+
45+
try:
46+
import cupy
47+
except ImportError:
48+
class cupy: # type: ignore[no-redef]
49+
ndarray = False
50+
4251
from xrspatial.utils import (
43-
get_dataarray_resolution, ngjit,
52+
cuda_args, get_dataarray_resolution, ngjit,
4453
has_cuda_and_cupy, is_cupy_array, is_dask_cupy,
4554
)
4655
from xrspatial.dataset_support import supports_dataset
@@ -236,6 +245,191 @@ def _cost_distance_numpy(source_data, friction_data, cellsize_x, cellsize_y,
236245
)
237246

238247

248+
# ---------------------------------------------------------------------------
249+
# CuPy GPU backend — iterative parallel relaxation (parallel Bellman-Ford)
250+
# ---------------------------------------------------------------------------
251+
252+
@cuda.jit
253+
def _cost_distance_relax_kernel(friction, dist, changed,
254+
height, width,
255+
dy, dx, dd, n_neighbors,
256+
max_cost):
257+
"""One relaxation pass: each pixel checks all neighbours for shorter paths.
258+
259+
Iterate until *changed* stays 0. Convergence is guaranteed because
260+
dist values only decrease and the graph has finite edge weights.
261+
"""
262+
iy, ix = cuda.grid(2)
263+
if iy >= height or ix >= width:
264+
return
265+
266+
f_u = friction[iy, ix]
267+
if not (_math.isfinite(f_u) and f_u > 0.0):
268+
return
269+
270+
current = dist[iy, ix]
271+
best = current
272+
273+
for k in range(n_neighbors):
274+
vy = iy + dy[k]
275+
vx = ix + dx[k]
276+
if vy < 0 or vy >= height or vx < 0 or vx >= width:
277+
continue
278+
279+
d_v = dist[vy, vx]
280+
if d_v >= best:
281+
continue
282+
283+
f_v = friction[vy, vx]
284+
if not (_math.isfinite(f_v) and f_v > 0.0):
285+
continue
286+
287+
edge_cost = dd[k] * (f_u + f_v) * 0.5
288+
new_cost = d_v + edge_cost
289+
290+
if new_cost < best:
291+
best = new_cost
292+
293+
if best < current and best <= max_cost:
294+
dist[iy, ix] = best
295+
changed[0] = 1
296+
297+
298+
def _cost_distance_cupy(source_data, friction_data, cellsize_x, cellsize_y,
299+
max_cost, target_values, dy, dx, dd):
300+
"""GPU cost-distance via iterative parallel relaxation.
301+
302+
Each CUDA thread processes one pixel per iteration, checking all
303+
neighbours for shorter paths (parallel Bellman-Ford). The wavefront
304+
advances at least one pixel per iteration, so convergence takes at
305+
most O(height + width) iterations.
306+
"""
307+
import cupy as cp
308+
309+
height, width = source_data.shape
310+
src = source_data.astype(cp.float64) if source_data.dtype != cp.float64 \
311+
else source_data
312+
fric = friction_data.astype(cp.float64) if friction_data.dtype != cp.float64 \
313+
else friction_data
314+
315+
# Initialize all distances to inf
316+
dist = cp.full((height, width), cp.inf, dtype=cp.float64)
317+
318+
# Find source pixels
319+
if len(target_values) == 0:
320+
source_mask = cp.isfinite(src) & (src != 0)
321+
else:
322+
source_mask = cp.isin(src, cp.asarray(target_values, dtype=cp.float64))
323+
source_mask &= cp.isfinite(src)
324+
325+
# Only seed sources on passable terrain
326+
passable = cp.isfinite(fric) & (fric > 0)
327+
source_mask &= passable
328+
dist[source_mask] = 0.0
329+
330+
if not cp.any(source_mask):
331+
return cp.full((height, width), cp.nan, dtype=cp.float32)
332+
333+
# Transfer neighbor offsets to device
334+
dy_d = cp.asarray(dy, dtype=cp.int64)
335+
dx_d = cp.asarray(dx, dtype=cp.int64)
336+
dd_d = cp.asarray(dd, dtype=cp.float64)
337+
n_neighbors = len(dy)
338+
339+
changed = cp.zeros(1, dtype=cp.int32)
340+
griddim, blockdim = cuda_args((height, width))
341+
342+
max_iterations = height + width
343+
for _ in range(max_iterations):
344+
changed[0] = 0
345+
_cost_distance_relax_kernel[griddim, blockdim](
346+
fric, dist, changed,
347+
height, width,
348+
dy_d, dx_d, dd_d, n_neighbors,
349+
np.float64(max_cost),
350+
)
351+
if int(changed[0]) == 0:
352+
break
353+
354+
# Convert: inf or over-budget -> NaN, else float32
355+
out = cp.where(
356+
cp.isinf(dist) | (dist > max_cost), cp.nan, dist,
357+
).astype(cp.float32)
358+
return out
359+
360+
361+
def _cost_distance_dask_cupy(source_da, friction_da,
362+
cellsize_x, cellsize_y, max_cost,
363+
target_values, dy, dx, dd):
364+
"""Dask+CuPy cost distance.
365+
366+
Bounded max_cost: ``da.map_overlap`` with per-chunk GPU relaxation.
367+
Unbounded / large radius: convert to dask+numpy, use CPU iterative path
368+
(Dijkstra is O(N log N) vs parallel relaxation's O(N * diameter), so CPU
369+
is more efficient for unbounded global shortest paths).
370+
"""
371+
import cupy as cp
372+
373+
height, width = source_da.shape
374+
375+
use_map_overlap = False
376+
if np.isfinite(max_cost):
377+
positive_friction = da.where(friction_da > 0, friction_da, np.inf)
378+
f_min = float(da.nanmin(positive_friction).compute())
379+
if np.isfinite(f_min) and f_min > 0:
380+
min_cellsize = min(cellsize_x, cellsize_y)
381+
max_radius = max_cost / (f_min * min_cellsize)
382+
pad = int(max_radius + 1)
383+
chunks_y, chunks_x = source_da.chunks
384+
if pad < max(chunks_y) and pad < max(chunks_x):
385+
use_map_overlap = True
386+
387+
if use_map_overlap:
388+
pad_y = int(max_cost / (f_min * cellsize_y) + 1)
389+
pad_x = int(max_cost / (f_min * cellsize_x) + 1)
390+
391+
# Closure captures the scalar parameters
392+
tv = target_values
393+
mc = max_cost
394+
cx, cy = cellsize_x, cellsize_y
395+
_dy, _dx, _dd = dy, dx, dd
396+
397+
def _chunk_func(source_block, friction_block):
398+
return _cost_distance_cupy(
399+
source_block, friction_block,
400+
cx, cy, mc, tv, _dy, _dx, _dd,
401+
)
402+
403+
return da.map_overlap(
404+
_chunk_func,
405+
source_da, friction_da,
406+
depth=(pad_y, pad_x),
407+
boundary=np.nan,
408+
dtype=np.float32,
409+
meta=cp.array((), dtype=cp.float32),
410+
)
411+
412+
# Unbounded or padding too large: convert to dask+numpy, use CPU path
413+
source_np = source_da.map_blocks(
414+
lambda b: b.get(), dtype=source_da.dtype,
415+
meta=np.array((), dtype=source_da.dtype),
416+
)
417+
friction_np = friction_da.map_blocks(
418+
lambda b: b.get(), dtype=friction_da.dtype,
419+
meta=np.array((), dtype=friction_da.dtype),
420+
)
421+
result = _cost_distance_dask(
422+
source_np, friction_np,
423+
cellsize_x, cellsize_y, max_cost,
424+
target_values, dy, dx, dd,
425+
)
426+
# Convert back to dask+cupy
427+
return result.map_blocks(
428+
cp.asarray, dtype=result.dtype,
429+
meta=cp.array((), dtype=result.dtype),
430+
)
431+
432+
239433
# ---------------------------------------------------------------------------
240434
# Tile kernel for iterative boundary Dijkstra
241435
# ---------------------------------------------------------------------------
@@ -965,28 +1159,13 @@ def cost_distance(
9651159
chunks=source_data.chunks)
9661160

9671161
if _is_cupy_backend:
968-
import cupy
969-
# Transfer to CPU, run numpy kernel, transfer back
970-
source_np = source_data.get()
971-
friction_np = np.asarray(friction.data.get(), dtype=np.float64)
972-
result_data = cupy.asarray(
973-
_cost_distance_numpy(
974-
source_np, friction_np,
975-
cellsize_x, cellsize_y, max_cost_f,
976-
target_values, dy, dx, dd,
977-
)
1162+
result_data = _cost_distance_cupy(
1163+
source_data, friction_data,
1164+
cellsize_x, cellsize_y, max_cost_f,
1165+
target_values, dy, dx, dd,
9781166
)
9791167
elif _is_dask_cupy:
980-
# Spill each chunk to CPU via map_overlap, then wrap back as dask
981-
source_data = source_data.map_blocks(
982-
lambda b: b.get(), dtype=source_data.dtype,
983-
meta=np.array((), dtype=source_data.dtype),
984-
)
985-
friction_data = friction_data.map_blocks(
986-
lambda b: b.get(), dtype=friction_data.dtype,
987-
meta=np.array((), dtype=friction_data.dtype),
988-
)
989-
result_data = _cost_distance_dask(
1168+
result_data = _cost_distance_dask_cupy(
9901169
source_data, friction_data,
9911170
cellsize_x, cellsize_y, max_cost_f,
9921171
target_values, dy, dx, dd,

0 commit comments

Comments
 (0)