Skip to content

Commit 1baadd7

Browse files
committed
Add native CUDA kernel for hydraulic erosion (#961)
Replaces the CPU-fallback path for CuPy and Dask+CuPy arrays with a real GPU kernel. Each particle maps to one CUDA thread; conflicts at shared heightmap cells are resolved with cuda.atomic.add. Refactors erode() to use ArrayTypeFunctionMapping dispatch instead of manual type checks. Also adds test_erosion.py with 15 tests covering numpy, dask+numpy, cupy, and dask+cupy backends.
1 parent fc9d832 commit 1baadd7

File tree

2 files changed

+413
-38
lines changed

2 files changed

+413
-38
lines changed

xrspatial/erosion.py

Lines changed: 206 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,12 @@ class cupy(object):
1717
except ImportError:
1818
da = None
1919

20-
from xrspatial.utils import has_cuda_and_cupy, is_cupy_array, is_dask_cupy
20+
from xrspatial.utils import (
21+
ArrayTypeFunctionMapping,
22+
has_cuda_and_cupy,
23+
is_cupy_array,
24+
is_dask_cupy,
25+
)
2126

2227
# Default erosion parameters
2328
_DEFAULT_PARAMS = dict(
@@ -157,6 +162,199 @@ def _erode_cpu(heightmap, random_pos, boy, box, bw,
157162
return heightmap
158163

159164

165+
# ---------------------------------------------------------------------------
166+
# GPU (CuPy / CUDA) implementation
167+
# ---------------------------------------------------------------------------
168+
169+
if cuda is not None:
170+
171+
@cuda.jit
172+
def _erode_gpu_kernel(
173+
heightmap, random_pos, boy, box, bw,
174+
inertia, capacity, deposition, erosion_rate,
175+
evaporation, gravity, min_slope, max_lifetime,
176+
):
177+
"""One CUDA thread per particle.
178+
179+
Particles within a single launch are independent. Conflicts at
180+
shared heightmap cells are resolved via ``cuda.atomic.add``.
181+
"""
182+
tid = cuda.grid(1)
183+
n_iterations = random_pos.shape[0]
184+
if tid >= n_iterations:
185+
return
186+
187+
height = heightmap.shape[0]
188+
width = heightmap.shape[1]
189+
n_brush = bw.shape[0]
190+
191+
pos_x = random_pos[tid, 0] * (width - 3) + 1
192+
pos_y = random_pos[tid, 1] * (height - 3) + 1
193+
dir_x = 0.0
194+
dir_y = 0.0
195+
speed = 1.0
196+
water = 1.0
197+
sediment = 0.0
198+
199+
for step in range(max_lifetime):
200+
node_x = int(pos_x)
201+
node_y = int(pos_y)
202+
203+
if node_x < 1 or node_x >= width - 2:
204+
return
205+
if node_y < 1 or node_y >= height - 2:
206+
return
207+
208+
fx = pos_x - node_x
209+
fy = pos_y - node_y
210+
211+
h00 = heightmap[node_y, node_x]
212+
h10 = heightmap[node_y, node_x + 1]
213+
h01 = heightmap[node_y + 1, node_x]
214+
h11 = heightmap[node_y + 1, node_x + 1]
215+
216+
grad_x = (h10 - h00) * (1 - fy) + (h11 - h01) * fy
217+
grad_y = (h01 - h00) * (1 - fx) + (h11 - h10) * fx
218+
219+
dir_x = dir_x * inertia - grad_x * (1 - inertia)
220+
dir_y = dir_y * inertia - grad_y * (1 - inertia)
221+
222+
dir_len = (dir_x * dir_x + dir_y * dir_y) ** 0.5
223+
if dir_len < 1e-10:
224+
return
225+
dir_x /= dir_len
226+
dir_y /= dir_len
227+
228+
new_x = pos_x + dir_x
229+
new_y = pos_y + dir_y
230+
231+
if new_x < 1 or new_x >= width - 2:
232+
return
233+
if new_y < 1 or new_y >= height - 2:
234+
return
235+
236+
h_old = (h00 * (1 - fx) * (1 - fy) + h10 * fx * (1 - fy) +
237+
h01 * (1 - fx) * fy + h11 * fx * fy)
238+
239+
new_node_x = int(new_x)
240+
new_node_y = int(new_y)
241+
new_fx = new_x - new_node_x
242+
new_fy = new_y - new_node_y
243+
h_new = (heightmap[new_node_y, new_node_x] * (1 - new_fx) * (1 - new_fy) +
244+
heightmap[new_node_y, new_node_x + 1] * new_fx * (1 - new_fy) +
245+
heightmap[new_node_y + 1, new_node_x] * (1 - new_fx) * new_fy +
246+
heightmap[new_node_y + 1, new_node_x + 1] * new_fx * new_fy)
247+
248+
h_diff = h_new - h_old
249+
250+
neg_h_diff = -h_diff
251+
if neg_h_diff < min_slope:
252+
neg_h_diff = min_slope
253+
sed_capacity = neg_h_diff * speed * water * capacity
254+
255+
if sediment > sed_capacity or h_diff > 0:
256+
if h_diff > 0:
257+
amount = h_diff if h_diff < sediment else sediment
258+
else:
259+
amount = (sediment - sed_capacity) * deposition
260+
261+
sediment -= amount
262+
263+
cuda.atomic.add(heightmap, (node_y, node_x),
264+
amount * (1 - fx) * (1 - fy))
265+
cuda.atomic.add(heightmap, (node_y, node_x + 1),
266+
amount * fx * (1 - fy))
267+
cuda.atomic.add(heightmap, (node_y + 1, node_x),
268+
amount * (1 - fx) * fy)
269+
cuda.atomic.add(heightmap, (node_y + 1, node_x + 1),
270+
amount * fx * fy)
271+
else:
272+
neg_h = -h_diff
273+
sc_minus_sed = (sed_capacity - sediment) * erosion_rate
274+
amount = sc_minus_sed if sc_minus_sed < neg_h else neg_h
275+
276+
for k in range(n_brush):
277+
ey = node_y + boy[k]
278+
ex = node_x + box[k]
279+
if 0 <= ey < height and 0 <= ex < width:
280+
cuda.atomic.add(heightmap, (ey, ex),
281+
-(amount * bw[k]))
282+
283+
sediment += amount
284+
285+
speed_sq = speed * speed + h_diff * gravity
286+
if speed_sq > 0:
287+
speed = speed_sq ** 0.5
288+
else:
289+
speed = 0.0
290+
water *= (1 - evaporation)
291+
292+
pos_x = new_x
293+
pos_y = new_y
294+
295+
296+
def _erode_cupy(data, random_pos_np, boy, box, bw, params):
297+
"""Run erosion on a CuPy array using the CUDA kernel."""
298+
hm = data.astype(cupy.float64)
299+
300+
# Transfer brush and random positions to device
301+
random_pos_d = cupy.asarray(random_pos_np)
302+
boy_d = cupy.asarray(boy)
303+
box_d = cupy.asarray(box)
304+
bw_d = cupy.asarray(bw)
305+
306+
n_particles = random_pos_np.shape[0]
307+
threads_per_block = 256
308+
blocks = (n_particles + threads_per_block - 1) // threads_per_block
309+
310+
_erode_gpu_kernel[blocks, threads_per_block](
311+
hm, random_pos_d, boy_d, box_d, bw_d,
312+
params['inertia'], params['capacity'], params['deposition'],
313+
params['erosion'], params['evaporation'], params['gravity'],
314+
params['min_slope'], int(params['max_lifetime']),
315+
)
316+
cuda.synchronize()
317+
318+
return hm.astype(cupy.float32)
319+
320+
321+
def _erode_numpy(data, random_pos, boy, box, bw, params):
322+
"""Run erosion on a NumPy array using the CPU kernel."""
323+
hm = data.astype(np.float64).copy()
324+
325+
hm = _erode_cpu(
326+
hm, random_pos, boy, box, bw,
327+
params['inertia'], params['capacity'], params['deposition'],
328+
params['erosion'], params['evaporation'], params['gravity'],
329+
params['min_slope'], int(params['max_lifetime']),
330+
)
331+
332+
return hm.astype(np.float32)
333+
334+
335+
def _erode_dask_numpy(data, random_pos, boy, box, bw, params):
336+
"""Run erosion on a dask+numpy array.
337+
338+
Erosion is a global operation (particles traverse the full grid),
339+
so we materialize to numpy, run the CPU kernel, then re-wrap.
340+
"""
341+
np_data = data.compute()
342+
result = _erode_numpy(np_data, random_pos, boy, box, bw, params)
343+
return da.from_array(result, chunks=data.chunksize)
344+
345+
346+
def _erode_dask_cupy(data, random_pos, boy, box, bw, params):
347+
"""Run erosion on a dask+cupy array.
348+
349+
Materializes to a single CuPy array, runs the GPU kernel, then
350+
re-wraps as dask.
351+
"""
352+
cp_data = data.compute() # CuPy ndarray
353+
result = _erode_cupy(cp_data, random_pos, boy, box, bw, params)
354+
return da.from_array(result, chunks=data.chunksize,
355+
meta=cupy.array((), dtype=cupy.float32))
356+
357+
160358
def erode(agg, iterations=50000, seed=42, params=None):
161359
"""Apply particle-based hydraulic erosion to a terrain DataArray.
162360
@@ -185,49 +383,19 @@ def erode(agg, iterations=50000, seed=42, params=None):
185383
if params is not None:
186384
p.update(params)
187385

188-
data = agg.data
189-
is_dask = da is not None and isinstance(data, da.Array)
190-
is_gpu = False
191-
192-
if is_dask:
193-
if is_dask_cupy(agg):
194-
is_gpu = True
195-
data = data.compute() # cupy ndarray
196-
else:
197-
data = data.compute() # numpy ndarray
198-
199-
if has_cuda_and_cupy() and is_cupy_array(data):
200-
is_gpu = True
201-
202-
# work on a copy
203-
if is_gpu:
204-
hm = cupy.asnumpy(data).astype(np.float64)
205-
else:
206-
hm = data.astype(np.float64).copy()
207-
208-
# precompute brush and random positions outside JIT
386+
# Precompute brush and random positions on the host
209387
boy, box, bw = _build_brush(int(p['radius']))
210388
rng = np.random.RandomState(seed)
211389
random_pos = rng.random((iterations, 2))
212390

213-
hm = _erode_cpu(
214-
hm, random_pos, boy, box, bw,
215-
p['inertia'], p['capacity'], p['deposition'], p['erosion'],
216-
p['evaporation'], p['gravity'], p['min_slope'],
217-
int(p['max_lifetime']),
391+
mapper = ArrayTypeFunctionMapping(
392+
numpy_func=lambda d: _erode_numpy(d, random_pos, boy, box, bw, p),
393+
cupy_func=lambda d: _erode_cupy(d, random_pos, boy, box, bw, p),
394+
dask_func=lambda d: _erode_dask_numpy(d, random_pos, boy, box, bw, p),
395+
dask_cupy_func=lambda d: _erode_dask_cupy(d, random_pos, boy, box, bw, p),
218396
)
219397

220-
result_data = hm.astype(np.float32)
221-
if is_gpu:
222-
result_data = cupy.asarray(result_data)
223-
224-
if is_dask:
225-
if is_gpu:
226-
result_data = da.from_array(result_data,
227-
chunks=agg.data.chunksize,
228-
meta=cupy.array((), dtype=cupy.float32))
229-
else:
230-
result_data = da.from_array(result_data, chunks=agg.data.chunksize)
398+
result_data = mapper(agg)(agg.data)
231399

232400
return xr.DataArray(result_data, dims=agg.dims, coords=agg.coords,
233401
attrs=agg.attrs, name=agg.name)

0 commit comments

Comments
 (0)