Skip to content

Commit fc7180f

Browse files
committed
Update on each GPU in device memory and then copy to unified memory
1 parent 54f6a07 commit fc7180f

1 file changed

Lines changed: 121 additions & 9 deletions

File tree

ext/PointNeighborsCUDAExt.jl

Lines changed: 121 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
module PointNeighborsCUDAExt
22

3-
using PointNeighbors: PointNeighbors, generic_kernel, CUDAMultiGPUBackend, KernelAbstractions
4-
using CUDA: CUDA, CuArray, CUDABackend
3+
using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors,
4+
GridNeighborhoodSearch, FullGridCellList, extract_svector,
5+
cell_coords, cell_index, check_cell_bounds, pushat_atomic!
6+
using CUDA: CUDA, CuArray, CUDABackend, cu
7+
using UnsafeAtomics: UnsafeAtomics
8+
using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index
9+
using PointNeighbors.Adapt: Adapt
510

611
const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
712

@@ -10,16 +15,28 @@ const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
1015
PointNeighbors.get_backend(x::UnifiedCuArray) = CUDAMultiGPUBackend()
1116

1217
# Convert input array to `CuArray` with unified memory
13-
function PointNeighbors.Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array)
18+
function Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array)
1419
return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array)
1520
end
1621

1722
@inline function PointNeighbors.parallel_foreach(f, iterator, x::UnifiedCuArray)
1823
PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend())
1924
end
2025

21-
# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
22-
@inline function PointNeighbors.parallel_foreach(f, iterator, x::CUDAMultiGPUBackend)
26+
@inline function PointNeighbors.parallel_foreach(f::T, iterator, x::CUDAMultiGPUBackend) where {T}
27+
parallel_foreach2((i, gpu) -> @inline f(i), iterator, x)
28+
end
29+
30+
@inline function parallel_foreach2(f, iterator, x::UnifiedCuArray)
31+
parallel_foreach2(f, iterator, CUDAMultiGPUBackend())
32+
end
33+
34+
KernelAbstractions.@kernel function generic_kernel2(f, gpu)
35+
i = KernelAbstractions.@index(Global)
36+
@inline f(i, gpu)
37+
end
38+
39+
@inline function parallel_foreach2(f, iterator, x::CUDAMultiGPUBackend)
2340
# On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
2441
# and index with `iterator[eachindex(iterator)[i]]`.
2542
# Note that this only works with vector-like iterators that support arbitrary indexing.
@@ -35,19 +52,25 @@ end
3552

3653
backend = CUDABackend()
3754

38-
# Spawn kernel on each device
55+
# Synchronize each device
56+
for i in 1:n_gpus
57+
CUDA.device!(i - 1)
58+
KernelAbstractions.synchronize(backend)
59+
end
60+
61+
# Launch kernel on each device
3962
for (i, indices_) in enumerate(indices_split)
4063
# Select the correct device for this partition
4164
CUDA.device!(i - 1)
4265

4366
# Call the generic kernel, which only calls a function with the global GPU index
44-
generic_kernel(backend)(ndrange = length(indices_)) do j
45-
@inbounds @inline f(iterator[indices_[j]])
67+
generic_kernel2(backend)(i, ndrange = length(indices_)) do j, gpu
68+
@inbounds @inline f(iterator[indices_[j]], gpu)
4669
end
4770
end
4871

4972
# Synchronize each device
50-
for i in 1:length(indices_split)
73+
for i in 1:n_gpus
5174
CUDA.device!(i - 1)
5275
KernelAbstractions.synchronize(backend)
5376
end
@@ -56,4 +79,93 @@ end
5679
CUDA.device!(0)
5780
end
5881

82+
struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
83+
vov::VOV
84+
vov_per_gpu::V
85+
end
86+
87+
function MultiGPUVectorOfVectors(vov, vov_per_gpu)
88+
MultiGPUVectorOfVectors{eltype(vov.backend), typeof(vov), typeof(vov_per_gpu)}(vov, vov_per_gpu)
89+
end
90+
91+
Adapt.@adapt_structure(MultiGPUVectorOfVectors)
92+
93+
function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
94+
max_inner_length, max_outer_length = size(vov.backend)
95+
96+
n_gpus = length(CUDA.devices())
97+
vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
98+
99+
vov_ = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
100+
vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus)
101+
for vov__ in vov_per_gpu_
102+
vov__.length_[] = vov.length_[]
103+
end
104+
105+
return MultiGPUVectorOfVectors{T, typeof(vov_), typeof(vov_per_gpu_)}(vov_, vov_per_gpu_)
106+
end
107+
108+
const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
109+
110+
function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
111+
(; cell_list) = neighborhood_search
112+
(; cells) = cell_list
113+
(; vov, vov_per_gpu) = cells
114+
115+
for vov_ in vov_per_gpu
116+
vov_.lengths .= 0
117+
end
118+
119+
if neighborhood_search.search_radius < eps()
120+
# Cannot initialize with zero search radius.
121+
# This is used in TrixiParticles when a neighborhood search is not used.
122+
return neighborhood_search
123+
end
124+
125+
# Fill cell lists per GPU
126+
# TODO split range into chunks for each GPU
127+
parallel_foreach2(axes(y, 2), y) do point, gpu
128+
# Get cell index of the point's cell
129+
point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
130+
cell = cell_coords(point_coords, neighborhood_search)
131+
132+
# Add point to corresponding cell
133+
@boundscheck check_cell_bounds(cell_list, cell)
134+
135+
# The atomic version of `pushat!` uses atomics to avoid race conditions when
136+
# `pushat!` is used in a parallel loop.
137+
@inbounds pushat_atomic!(vov_per_gpu[gpu], cell_index(cell_list, cell), point)
138+
end
139+
140+
lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, length(vov_per_gpu))
141+
CUDA.synchronize()
142+
offsets_ = cu(cumsum(lengths), unified = true)
143+
CUDA.synchronize()
144+
vov.lengths .= offsets_[end]
145+
CUDA.synchronize()
146+
offsets = offsets_ .- lengths
147+
148+
# Merge cell lists
149+
parallel_foreach2(axes(vov, 2), y) do cell, gpu
150+
offset = offsets[gpu][cell]
151+
152+
points = vov_per_gpu[gpu][cell]
153+
for i in eachindex(points)
154+
vov.backend[offset + i, cell] = points[i]
155+
end
156+
end
157+
158+
return neighborhood_search
159+
end
160+
161+
function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
162+
x::AbstractMatrix, y::AbstractMatrix;
163+
points_moving = (true, true), parallelization_backend = x)
164+
# The coordinates of the first set of points are irrelevant for this NHS.
165+
# Only update when the second set is moving.
166+
points_moving[2] || return neighborhood_search
167+
168+
PointNeighbors.initialize_grid!(neighborhood_search, y)
169+
end
170+
59171
end # module

0 commit comments

Comments
 (0)