11module PointNeighborsCUDAExt
22
3- using PointNeighbors: PointNeighbors, generic_kernel, CUDAMultiGPUBackend, KernelAbstractions
4- using CUDA: CUDA, CuArray, CUDABackend
3+ using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors,
4+ GridNeighborhoodSearch, FullGridCellList, extract_svector,
5+ cell_coords, cell_index, check_cell_bounds, pushat_atomic!
6+ using CUDA: CUDA, CuArray, CUDABackend, cu
7+ using UnsafeAtomics: UnsafeAtomics
8+ using PointNeighbors. KernelAbstractions: KernelAbstractions, @kernel , @index
9+ using PointNeighbors. Adapt: Adapt
510
611const UnifiedCuArray = CuArray{<: Any , <: Any , CUDA. UnifiedMemory}
712
@@ -10,16 +15,28 @@ const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
1015PointNeighbors. get_backend (x:: UnifiedCuArray ) = CUDAMultiGPUBackend ()
1116
1217# Convert input array to `CuArray` with unified memory
13- function PointNeighbors . Adapt. adapt_structure (to:: CUDAMultiGPUBackend , array:: Array )
18+ function Adapt. adapt_structure (to:: CUDAMultiGPUBackend , array:: Array )
1419 return CuArray {eltype(array), ndims(array), CUDA.UnifiedMemory} (array)
1520end
1621
1722@inline function PointNeighbors. parallel_foreach (f, iterator, x:: UnifiedCuArray )
1823 PointNeighbors. parallel_foreach (f, iterator, CUDAMultiGPUBackend ())
1924end
2025
21- # On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
22- @inline function PointNeighbors. parallel_foreach (f, iterator, x:: CUDAMultiGPUBackend )
26+ @inline function PointNeighbors. parallel_foreach (f:: T , iterator, x:: CUDAMultiGPUBackend ) where {T}
27+ parallel_foreach2 ((i, gpu) -> @inline f (i), iterator, x)
28+ end
29+
30+ @inline function parallel_foreach2 (f, iterator, x:: UnifiedCuArray )
31+ parallel_foreach2 (f, iterator, CUDAMultiGPUBackend ())
32+ end
33+
34+ KernelAbstractions. @kernel function generic_kernel2 (f, gpu)
35+ i = KernelAbstractions. @index (Global)
36+ @inline f (i, gpu)
37+ end
38+
39+ @inline function parallel_foreach2 (f, iterator, x:: CUDAMultiGPUBackend )
2340 # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
2441 # and index with `iterator[eachindex(iterator)[i]]`.
2542 # Note that this only works with vector-like iterators that support arbitrary indexing.
3552
3653 backend = CUDABackend ()
3754
38- # Spawn kernel on each device
55+ # Synchronize each device
56+ for i in 1 : n_gpus
57+ CUDA. device! (i - 1 )
58+ KernelAbstractions. synchronize (backend)
59+ end
60+
61+ # Launch kernel on each device
3962 for (i, indices_) in enumerate (indices_split)
4063 # Select the correct device for this partition
4164 CUDA. device! (i - 1 )
4265
4366 # Call the generic kernel, which only calls a function with the global GPU index
44- generic_kernel (backend)(ndrange = length (indices_)) do j
45- @inbounds @inline f (iterator[indices_[j]])
67+ generic_kernel2 (backend)(i, ndrange = length (indices_)) do j, gpu
68+ @inbounds @inline f (iterator[indices_[j]], gpu )
4669 end
4770 end
4871
4972 # Synchronize each device
50- for i in 1 : length (indices_split)
73+ for i in 1 : n_gpus
5174 CUDA. device! (i - 1 )
5275 KernelAbstractions. synchronize (backend)
5376 end
5679 CUDA. device! (0 )
5780end
5881
82+ struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
83+ vov:: VOV
84+ vov_per_gpu:: V
85+ end
86+
87+ function MultiGPUVectorOfVectors (vov, vov_per_gpu)
88+ MultiGPUVectorOfVectors {eltype(vov.backend), typeof(vov), typeof(vov_per_gpu)} (vov, vov_per_gpu)
89+ end
90+
91+ Adapt. @adapt_structure (MultiGPUVectorOfVectors)
92+
93+ function Adapt. adapt_structure (to:: CUDAMultiGPUBackend , vov:: DynamicVectorOfVectors{T} ) where {T}
94+ max_inner_length, max_outer_length = size (vov. backend)
95+
96+ n_gpus = length (CUDA. devices ())
97+ vov_per_gpu = [DynamicVectorOfVectors {T} (; max_outer_length, max_inner_length) for _ in 1 : n_gpus]
98+
99+ vov_ = DynamicVectorOfVectors (Adapt. adapt (to, vov. backend), Adapt. adapt (to, vov. length_), Adapt. adapt (to, vov. lengths))
100+ vov_per_gpu_ = ntuple (i -> Adapt. adapt (CuArray, vov_per_gpu[i]), n_gpus)
101+ for vov__ in vov_per_gpu_
102+ vov__. length_[] = vov. length_[]
103+ end
104+
105+ return MultiGPUVectorOfVectors {T, typeof(vov_), typeof(vov_per_gpu_)} (vov_, vov_per_gpu_)
106+ end
107+
108+ const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:MultiGPUVectorOfVectors} }
109+
110+ function PointNeighbors. initialize_grid! (neighborhood_search:: MultiGPUNeighborhoodSearch , y:: AbstractMatrix )
111+ (; cell_list) = neighborhood_search
112+ (; cells) = cell_list
113+ (; vov, vov_per_gpu) = cells
114+
115+ for vov_ in vov_per_gpu
116+ vov_. lengths .= 0
117+ end
118+
119+ if neighborhood_search. search_radius < eps ()
120+ # Cannot initialize with zero search radius.
121+ # This is used in TrixiParticles when a neighborhood search is not used.
122+ return neighborhood_search
123+ end
124+
125+ # Fill cell lists per GPU
126+ # TODO split range into chunks for each GPU
127+ parallel_foreach2 (axes (y, 2 ), y) do point, gpu
128+ # Get cell index of the point's cell
129+ point_coords = extract_svector (y, Val (ndims (neighborhood_search)), point)
130+ cell = cell_coords (point_coords, neighborhood_search)
131+
132+ # Add point to corresponding cell
133+ @boundscheck check_cell_bounds (cell_list, cell)
134+
135+ # The atomic version of `pushat!` uses atomics to avoid race conditions when
136+ # `pushat!` is used in a parallel loop.
137+ @inbounds pushat_atomic! (vov_per_gpu[gpu], cell_index (cell_list, cell), point)
138+ end
139+
140+ lengths = ntuple (gpu -> vov_per_gpu[gpu]. lengths, length (vov_per_gpu))
141+ CUDA. synchronize ()
142+ offsets_ = cu (cumsum (lengths), unified = true )
143+ CUDA. synchronize ()
144+ vov. lengths .= offsets_[end ]
145+ CUDA. synchronize ()
146+ offsets = offsets_ .- lengths
147+
148+ # Merge cell lists
149+ parallel_foreach2 (axes (vov, 2 ), y) do cell, gpu
150+ offset = offsets[gpu][cell]
151+
152+ points = vov_per_gpu[gpu][cell]
153+ for i in eachindex (points)
154+ vov. backend[offset + i, cell] = points[i]
155+ end
156+ end
157+
158+ return neighborhood_search
159+ end
160+
161+ function PointNeighbors. update! (neighborhood_search:: MultiGPUNeighborhoodSearch ,
162+ x:: AbstractMatrix , y:: AbstractMatrix ;
163+ points_moving = (true , true ), parallelization_backend = x)
164+ # The coordinates of the first set of points are irrelevant for this NHS.
165+ # Only update when the second set is moving.
166+ points_moving[2 ] || return neighborhood_search
167+
168+ PointNeighbors. initialize_grid! (neighborhood_search, y)
169+ end
170+
59171end # module
0 commit comments