@@ -2,11 +2,12 @@ module PointNeighborsCUDAExt
22
33using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors,
44 GridNeighborhoodSearch, FullGridCellList, extract_svector,
5- cell_coords, cell_index, check_cell_bounds, pushat_atomic!
5+ cell_coords, cell_index, @threaded , generic_kernel
66using CUDA: CUDA, CuArray, CUDABackend, cu
77using UnsafeAtomics: UnsafeAtomics
88using PointNeighbors. KernelAbstractions: KernelAbstractions, @kernel , @index
99using PointNeighbors. Adapt: Adapt
10+ using Base: @propagate_inbounds
1011
1112const UnifiedCuArray = CuArray{<: Any , <: Any , CUDA. UnifiedMemory}
1213
2425end
2526
2627@inline function PointNeighbors. parallel_foreach (f:: T , iterator, x:: CUDAMultiGPUBackend ) where {T}
27- parallel_foreach2 ((i, gpu) -> @inline f (i), iterator, x)
28- end
29-
30- @inline function parallel_foreach2 (f, iterator, x:: UnifiedCuArray )
31- parallel_foreach2 (f, iterator, CUDAMultiGPUBackend ())
32- end
33-
34- KernelAbstractions. @kernel function generic_kernel2 (f, gpu)
35- i = KernelAbstractions. @index (Global)
36- @inline f (i, gpu)
37- end
38-
39- @inline function parallel_foreach2 (f, iterator, x:: CUDAMultiGPUBackend )
4028 # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
4129 # and index with `iterator[eachindex(iterator)[i]]`.
4230 # Note that this only works with vector-like iterators that support arbitrary indexing.
6452 CUDA. device! (i - 1 )
6553
6654 # Call the generic kernel, which only calls a function with the global GPU index
67- generic_kernel2 (backend)(i, ndrange = length (indices_)) do j, gpu
68- @inbounds @inline f (iterator[indices_[j]], gpu )
55+ generic_kernel (backend)(ndrange = length (indices_)) do j
56+ @inbounds @inline f (iterator[indices_[j]])
6957 end
7058 end
7159
7967 CUDA. device! (0 )
8068end
8169
82- struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
83- vov :: VOV
84- vov_per_gpu :: V
85- end
86-
87- function MultiGPUVectorOfVectors (vov, vov_per_gpu)
88- MultiGPUVectorOfVectors {eltype(vov.backend), typeof(vov), typeof(vov_per_gpu)} (vov, vov_per_gpu )
70+ function atomic_system_add (ptr :: CUDA.LLVMPtr{Int32, CUDA.AS.Global} , val :: Int32 )
71+ CUDA . LLVM . Interop . @asmcall (
72+ " atom.sys.global.add.u32 \$ 0, [ \$ 1], \$ 2; " ,
73+ " =r,l,r,~{memory} " ,
74+ true , Int32, Tuple{CUDA . LLVMPtr{Int32, CUDA . AS . Global}, Int32},
75+ ptr, val
76+ )
8977end
9078
91- Adapt. @adapt_structure (MultiGPUVectorOfVectors)
79+ @inline function pushat_atomic_system! (vov, i, value)
80+ (; backend, lengths) = vov
9281
93- function Adapt. adapt_structure (to:: CUDAMultiGPUBackend , vov:: DynamicVectorOfVectors{T} ) where {T}
94- max_inner_length, max_outer_length = size (vov. backend)
82+ @boundscheck checkbounds (vov, i)
9583
96- n_gpus = length (CUDA. devices ())
97- vov_per_gpu = [DynamicVectorOfVectors {T} (; max_outer_length, max_inner_length) for _ in 1 : n_gpus]
84+ # Increment the column length with an atomic add to avoid race conditions.
85+ # Store the new value since it might be changed immediately afterwards by another
86+ # thread.
87+ # new_length = Atomix.@atomic lengths[i] += 1
88+ new_length = atomic_system_add (pointer (lengths, i), Int32 (1 )) + 1
9889
99- vov_ = DynamicVectorOfVectors (Adapt. adapt (to, vov. backend), Adapt. adapt (to, vov. length_), Adapt. adapt (to, vov. lengths))
100- vov_per_gpu_ = ntuple (i -> Adapt. adapt (CuArray, vov_per_gpu[i]), n_gpus)
101- for vov__ in vov_per_gpu_
102- vov__. length_[] = vov. length_[]
103- end
90+ # We can write here without race conditions, since the atomic add guarantees
91+ # that `new_length` is different for each thread.
92+ backend[new_length, i] = value
10493
105- return MultiGPUVectorOfVectors {T, typeof(vov_), typeof(vov_per_gpu_)} (vov_, vov_per_gpu_)
94+ return vov
10695end
10796
108- const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:MultiGPUVectorOfVectors } }
97+ const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}} } }
10998
11099function PointNeighbors. initialize_grid! (neighborhood_search:: MultiGPUNeighborhoodSearch , y:: AbstractMatrix )
111100 (; cell_list) = neighborhood_search
112- (; cells) = cell_list
113- (; vov, vov_per_gpu) = cells
114101
115- for vov_ in vov_per_gpu
116- vov_. lengths .= 0
117- end
102+ cell_list. cells. lengths .= 0
118103
119104 if neighborhood_search. search_radius < eps ()
120105 # Cannot initialize with zero search radius.
121106 # This is used in TrixiParticles when a neighborhood search is not used.
122107 return neighborhood_search
123108 end
124109
125- # Fill cell lists per GPU
126- # TODO split range into chunks for each GPU
127- parallel_foreach2 (axes (y, 2 ), y) do point, gpu
110+ # Faster on a single GPU
111+ @threaded CUDABackend () for point in axes (y, 2 )
128112 # Get cell index of the point's cell
129- point_coords = extract_svector (y, Val (ndims (neighborhood_search)), point)
130- cell = cell_coords (point_coords, neighborhood_search)
113+ point_coords = PointNeighbors . extract_svector (y, Val (ndims (neighborhood_search)), point)
114+ cell = PointNeighbors . cell_coords (point_coords, neighborhood_search)
131115
132116 # Add point to corresponding cell
133- @boundscheck check_cell_bounds (cell_list, cell)
134-
135- # The atomic version of `pushat!` uses atomics to avoid race conditions when
136- # `pushat!` is used in a parallel loop.
137- @inbounds pushat_atomic! (vov_per_gpu[gpu], cell_index (cell_list, cell), point)
138- end
139-
140- lengths = ntuple (gpu -> vov_per_gpu[gpu]. lengths, length (vov_per_gpu))
141- CUDA. synchronize ()
142- offsets_ = cu (cumsum (lengths), unified = true )
143- CUDA. synchronize ()
144- vov. lengths .= offsets_[end ]
145- CUDA. synchronize ()
146- offsets = offsets_ .- lengths
147-
148- # Merge cell lists
149- parallel_foreach2 (axes (vov, 2 ), y) do cell, gpu
150- offset = offsets[gpu][cell]
151-
152- points = vov_per_gpu[gpu][cell]
153- for i in eachindex (points)
154- vov. backend[offset + i, cell] = points[i]
155- end
117+ pushat_atomic_system! (cell_list. cells, PointNeighbors. cell_index (cell_list, cell), point)
156118 end
157119
158120 return neighborhood_search
159121end
160122
161- function PointNeighbors . update! (neighborhood_search :: MultiGPUNeighborhoodSearch ,
162- x :: AbstractMatrix , y :: AbstractMatrix ;
163- points_moving = ( true , true ), parallelization_backend = x)
164- # The coordinates of the first set of points are irrelevant for this NHS.
165- # Only update when the second set is moving.
166- points_moving[ 2 ] || return neighborhood_search
123+ # This might be slightly faster, but causes problems with synchronization in the interaction
124+ # kernel because we are carrying around device memory.
125+ # struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
126+ # vov_unified :: VU
127+ # vov_device :: VD
128+ # end
167129
168- PointNeighbors. initialize_grid! (neighborhood_search, y)
169- end
130+ # function MultiGPUVectorOfVectors(vov_unified, vov_device)
131+ # MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_device)}(vov_unified, vov_device)
132+ # end
133+
134+ # # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
135+
136+ # function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
137+ # @info "" to
138+ # return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
139+ # end
140+
141+ # @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
142+ # return getindex(vov.vov_unified, i)
143+ # end
144+
145+ # function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
146+ # max_inner_length, max_outer_length = size(vov.backend)
147+
148+ # # Make sure the vector of vectors in device memory lives on the first GPU
149+ # CUDA.device!(0)
150+
151+ # vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
152+ # vov_device = Adapt.adapt(CuArray, vov)
153+
154+ # return MultiGPUVectorOfVectors(vov_unified, vov_device)
155+ # end
156+
157+ # const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
158+
159+ # function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
160+ # (; cell_list) = neighborhood_search
161+ # (; cells) = cell_list
162+ # (; vov_unified, vov_device) = cells
163+
164+ # if neighborhood_search.search_radius < eps()
165+ # # Cannot initialize with zero search radius.
166+ # # This is used in TrixiParticles when a neighborhood search is not used.
167+ # return neighborhood_search
168+ # end
169+
170+ # vov_device.lengths .= 0
171+
172+ # CUDA.device!(0)
173+
174+ # # Fill vector of vectors in device memory (on a single GPU)
175+ # @threaded CUDABackend() for point in axes(y, 2)
176+ # # Get cell index of the point's cell
177+ # point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
178+ # cell = cell_coords(point_coords, neighborhood_search)
179+
180+ # # Add point to corresponding cell
181+ # @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell)
182+
183+ # # The atomic version of `pushat!` uses atomics to avoid race conditions when
184+ # # `pushat!` is used in a parallel loop.
185+ # @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
186+ # end
187+
188+ # # Copy vector of vectors to unified memory
189+ # vov_unified.backend .= vov_device.backend
190+ # vov_unified.lengths .= vov_device.lengths
191+ # vov_unified.length_[] = vov_device.length_[]
192+
193+ # return neighborhood_search
194+ # end
195+
196+ # function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
197+ # x::AbstractMatrix, y::AbstractMatrix;
198+ # points_moving = (true, true), parallelization_backend = x)
199+ # # The coordinates of the first set of points are irrelevant for this NHS.
200+ # # Only update when the second set is moving.
201+ # points_moving[2] || return neighborhood_search
202+
203+ # PointNeighbors.initialize_grid!(neighborhood_search, y)
204+ # end
170205
171206end # module
0 commit comments