@@ -8,6 +8,7 @@ using UnsafeAtomics: UnsafeAtomics
88using PointNeighbors. KernelAbstractions: KernelAbstractions, @kernel , @index
99using PointNeighbors. Adapt: Adapt
1010using Base: @propagate_inbounds
11+ using PointNeighbors. BenchmarkTools
1112
1213const UnifiedCuArray = CuArray{<: Any , <: Any , CUDA. UnifiedMemory}
1314
6768 CUDA. device! (0 )
6869end
6970
70- function atomic_system_add (ptr:: CUDA.LLVMPtr{Int32, CUDA.AS.Global} , val:: Int32 )
71- CUDA. LLVM. Interop. @asmcall (
72- " atom.sys.global.add.u32 \$ 0, [\$ 1], \$ 2;" ,
73- " =r,l,r,~{memory}" ,
74- true , Int32, Tuple{CUDA. LLVMPtr{Int32, CUDA. AS. Global}, Int32},
75- ptr, val
76- )
77- end
71+ # ###########################################################################################
72+ # First approach: Use actual system atomics on unified memory
73+ # ###########################################################################################
74+ # function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
75+ # CUDA.LLVM.Interop.@asmcall(
76+ # "atom.sys.global.add.u32 \$0, [\$1], \$2;",
77+ # "=r,l,r,~{memory}",
78+ # true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
79+ # ptr, val
80+ # )
81+ # end
7882
79- @inline function pushat_atomic_system! (vov, i, value)
80- (; backend, lengths) = vov
83+ # @inline function pushat_atomic_system!(vov, i, value)
84+ # (; backend, lengths) = vov
8185
82- @boundscheck checkbounds (vov, i)
86+ # @boundscheck checkbounds(vov, i)
8387
84- # Increment the column length with an atomic add to avoid race conditions.
85- # Store the new value since it might be changed immediately afterwards by another
86- # thread.
87- # new_length = Atomix.@atomic lengths[i] += 1
88- new_length = atomic_system_add (pointer (lengths, i), Int32 (1 )) + 1
88+ # # Increment the column length with an atomic add to avoid race conditions.
89+ # # Store the new value since it might be changed immediately afterwards by another
90+ # # thread.
91+ # # new_length = Atomix.@atomic lengths[i] += 1
92+ # new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
8993
90- # We can write here without race conditions, since the atomic add guarantees
91- # that `new_length` is different for each thread.
92- backend[new_length, i] = value
94+ # # We can write here without race conditions, since the atomic add guarantees
95+ # # that `new_length` is different for each thread.
96+ # backend[new_length, i] = value
9397
94- return vov
95- end
98+ # return vov
99+ # end
96100
97- const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}} }
101+ # const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
98102
99- function PointNeighbors. initialize_grid! (neighborhood_search:: MultiGPUNeighborhoodSearch , y:: AbstractMatrix )
100- (; cell_list) = neighborhood_search
103+ # function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
104+ # (; cell_list) = neighborhood_search
101105
102- cell_list. cells. lengths .= 0
106+ # cell_list.cells.lengths .= 0
103107
104- if neighborhood_search. search_radius < eps ()
105- # Cannot initialize with zero search radius.
106- # This is used in TrixiParticles when a neighborhood search is not used.
107- return neighborhood_search
108- end
108+ # if neighborhood_search.search_radius < eps()
109+ # # Cannot initialize with zero search radius.
110+ # # This is used in TrixiParticles when a neighborhood search is not used.
111+ # return neighborhood_search
112+ # end
109113
110- # Faster on a single GPU
111- @threaded CUDABackend () for point in axes (y, 2 )
112- # Get cell index of the point's cell
113- point_coords = PointNeighbors. extract_svector (y, Val (ndims (neighborhood_search)), point)
114- cell = PointNeighbors. cell_coords (point_coords, neighborhood_search)
114+ # # Faster on a single GPU
115+ # # CUDA.NVTX.@range "threaded loop" begin
116+ # @threaded y for point in axes(y, 2)
117+ # # Get cell index of the point's cell
118+ # point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
119+ # cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
115120
116- # Add point to corresponding cell
117- pushat_atomic_system! (cell_list. cells, PointNeighbors. cell_index (cell_list, cell), point)
118- end
121+ # # Add point to corresponding cell
122+ # pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
123+ # end
124+ # # end
119125
120- return neighborhood_search
121- end
126+ # return neighborhood_search
127+ # end
122128
123- # This might be slightly faster, but causes problems with synchronization in the interaction
124- # kernel because we are carrying around device memory.
129+ # function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch,
130+ # y::AbstractMatrix; parallelization_backend = y)
131+ # PointNeighbors.initialize_grid!(neighborhood_search, y)
132+ # end
133+
134+ # ###########################################################################################
135+ # Second approach: Use a device array to update and copy to unified memory
136+ # ###########################################################################################
125137# struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
126138# vov_unified :: VU
127139# vov_device :: VD
133145
134146# # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
135147
136- # function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
137- # @info "" to
138- # return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
148+ # function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
149+ # return Adapt.adapt(to, vov.vov_unified)
139150# end
140151
141152# @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
182193
183194# # The atomic version of `pushat!` uses atomics to avoid race conditions when
184195# # `pushat!` is used in a parallel loop.
185- # @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
196+ # @inbounds PointNeighbors. pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
186197# end
187198
188199# # Copy vector of vectors to unified memory
@@ -203,4 +214,191 @@ end
203214# PointNeighbors.initialize_grid!(neighborhood_search, y)
204215# end
205216
217+ # ###########################################################################################
218+ # Third approach: Like second approach, but a device array for each GPU.
219+
220+ # This should have the advantage that each GPU updates the part of the grid that should
221+ # already be in its memory and we can avoid moving everything to the first GPU.
222+ # ###########################################################################################
223+ KernelAbstractions. @kernel function generic_kernel2 (f, gpu)
224+ i = KernelAbstractions. @index (Global)
225+ @inline f (i, gpu)
226+ end
227+
228+ @inline function parallel_foreach2 (f:: T , iterator, vov_per_gpu) where {T}
229+ # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
230+ # and index with `iterator[eachindex(iterator)[i]]`.
231+ # Note that this only works with vector-like iterators that support arbitrary indexing.
232+ indices = eachindex (iterator)
233+
234+ # Skip empty loops
235+ length (indices) == 0 && return
236+
237+ # Partition `ndrange` to the GPUs
238+ n_gpus = length (CUDA. devices ())
239+ indices_split = Iterators. partition (indices, ceil (Int, length (indices) / n_gpus))
240+ @assert length (indices_split) <= n_gpus
241+
242+ backend = CUDABackend ()
243+
244+ # Synchronize each device
245+ for i in 1 : n_gpus
246+ CUDA. device! (i - 1 )
247+ KernelAbstractions. synchronize (backend)
248+ end
249+
250+ # Launch kernel on each device
251+ for (i, indices_) in enumerate (indices_split)
252+ # Select the correct device for this partition
253+ CUDA. device! (i - 1 )
254+
255+ # Call the generic kernel, which only calls a function with the global GPU index
256+ generic_kernel2 (backend)(vov_per_gpu[i], ndrange = length (indices_)) do j, vov_per_gpu
257+ @inbounds @inline f (iterator[indices_[j]], vov_per_gpu)
258+ end
259+ end
260+
261+ # Synchronize each device
262+ for i in 1 : n_gpus
263+ CUDA. device! (i - 1 )
264+ KernelAbstractions. synchronize (backend)
265+ end
266+
267+ # Select first device again
268+ CUDA. device! (0 )
269+ end
270+
271+ struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
272+ vov_unified:: VOV
273+ vov_per_gpu:: V
274+ end
275+
276+ function MultiGPUVectorOfVectors (vov_unified, vov_per_gpu)
277+ MultiGPUVectorOfVectors {eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)} (vov_unified, vov_per_gpu)
278+ end
279+
280+ # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
281+
282+ function Adapt. adapt_structure (to:: CUDA.KernelAdaptor , vov:: MultiGPUVectorOfVectors )
283+ return Adapt. adapt (to, vov. vov_unified)
284+ end
285+
286+ function Adapt. adapt_structure (to:: CUDAMultiGPUBackend , vov:: DynamicVectorOfVectors{T} ) where {T}
287+ max_inner_length, max_outer_length = size (vov. backend)
288+
289+ n_gpus = length (CUDA. devices ())
290+ vov_per_gpu = [DynamicVectorOfVectors {T} (; max_outer_length, max_inner_length) for _ in 1 : n_gpus]
291+
292+ vov_unified = DynamicVectorOfVectors (Adapt. adapt (to, vov. backend), Adapt. adapt (to, vov. length_), Adapt. adapt (to, vov. lengths))
293+ vov_per_gpu_ = ntuple (i -> Adapt. adapt (CuArray, vov_per_gpu[i]), n_gpus)
294+ for vov__ in vov_per_gpu_
295+ vov__. length_[] = vov. length_[]
296+ end
297+
298+ return MultiGPUVectorOfVectors {T, typeof(vov_unified), typeof(vov_per_gpu_)} (vov_unified, vov_per_gpu_)
299+ end
300+
301+ const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:MultiGPUVectorOfVectors} }
302+
303+ KernelAbstractions. @kernel function kernel3 (vov, vov_unified, previous_lengths, :: Val{gpu} , :: Val{islast} ) where {gpu, islast}
304+ cell = KernelAbstractions. @index (Global)
305+
306+ length = @inbounds vov. lengths[cell]
307+ if length > 0 || islast
308+ offset = 0
309+ for length_ in previous_lengths
310+ offset += @inbounds length_[cell]
311+ end
312+
313+ for i in 1 : length
314+ @inbounds vov_unified. backend[offset + i, cell] = vov. backend[i, cell]
315+ end
316+
317+ if islast
318+ @inbounds vov_unified. lengths[cell] = offset + length
319+ end
320+ end
321+ end
322+
323+ function PointNeighbors. initialize_grid! (neighborhood_search:: MultiGPUNeighborhoodSearch , y:: AbstractMatrix )
324+ (; cell_list) = neighborhood_search
325+ (; cells) = cell_list
326+ (; vov_unified, vov_per_gpu) = cells
327+
328+ for vov_ in vov_per_gpu
329+ vov_. lengths .= 0
330+ end
331+
332+ if neighborhood_search. search_radius < eps ()
333+ # Cannot initialize with zero search radius.
334+ # This is used in TrixiParticles when a neighborhood search is not used.
335+ return neighborhood_search
336+ end
337+
338+ # Fill cell lists per GPU
339+ parallel_foreach2 (axes (y, 2 ), vov_per_gpu) do point, vov
340+ # Get cell index of the point's cell
341+ point_coords = extract_svector (y, Val (ndims (neighborhood_search)), point)
342+ cell = cell_coords (point_coords, neighborhood_search)
343+
344+ # Add point to corresponding cell
345+ @boundscheck PointNeighbors. check_cell_bounds (cell_list, cell)
346+
347+ # The atomic version of `pushat!` uses atomics to avoid race conditions when
348+ # `pushat!` is used in a parallel loop.
349+ @inbounds PointNeighbors. pushat_atomic! (vov, cell_index (cell_list, cell), point)
350+ end
351+
352+ n_gpus = length (CUDA. devices ())
353+ backend = CUDABackend ()
354+
355+ # Synchronize each device
356+ for i in 1 : n_gpus
357+ CUDA. device! (i - 1 )
358+ KernelAbstractions. synchronize (backend)
359+ end
360+
361+ # Launch kernel on each device
362+ for gpu in 1 : n_gpus
363+ # Select the correct device
364+ CUDA. device! (gpu - 1 )
365+
366+ previous_lengths = ntuple (gpu -> vov_per_gpu[gpu]. lengths, gpu - 1 )
367+
368+ # Call the generic kernel, which only calls a function with the global GPU index
369+ kernel3 (backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val (gpu), Val (gpu == n_gpus), ndrange = length (vov_per_gpu[gpu]))
370+ end
371+
372+ # Synchronize each device
373+ for i in 1 : n_gpus
374+ CUDA. device! (i - 1 )
375+ KernelAbstractions. synchronize (backend)
376+ end
377+
378+ # Select first device again
379+ CUDA. device! (0 )
380+
381+ # # Merge cell lists
382+ # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu
383+ # offset = offsets[gpu][cell]
384+
385+ # points = vov_per_gpu[gpu][cell]
386+ # for i in eachindex(points)
387+ # vov_unified.backend[offset + i, cell] = points[i]
388+ # end
389+ # end
390+
391+ return neighborhood_search
392+ end
393+
394+ function PointNeighbors. update! (neighborhood_search:: MultiGPUNeighborhoodSearch ,
395+ x:: AbstractMatrix , y:: AbstractMatrix ;
396+ points_moving = (true , true ), parallelization_backend = x)
397+ # The coordinates of the first set of points are irrelevant for this NHS.
398+ # Only update when the second set is moving.
399+ points_moving[2 ] || return neighborhood_search
400+
401+ PointNeighbors. initialize_grid! (neighborhood_search, y)
402+ end
403+
206404end # module
0 commit comments