Skip to content

Commit 234dbb5

Browse files
efaulhabervchuravy
andcommitted
Add workaround for system atomics
Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
1 parent fc7180f commit 234dbb5

1 file changed

Lines changed: 111 additions & 76 deletions

File tree

ext/PointNeighborsCUDAExt.jl

Lines changed: 111 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@ module PointNeighborsCUDAExt
22

33
using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors,
44
GridNeighborhoodSearch, FullGridCellList, extract_svector,
5-
cell_coords, cell_index, check_cell_bounds, pushat_atomic!
5+
cell_coords, cell_index, @threaded, generic_kernel
66
using CUDA: CUDA, CuArray, CUDABackend, cu
77
using UnsafeAtomics: UnsafeAtomics
88
using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index
99
using PointNeighbors.Adapt: Adapt
10+
using Base: @propagate_inbounds
1011

1112
const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
1213

@@ -24,19 +25,6 @@ end
2425
end
2526

2627
@inline function PointNeighbors.parallel_foreach(f::T, iterator, x::CUDAMultiGPUBackend) where {T}
27-
parallel_foreach2((i, gpu) -> @inline f(i), iterator, x)
28-
end
29-
30-
@inline function parallel_foreach2(f, iterator, x::UnifiedCuArray)
31-
parallel_foreach2(f, iterator, CUDAMultiGPUBackend())
32-
end
33-
34-
KernelAbstractions.@kernel function generic_kernel2(f, gpu)
35-
i = KernelAbstractions.@index(Global)
36-
@inline f(i, gpu)
37-
end
38-
39-
@inline function parallel_foreach2(f, iterator, x::CUDAMultiGPUBackend)
4028
# On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
4129
# and index with `iterator[eachindex(iterator)[i]]`.
4230
# Note that this only works with vector-like iterators that support arbitrary indexing.
@@ -64,8 +52,8 @@ end
6452
CUDA.device!(i - 1)
6553

6654
# Call the generic kernel, which only calls a function with the global GPU index
67-
generic_kernel2(backend)(i, ndrange = length(indices_)) do j, gpu
68-
@inbounds @inline f(iterator[indices_[j]], gpu)
55+
generic_kernel(backend)(ndrange = length(indices_)) do j
56+
@inbounds @inline f(iterator[indices_[j]])
6957
end
7058
end
7159

@@ -79,93 +67,140 @@ end
7967
CUDA.device!(0)
8068
end
8169

82-
struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
83-
vov::VOV
84-
vov_per_gpu::V
85-
end
86-
87-
function MultiGPUVectorOfVectors(vov, vov_per_gpu)
88-
MultiGPUVectorOfVectors{eltype(vov.backend), typeof(vov), typeof(vov_per_gpu)}(vov, vov_per_gpu)
70+
function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
71+
CUDA.LLVM.Interop.@asmcall(
72+
"atom.sys.global.add.u32 \$0, [\$1], \$2;",
73+
"=r,l,r,~{memory}",
74+
true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
75+
ptr, val
76+
)
8977
end
9078

91-
Adapt.@adapt_structure(MultiGPUVectorOfVectors)
79+
@inline function pushat_atomic_system!(vov, i, value)
80+
(; backend, lengths) = vov
9281

93-
function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
94-
max_inner_length, max_outer_length = size(vov.backend)
82+
@boundscheck checkbounds(vov, i)
9583

96-
n_gpus = length(CUDA.devices())
97-
vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
84+
# Increment the column length with an atomic add to avoid race conditions.
85+
# Store the new value since it might be changed immediately afterwards by another
86+
# thread.
87+
# new_length = Atomix.@atomic lengths[i] += 1
88+
new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
9889

99-
vov_ = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
100-
vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus)
101-
for vov__ in vov_per_gpu_
102-
vov__.length_[] = vov.length_[]
103-
end
90+
# We can write here without race conditions, since the atomic add guarantees
91+
# that `new_length` is different for each thread.
92+
backend[new_length, i] = value
10493

105-
return MultiGPUVectorOfVectors{T, typeof(vov_), typeof(vov_per_gpu_)}(vov_, vov_per_gpu_)
94+
return vov
10695
end
10796

108-
const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
97+
const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
10998

11099
function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
111100
(; cell_list) = neighborhood_search
112-
(; cells) = cell_list
113-
(; vov, vov_per_gpu) = cells
114101

115-
for vov_ in vov_per_gpu
116-
vov_.lengths .= 0
117-
end
102+
cell_list.cells.lengths .= 0
118103

119104
if neighborhood_search.search_radius < eps()
120105
# Cannot initialize with zero search radius.
121106
# This is used in TrixiParticles when a neighborhood search is not used.
122107
return neighborhood_search
123108
end
124109

125-
# Fill cell lists per GPU
126-
# TODO split range into chunks for each GPU
127-
parallel_foreach2(axes(y, 2), y) do point, gpu
110+
# Faster on a single GPU
111+
@threaded CUDABackend() for point in axes(y, 2)
128112
# Get cell index of the point's cell
129-
point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
130-
cell = cell_coords(point_coords, neighborhood_search)
113+
point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
114+
cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
131115

132116
# Add point to corresponding cell
133-
@boundscheck check_cell_bounds(cell_list, cell)
134-
135-
# The atomic version of `pushat!` uses atomics to avoid race conditions when
136-
# `pushat!` is used in a parallel loop.
137-
@inbounds pushat_atomic!(vov_per_gpu[gpu], cell_index(cell_list, cell), point)
138-
end
139-
140-
lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, length(vov_per_gpu))
141-
CUDA.synchronize()
142-
offsets_ = cu(cumsum(lengths), unified = true)
143-
CUDA.synchronize()
144-
vov.lengths .= offsets_[end]
145-
CUDA.synchronize()
146-
offsets = offsets_ .- lengths
147-
148-
# Merge cell lists
149-
parallel_foreach2(axes(vov, 2), y) do cell, gpu
150-
offset = offsets[gpu][cell]
151-
152-
points = vov_per_gpu[gpu][cell]
153-
for i in eachindex(points)
154-
vov.backend[offset + i, cell] = points[i]
155-
end
117+
pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
156118
end
157119

158120
return neighborhood_search
159121
end
160122

161-
function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
162-
x::AbstractMatrix, y::AbstractMatrix;
163-
points_moving = (true, true), parallelization_backend = x)
164-
# The coordinates of the first set of points are irrelevant for this NHS.
165-
# Only update when the second set is moving.
166-
points_moving[2] || return neighborhood_search
123+
# This might be slightly faster, but causes problems with synchronization in the interaction
124+
# kernel because we are carrying around device memory.
125+
# struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
126+
# vov_unified :: VU
127+
# vov_device :: VD
128+
# end
167129

168-
PointNeighbors.initialize_grid!(neighborhood_search, y)
169-
end
130+
# function MultiGPUVectorOfVectors(vov_unified, vov_device)
131+
# MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_device)}(vov_unified, vov_device)
132+
# end
133+
134+
# # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
135+
136+
# function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
137+
# @info "" to
138+
# return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
139+
# end
140+
141+
# @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
142+
# return getindex(vov.vov_unified, i)
143+
# end
144+
145+
# function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
146+
# max_inner_length, max_outer_length = size(vov.backend)
147+
148+
# # Make sure the vector of vectors in device memory lives on the first GPU
149+
# CUDA.device!(0)
150+
151+
# vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
152+
# vov_device = Adapt.adapt(CuArray, vov)
153+
154+
# return MultiGPUVectorOfVectors(vov_unified, vov_device)
155+
# end
156+
157+
# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
158+
159+
# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
160+
# (; cell_list) = neighborhood_search
161+
# (; cells) = cell_list
162+
# (; vov_unified, vov_device) = cells
163+
164+
# if neighborhood_search.search_radius < eps()
165+
# # Cannot initialize with zero search radius.
166+
# # This is used in TrixiParticles when a neighborhood search is not used.
167+
# return neighborhood_search
168+
# end
169+
170+
# vov_device.lengths .= 0
171+
172+
# CUDA.device!(0)
173+
174+
# # Fill vector of vectors in device memory (on a single GPU)
175+
# @threaded CUDABackend() for point in axes(y, 2)
176+
# # Get cell index of the point's cell
177+
# point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
178+
# cell = cell_coords(point_coords, neighborhood_search)
179+
180+
# # Add point to corresponding cell
181+
# @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell)
182+
183+
# # The atomic version of `pushat!` uses atomics to avoid race conditions when
184+
# # `pushat!` is used in a parallel loop.
185+
# @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
186+
# end
187+
188+
# # Copy vector of vectors to unified memory
189+
# vov_unified.backend .= vov_device.backend
190+
# vov_unified.lengths .= vov_device.lengths
191+
# vov_unified.length_[] = vov_device.length_[]
192+
193+
# return neighborhood_search
194+
# end
195+
196+
# function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
197+
# x::AbstractMatrix, y::AbstractMatrix;
198+
# points_moving = (true, true), parallelization_backend = x)
199+
# # The coordinates of the first set of points are irrelevant for this NHS.
200+
# # Only update when the second set is moving.
201+
# points_moving[2] || return neighborhood_search
202+
203+
# PointNeighbors.initialize_grid!(neighborhood_search, y)
204+
# end
170205

171206
end # module

0 commit comments

Comments
 (0)