Skip to content

Commit c32bf84

Browse files
committed
Make aligned loads nicer
1 parent 24dcc46 commit c32bf84

2 files changed

Lines changed: 92 additions & 31 deletions

File tree

src/general/gpu.jl

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,64 @@ end
3535
function allocate(backend, ELTYPE, size)
3636
return Array{ELTYPE, length(size)}(undef, size)
3737
end
38+
39+
# This function checks if we can use aligned `SVector` or `SMatrix` loads for the given
40+
# array `A` and the size of the vector we want to load `n`.
41+
#
42+
# Note that it is not checked if the size `n` is legal (a power of 2).
43+
# If the size `n` is not legal, the aligned loads will fall back to regular loads.
44+
# This is because a regular load is expected for a problem that doesn't support aligned
45+
# loads (e.g. 3D matrices of 3x3), whereas an illegal alignment is unexpected and should
46+
# throw an error instead of silently (and non-deterministically) falling back to slower
47+
# regular loads.
48+
function can_use_aligned_load(A, n)
49+
# Check if the beginning of `A` is aligned to the size of the vector we want to load.
50+
is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0
51+
# Check if the stride of `A` in the last dimension is equal to `n`,
52+
# which means that there is no padding between the vectors we want to load.
53+
has_stride = stride(A, ndims(A)) == n
54+
55+
return is_aligned && has_stride
56+
end
57+
58+
# For N = 2 and N = 4, use the aligned vector loads.
59+
@propagate_inbounds function extract_svector_aligned(A::AbstractMatrix,
60+
val_n::Union{Val{2}, Val{4}}, i)
61+
return _extract_svector_aligned(A, val_n, i)
62+
end
63+
64+
# For other N, fall back to the regular `extract_svector`.
65+
@propagate_inbounds function extract_svector_aligned(A, val_n, i)
66+
return extract_svector(A, val_n, i)
67+
end
68+
69+
# This function only works for N = 2 and N = 4, because it requires a stride of 2^n.
70+
@inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N}
71+
@boundscheck checkbounds(A, 1:N, i)
72+
73+
# This function assumes alignment of the data, which means that the columns of `A`
74+
# have exactly the size `N` and no padding between them.
75+
@boundscheck @assert stride(A, 2) == N
76+
77+
vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1))
78+
79+
return SVector{N}(Tuple(vec))
80+
end
81+
82+
# For general N, fall back to the regular `extract_smatrix`.
83+
@propagate_inbounds function extract_matrix_aligned(A, val_n, i)
84+
return extract_smatrix(A, val_n, i)
85+
end
86+
87+
# This function only works for 2x2 matrices, because it requires a stride of 2^n.
88+
@inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
89+
@boundscheck checkbounds(A, 2, 2, i)
90+
91+
# This function assumes alignment of the data, which means that the first two of `A`
92+
# have exactly the size `2` and no padding between them.
93+
@boundscheck @assert stride(A, 3) == 4
94+
95+
vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
96+
97+
return SMatrix{2, 2}(Tuple(vec))
98+
end

src/schemes/fluid/weakly_compressible_sph/rhs.jl

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ function interact!(dv, v_particle_system, u_particle_system,
2525
compact_support_ = compact_support(particle_system, neighbor_system)
2626
almostzero = sqrt(eps(compact_support_^2))
2727

28-
use_simd_load_system = Val(use_simd_load(v_particle_system, particle_system))
29-
use_simd_load_neighbor = Val(use_simd_load(v_neighbor_system, neighbor_system))
28+
use_aligned_load_system = Val(use_aligned_vrho_load(v_particle_system, particle_system))
29+
use_aligned_load_neighbor = Val(use_aligned_vrho_load(v_neighbor_system,
30+
neighbor_system))
3031

3132
@threaded semi for particle in each_integrated_particle(particle_system)
3233
# We are looping over the particles of `particle_system`, so it is guaranteed
@@ -38,7 +39,7 @@ function interact!(dv, v_particle_system, u_particle_system,
3839
# which gives a significant speedup on GPUs.
3940
(v_a,
4041
rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
41-
use_simd_load_system, particle)
42+
use_aligned_load_system, particle)
4243

4344
# Accumulate the RHS contributions over all neighbors before writing to `dv`,
4445
# to reduce the number of memory writes.
@@ -64,7 +65,7 @@ function interact!(dv, v_particle_system, u_particle_system,
6465
m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
6566
(v_b,
6667
rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
67-
use_simd_load_neighbor, neighbor)
68+
use_aligned_load_neighbor, neighbor)
6869

6970
# The following call is equivalent to
7071
# `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -153,48 +154,47 @@ end
153154
# Optimized version for WCSPH with `ContinuityDensity` in 3D,
154155
# which combines the velocity and density load into one wide load.
155156
# This is significantly faster on GPUs than the 4 individual loads of `extract_svector`.
156-
# WARNING: this requires that the pointer of `v` is aligned to the size
157-
# of `SIMD.Vec{4, eltype(v)}`, which is checked by `use_simd_load`.
158-
@inline function velocity_and_density(v, system, ::Val{true}, particle)
159-
# Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
160-
# of `v` corresponding to `particle`.
161-
# Note that this doesn't work for 2D because it requires a stride of 2^n.
162-
vrho_particle = SIMD.vloada(SIMD.Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
157+
# WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`,
158+
# which is checked by `use_aligned_vrho_load`.
159+
# Only call this function after checking `use_aligned_vrho_load` to avoid
160+
# segmentation faults from illegal accesses.
161+
@propagate_inbounds function velocity_and_density(v, system, ::Val{true}, particle)
162+
vrho_particle = extract_svector_aligned(v, Val(4), particle)
163163

164164
# The columns of `v` are ordered as (v_x, v_y, v_z, rho)
165-
v1, v2, v3, rho = Tuple(vrho_particle)
166-
v_particle = SVector(v1, v2, v3)
165+
v..., rho = Tuple(vrho_particle)
166+
v_particle = SVector(v)
167167

168168
return v_particle, rho
169169
end
170170

171-
# By default, don't use SIMD loads
172-
use_simd_load(v, system) = false
171+
# By default, don't use aligned loads
172+
use_aligned_vrho_load(v, system) = false
173173

174-
function use_simd_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
175-
use_simd_load(v, system, system.density_calculator)
174+
function use_aligned_vrho_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
175+
use_aligned_vrho_load(v, system, system.density_calculator)
176176
end
177177

178-
use_simd_load(v, system, density_calculator) = false
178+
use_aligned_vrho_load(v, system, density_calculator) = false
179179

180-
# Only use SIMD loads when all of these conditions are satisfied:
180+
# Only use aligned loads when all of these conditions are satisfied:
181181
# - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4.
182-
# - We are on a GPU, where the SIMD load gives a significant speedup.
183-
# - The velocity array is aligned for SIMD loads, which requires that the pointer of `v`
184-
# is aligned to the size of `SIMD.Vec{4, eltype(v)}`.
185-
# Otherwise, we cannot use `vloada`, which is an *aligned* SIMD load.
182+
# - We are on a GPU, where the aligned load gives a significant speedup.
183+
# - The velocity array is aligned for aligned loads, which requires that the pointer of `v`
184+
# is aligned to `4 * sizeof(eltype(v))`
185+
# Otherwise, we cannot use `vloada`, which is an *aligned* load.
186186
# The unaligned version `vload` does not produce wide load instructions on GPUs.
187-
# In this last case, we don't fall back to the non-SIMD version and throw an error instead.
188-
function use_simd_load(v::AbstractGPUArray, system, ::ContinuityDensity)
189-
aligned = is_aligned(pointer(v), SIMD.Vec{4, eltype(v)})
190-
191-
if !aligned
187+
function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity)
188+
if !can_use_aligned_load(v, 4)
189+
# If aligned loads are possible for the problem, but not allowed due to alignment,
190+
# we don't fall back to the non-SIMD version and throw an error instead.
191+
# This is likely a configuration error (see the error message below), and notifying
192+
# the user is better than silently falling back to slower loads and thus
193+
# non-deterministic performance.
192194
error("on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " *
193195
"first systems in the semidiscretization to ensure that their integration " *
194196
"arrays are aligned for SIMD loads.")
195197
end
196198

197-
return aligned
199+
return true
198200
end
199-
200-
is_aligned(ptr, ::Type{SIMD.Vec{N, T}}) where {N, T} = UInt(ptr) % (N * sizeof(T)) == 0

0 commit comments

Comments
 (0)