Make aligned loads nicer

efaulhaber · efaulhaber · commit c32bf8465dd3 · 2026-04-16T17:59:15.000+02:00
diff --git a/src/general/gpu.jl b/src/general/gpu.jl
@@ -35,3 +35,64 @@ end
 function allocate(backend, ELTYPE, size)
     return Array{ELTYPE, length(size)}(undef, size)
 end
+
+# This function checks if we can use aligned `SVector` or `SMatrix` loads for the given
+# array `A` and the size of the vector we want to load `n`.
+#
+# Note that it is not checked if the size `n` is legal (a power of 2).
+# If the size `n` is not legal, the aligned loads will fall back to regular loads.
+# This is because a regular load is expected for a problem that doesn't support aligned
+# loads (e.g. 3D matrices of 3x3), whereas an illegal alignment is unexpected and should
+# throw an error instead of silently (and non-deterministically) falling back to slower
+# regular loads.
+function can_use_aligned_load(A, n)
+    # Check if the beginning of `A` is aligned to the size of the vector we want to load.
+    is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0
+    # Check if the stride of `A` in the last dimension is equal to `n`,
+    # which means that there is no padding between the vectors we want to load.
+    has_stride = stride(A, ndims(A)) == n
+
+    return is_aligned && has_stride
+end
+
+# For N = 2 and N = 4, use the aligned vector loads.
+@propagate_inbounds function extract_svector_aligned(A::AbstractMatrix,
+                                                     val_n::Union{Val{2}, Val{4}}, i)
+    return _extract_svector_aligned(A, val_n, i)
+end
+
+# For other N, fall back to the regular `extract_svector`.
+@propagate_inbounds function extract_svector_aligned(A, val_n, i)
+    return extract_svector(A, val_n, i)
+end
+
+# This function only works for N = 2 and N = 4, because it requires a stride of 2^n.
+@inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N}
+    @boundscheck checkbounds(A, 1:N, i)
+
+    # This function assumes alignment of the data, which means that the columns of `A`
+    # have exactly the size `N` and no padding between them.
+    @boundscheck @assert stride(A, 2) == N
+
+    vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1))
+
+    return SVector{N}(Tuple(vec))
+end
+
+# For general N, fall back to the regular `extract_smatrix`.
+@propagate_inbounds function extract_matrix_aligned(A, val_n, i)
+    return extract_smatrix(A, val_n, i)
+end
+
+# This function only works for 2x2 matrices, because it requires a stride of 2^n.
+@inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
+    @boundscheck checkbounds(A, 2, 2, i)
+
+    # This function assumes alignment of the data, which means that the first two of `A`
+    # have exactly the size `2` and no padding between them.
+    @boundscheck @assert stride(A, 3) == 4
+
+    vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
+
+    return SMatrix{2, 2}(Tuple(vec))
+end
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -25,8 +25,9 @@ function interact!(dv, v_particle_system, u_particle_system,
     compact_support_ = compact_support(particle_system, neighbor_system)
     almostzero = sqrt(eps(compact_support_^2))
 
-    use_simd_load_system = Val(use_simd_load(v_particle_system, particle_system))
-    use_simd_load_neighbor = Val(use_simd_load(v_neighbor_system, neighbor_system))
+    use_aligned_load_system = Val(use_aligned_vrho_load(v_particle_system, particle_system))
+    use_aligned_load_neighbor = Val(use_aligned_vrho_load(v_neighbor_system,
+                                                          neighbor_system))
 
     @threaded semi for particle in each_integrated_particle(particle_system)
         # We are looping over the particles of `particle_system`, so it is guaranteed
@@ -38,7 +39,7 @@ function interact!(dv, v_particle_system, u_particle_system,
         # which gives a significant speedup on GPUs.
         (v_a,
          rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
-                                                 use_simd_load_system, particle)
+                                                 use_aligned_load_system, particle)
 
         # Accumulate the RHS contributions over all neighbors before writing to `dv`,
         # to reduce the number of memory writes.
@@ -64,7 +65,7 @@ function interact!(dv, v_particle_system, u_particle_system,
             m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
             (v_b,
              rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
-                                                     use_simd_load_neighbor, neighbor)
+                                                     use_aligned_load_neighbor, neighbor)
 
             # The following call is equivalent to
             #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -153,48 +154,47 @@ end
 # Optimized version for WCSPH with `ContinuityDensity` in 3D,
 # which combines the velocity and density load into one wide load.
 # This is significantly faster on GPUs than the 4 individual loads of `extract_svector`.
-# WARNING: this requires that the pointer of `v` is aligned to the size
-#          of `SIMD.Vec{4, eltype(v)}`, which is checked by `use_simd_load`.
-@inline function velocity_and_density(v, system, ::Val{true}, particle)
-    # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
-    # of `v` corresponding to `particle`.
-    # Note that this doesn't work for 2D because it requires a stride of 2^n.
-    vrho_particle = SIMD.vloada(SIMD.Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
+# WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`,
+#          which is checked by `use_aligned_vrho_load`.
+#          Only call this function after checking `use_aligned_vrho_load` to avoid
+#          segmentation faults from illegal accesses.
+@propagate_inbounds function velocity_and_density(v, system, ::Val{true}, particle)
+    vrho_particle = extract_svector_aligned(v, Val(4), particle)
 
     # The columns of `v` are ordered as (v_x, v_y, v_z, rho)
-    v1, v2, v3, rho = Tuple(vrho_particle)
-    v_particle = SVector(v1, v2, v3)
+    v..., rho = Tuple(vrho_particle)
+    v_particle = SVector(v)
 
     return v_particle, rho
 end
 
-# By default, don't use SIMD loads
-use_simd_load(v, system) = false
+# By default, don't use aligned loads
+use_aligned_vrho_load(v, system) = false
 
-function use_simd_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
-    use_simd_load(v, system, system.density_calculator)
+function use_aligned_vrho_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
+    use_aligned_vrho_load(v, system, system.density_calculator)
 end
 
-use_simd_load(v, system, density_calculator) = false
+use_aligned_vrho_load(v, system, density_calculator) = false
 
-# Only use SIMD loads when all of these conditions are satisfied:
+# Only use aligned loads when all of these conditions are satisfied:
 # - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4.
-# - We are on a GPU, where the SIMD load gives a significant speedup.
-# - The velocity array is aligned for SIMD loads, which requires that the pointer of `v`
-#   is aligned to the size of `SIMD.Vec{4, eltype(v)}`.
-#   Otherwise, we cannot use `vloada`, which is an *aligned* SIMD load.
+# - We are on a GPU, where the aligned load gives a significant speedup.
+# - The velocity array is aligned for aligned loads, which requires that the pointer of `v`
+#   is aligned to `4 * sizeof(eltype(v))`
+#   Otherwise, we cannot use `vloada`, which is an *aligned* load.
 #   The unaligned version `vload` does not produce wide load instructions on GPUs.
-#   In this last case, we don't fall back to the non-SIMD version and throw an error instead.
-function use_simd_load(v::AbstractGPUArray, system, ::ContinuityDensity)
-    aligned = is_aligned(pointer(v), SIMD.Vec{4, eltype(v)})
-
-    if !aligned
+function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity)
+    if !can_use_aligned_load(v, 4)
+        # If aligned loads are possible for the problem, but not allowed due to alignment,
+        # we don't fall back to the non-SIMD version and throw an error instead.
+        # This is likely a configuration error (see the error message below), and notifying
+        # the user is better than silently falling back to slower loads and thus
+        # non-deterministic performance.
         error("on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " *
               "first systems in the semidiscretization to ensure that their integration " *
               "arrays are aligned for SIMD loads.")
     end
 
-    return aligned
+    return true
 end
-
-is_aligned(ptr, ::Type{SIMD.Vec{N, T}}) where {N, T} = UInt(ptr) % (N * sizeof(T)) == 0