trixi-framework · efaulhaber · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/Project.toml b/Project.toml
@@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -63,6 +64,7 @@ Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
 Reexport = "1"
+SIMD = "3.7.2"
 SciMLBase = "2"
 StaticArrays = "1"
 Statistics = "1"

diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -25,6 +25,7 @@ using Random: seed!
 using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
                  get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
                  add_tstop!
+using SIMD: SIMD
 @reexport using StaticArrays: SVector
 using StaticArrays: @SMatrix, SMatrix, setindex
 using Statistics: Statistics

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
@@ -68,18 +68,42 @@ end
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.
-@propagate_inbounds function extract_smatrix(A, system, particle)
-    return extract_smatrix(A, Val(ndims(system)), particle)
+@propagate_inbounds function extract_smatrix(A, system, i)
+    return extract_smatrix(A, Val(ndims(system)), i)
 end
 
-@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
-    @boundscheck checkbounds(A, NDIMS, NDIMS, particle)
+@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N}
+    @boundscheck checkbounds(A, N, N, i)
+    # This function assumes that the first two dimensions of `A` have exactly the size `N`.
+    @boundscheck if stride(A, 3) != N^2
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
+    end
+
+    # Extract the matrix elements as a tuple in column-major order,
+    # and construct an `SMatrix` from it.
+    return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))
+end
+
+@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
+    @boundscheck checkbounds(A, 2, 2, i)
+    # This function assumes that the first two dimensions of `A` have exactly the size `2`.
+    @boundscheck if stride(A, 3) != 4
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
+    end
 
-    # Extract the matrix elements for this particle as a tuple to pass to SMatrix
-    return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
-                                                               div(i - 1, NDIMS) + 1,
-                                                               particle]),
-                                        Val(NDIMS^2)))
+    # This is the same as
+    # `SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))`,
+    # but it's slightly faster on CPUs in some cases. As opposed to the GPU-optimized
+    # version in `extract_smatrix_aligned`, we use `vload` and not `vloada` here, which
+    # does not require alignment and works if N^2 is not a power of 2.
+    # This is faster in the TLSPH RHS in 2D, but slower in 3D for some reason.
+    # See the benchmarks in https://github.com/trixi-framework/TrixiParticles.jl/pull/1147.
+    vec = SIMD.vload(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
+    return SMatrix{2, 2}(Tuple(vec))
 end
 
 # Specifically get the current coordinates of a particle for all system types.

diff --git a/src/general/gpu.jl b/src/general/gpu.jl
@@ -35,3 +35,73 @@ end
 function allocate(backend, ELTYPE, size)
     return Array{ELTYPE, length(size)}(undef, size)
 end
+
+# This function checks if we can use aligned `SVector` or `SMatrix` loads for the given
+# array `A` and the size of the vector we want to load `n`.
+function can_use_aligned_load(A, n)
+    # Check that `n` is a power of 2, which is a requirement for aligned loads.
+    is_power_of_2 = n > 0 && (n & (n - 1)) == 0
+    # Check if the beginning of `A` is aligned to the size of the vector we want to load.
+    is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0
+    # Check if the stride of `A` in the last dimension is equal to `n`,
+    # which means that there is no padding between the vectors we want to load.
+    has_stride = stride(A, ndims(A)) == n
+
+    return is_power_of_2 && is_aligned && has_stride
+end
+
+@propagate_inbounds function extract_svector_aligned(A, system, particle)
+    return extract_svector_aligned(A, Val(ndims(system)), particle)
+end
+
+# For N = 2 and N = 4, use the aligned vector loads.
+@propagate_inbounds function extract_svector_aligned(A::AbstractMatrix,
+                                                     val_n::Union{Val{2}, Val{4}}, i)
+    return _extract_svector_aligned(A, val_n, i)
+end
+
+# For other N, fall back to the regular `extract_svector`.
+@propagate_inbounds function extract_svector_aligned(A, val_n::Val, i)
+    return extract_svector(A, val_n, i)
+end
+
+# This function only works for N = 2 and N = 4, because it requires a stride of 2^n.
+@inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N}
+    @boundscheck checkbounds(A, 1:N, i)
+    @boundscheck can_use_aligned_load(A, N)
+    # This function assumes a stride of N in the last dimension.
+    @boundscheck if stride(A, 2) != N
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_svector_aligned only works for 2D arrays where the stride in the last dimension is equal to N")
+    end
+
+    vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1))
+
+    return SVector{N}(Tuple(vec))
+end
+
+@propagate_inbounds function extract_smatrix_aligned(A, system, particle)
+    return extract_smatrix_aligned(A, Val(ndims(system)), particle)
+end
+
+# For general N, fall back to the regular `extract_smatrix`.
+@propagate_inbounds function extract_smatrix_aligned(A, val_n::Val, i)
+    return extract_smatrix(A, val_n, i)
+end
+
+# This function only works for 2x2 matrices, because it requires a stride of 2^n.
+@inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
+    @boundscheck checkbounds(A, 2, 2, i)
+    @boundscheck can_use_aligned_load(A, 4)
+    # This function assumes that the first two dimensions of `A` have exactly the size `N`.
+    @boundscheck if stride(A, 3) != 4
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix_aligned only works for 3D arrays where the first two dimensions each have size N")
+    end
+
+    vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
+
+    return SMatrix{2, 2}(Tuple(vec))
+end
diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
@@ -94,12 +94,27 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
 
     sizes_u = [u_nvariables(system) * n_integrated_particles(system)
                for system in systems]
-    ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i])
-                     for i in eachindex(sizes_u))
     sizes_v = [v_nvariables(system) * n_integrated_particles(system)
                for system in systems]
-    ranges_v = Tuple((sum(sizes_v[1:(i - 1)]) + 1):sum(sizes_v[1:i])
-                     for i in eachindex(sizes_v))
+
+    start_u = 1
+    start_v = 1
+    ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems))
+    ranges_v_vec = Vector{UnitRange{Int}}(undef, length(systems))
+    for i in eachindex(systems)
+        ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1)
+        ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1)
+
+        # Align ranges to 64 bytes by adding padding if necessary.
+        # This ensures that aligned loads can be used on the wrapped integration arrays,
+        # which can significantly improve performance on GPUs.
+        block_size = div(64, sizeof(eltype(systems[i])))
+        start_u += div(sizes_u[i], block_size, RoundUp) * block_size
+        start_v += div(sizes_v[i], block_size, RoundUp) * block_size
+    end
+
+    ranges_u = Tuple(ranges_u_vec)
+    ranges_v = Tuple(ranges_v_vec)
 
     # Create a n x n matrix of n neighborhood searches for each of the n systems.
     # We will need one neighborhood search for each pair of systems.
@@ -245,13 +260,13 @@ function semidiscretize(semi, tspan; reset_threads=true)
         Polyester.reset_threads!()
     end
 
-    sizes_u = (u_nvariables(system) * n_integrated_particles(system) for system in systems)
-    sizes_v = (v_nvariables(system) * n_integrated_particles(system) for system in systems)
+    size_u_ode = semi.ranges_u[end].stop
+    size_v_ode = semi.ranges_v[end].stop
 
     # Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or
     # use CPU vectors for all CPU backends.
-    u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, sum(sizes_u))
-    v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v))
+    u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, size_u_ode)
+    v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, size_v_ode)
 
     if semi.parallelization_backend isa KernelAbstractions.GPU
         u0_ode = u0_ode_
@@ -266,6 +281,11 @@ function semidiscretize(semi, tspan; reset_threads=true)
                                         parallelization_backend=semi.parallelization_backend)
     end
 
+    # Initialize the arrays with zeros to initialize the padding values for alignment.
+    # The values that are actually used will be overwritten in `write_u0!` and `write_v0!`.
+    u0_ode .= 0
+    v0_ode .= 0
+
     # Set initial condition
     foreach_system_wrapped(semi, v0_ode, u0_ode) do system, v0_system, u0_system
         write_u0!(u0_system, system)
@@ -356,8 +376,9 @@ end
     range = ranges_v[system_indices(system, semi)]
 
     @boundscheck begin
-        if length(range) != v_nvariables(system) * n_integrated_particles(system)
-            throw(DimensionMismatch("`v_ode` range length $range_length does not match " *
+        expected = v_nvariables(system) * n_integrated_particles(system)
+        if length(range) != expected
+            throw(DimensionMismatch("`v_ode` range length $(length(range)) does not match " *
                                     "expected number of entries $expected"))
         end
     end

diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -25,14 +25,23 @@ function interact!(dv, v_particle_system, u_particle_system,
     compact_support_ = compact_support(particle_system, neighbor_system)
     almostzero = sqrt(eps(compact_support_^2))
 
+    use_aligned_load_system = Val(use_aligned_vrho_load(v_particle_system, particle_system))
+    use_aligned_load_neighbor = Val(use_aligned_vrho_load(v_neighbor_system,
+                                                          neighbor_system))
+
     @threaded semi for particle in each_integrated_particle(particle_system)
         # We are looping over the particles of `particle_system`, so it is guaranteed
         # that `particle` is in bounds of `particle_system`.
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
 
-        v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
-        rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        # Note that we can only safely use `@inbounds` after checking alignment
+        # with `use_aligned_vrho_load` before the `@threaded` loop.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 use_aligned_load_system, particle)
 
         # Accumulate the RHS contributions over all neighbors before writing to `dv`,
         # to reduce the number of memory writes.
@@ -56,8 +65,11 @@ function interact!(dv, v_particle_system, u_particle_system,
 
             # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
             m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
-            v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
-            rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+            # Note that we can only safely use `@inbounds` after checking alignment
+            # with `use_aligned_vrho_load` before the `@threaded` loop.
+            (v_b,
+             rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                                                     use_aligned_load_neighbor, neighbor)
 
             # The following call is equivalent to
             #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -134,3 +146,55 @@ end
                                    neighbor, p_a)
     return p_a
 end
+
+# Default method, which simply calls `current_velocity` and `current_density` separately.
+@propagate_inbounds function velocity_and_density(v, system, ::Val{false}, particle)
+    v_particle = current_velocity(v, system, particle)
+    rho_particle = current_density(v, system, particle)
+
+    return v_particle, rho_particle
+end
+
+# Optimized version for WCSPH with `ContinuityDensity` in 3D,
+# which combines the velocity and density load into one wide load.
+# This is significantly faster on GPUs than the 4 individual loads of `extract_svector`.
+# WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`,
+#          which is checked by `use_aligned_vrho_load`.
+#          Only call this function after checking `use_aligned_vrho_load` to avoid
+#          segmentation faults from illegal accesses.
+@propagate_inbounds function velocity_and_density(v, system, ::Val{true}, particle)
+    vrho_particle = extract_svector_aligned(v, Val(4), particle)
+
+    # The columns of `v` are ordered as (v_x, v_y, v_z, rho)
+    v..., rho = Tuple(vrho_particle)
+    v_particle = SVector(v)
+
+    return v_particle, rho
+end
+
+# By default, don't use aligned loads
+use_aligned_vrho_load(v, system) = false
+
+function use_aligned_vrho_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
+    use_aligned_vrho_load(v, system, system.density_calculator)
+end
+
+use_aligned_vrho_load(v, system, density_calculator) = false
+
+# Only use aligned loads when all of these conditions are satisfied:
+# - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4.
+# - We are on a GPU, where the aligned load gives a significant speedup.
+# - The velocity array is aligned for aligned loads, which requires that the pointer of `v`
+#   is aligned to `4 * sizeof(eltype(v))`
+#   Otherwise, we cannot use `vloada`, which is an *aligned* load.
+#   The unaligned version `vload` does not produce wide load instructions on GPUs.
+function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity)
+    if !can_use_aligned_load(v, 4)
+        # Aligned loads should always be possible on GPUs because the slices of `v_ode`
+        # are aligned to 64 bytes in `Semidiscretization` and arrays on GPUs are always
+        # aligned to full pages.
+        error("illegal alignment of `v` integration array. Please report this issue.")
+    end
+
+    return true
+end