From 0441ed6fb2fe5c506e5f311e3253d5bd848c2ca2 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Thu, 16 Apr 2026 17:00:57 +0200
Subject: [PATCH 01/22] Add SIMD.jl dependency

This reverts commit cdd58e2c3ac6fef072b39c5d2252119eb99bf4e3.
---
 Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Project.toml b/Project.toml
index 9f17f48c0a..fe5dd24971 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -63,6 +64,7 @@ Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
 Reexport = "1"
+SIMD = "3.7.2"
 SciMLBase = "2"
 StaticArrays = "1"
 Statistics = "1"

From 13d6dfdec49d87b4e79b6efd019f0fd6210a74e0 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Thu, 16 Apr 2026 17:01:39 +0200
Subject: [PATCH 02/22] Remove vectorized loads for WCSPH velocity and density

This reverts commit 1243fc6cc8b7e7bb7f9e211ce03fdbe88eebfa35.
---
 src/TrixiParticles.jl                         |  1 +
 .../fluid/weakly_compressible_sph/rhs.jl      | 72 +++++++++++++++++--
 test/examples/gpu.jl                          | 62 ++++++++++++++++
 3 files changed, 131 insertions(+), 4 deletions(-)

diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
index 215a2b3bf1..45b94faca3 100644
--- a/src/TrixiParticles.jl
+++ b/src/TrixiParticles.jl
@@ -25,6 +25,7 @@ using Random: seed!
 using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
                  get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
                  add_tstop!
+using SIMD: SIMD
 @reexport using StaticArrays: SVector
 using StaticArrays: @SMatrix, SMatrix, setindex
 using Statistics: Statistics
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index 836063538b..312f6912f4 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -25,14 +25,20 @@ function interact!(dv, v_particle_system, u_particle_system,
     compact_support_ = compact_support(particle_system, neighbor_system)
     almostzero = sqrt(eps(compact_support_^2))
 
+    use_simd_load_system = Val(use_simd_load(v_particle_system, particle_system))
+    use_simd_load_neighbor = Val(use_simd_load(v_neighbor_system, neighbor_system))
+
     @threaded semi for particle in each_integrated_particle(particle_system)
         # We are looping over the particles of `particle_system`, so it is guaranteed
         # that `particle` is in bounds of `particle_system`.
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
 
-        v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
-        rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 use_simd_load_system, particle)
 
         # Accumulate the RHS contributions over all neighbors before writing to `dv`,
         # to reduce the number of memory writes.
@@ -56,8 +62,9 @@ function interact!(dv, v_particle_system, u_particle_system,
 
             # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
             m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
-            v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
-            rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+            (v_b,
+             rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                                                     use_simd_load_neighbor, neighbor)
 
             # The following call is equivalent to
             #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -134,3 +141,60 @@ end
                                    neighbor, p_a)
     return p_a
 end
+
+# Default method, which simply calls `current_velocity` and `current_density` separately.
+@propagate_inbounds function velocity_and_density(v, system, ::Val{false}, particle)
+    v_particle = current_velocity(v, system, particle)
+    rho_particle = current_density(v, system, particle)
+
+    return v_particle, rho_particle
+end
+
+# Optimized version for WCSPH with `ContinuityDensity` in 3D,
+# which combines the velocity and density load into one wide load.
+# This is significantly faster on GPUs than the 4 individual loads of `extract_svector`.
+# WARNING: this requires that the pointer of `v` is aligned to the size
+#          of `SIMD.Vec{4, eltype(v)}`, which is checked by `use_simd_load`.
+@inline function velocity_and_density(v, system, ::Val{true}, particle)
+    # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
+    # of `v` corresponding to `particle`.
+    # Note that this doesn't work for 2D because it requires a stride of 2^n.
+    vrho_particle = SIMD.vloada(SIMD.Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
+
+    # The columns of `v` are ordered as (v_x, v_y, v_z, rho)
+    v1, v2, v3, rho = Tuple(vrho_particle)
+    v_particle = SVector(v1, v2, v3)
+
+    return v_particle, rho
+end
+
+# By default, don't use SIMD loads
+use_simd_load(v, system) = false
+
+function use_simd_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
+    use_simd_load(v, system, system.density_calculator)
+end
+
+use_simd_load(v, system, density_calculator) = false
+
+# Only use SIMD loads when all of these conditions are satisfied:
+# - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4.
+# - We are on a GPU, where the SIMD load gives a significant speedup.
+# - The velocity array is aligned for SIMD loads, which requires that the pointer of `v`
+#   is aligned to the size of `SIMD.Vec{4, eltype(v)}`.
+#   Otherwise, we cannot use `vloada`, which is an *aligned* SIMD load.
+#   The unaligned version `vload` does not produce wide load instructions on GPUs.
+#   In this last case, we don't fall back to the non-SIMD version and throw an error instead.
+function use_simd_load(v::AbstractGPUArray, system, ::ContinuityDensity)
+    aligned = is_aligned(pointer(v), SIMD.Vec{4, eltype(v)})
+
+    if !aligned
+        error("on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " *
+              "first systems in the semidiscretization to ensure that their integration " *
+              "arrays are aligned for SIMD loads.")
+    end
+
+    return aligned
+end
+
+is_aligned(ptr, ::Type{SIMD.Vec{N, T}}) where {N, T} = UInt(ptr) % (N * sizeof(T)) == 0
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 3117a67ca5..2cd7ade8a4 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -77,6 +77,68 @@ end
     end
 end
 
+@testset verbose=true "velocity_and_density $TRIXIPARTICLES_TEST_" begin
+    if supports_double_precision
+        types = [Float64, Float32]
+    else
+        types = [Float32]
+    end
+
+    @testset verbose=true "use_simd_load $T" for T in types
+        # Aligned array
+        x = zeros(T, 16)
+
+        # Test different "systems" and density calculators
+        @test !TrixiParticles.use_simd_load(x, nothing)
+        @test !TrixiParticles.use_simd_load(x, (; density_calculator=ContinuityDensity()))
+        @test !TrixiParticles.use_simd_load(x, nothing, SummationDensity())
+
+        # No SIMD load on the CPU
+        @test !TrixiParticles.use_simd_load(x, nothing, ContinuityDensity())
+
+        y = Adapt.adapt(parallelization_backend, x)
+
+        # Test different "systems" and density calculators
+        @test !TrixiParticles.use_simd_load(y, nothing)
+        @test !TrixiParticles.use_simd_load(y, (; density_calculator=ContinuityDensity()))
+        @test !TrixiParticles.use_simd_load(y, nothing, SummationDensity())
+
+        # Use SIMD load on the GPU with 4-aligned arrays
+        @test TrixiParticles.use_simd_load(y, nothing, ContinuityDensity())
+        @test TrixiParticles.use_simd_load(view(y, 5:16), nothing, ContinuityDensity())
+
+        # Unaligned array on the GPU should throw an error
+        @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 2:16), nothing,
+                                                                  ContinuityDensity())
+        @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 3:16), nothing,
+                                                                  ContinuityDensity())
+        @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 4:16), nothing,
+                                                                  ContinuityDensity())
+    end
+
+    @testset "velocity_and_density $T" for T in types
+        # Aligned array
+        x = rand(T, 4, 4)
+        y = Adapt.adapt(parallelization_backend, x)
+
+        # Dummy system that will only be used for `ndims`.
+        struct MockSystem end
+        Base.ndims(::MockSystem) = 3
+        system = MockSystem()
+        @inline TrixiParticles.current_density(v, ::MockSystem, i) = v[4, i]
+
+        # Test that the SIMD version is consistent with the non-SIMD version.
+        # We have 4 particles (with 4 values per particles).
+        # In order to test this on the GPU, we need to use a kernel with `@threaded`.
+        result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
+        TrixiParticles.@threaded parallelization_backend for i in 1:4
+            result[i] = TrixiParticles.velocity_and_density(y, system, Val(false), i) ==
+                        TrixiParticles.velocity_and_density(y, system, Val(true), i)
+        end
+        @test all(result)
+    end
+end
+
 @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin
     @testset verbose=true "Fluid" begin
         @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin

From 5b92abbb7d7224c23091946cb63547f2a31b20df Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Thu, 16 Apr 2026 17:09:03 +0200
Subject: [PATCH 03/22] Add SIMD load `extract_smatrix_aligned`

---
 src/general/abstract_system.jl | 28 ++++++++++++++++++++++++++
 test/examples/gpu.jl           | 36 ++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 79b52a81df..0a5f61170a 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -82,6 +82,34 @@ end
                                         Val(NDIMS^2)))
 end
 
+# Optimized version for 2D, which uses SIMD.jl to combine the 4 loads of the 2x2 matrix
+# into a single wide load. This is significantly faster on GPUs than 4 individual loads.
+# WARNING:
+# 1. This only works if the matrix elements are stored contiguously in memory.
+#    The 4 elements of the 2x2 matrix for a particle must immediately follow the
+#    4 elements of the previous particle's matrix, with no padding in between.
+# 2. The pointer of `A` must be aligned to the size of `Vec{4, eltype(A)}`.
+#    This is guaranteed if `A` is allocated by Julia and has the correct size,
+#    but may not be true if `A` is a view or subarray.
+@propagate_inbounds function extract_smatrix_aligned(A, system, particle)
+    return extract_smatrix_aligned(A, Val(ndims(system)), particle)
+end
+
+@inline function extract_smatrix_aligned(A, ::Val{2}, particle)
+    @boundscheck checkbounds(A, 2, 2, particle)
+
+    # Note that this doesn't work in 3D because it requires a stride of 2^n.
+    x = SIMD.vloada(SIMD.Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1))
+
+    return SMatrix{2, 2}(Tuple(x))
+end
+
+# Fall back to the generic version when not in 2D.
+@propagate_inbounds function extract_smatrix_aligned(A, ::Val{NDIMS},
+                                                     particle) where {NDIMS}
+    return extract_smatrix(A, Val(NDIMS), particle)
+end
+
 # Specifically get the current coordinates of a particle for all system types.
 @propagate_inbounds function current_coords(u, system, particle)
     return extract_svector(current_coordinates(u, system), system, particle)
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 2cd7ade8a4..91cf036079 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -139,6 +139,42 @@ end
     end
 end
 
+@testset verbose=true "extract_smatrix_aligned $TRIXIPARTICLES_TEST_" begin
+    if supports_double_precision
+        types = [Float64, Float32]
+    else
+        types = [Float32]
+    end
+
+    @testset verbose=true "$T" for T in types
+        @testset verbose=true "$(N)D" for N in 2:3
+            @testset "CPU" begin
+                A = rand(T, N, N, 4)
+
+                # Test that the SIMD version is consistent with the non-SIMD version.
+                for i in 1:4
+                    @test TrixiParticles.extract_smatrix_aligned(A, Val(N), i) ==
+                          TrixiParticles.extract_smatrix(A, Val(N), i)
+                end
+            end
+
+            @testset "GPU" begin
+                A = Adapt.adapt(parallelization_backend, rand(T, N, N, 4))
+                val = Val(N)
+
+                # Test that the SIMD version is consistent with the non-SIMD version.
+                # In order to test this on the GPU, we need to use a kernel with `@threaded`.
+                result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
+                TrixiParticles.@threaded parallelization_backend for i in 1:4
+                    result[i] = TrixiParticles.extract_smatrix_aligned(A, val, i) ==
+                                TrixiParticles.extract_smatrix(A, val, i)
+                end
+                @test all(result)
+            end
+        end
+    end
+end
+
 @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin
     @testset verbose=true "Fluid" begin
         @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin

From 7df05abf670b886cf656fe0743ac5eb5a45504b3 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Thu, 16 Apr 2026 17:59:15 +0200
Subject: [PATCH 04/22] Make aligned loads nicer

---
 src/general/gpu.jl                            | 61 ++++++++++++++++++
 .../fluid/weakly_compressible_sph/rhs.jl      | 62 +++++++++----------
 2 files changed, 92 insertions(+), 31 deletions(-)

diff --git a/src/general/gpu.jl b/src/general/gpu.jl
index f0d145bf1e..d6ddf62501 100644
--- a/src/general/gpu.jl
+++ b/src/general/gpu.jl
@@ -35,3 +35,64 @@ end
 function allocate(backend, ELTYPE, size)
     return Array{ELTYPE, length(size)}(undef, size)
 end
+
+# This function checks if we can use aligned `SVector` or `SMatrix` loads for the given
+# array `A` and the size of the vector we want to load `n`.
+#
+# Note that it is not checked if the size `n` is legal (a power of 2).
+# If the size `n` is not legal, the aligned loads will fall back to regular loads.
+# This is because a regular load is expected for a problem that doesn't support aligned
+# loads (e.g. 3D matrices of 3x3), whereas an illegal alignment is unexpected and should
+# throw an error instead of silently (and non-deterministically) falling back to slower
+# regular loads.
+function can_use_aligned_load(A, n)
+    # Check if the beginning of `A` is aligned to the size of the vector we want to load.
+    is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0
+    # Check if the stride of `A` in the last dimension is equal to `n`,
+    # which means that there is no padding between the vectors we want to load.
+    has_stride = stride(A, ndims(A)) == n
+
+    return is_aligned && has_stride
+end
+
+# For N = 2 and N = 4, use the aligned vector loads.
+@propagate_inbounds function extract_svector_aligned(A::AbstractMatrix,
+                                                     val_n::Union{Val{2}, Val{4}}, i)
+    return _extract_svector_aligned(A, val_n, i)
+end
+
+# For other N, fall back to the regular `extract_svector`.
+@propagate_inbounds function extract_svector_aligned(A, val_n, i)
+    return extract_svector(A, val_n, i)
+end
+
+# This function only works for N = 2 and N = 4, because it requires a stride of 2^n.
+@inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N}
+    @boundscheck checkbounds(A, 1:N, i)
+
+    # This function assumes alignment of the data, which means that the columns of `A`
+    # have exactly the size `N` and no padding between them.
+    @boundscheck @assert stride(A, 2) == N
+
+    vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1))
+
+    return SVector{N}(Tuple(vec))
+end
+
+# For general N, fall back to the regular `extract_smatrix`.
+@propagate_inbounds function extract_matrix_aligned(A, val_n, i)
+    return extract_smatrix(A, val_n, i)
+end
+
+# This function only works for 2x2 matrices, because it requires a stride of 2^n.
+@inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
+    @boundscheck checkbounds(A, 2, 2, i)
+
+    # This function assumes alignment of the data, which means that the first two of `A`
+    # have exactly the size `2` and no padding between them.
+    @boundscheck @assert stride(A, 3) == 4
+
+    vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
+
+    return SMatrix{2, 2}(Tuple(vec))
+end
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index 312f6912f4..2c9c5487e5 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -25,8 +25,9 @@ function interact!(dv, v_particle_system, u_particle_system,
     compact_support_ = compact_support(particle_system, neighbor_system)
     almostzero = sqrt(eps(compact_support_^2))
 
-    use_simd_load_system = Val(use_simd_load(v_particle_system, particle_system))
-    use_simd_load_neighbor = Val(use_simd_load(v_neighbor_system, neighbor_system))
+    use_aligned_load_system = Val(use_aligned_vrho_load(v_particle_system, particle_system))
+    use_aligned_load_neighbor = Val(use_aligned_vrho_load(v_neighbor_system,
+                                                          neighbor_system))
 
     @threaded semi for particle in each_integrated_particle(particle_system)
         # We are looping over the particles of `particle_system`, so it is guaranteed
@@ -38,7 +39,7 @@ function interact!(dv, v_particle_system, u_particle_system,
         # which gives a significant speedup on GPUs.
         (v_a,
          rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
-                                                 use_simd_load_system, particle)
+                                                 use_aligned_load_system, particle)
 
         # Accumulate the RHS contributions over all neighbors before writing to `dv`,
         # to reduce the number of memory writes.
@@ -64,7 +65,7 @@ function interact!(dv, v_particle_system, u_particle_system,
             m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
             (v_b,
              rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
-                                                     use_simd_load_neighbor, neighbor)
+                                                     use_aligned_load_neighbor, neighbor)
 
             # The following call is equivalent to
             #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -153,48 +154,47 @@ end
 # Optimized version for WCSPH with `ContinuityDensity` in 3D,
 # which combines the velocity and density load into one wide load.
 # This is significantly faster on GPUs than the 4 individual loads of `extract_svector`.
-# WARNING: this requires that the pointer of `v` is aligned to the size
-#          of `SIMD.Vec{4, eltype(v)}`, which is checked by `use_simd_load`.
-@inline function velocity_and_density(v, system, ::Val{true}, particle)
-    # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
-    # of `v` corresponding to `particle`.
-    # Note that this doesn't work for 2D because it requires a stride of 2^n.
-    vrho_particle = SIMD.vloada(SIMD.Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
+# WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`,
+#          which is checked by `use_aligned_vrho_load`.
+#          Only call this function after checking `use_aligned_vrho_load` to avoid
+#          segmentation faults from illegal accesses.
+@propagate_inbounds function velocity_and_density(v, system, ::Val{true}, particle)
+    vrho_particle = extract_svector_aligned(v, Val(4), particle)
 
     # The columns of `v` are ordered as (v_x, v_y, v_z, rho)
-    v1, v2, v3, rho = Tuple(vrho_particle)
-    v_particle = SVector(v1, v2, v3)
+    v..., rho = Tuple(vrho_particle)
+    v_particle = SVector(v)
 
     return v_particle, rho
 end
 
-# By default, don't use SIMD loads
-use_simd_load(v, system) = false
+# By default, don't use aligned loads
+use_aligned_vrho_load(v, system) = false
 
-function use_simd_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
-    use_simd_load(v, system, system.density_calculator)
+function use_aligned_vrho_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
+    use_aligned_vrho_load(v, system, system.density_calculator)
 end
 
-use_simd_load(v, system, density_calculator) = false
+use_aligned_vrho_load(v, system, density_calculator) = false
 
-# Only use SIMD loads when all of these conditions are satisfied:
+# Only use aligned loads when all of these conditions are satisfied:
 # - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4.
-# - We are on a GPU, where the SIMD load gives a significant speedup.
-# - The velocity array is aligned for SIMD loads, which requires that the pointer of `v`
-#   is aligned to the size of `SIMD.Vec{4, eltype(v)}`.
-#   Otherwise, we cannot use `vloada`, which is an *aligned* SIMD load.
+# - We are on a GPU, where the aligned load gives a significant speedup.
+# - The velocity array is aligned for aligned loads, which requires that the pointer of `v`
+#   is aligned to `4 * sizeof(eltype(v))`
+#   Otherwise, we cannot use `vloada`, which is an *aligned* load.
 #   The unaligned version `vload` does not produce wide load instructions on GPUs.
-#   In this last case, we don't fall back to the non-SIMD version and throw an error instead.
-function use_simd_load(v::AbstractGPUArray, system, ::ContinuityDensity)
-    aligned = is_aligned(pointer(v), SIMD.Vec{4, eltype(v)})
-
-    if !aligned
+function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity)
+    if !can_use_aligned_load(v, 4)
+        # If aligned loads are possible for the problem, but not allowed due to alignment,
+        # we don't fall back to the non-SIMD version and throw an error instead.
+        # This is likely a configuration error (see the error message below), and notifying
+        # the user is better than silently falling back to slower loads and thus
+        # non-deterministic performance.
         error("on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " *
               "first systems in the semidiscretization to ensure that their integration " *
               "arrays are aligned for SIMD loads.")
     end
 
-    return aligned
+    return true
 end
-
-is_aligned(ptr, ::Type{SIMD.Vec{N, T}}) where {N, T} = UInt(ptr) % (N * sizeof(T)) == 0

From 48350ea59eb0607270e353d79560e91e15f8870b Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 17 Apr 2026 14:44:39 +0200
Subject: [PATCH 05/22] Automatically add padding to make aligned loads
 possible.

---
 src/general/semidiscretization.jl                | 15 +++++++++++++--
 src/schemes/fluid/weakly_compressible_sph/rhs.jl | 12 ++++--------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
index 6f8f59197e..cb05b4e0b7 100644
--- a/src/general/semidiscretization.jl
+++ b/src/general/semidiscretization.jl
@@ -94,10 +94,21 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
 
     sizes_u = [u_nvariables(system) * n_integrated_particles(system)
                for system in systems]
-    ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i])
-                     for i in eachindex(sizes_u))
     sizes_v = [v_nvariables(system) * n_integrated_particles(system)
                for system in systems]
+
+    # Align sizes to 64 bytes by adding padding if necessary.
+    # This ensures that aligned loads can be used on the integration arrays, which can
+    # significantly improve performance on GPUs. Performance benefits on CPUs remain
+    # to be investigated.
+    for i in eachindex(systems)
+        block_size = div(64, sizeof(eltype(systems[i])))
+        sizes_u[i] = div(sizes_u[i], block_size, RoundUp) * block_size
+        sizes_v[i] = div(sizes_v[i], block_size, RoundUp) * block_size
+    end
+
+    ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i])
+                     for i in eachindex(sizes_u))
     ranges_v = Tuple((sum(sizes_v[1:(i - 1)]) + 1):sum(sizes_v[1:i])
                      for i in eachindex(sizes_v))
 
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index 2c9c5487e5..a44adaef9b 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -186,14 +186,10 @@ use_aligned_vrho_load(v, system, density_calculator) = false
 #   The unaligned version `vload` does not produce wide load instructions on GPUs.
 function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity)
     if !can_use_aligned_load(v, 4)
-        # If aligned loads are possible for the problem, but not allowed due to alignment,
-        # we don't fall back to the non-SIMD version and throw an error instead.
-        # This is likely a configuration error (see the error message below), and notifying
-        # the user is better than silently falling back to slower loads and thus
-        # non-deterministic performance.
-        error("on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " *
-              "first systems in the semidiscretization to ensure that their integration " *
-              "arrays are aligned for SIMD loads.")
+        # Aligned loads should always be possible on GPUs because the slices of `v_ode`
+        # are aligned to 64 bytes in `Semidiscretization` and arrays on GPUs are always
+        # aligned to full pages.
+        error("illegal alignment of `v` integration array. Please report this issue.")
     end
 
     return true

From 65b309a76e617e413ca903d31a1258f349e95e42 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Sun, 19 Apr 2026 12:36:52 +0200
Subject: [PATCH 06/22] Fix tests

---
 src/general/semidiscretization.jl  | 41 ++++++++++++++++++------------
 test/general/semidiscretization.jl | 14 ++++++----
 2 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
index cb05b4e0b7..f9a35562f5 100644
--- a/src/general/semidiscretization.jl
+++ b/src/general/semidiscretization.jl
@@ -97,20 +97,25 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
     sizes_v = [v_nvariables(system) * n_integrated_particles(system)
                for system in systems]
 
-    # Align sizes to 64 bytes by adding padding if necessary.
-    # This ensures that aligned loads can be used on the integration arrays, which can
-    # significantly improve performance on GPUs. Performance benefits on CPUs remain
-    # to be investigated.
+    start_u = 1
+    start_v = 1
+    ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems))
+    ranges_v_vec = Vector{UnitRange{Int}}(undef, length(systems))
     for i in eachindex(systems)
+        ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1)
+        ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1)
+
+        # Align sizes to 64 bytes by adding padding if necessary.
+        # This ensures that aligned loads can be used on the integration arrays, which can
+        # significantly improve performance on GPUs. Performance benefits on CPUs remain
+        # to be investigated.
         block_size = div(64, sizeof(eltype(systems[i])))
-        sizes_u[i] = div(sizes_u[i], block_size, RoundUp) * block_size
-        sizes_v[i] = div(sizes_v[i], block_size, RoundUp) * block_size
+        start_u += div(sizes_u[i], block_size, RoundUp) * block_size
+        start_v += div(sizes_v[i], block_size, RoundUp) * block_size
     end
 
-    ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i])
-                     for i in eachindex(sizes_u))
-    ranges_v = Tuple((sum(sizes_v[1:(i - 1)]) + 1):sum(sizes_v[1:i])
-                     for i in eachindex(sizes_v))
+    ranges_u = Tuple(ranges_u_vec)
+    ranges_v = Tuple(ranges_v_vec)
 
     # Create a n x n matrix of n neighborhood searches for each of the n systems.
     # We will need one neighborhood search for each pair of systems.
@@ -256,13 +261,13 @@ function semidiscretize(semi, tspan; reset_threads=true)
         Polyester.reset_threads!()
     end
 
-    sizes_u = (u_nvariables(system) * n_integrated_particles(system) for system in systems)
-    sizes_v = (v_nvariables(system) * n_integrated_particles(system) for system in systems)
+    size_u_ode = semi.ranges_u[end].stop
+    size_v_ode = semi.ranges_v[end].stop
 
     # Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or
     # use CPU vectors for all CPU backends.
-    u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, sum(sizes_u))
-    v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v))
+    u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, size_u_ode)
+    v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, size_v_ode)
 
     if semi.parallelization_backend isa KernelAbstractions.GPU
         u0_ode = u0_ode_
@@ -277,6 +282,9 @@ function semidiscretize(semi, tspan; reset_threads=true)
                                         parallelization_backend=semi.parallelization_backend)
     end
 
+    u0_ode .= 0
+    v0_ode .= 0
+
     # Set initial condition
     foreach_system_wrapped(semi, v0_ode, u0_ode) do system, v0_system, u0_system
         write_u0!(u0_system, system)
@@ -367,8 +375,9 @@ end
     range = ranges_v[system_indices(system, semi)]
 
     @boundscheck begin
-        if length(range) != v_nvariables(system) * n_integrated_particles(system)
-            throw(DimensionMismatch("`v_ode` range length $range_length does not match " *
+        expected = v_nvariables(system) * n_integrated_particles(system)
+        if length(range) != expected
+            throw(DimensionMismatch("`v_ode` range length $(length(range)) does not match " *
                                     "expected number of entries $expected"))
         end
     end
diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl
index 3c2dcbeb58..b816dd26da 100644
--- a/test/general/semidiscretization.jl
+++ b/test/general/semidiscretization.jl
@@ -8,6 +8,7 @@
     system2 = System2()
 
     Base.eltype(::System1) = Float64
+    Base.eltype(::System2) = Float64
     TrixiParticles.coordinates_eltype(::System1) = Float32
     TrixiParticles.u_nvariables(::System1) = 3
     TrixiParticles.u_nvariables(::System2) = 4
@@ -24,9 +25,12 @@
     @testset verbose=true "Constructor" begin
         semi = Semidiscretization(system1, system2, neighborhood_search=nothing)
 
-        # Verification
-        @test semi.ranges_u == (1:6, 7:18)
-        @test semi.ranges_v == (1:6, 7:12)
+        # Verification: These are the ranges that we would expect based on system sizes:
+        # semi.ranges_u == (1:6, 7:18)
+        # semi.ranges_v == (1:6, 7:12)
+        # Due to alignment to 64 bytes, the ranges are adjusted to be:
+        @test semi.ranges_u == (1:6, 9:20)
+        @test semi.ranges_v == (1:6, 9:14)
 
         nhs = [TrixiParticles.TrivialNeighborhoodSearch{3}(search_radius=0.2,
                eachpoint=1:2)
@@ -152,8 +156,8 @@
 
         semi = Semidiscretization(system1, system2, neighborhood_search=nothing)
 
-        dv_ode = zeros(3 * 2 + 2 * 3)
-        du_ode = zeros(3 * 2 + 4 * 3)
+        dv_ode = zeros(semi.ranges_v[end].stop)
+        du_ode = zeros(semi.ranges_u[end].stop)
         u_ode = zero(du_ode)
 
         v1 = [1.0 2.0

From b5fcd699a6fa02423fdb69d93253443ff2f2bc97 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Sun, 19 Apr 2026 12:46:02 +0200
Subject: [PATCH 07/22] Update comment

---
 src/general/semidiscretization.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
index f9a35562f5..73ab75c256 100644
--- a/src/general/semidiscretization.jl
+++ b/src/general/semidiscretization.jl
@@ -107,8 +107,7 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
 
         # Align sizes to 64 bytes by adding padding if necessary.
         # This ensures that aligned loads can be used on the integration arrays, which can
-        # significantly improve performance on GPUs. Performance benefits on CPUs remain
-        # to be investigated.
+        # significantly improve performance on GPUs.
         block_size = div(64, sizeof(eltype(systems[i])))
         start_u += div(sizes_u[i], block_size, RoundUp) * block_size
         start_v += div(sizes_v[i], block_size, RoundUp) * block_size

From a4d618caeed15259008400e0507f890da594b05e Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Sun, 19 Apr 2026 13:07:57 +0200
Subject: [PATCH 08/22] Only align integration arrays on the GPU

---
 src/general/semidiscretization.jl  | 15 ++++++++++----
 test/examples/gpu.jl               | 33 ++++++++++++++++++++++++++++++
 test/general/semidiscretization.jl |  9 +++-----
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
index 73ab75c256..5e10b093ed 100644
--- a/src/general/semidiscretization.jl
+++ b/src/general/semidiscretization.jl
@@ -97,6 +97,17 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
     sizes_v = [v_nvariables(system) * n_integrated_particles(system)
                for system in systems]
 
+    ELTYPE = eltype(first(systems))
+    if parallelization_backend isa KernelAbstractions.GPU
+        # Align ranges to 64 bytes by adding padding if necessary.
+        # This ensures that aligned loads can be used on the wrapped integration arrays,
+        # which can significantly improve performance on GPUs.
+        block_size = div(64, sizeof(ELTYPE))
+    else
+        # There is no performance benefit to aligning ranges for CPU backends.
+        block_size = 1
+    end
+
     start_u = 1
     start_v = 1
     ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems))
@@ -105,10 +116,6 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
         ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1)
         ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1)
 
-        # Align sizes to 64 bytes by adding padding if necessary.
-        # This ensures that aligned loads can be used on the integration arrays, which can
-        # significantly improve performance on GPUs.
-        block_size = div(64, sizeof(eltype(systems[i])))
         start_u += div(sizes_u[i], block_size, RoundUp) * block_size
         start_v += div(sizes_v[i], block_size, RoundUp) * block_size
     end
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 91cf036079..f1b8b92ed9 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -175,6 +175,39 @@ end
     end
 end
 
+@trixi_testset "Semidiscretization vector alignment" begin
+    # Mock systems
+    struct System1 <: TrixiParticles.AbstractSystem{3} end
+    struct System2 <: TrixiParticles.AbstractSystem{3} end
+
+    system1 = System1()
+    system2 = System2()
+
+    Base.eltype(::System1) = Float64
+    Base.eltype(::System2) = Float64
+    TrixiParticles.u_nvariables(::System1) = 3
+    TrixiParticles.u_nvariables(::System2) = 4
+    TrixiParticles.v_nvariables(::System1) = 3
+    TrixiParticles.v_nvariables(::System2) = 2
+    TrixiParticles.nparticles(::System1) = 2
+    TrixiParticles.nparticles(::System2) = 3
+
+    TrixiParticles.compact_support(::System1, neighbor) = 0.2
+    TrixiParticles.compact_support(::System2, neighbor) = 0.2
+
+    @testset verbose=true "Constructor" begin
+        semi = Semidiscretization(system1, system2, neighborhood_search=nothing,
+                                  parallelization_backend=Main.parallelization_backend)
+
+        # These are the ranges that we would expect on the CPU:
+        # semi.ranges_u == (1:6, 7:18)
+        # semi.ranges_v == (1:6, 7:12)
+        # Due to alignment to 64 bytes, the ranges are adjusted to be:
+        @test semi.ranges_u == (1:6, 9:20)
+        @test semi.ranges_v == (1:6, 9:14)
+    end
+end
+
 @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin
     @testset verbose=true "Fluid" begin
         @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin
diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl
index b816dd26da..52aeb3718c 100644
--- a/test/general/semidiscretization.jl
+++ b/test/general/semidiscretization.jl
@@ -25,12 +25,9 @@
     @testset verbose=true "Constructor" begin
         semi = Semidiscretization(system1, system2, neighborhood_search=nothing)
 
-        # Verification: These are the ranges that we would expect based on system sizes:
-        # semi.ranges_u == (1:6, 7:18)
-        # semi.ranges_v == (1:6, 7:12)
-        # Due to alignment to 64 bytes, the ranges are adjusted to be:
-        @test semi.ranges_u == (1:6, 9:20)
-        @test semi.ranges_v == (1:6, 9:14)
+        # Verification
+        semi.ranges_u == (1:6, 7:18)
+        semi.ranges_v == (1:6, 7:12)
 
         nhs = [TrixiParticles.TrivialNeighborhoodSearch{3}(search_radius=0.2,
                eachpoint=1:2)

From 81849b39ab5fd85a5493506aed654af7c774e4e0 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Sun, 19 Apr 2026 13:49:59 +0200
Subject: [PATCH 09/22] Use aligned loads for TLSPH 2D matrices

---
 src/general/abstract_system.jl                | 41 +++------------
 src/general/gpu.jl                            | 20 +++----
 .../fluid/weakly_compressible_sph/rhs.jl      |  4 ++
 .../structure/total_lagrangian_sph/rhs.jl     | 52 +++++++++++++++++--
 4 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 0a5f61170a..23135da628 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -72,42 +72,13 @@ end
     return extract_smatrix(A, Val(ndims(system)), particle)
 end
 
-@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
-    @boundscheck checkbounds(A, NDIMS, NDIMS, particle)
+@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N}
+    @boundscheck checkbounds(A, N, N, i)
+    # This function assumes that the first two dimensions of `A` have exactly the size `N`.
+    @boundscheck @assert stride(A, 3) == N^2
 
-    # Extract the matrix elements for this particle as a tuple to pass to SMatrix
-    return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
-                                                               div(i - 1, NDIMS) + 1,
-                                                               particle]),
-                                        Val(NDIMS^2)))
-end
-
-# Optimized version for 2D, which uses SIMD.jl to combine the 4 loads of the 2x2 matrix
-# into a single wide load. This is significantly faster on GPUs than 4 individual loads.
-# WARNING:
-# 1. This only works if the matrix elements are stored contiguously in memory.
-#    The 4 elements of the 2x2 matrix for a particle must immediately follow the
-#    4 elements of the previous particle's matrix, with no padding in between.
-# 2. The pointer of `A` must be aligned to the size of `Vec{4, eltype(A)}`.
-#    This is guaranteed if `A` is allocated by Julia and has the correct size,
-#    but may not be true if `A` is a view or subarray.
-@propagate_inbounds function extract_smatrix_aligned(A, system, particle)
-    return extract_smatrix_aligned(A, Val(ndims(system)), particle)
-end
-
-@inline function extract_smatrix_aligned(A, ::Val{2}, particle)
-    @boundscheck checkbounds(A, 2, 2, particle)
-
-    # Note that this doesn't work in 3D because it requires a stride of 2^n.
-    x = SIMD.vloada(SIMD.Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1))
-
-    return SMatrix{2, 2}(Tuple(x))
-end
-
-# Fall back to the generic version when not in 2D.
-@propagate_inbounds function extract_smatrix_aligned(A, ::Val{NDIMS},
-                                                     particle) where {NDIMS}
-    return extract_smatrix(A, Val(NDIMS), particle)
+    # Extract the matrix elements for this `i` as a tuple to pass to SMatrix
+    return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[N^2 * (i - 1) + j]), Val(N^2)))
 end
 
 # Specifically get the current coordinates of a particle for all system types.
diff --git a/src/general/gpu.jl b/src/general/gpu.jl
index d6ddf62501..5cf7082588 100644
--- a/src/general/gpu.jl
+++ b/src/general/gpu.jl
@@ -55,6 +55,10 @@ function can_use_aligned_load(A, n)
     return is_aligned && has_stride
 end
 
+@propagate_inbounds function extract_svector_aligned(A, system::AbstractSystem, particle)
+    return extract_svector_aligned(A, Val(ndims(system)), particle)
+end
+
 # For N = 2 and N = 4, use the aligned vector loads.
 @propagate_inbounds function extract_svector_aligned(A::AbstractMatrix,
                                                      val_n::Union{Val{2}, Val{4}}, i)
@@ -69,28 +73,26 @@ end
 # This function only works for N = 2 and N = 4, because it requires a stride of 2^n.
 @inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N}
     @boundscheck checkbounds(A, 1:N, i)
-
-    # This function assumes alignment of the data, which means that the columns of `A`
-    # have exactly the size `N` and no padding between them.
-    @boundscheck @assert stride(A, 2) == N
+    @boundscheck can_use_aligned_load(A, N)
 
     vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1))
 
     return SVector{N}(Tuple(vec))
 end
 
+@propagate_inbounds function extract_smatrix_aligned(A, system::AbstractSystem, particle)
+    return extract_smatrix_aligned(A, Val(ndims(system)), particle)
+end
+
 # For general N, fall back to the regular `extract_smatrix`.
-@propagate_inbounds function extract_matrix_aligned(A, val_n, i)
+@propagate_inbounds function extract_smatrix_aligned(A, val_n, i)
     return extract_smatrix(A, val_n, i)
 end
 
 # This function only works for 2x2 matrices, because it requires a stride of 2^n.
 @inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
     @boundscheck checkbounds(A, 2, 2, i)
-
-    # This function assumes alignment of the data, which means that the first two of `A`
-    # have exactly the size `2` and no padding between them.
-    @boundscheck @assert stride(A, 3) == 4
+    @boundscheck can_use_aligned_load(A, 4)
 
     vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
 
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index a44adaef9b..c4d969eaef 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -37,6 +37,8 @@ function interact!(dv, v_particle_system, u_particle_system,
 
         # In 3D, this function can combine velocity and density load into one wide load,
         # which gives a significant speedup on GPUs.
+        # Note that we can only safely use `@inbounds` after checking alignment
+        # with `use_aligned_vrho_load` before the `@threaded` loop.
         (v_a,
          rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
                                                  use_aligned_load_system, particle)
@@ -63,6 +65,8 @@ function interact!(dv, v_particle_system, u_particle_system,
 
             # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
             m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+            # Note that we can only safely use `@inbounds` after checking alignment
+            # with `use_aligned_vrho_load` before the `@threaded` loop.
             (v_b,
              rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
                                                      use_aligned_load_neighbor, neighbor)
diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl
index fa000ef38f..5f964bd60d 100644
--- a/src/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -31,15 +31,19 @@ end
     h = initial_smoothing_length(system)
     almostzero = sqrt(eps(h^2))
 
+    # Check alignment of deformation gradient and `pk1_rho2` arrays before the `@threaded`
+    # loop to be able to use aligned loads safely inside the loop.
+    use_aligned_matrix_load_ = Val(use_aligned_matrix_load(system))
+
     @threaded semi for particle in each_integrated_particle(system)
         # We are looping over the particles of `system`, so it is guaranteed
         # that `particle` is in bounds of `system`.
         m_a = @inbounds system.mass[particle]
         rho_a = @inbounds system.material_density[particle]
         # PK1 / rho^2
-        pk1_rho2_a = @inbounds pk1_rho2(system, particle)
+        pk1_rho2_a = @inbounds pk1_rho2(system, use_aligned_matrix_load_, particle)
         current_coords_a = @inbounds current_coords(system, particle)
-        F_a = @inbounds deformation_gradient(system, particle)
+        F_a = @inbounds deformation_gradient(system, use_aligned_matrix_load_, particle)
 
         # Accumulate the RHS contributions over all neighbors before writing to `dv`
         # to reduce the number of memory writes.
@@ -64,11 +68,12 @@ end
             rho_b = @inbounds system.material_density[neighbor]
             m_b = @inbounds system.mass[neighbor]
             # PK1 / rho^2
-            pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
+            pk1_rho2_b = @inbounds pk1_rho2(system, use_aligned_matrix_load_, neighbor)
             current_coords_b = @inbounds current_coords(system, neighbor)
 
             # The compiler is smart enough to optimize this away if no penalty force is used
-            F_b = @inbounds deformation_gradient(system, neighbor)
+            F_b = @inbounds deformation_gradient(system, use_aligned_matrix_load_,
+                                                 neighbor)
 
             current_pos_diff_ = current_coords_a - current_coords_b
             # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
@@ -120,3 +125,42 @@ function interact!(dv, v_particle_system, u_particle_system,
     # TODO continuity equation?
     return dv
 end
+
+function use_aligned_matrix_load(system)
+    return use_aligned_matrix_load(system.deformation_grad, system.pk1_rho2)
+end
+
+function use_aligned_matrix_load(deformation_grad::AbstractGPUArray,
+                                 pk1_rho2::AbstractGPUArray)
+    # Aligned loads should always be possible on GPUs because GPU arrays are always aligned
+    # to full pages, and these arrays are not slices of larger arrays.
+    if !can_use_aligned_load(deformation_grad, 4)
+        error("illegal alignment of deformation gradient array. Please report this issue.")
+    end
+    if !can_use_aligned_load(pk1_rho2, 4)
+        error("illegal alignment of `pk1_rho2` array. Please report this issue.")
+    end
+
+    return true
+end
+
+# Don't use aligned vector loads on the CPU
+use_aligned_matrix_load(deformation_grad, pk1_rho2) = false
+
+# Aligned vector load versions for deformation gradient and `pk1_rho2`.
+# These are only used on GPUs, which is checked by `use_aligned_matrix_load`.
+@propagate_inbounds function pk1_rho2(system, ::Val{true}, particle)
+    return extract_smatrix_aligned(system.pk1_rho2, system, particle)
+end
+
+@propagate_inbounds function pk1_rho2(system, ::Val{false}, particle)
+    return pk1_rho2(system, particle)
+end
+
+@propagate_inbounds function deformation_gradient(system, ::Val{true}, particle)
+    return extract_smatrix_aligned(system.deformation_grad, system, particle)
+end
+
+@propagate_inbounds function deformation_gradient(system, ::Val{false}, particle)
+    return deformation_gradient(system, particle)
+end

From 78f39fe51f96ac4e9effd80f66e8cf0273167f78 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Sun, 19 Apr 2026 14:11:37 +0200
Subject: [PATCH 10/22] Add and fix tests

---
 src/general/abstract_system.jl |  5 +-
 test/examples/gpu.jl           | 94 +++++++++++++++++++++-------------
 test/general/util.jl           | 20 ++++++++
 3 files changed, 81 insertions(+), 38 deletions(-)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 23135da628..dadd754d23 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -75,7 +75,10 @@ end
 @inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N}
     @boundscheck checkbounds(A, N, N, i)
     # This function assumes that the first two dimensions of `A` have exactly the size `N`.
-    @boundscheck @assert stride(A, 3) == N^2
+    @boundscheck if stride(A, 3) != N^2
+        error("extract_smatrix only works for 3D arrays where the first two dimensions " *
+              "have size N")
+    end
 
     # Extract the matrix elements for this `i` as a tuple to pass to SMatrix
     return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[N^2 * (i - 1) + j]), Val(N^2)))
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index f1b8b92ed9..621e483463 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -84,35 +84,35 @@ end
         types = [Float32]
     end
 
-    @testset verbose=true "use_simd_load $T" for T in types
+    @testset verbose=true "use_aligned_vrho_load $T" for T in types
         # Aligned array
         x = zeros(T, 16)
 
         # Test different "systems" and density calculators
-        @test !TrixiParticles.use_simd_load(x, nothing)
-        @test !TrixiParticles.use_simd_load(x, (; density_calculator=ContinuityDensity()))
-        @test !TrixiParticles.use_simd_load(x, nothing, SummationDensity())
+        @test !TrixiParticles.use_aligned_vrho_load(x, nothing)
+        @test !TrixiParticles.use_aligned_vrho_load(x, (; density_calculator=ContinuityDensity()))
+        @test !TrixiParticles.use_aligned_vrho_load(x, nothing, SummationDensity())
 
-        # No SIMD load on the CPU
-        @test !TrixiParticles.use_simd_load(x, nothing, ContinuityDensity())
+        # No aligned load on the CPU
+        @test !TrixiParticles.use_aligned_vrho_load(x, nothing, ContinuityDensity())
 
         y = Adapt.adapt(parallelization_backend, x)
 
         # Test different "systems" and density calculators
-        @test !TrixiParticles.use_simd_load(y, nothing)
-        @test !TrixiParticles.use_simd_load(y, (; density_calculator=ContinuityDensity()))
-        @test !TrixiParticles.use_simd_load(y, nothing, SummationDensity())
+        @test !TrixiParticles.use_aligned_vrho_load(y, nothing)
+        @test !TrixiParticles.use_aligned_vrho_load(y, (; density_calculator=ContinuityDensity()))
+        @test !TrixiParticles.use_aligned_vrho_load(y, nothing, SummationDensity())
 
-        # Use SIMD load on the GPU with 4-aligned arrays
-        @test TrixiParticles.use_simd_load(y, nothing, ContinuityDensity())
-        @test TrixiParticles.use_simd_load(view(y, 5:16), nothing, ContinuityDensity())
+        # Use aligned load on the GPU with 4-aligned arrays
+        @test TrixiParticles.use_aligned_vrho_load(y, nothing, ContinuityDensity())
+        @test TrixiParticles.use_aligned_vrho_load(view(y, 5:16), nothing, ContinuityDensity())
 
         # Unaligned array on the GPU should throw an error
-        @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 2:16), nothing,
+        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 2:16), nothing,
                                                                   ContinuityDensity())
-        @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 3:16), nothing,
+        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 3:16), nothing,
                                                                   ContinuityDensity())
-        @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 4:16), nothing,
+        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 4:16), nothing,
                                                                   ContinuityDensity())
     end
 
@@ -127,7 +127,9 @@ end
         system = MockSystem()
         @inline TrixiParticles.current_density(v, ::MockSystem, i) = v[4, i]
 
-        # Test that the SIMD version is consistent with the non-SIMD version.
+        @test TrixiParticles.use_aligned_vrho_load(y, system, ContinuityDensity())
+
+        # Test that the aligned version is consistent with the non-aligned version.
         # We have 4 particles (with 4 values per particles).
         # In order to test this on the GPU, we need to use a kernel with `@threaded`.
         result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
@@ -139,7 +141,7 @@ end
     end
 end
 
-@testset verbose=true "extract_smatrix_aligned $TRIXIPARTICLES_TEST_" begin
+@testset verbose=true "extract_svector_aligned $TRIXIPARTICLES_TEST_" begin
     if supports_double_precision
         types = [Float64, Float32]
     else
@@ -147,30 +149,48 @@ end
     end
 
     @testset verbose=true "$T" for T in types
-        @testset verbose=true "$(N)D" for N in 2:3
-            @testset "CPU" begin
-                A = rand(T, N, N, 4)
-
-                # Test that the SIMD version is consistent with the non-SIMD version.
-                for i in 1:4
-                    @test TrixiParticles.extract_smatrix_aligned(A, Val(N), i) ==
-                          TrixiParticles.extract_smatrix(A, Val(N), i)
-                end
+        @testset verbose=true "$(N)D" for N in 1:6
+            A = Adapt.adapt(parallelization_backend, rand(T, N, 4))
+            val = Val(N)
+
+            @test TrixiParticles.can_use_aligned_load(A, N)
+            @test !TrixiParticles.can_use_aligned_load(view(A, 2:length(A)), N)
+
+            # Test that the aligned version is consistent with the non-aligned version.
+            # In order to test this on the GPU, we need to use a kernel with `@threaded`.
+            result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
+            TrixiParticles.@threaded parallelization_backend for i in 1:4
+                result[i] = TrixiParticles.extract_svector_aligned(A, val, i) ==
+                            TrixiParticles.extract_svector(A, val, i)
             end
+            @test all(result)
+        end
+    end
+end
 
-            @testset "GPU" begin
-                A = Adapt.adapt(parallelization_backend, rand(T, N, N, 4))
-                val = Val(N)
+@testset verbose=true "extract_smatrix_aligned $TRIXIPARTICLES_TEST_" begin
+    if supports_double_precision
+        types = [Float64, Float32]
+    else
+        types = [Float32]
+    end
 
-                # Test that the SIMD version is consistent with the non-SIMD version.
-                # In order to test this on the GPU, we need to use a kernel with `@threaded`.
-                result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
-                TrixiParticles.@threaded parallelization_backend for i in 1:4
-                    result[i] = TrixiParticles.extract_smatrix_aligned(A, val, i) ==
-                                TrixiParticles.extract_smatrix(A, val, i)
-                end
-                @test all(result)
+    @testset verbose=true "$T" for T in types
+        @testset verbose=true "$(N)D" for N in 2:3
+            A = Adapt.adapt(parallelization_backend, rand(T, N, N, 4))
+            val = Val(N)
+
+            @test TrixiParticles.can_use_aligned_load(A, N)
+            @test !TrixiParticles.can_use_aligned_load(view(A, 2:length(A)), N)
+
+            # Test that the aligned version is consistent with the non-aligned version.
+            # In order to test this on the GPU, we need to use a kernel with `@threaded`.
+            result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
+            TrixiParticles.@threaded parallelization_backend for i in 1:4
+                result[i] = TrixiParticles.extract_smatrix_aligned(A, val, i) ==
+                            TrixiParticles.extract_smatrix(A, val, i)
             end
+            @test all(result)
         end
     end
 end
diff --git a/test/general/util.jl b/test/general/util.jl
index 4aefddd4e3..7441fdb7f0 100644
--- a/test/general/util.jl
+++ b/test/general/util.jl
@@ -1,3 +1,23 @@
+@testset verbose=true "extract_smatrix" begin
+    A = Float64.(collect(reshape(1:12, 2, 2, 3)))
+    @test TrixiParticles.extract_smatrix(A, Val(2), 1) == A[:, :, 1]
+    @test TrixiParticles.extract_smatrix(A, Val(2), 2) == A[:, :, 2]
+    @test TrixiParticles.extract_smatrix(A, Val(2), 3) == A[:, :, 3]
+    @test_throws "extract_smatrix only works" TrixiParticles.extract_smatrix(A, Val(1), 1)
+    @test_throws "BoundsError" TrixiParticles.extract_smatrix(A, Val(3), 1)
+end
+
+@testset verbose=true "extract_svector" begin
+    A = Float64.(collect(reshape(1:9, 3, 3)))
+    @test TrixiParticles.extract_svector(A, Val(3), 1) == A[:, 1]
+    @test TrixiParticles.extract_svector(A, Val(3), 2) == A[:, 2]
+    @test TrixiParticles.extract_svector(A, Val(3), 3) == A[:, 3]
+    @test TrixiParticles.extract_svector(A, Val(2), 1) == A[1:2, 1]
+    @test TrixiParticles.extract_svector(A, Val(2), 2) == A[1:2, 2]
+    @test TrixiParticles.extract_svector(A, Val(2), 3) == A[1:2, 3]
+    @test_throws "BoundsError" TrixiParticles.extract_svector(A, Val(4), 1)
+end
+
 @testset verbose=true "ThreadedBroadcastArray" begin
     A = TrixiParticles.ThreadedBroadcastArray(ones(3, 3))
     B = ones(3, 3)

From 180cc6052946c4151c98434f2ca619ca24ec5d69 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Sun, 19 Apr 2026 14:12:13 +0200
Subject: [PATCH 11/22] Reformat

---
 test/examples/gpu.jl | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 621e483463..20b6025070 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -90,7 +90,9 @@ end
 
         # Test different "systems" and density calculators
         @test !TrixiParticles.use_aligned_vrho_load(x, nothing)
-        @test !TrixiParticles.use_aligned_vrho_load(x, (; density_calculator=ContinuityDensity()))
+        @test !TrixiParticles.use_aligned_vrho_load(x,
+                                                    (;
+                                                     density_calculator=ContinuityDensity()))
         @test !TrixiParticles.use_aligned_vrho_load(x, nothing, SummationDensity())
 
         # No aligned load on the CPU
@@ -100,20 +102,26 @@ end
 
         # Test different "systems" and density calculators
         @test !TrixiParticles.use_aligned_vrho_load(y, nothing)
-        @test !TrixiParticles.use_aligned_vrho_load(y, (; density_calculator=ContinuityDensity()))
+        @test !TrixiParticles.use_aligned_vrho_load(y,
+                                                    (;
+                                                     density_calculator=ContinuityDensity()))
         @test !TrixiParticles.use_aligned_vrho_load(y, nothing, SummationDensity())
 
         # Use aligned load on the GPU with 4-aligned arrays
         @test TrixiParticles.use_aligned_vrho_load(y, nothing, ContinuityDensity())
-        @test TrixiParticles.use_aligned_vrho_load(view(y, 5:16), nothing, ContinuityDensity())
+        @test TrixiParticles.use_aligned_vrho_load(view(y, 5:16), nothing,
+                                                   ContinuityDensity())
 
         # Unaligned array on the GPU should throw an error
-        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 2:16), nothing,
-                                                                  ContinuityDensity())
-        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 3:16), nothing,
-                                                                  ContinuityDensity())
-        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 4:16), nothing,
-                                                                  ContinuityDensity())
+        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 2:16),
+                                                                              nothing,
+                                                                              ContinuityDensity())
+        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 3:16),
+                                                                              nothing,
+                                                                              ContinuityDensity())
+        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 4:16),
+                                                                              nothing,
+                                                                              ContinuityDensity())
     end
 
     @testset "velocity_and_density $T" for T in types

From 49df3727da11a48cb264dd045deacaa6f3eccbe1 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Sun, 19 Apr 2026 16:19:34 +0200
Subject: [PATCH 12/22] Fix tests

---
 src/general/abstract_system.jl |  9 +++---
 src/general/gpu.jl             | 10 ++++---
 test/examples/gpu.jl           | 50 +++++++++++++++++++++-------------
 3 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index dadd754d23..f0c5809285 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -68,16 +68,17 @@ end
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.
-@propagate_inbounds function extract_smatrix(A, system, particle)
-    return extract_smatrix(A, Val(ndims(system)), particle)
+@propagate_inbounds function extract_smatrix(A, system, i)
+    return extract_smatrix(A, Val(ndims(system)), i)
 end
 
 @inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N}
     @boundscheck checkbounds(A, N, N, i)
     # This function assumes that the first two dimensions of `A` have exactly the size `N`.
     @boundscheck if stride(A, 3) != N^2
-        error("extract_smatrix only works for 3D arrays where the first two dimensions " *
-              "have size N")
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
     end
 
     # Extract the matrix elements for this `i` as a tuple to pass to SMatrix
diff --git a/src/general/gpu.jl b/src/general/gpu.jl
index 5cf7082588..803535b198 100644
--- a/src/general/gpu.jl
+++ b/src/general/gpu.jl
@@ -46,16 +46,18 @@ end
 # throw an error instead of silently (and non-deterministically) falling back to slower
 # regular loads.
 function can_use_aligned_load(A, n)
+    # Check that `n` is a power of 2, which is a requirement for aligned loads.
+    is_power_of_2 = n > 0 && (n & (n - 1)) == 0
     # Check if the beginning of `A` is aligned to the size of the vector we want to load.
     is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0
     # Check if the stride of `A` in the last dimension is equal to `n`,
     # which means that there is no padding between the vectors we want to load.
     has_stride = stride(A, ndims(A)) == n
 
-    return is_aligned && has_stride
+    return is_power_of_2 && is_aligned && has_stride
 end
 
-@propagate_inbounds function extract_svector_aligned(A, system::AbstractSystem, particle)
+@propagate_inbounds function extract_svector_aligned(A, system, particle)
     return extract_svector_aligned(A, Val(ndims(system)), particle)
 end
 
@@ -80,12 +82,12 @@ end
     return SVector{N}(Tuple(vec))
 end
 
-@propagate_inbounds function extract_smatrix_aligned(A, system::AbstractSystem, particle)
+@propagate_inbounds function extract_smatrix_aligned(A, system, particle)
     return extract_smatrix_aligned(A, Val(ndims(system)), particle)
 end
 
 # For general N, fall back to the regular `extract_smatrix`.
-@propagate_inbounds function extract_smatrix_aligned(A, val_n, i)
+@propagate_inbounds function extract_smatrix_aligned(A, val_n::Val, i)
     return extract_smatrix(A, val_n, i)
 end
 
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 20b6025070..9437731021 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -86,7 +86,7 @@ end
 
     @testset verbose=true "use_aligned_vrho_load $T" for T in types
         # Aligned array
-        x = zeros(T, 16)
+        x = zeros(T, 4, 4)
 
         # Test different "systems" and density calculators
         @test !TrixiParticles.use_aligned_vrho_load(x, nothing)
@@ -109,19 +109,21 @@ end
 
         # Use aligned load on the GPU with 4-aligned arrays
         @test TrixiParticles.use_aligned_vrho_load(y, nothing, ContinuityDensity())
-        @test TrixiParticles.use_aligned_vrho_load(view(y, 5:16), nothing,
-                                                   ContinuityDensity())
+        @test TrixiParticles.use_aligned_vrho_load(TrixiParticles.wrap_array(y, 5:16,
+                                                                             (4, 3)),
+                                                   nothing, ContinuityDensity())
 
         # Unaligned array on the GPU should throw an error
-        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 2:16),
-                                                                              nothing,
-                                                                              ContinuityDensity())
-        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 3:16),
-                                                                              nothing,
-                                                                              ContinuityDensity())
-        @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 4:16),
-                                                                              nothing,
-                                                                              ContinuityDensity())
+        str = "illegal alignment"
+        y2 = TrixiParticles.wrap_array(y, 2:13, (4, 3))
+        @test_throws str TrixiParticles.use_aligned_vrho_load(y2, nothing,
+                                                              ContinuityDensity())
+        y3 = TrixiParticles.wrap_array(y, 3:14, (4, 3))
+        @test_throws str TrixiParticles.use_aligned_vrho_load(y3, nothing,
+                                                              ContinuityDensity())
+        y4 = TrixiParticles.wrap_array(y, 4:15, (4, 3))
+        @test_throws str TrixiParticles.use_aligned_vrho_load(y4, nothing,
+                                                              ContinuityDensity())
     end
 
     @testset "velocity_and_density $T" for T in types
@@ -157,12 +159,13 @@ end
     end
 
     @testset verbose=true "$T" for T in types
-        @testset verbose=true "$(N)D" for N in 1:6
+        @testset verbose=true "$(N)D" for N in [2, 4, 8]
             A = Adapt.adapt(parallelization_backend, rand(T, N, 4))
             val = Val(N)
 
             @test TrixiParticles.can_use_aligned_load(A, N)
-            @test !TrixiParticles.can_use_aligned_load(view(A, 2:length(A)), N)
+            slice = TrixiParticles.wrap_array(A, 2:(3 * N + 1), (N, 3))
+            @test !TrixiParticles.can_use_aligned_load(slice, N)
 
             # Test that the aligned version is consistent with the non-aligned version.
             # In order to test this on the GPU, we need to use a kernel with `@threaded`.
@@ -173,6 +176,14 @@ end
             end
             @test all(result)
         end
+
+        @testset verbose=true "$(N)D" for N in [3, 5, 6, 7]
+            A = Adapt.adapt(parallelization_backend, rand(T, N, 4))
+            val = Val(N)
+
+            # Aligned loads are only support for powers of two.
+            @test !TrixiParticles.can_use_aligned_load(A, N)
+        end
     end
 end
 
@@ -184,12 +195,13 @@ end
     end
 
     @testset verbose=true "$T" for T in types
-        @testset verbose=true "$(N)D" for N in 2:3
-            A = Adapt.adapt(parallelization_backend, rand(T, N, N, 4))
-            val = Val(N)
+        @testset verbose=true "2D" begin
+            A = Adapt.adapt(parallelization_backend, rand(T, 2, 2, 4))
+            val = Val(2)
 
-            @test TrixiParticles.can_use_aligned_load(A, N)
-            @test !TrixiParticles.can_use_aligned_load(view(A, 2:length(A)), N)
+            @test TrixiParticles.can_use_aligned_load(A, 2^2)
+            slice = TrixiParticles.wrap_array(A, 2:(3 * 4 + 1), (2, 2, 3))
+            @test !TrixiParticles.can_use_aligned_load(slice, 2^2)
 
             # Test that the aligned version is consistent with the non-aligned version.
             # In order to test this on the GPU, we need to use a kernel with `@threaded`.

From 3df4dea8f903940692d7daef2f270b425b4f047a Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Sun, 19 Apr 2026 16:24:59 +0200
Subject: [PATCH 13/22] Fix tests

---
 test/schemes/structure/total_lagrangian_sph/rhs.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl
index ce498b4db2..fa4f98241d 100644
--- a/test/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -62,14 +62,15 @@
                 current_coordinates::Any
                 material_density::Any
                 pk1_rho2::Any
+                deformation_grad::Any
                 mass::Any
                 penalty_force::Any
                 viscosity::Any
                 buffer::Any
             end
             @inline Base.eltype(::MockSystem) = Float64
-            system = MockSystem(current_coordinates, material_density, pk1_rho2, mass,
-                                nothing, nothing, nothing)
+            system = MockSystem(current_coordinates, material_density, pk1_rho2,
+                                nothing, mass, nothing, nothing, nothing)
 
             function TrixiParticles.initial_coordinates(::MockSystem)
                 return initial_coordinates

From 5e9f881aafbf28d9d2570837d31ba7ac7317b594 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Apr 2026 09:32:58 +0200
Subject: [PATCH 14/22] Add test for precompilation warnings.

---
 test/runtests.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index 457e6b8212..916176a017 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,3 +1,8 @@
+using Test
+# Test that precompilation works without warnings (e.g. due to method overwites).
+# This is important because method overwrite warnings disable precompilation of the package.
+@test_nowarn using TrixiParticles
+
 include("test_util.jl")
 
 const TRIXIPARTICLES_TEST = lowercase(get(ENV, "TRIXIPARTICLES_TEST", "all"))

From 92140497c4d91ca3409152af740187067f92107b Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Apr 2026 09:34:00 +0200
Subject: [PATCH 15/22] Fix

---
 src/general/gpu.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/general/gpu.jl b/src/general/gpu.jl
index 803535b198..d28a09931f 100644
--- a/src/general/gpu.jl
+++ b/src/general/gpu.jl
@@ -68,7 +68,7 @@ end
 end
 
 # For other N, fall back to the regular `extract_svector`.
-@propagate_inbounds function extract_svector_aligned(A, val_n, i)
+@propagate_inbounds function extract_svector_aligned(A, val_n::Val, i)
     return extract_svector(A, val_n, i)
 end
 

From f99b95e1640acd50e0cd77c4b0ad0ae31df462e2 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Apr 2026 10:17:23 +0200
Subject: [PATCH 16/22] Try again

---
 test/runtests.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 916176a017..3df17c81b8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,7 @@
 using Test
-# Test that precompilation works without warnings (e.g. due to method overwites).
+# Test that precompilation works without warnings (e.g. due to method overwrites).
 # This is important because method overwrite warnings disable precompilation of the package.
-@test_nowarn using TrixiParticles
+@test_nowarn import TrixiParticles
 
 include("test_util.jl")
 

From f02bb364de029c6fb923c9e0bbd4c194c5a20487 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Apr 2026 14:16:34 +0200
Subject: [PATCH 17/22] Fix tests

---
 test/runtests.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 3df17c81b8..7d60bfc759 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,8 @@
 using Test
 # Test that precompilation works without warnings (e.g. due to method overwrites).
 # This is important because method overwrite warnings disable precompilation of the package.
-@test_nowarn import TrixiParticles
+# We need `@eval Main` before Julia 1.12 to avoid `import" expression not at top level`.
+@test_nowarn @eval Main import TrixiParticles
 
 include("test_util.jl")
 

From d025c45f5aadd59f490345c02b395180578333a4 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Apr 2026 14:18:12 +0200
Subject: [PATCH 18/22] Always align integration arrays

---
 src/general/semidiscretization.jl  | 15 ++++----------
 test/examples/gpu.jl               | 33 ------------------------------
 test/general/semidiscretization.jl |  9 +++++---
 3 files changed, 10 insertions(+), 47 deletions(-)

diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
index 5e10b093ed..7782d42e2f 100644
--- a/src/general/semidiscretization.jl
+++ b/src/general/semidiscretization.jl
@@ -97,17 +97,6 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
     sizes_v = [v_nvariables(system) * n_integrated_particles(system)
                for system in systems]
 
-    ELTYPE = eltype(first(systems))
-    if parallelization_backend isa KernelAbstractions.GPU
-        # Align ranges to 64 bytes by adding padding if necessary.
-        # This ensures that aligned loads can be used on the wrapped integration arrays,
-        # which can significantly improve performance on GPUs.
-        block_size = div(64, sizeof(ELTYPE))
-    else
-        # There is no performance benefit to aligning ranges for CPU backends.
-        block_size = 1
-    end
-
     start_u = 1
     start_v = 1
     ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems))
@@ -116,6 +105,10 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
         ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1)
         ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1)
 
+        # Align ranges to 64 bytes by adding padding if necessary.
+        # This ensures that aligned loads can be used on the wrapped integration arrays,
+        # which can significantly improve performance on GPUs.
+        block_size = div(64, sizeof(eltype(systems[i])))
         start_u += div(sizes_u[i], block_size, RoundUp) * block_size
         start_v += div(sizes_v[i], block_size, RoundUp) * block_size
     end
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 9437731021..09f8e03205 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -215,39 +215,6 @@ end
     end
 end
 
-@trixi_testset "Semidiscretization vector alignment" begin
-    # Mock systems
-    struct System1 <: TrixiParticles.AbstractSystem{3} end
-    struct System2 <: TrixiParticles.AbstractSystem{3} end
-
-    system1 = System1()
-    system2 = System2()
-
-    Base.eltype(::System1) = Float64
-    Base.eltype(::System2) = Float64
-    TrixiParticles.u_nvariables(::System1) = 3
-    TrixiParticles.u_nvariables(::System2) = 4
-    TrixiParticles.v_nvariables(::System1) = 3
-    TrixiParticles.v_nvariables(::System2) = 2
-    TrixiParticles.nparticles(::System1) = 2
-    TrixiParticles.nparticles(::System2) = 3
-
-    TrixiParticles.compact_support(::System1, neighbor) = 0.2
-    TrixiParticles.compact_support(::System2, neighbor) = 0.2
-
-    @testset verbose=true "Constructor" begin
-        semi = Semidiscretization(system1, system2, neighborhood_search=nothing,
-                                  parallelization_backend=Main.parallelization_backend)
-
-        # These are the ranges that we would expect on the CPU:
-        # semi.ranges_u == (1:6, 7:18)
-        # semi.ranges_v == (1:6, 7:12)
-        # Due to alignment to 64 bytes, the ranges are adjusted to be:
-        @test semi.ranges_u == (1:6, 9:20)
-        @test semi.ranges_v == (1:6, 9:14)
-    end
-end
-
 @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin
     @testset verbose=true "Fluid" begin
         @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin
diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl
index 52aeb3718c..6549e3e0bc 100644
--- a/test/general/semidiscretization.jl
+++ b/test/general/semidiscretization.jl
@@ -25,9 +25,12 @@
     @testset verbose=true "Constructor" begin
         semi = Semidiscretization(system1, system2, neighborhood_search=nothing)
 
-        # Verification
-        semi.ranges_u == (1:6, 7:18)
-        semi.ranges_v == (1:6, 7:12)
+        # These are the ranges that we would expect on the CPU:
+        # semi.ranges_u == (1:6, 7:18)
+        # semi.ranges_v == (1:6, 7:12)
+        # Due to alignment to 64 bytes, the ranges are adjusted to be:
+        @test semi.ranges_u == (1:6, 9:20)
+        @test semi.ranges_v == (1:6, 9:14)
 
         nhs = [TrixiParticles.TrivialNeighborhoodSearch{3}(search_radius=0.2,
                eachpoint=1:2)

From fb04abf909f178830cad1f4023ed0dc78fb9b623 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Apr 2026 14:35:17 +0200
Subject: [PATCH 19/22] Use `vload` in 2D on the CPU as well

---
 src/general/abstract_system.jl | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index f0c5809285..7802318a8a 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -59,12 +59,15 @@ initialize!(system, semi) = system
 end
 
 # Return `A[:, i...]` as an `SVector`.
-@inline function extract_svector(A, ::Val{NDIMS}, i...) where {NDIMS}
+@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS}
     # Explicit bounds check, which can be removed by calling this function with `@inbounds`
     @boundscheck checkbounds(A, NDIMS, i...)
 
     # Assume inbounds access now
     return SVector(ntuple(@inline(dim->@inbounds A[dim, i...]), NDIMS))
+    # vec = SIMD.vload(SIMD.Vec{NDIMS, eltype(A)}, pointer(A, NDIMS * (i - 1) + 1))
+
+    return SVector{NDIMS}(Tuple(vec))
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.
@@ -81,8 +84,29 @@ end
         error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
     end
 
-    # Extract the matrix elements for this `i` as a tuple to pass to SMatrix
-    return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[N^2 * (i - 1) + j]), Val(N^2)))
+    # Extract the matrix elements as a tuple in column-major order,
+    # and construct an `SMatrix` from it.
+    return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))
+end
+
+@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
+    @boundscheck checkbounds(A, 2, 2, i)
+    # This function assumes that the first two dimensions of `A` have exactly the size `2`.
+    @boundscheck if stride(A, 3) != 4
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
+    end
+
+    # This is the same as
+    # `SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))`,
+    # but it's slightly faster on CPUs in some cases. As opposed to the GPU-optimized
+    # version in `extract_smatrix_aligned`, we use `vload` and not `vloada` here, which
+    # does not require alignment and works if N^2 is not a power of 2.
+    # This is faster in the TLSPH RHS in 2D, but slower in 3D for some reason.
+    # See the benchmarks in https://github.com/trixi-framework/TrixiParticles.jl/pull/1147.
+    vec = SIMD.vload(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
+    return SMatrix{2, 2}(Tuple(vec))
 end
 
 # Specifically get the current coordinates of a particle for all system types.

From 4904645ff96ed6672f33a9826df5700d480ea2f0 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Apr 2026 14:37:18 +0200
Subject: [PATCH 20/22] Add comment

---
 src/schemes/structure/total_lagrangian_sph/rhs.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl
index 5f964bd60d..e32315959c 100644
--- a/src/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -144,7 +144,10 @@ function use_aligned_matrix_load(deformation_grad::AbstractGPUArray,
     return true
 end
 
-# Don't use aligned vector loads on the CPU
+# Don't use aligned vector loads on the CPU. For large arrays, alignment to 32 bytes
+# (4 * Float64) is usually given, but it is not guaranteed, as Julia only guarantees
+# alignment to 16 bytes. However, the non-aligned `vload` used in `extract_smatrix` in 2D
+# has the same performance as the aligned `vloada` in `extract_smatrix_aligned` on the CPU.
 use_aligned_matrix_load(deformation_grad, pk1_rho2) = false
 
 # Aligned vector load versions for deformation gradient and `pk1_rho2`.

From 3c4bb9fcead2baa12c566959bfa337c510cc38c0 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Apr 2026 15:02:58 +0200
Subject: [PATCH 21/22] Fix

---
 src/general/abstract_system.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 7802318a8a..faaa1866cf 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -59,13 +59,12 @@ initialize!(system, semi) = system
 end
 
 # Return `A[:, i...]` as an `SVector`.
-@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS}
+@inline function extract_svector(A, ::Val{NDIMS}, i...) where {NDIMS}
     # Explicit bounds check, which can be removed by calling this function with `@inbounds`
     @boundscheck checkbounds(A, NDIMS, i...)
 
     # Assume inbounds access now
     return SVector(ntuple(@inline(dim->@inbounds A[dim, i...]), NDIMS))
-    # vec = SIMD.vload(SIMD.Vec{NDIMS, eltype(A)}, pointer(A, NDIMS * (i - 1) + 1))
 
     return SVector{NDIMS}(Tuple(vec))
 end

From 4dd0472dcb2bfbcdda1115f752ce941a3cad253e Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Tue, 21 Apr 2026 11:08:52 +0200
Subject: [PATCH 22/22] Add comments

---
 src/general/abstract_system.jl     |  2 --
 src/general/gpu.jl                 | 19 ++++++++++++-------
 src/general/semidiscretization.jl  |  2 ++
 test/examples/gpu.jl               |  2 +-
 test/general/semidiscretization.jl |  2 +-
 5 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index faaa1866cf..89e49c7334 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -65,8 +65,6 @@ end
 
     # Assume inbounds access now
     return SVector(ntuple(@inline(dim->@inbounds A[dim, i...]), NDIMS))
-
-    return SVector{NDIMS}(Tuple(vec))
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.
diff --git a/src/general/gpu.jl b/src/general/gpu.jl
index d28a09931f..a3306ca410 100644
--- a/src/general/gpu.jl
+++ b/src/general/gpu.jl
@@ -38,13 +38,6 @@ end
 
 # This function checks if we can use aligned `SVector` or `SMatrix` loads for the given
 # array `A` and the size of the vector we want to load `n`.
-#
-# Note that it is not checked if the size `n` is legal (a power of 2).
-# If the size `n` is not legal, the aligned loads will fall back to regular loads.
-# This is because a regular load is expected for a problem that doesn't support aligned
-# loads (e.g. 3D matrices of 3x3), whereas an illegal alignment is unexpected and should
-# throw an error instead of silently (and non-deterministically) falling back to slower
-# regular loads.
 function can_use_aligned_load(A, n)
     # Check that `n` is a power of 2, which is a requirement for aligned loads.
     is_power_of_2 = n > 0 && (n & (n - 1)) == 0
@@ -76,6 +69,12 @@ end
 @inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N}
     @boundscheck checkbounds(A, 1:N, i)
     @boundscheck can_use_aligned_load(A, N)
+    # This function assumes a stride of N in the last dimension.
+    @boundscheck if stride(A, 2) != N
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_svector_aligned only works for 2D arrays where the stride in the last dimension is equal to N")
+    end
 
     vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1))
 
@@ -95,6 +94,12 @@ end
 @inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
     @boundscheck checkbounds(A, 2, 2, i)
     @boundscheck can_use_aligned_load(A, 4)
+    # This function assumes that the first two dimensions of `A` have exactly the size `N`.
+    @boundscheck if stride(A, 3) != 4
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix_aligned only works for 3D arrays where the first two dimensions each have size N")
+    end
 
     vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
 
diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
index 7782d42e2f..31a028110b 100644
--- a/src/general/semidiscretization.jl
+++ b/src/general/semidiscretization.jl
@@ -281,6 +281,8 @@ function semidiscretize(semi, tspan; reset_threads=true)
                                         parallelization_backend=semi.parallelization_backend)
     end
 
+    # Initialize the arrays with zeros to initialize the padding values for alignment.
+    # The values that are actually used will be overwritten in `write_u0!` and `write_v0!`.
     u0_ode .= 0
     v0_ode .= 0
 
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 09f8e03205..1201a9eeb4 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -131,7 +131,7 @@ end
         x = rand(T, 4, 4)
         y = Adapt.adapt(parallelization_backend, x)
 
-        # Dummy system that will only be used for `ndims`.
+        # Dummy system that will only be used for `ndims` and `current_density`.
         struct MockSystem end
         Base.ndims(::MockSystem) = 3
         system = MockSystem()
diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl
index 6549e3e0bc..12624b24ff 100644
--- a/test/general/semidiscretization.jl
+++ b/test/general/semidiscretization.jl
@@ -25,7 +25,7 @@
     @testset verbose=true "Constructor" begin
         semi = Semidiscretization(system1, system2, neighborhood_search=nothing)
 
-        # These are the ranges that we would expect on the CPU:
+        # These are the ranges that we would expect without alignment padding:
         # semi.ranges_u == (1:6, 7:18)
         # semi.ranges_v == (1:6, 7:12)
         # Due to alignment to 64 bytes, the ranges are adjusted to be: