diff --git a/Project.toml b/Project.toml
index 9f17f48c0a..fe5dd24971 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -63,6 +64,7 @@ Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
 Reexport = "1"
+SIMD = "3.7.2"
 SciMLBase = "2"
 StaticArrays = "1"
 Statistics = "1"
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
index 215a2b3bf1..45b94faca3 100644
--- a/src/TrixiParticles.jl
+++ b/src/TrixiParticles.jl
@@ -25,6 +25,7 @@ using Random: seed!
 using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
                  get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
                  add_tstop!
+using SIMD: SIMD
 @reexport using StaticArrays: SVector
 using StaticArrays: @SMatrix, SMatrix, setindex
 using Statistics: Statistics
diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 79b52a81df..89e49c7334 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -68,18 +68,42 @@ end
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.
-@propagate_inbounds function extract_smatrix(A, system, particle)
-    return extract_smatrix(A, Val(ndims(system)), particle)
+@propagate_inbounds function extract_smatrix(A, system, i)
+    return extract_smatrix(A, Val(ndims(system)), i)
 end
 
-@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
-    @boundscheck checkbounds(A, NDIMS, NDIMS, particle)
+@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N}
+    @boundscheck checkbounds(A, N, N, i)
+    # This function assumes that the first two dimensions of `A` have exactly the size `N`.
+    @boundscheck if stride(A, 3) != N^2
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
+    end
+
+    # Extract the matrix elements as a tuple in column-major order,
+    # and construct an `SMatrix` from it.
+    return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))
+end
+
+@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
+    @boundscheck checkbounds(A, 2, 2, i)
+    # This function assumes that the first two dimensions of `A` have exactly the size `2`.
+    @boundscheck if stride(A, 3) != 4
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
+    end
 
-    # Extract the matrix elements for this particle as a tuple to pass to SMatrix
-    return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
-                                                               div(i - 1, NDIMS) + 1,
-                                                               particle]),
-                                        Val(NDIMS^2)))
+    # This is the same as
+    # `SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))`,
+    # but it's slightly faster on CPUs in some cases. As opposed to the GPU-optimized
+    # version in `extract_smatrix_aligned`, we use `vload` and not `vloada` here, which
+    # does not require alignment and works if N^2 is not a power of 2.
+    # This is faster in the TLSPH RHS in 2D, but slower in 3D for some reason.
+    # See the benchmarks in https://github.com/trixi-framework/TrixiParticles.jl/pull/1147.
+    vec = SIMD.vload(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
+    return SMatrix{2, 2}(Tuple(vec))
 end
 
 # Specifically get the current coordinates of a particle for all system types.
diff --git a/src/general/gpu.jl b/src/general/gpu.jl
index f0d145bf1e..a3306ca410 100644
--- a/src/general/gpu.jl
+++ b/src/general/gpu.jl
@@ -35,3 +35,73 @@ end
 function allocate(backend, ELTYPE, size)
     return Array{ELTYPE, length(size)}(undef, size)
 end
+
+# This function checks if we can use aligned `SVector` or `SMatrix` loads for the given
+# array `A` and the size of the vector we want to load `n`.
+function can_use_aligned_load(A, n)
+    # Check that `n` is a power of 2, which is a requirement for aligned loads.
+    is_power_of_2 = n > 0 && (n & (n - 1)) == 0
+    # Check if the beginning of `A` is aligned to the size of the vector we want to load.
+    is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0
+    # Check if the stride of `A` in the last dimension is equal to `n`,
+    # which means that there is no padding between the vectors we want to load.
+    has_stride = stride(A, ndims(A)) == n
+
+    return is_power_of_2 && is_aligned && has_stride
+end
+
+@propagate_inbounds function extract_svector_aligned(A, system, particle)
+    return extract_svector_aligned(A, Val(ndims(system)), particle)
+end
+
+# For N = 2 and N = 4, use the aligned vector loads.
+@propagate_inbounds function extract_svector_aligned(A::AbstractMatrix,
+                                                     val_n::Union{Val{2}, Val{4}}, i)
+    return _extract_svector_aligned(A, val_n, i)
+end
+
+# For other N, fall back to the regular `extract_svector`.
+@propagate_inbounds function extract_svector_aligned(A, val_n::Val, i)
+    return extract_svector(A, val_n, i)
+end
+
+# This function only works for N = 2 and N = 4, because it requires a stride of 2^n.
+@inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N}
+    @boundscheck checkbounds(A, 1:N, i)
+    @boundscheck can_use_aligned_load(A, N)
+    # This function assumes a stride of N in the last dimension.
+    @boundscheck if stride(A, 2) != N
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_svector_aligned only works for 2D arrays where the stride in the last dimension is equal to N")
+    end
+
+    vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1))
+
+    return SVector{N}(Tuple(vec))
+end
+
+@propagate_inbounds function extract_smatrix_aligned(A, system, particle)
+    return extract_smatrix_aligned(A, Val(ndims(system)), particle)
+end
+
+# For general N, fall back to the regular `extract_smatrix`.
+@propagate_inbounds function extract_smatrix_aligned(A, val_n::Val, i)
+    return extract_smatrix(A, val_n, i)
+end
+
+# This function only works for 2x2 matrices, because it requires a stride of 2^n.
+@inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
+    @boundscheck checkbounds(A, 2, 2, i)
+    @boundscheck can_use_aligned_load(A, 4)
+    # This function assumes that the first two dimensions of `A` have exactly the size `N`.
+    @boundscheck if stride(A, 3) != 4
+        # WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
+        # even when the error is never thrown.
+        error("extract_smatrix_aligned only works for 3D arrays where the first two dimensions each have size N")
+    end
+
+    vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
+
+    return SMatrix{2, 2}(Tuple(vec))
+end
diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
index 6f8f59197e..31a028110b 100644
--- a/src/general/semidiscretization.jl
+++ b/src/general/semidiscretization.jl
@@ -94,12 +94,27 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
 
     sizes_u = [u_nvariables(system) * n_integrated_particles(system)
                for system in systems]
-    ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i])
-                     for i in eachindex(sizes_u))
     sizes_v = [v_nvariables(system) * n_integrated_particles(system)
                for system in systems]
-    ranges_v = Tuple((sum(sizes_v[1:(i - 1)]) + 1):sum(sizes_v[1:i])
-                     for i in eachindex(sizes_v))
+
+    start_u = 1
+    start_v = 1
+    ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems))
+    ranges_v_vec = Vector{UnitRange{Int}}(undef, length(systems))
+    for i in eachindex(systems)
+        ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1)
+        ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1)
+
+        # Align ranges to 64 bytes by adding padding if necessary.
+        # This ensures that aligned loads can be used on the wrapped integration arrays,
+        # which can significantly improve performance on GPUs.
+        block_size = div(64, sizeof(eltype(systems[i])))
+        start_u += div(sizes_u[i], block_size, RoundUp) * block_size
+        start_v += div(sizes_v[i], block_size, RoundUp) * block_size
+    end
+
+    ranges_u = Tuple(ranges_u_vec)
+    ranges_v = Tuple(ranges_v_vec)
 
     # Create a n x n matrix of n neighborhood searches for each of the n systems.
     # We will need one neighborhood search for each pair of systems.
@@ -245,13 +260,13 @@ function semidiscretize(semi, tspan; reset_threads=true)
         Polyester.reset_threads!()
     end
 
-    sizes_u = (u_nvariables(system) * n_integrated_particles(system) for system in systems)
-    sizes_v = (v_nvariables(system) * n_integrated_particles(system) for system in systems)
+    size_u_ode = semi.ranges_u[end].stop
+    size_v_ode = semi.ranges_v[end].stop
 
     # Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or
     # use CPU vectors for all CPU backends.
-    u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, sum(sizes_u))
-    v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v))
+    u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, size_u_ode)
+    v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, size_v_ode)
 
     if semi.parallelization_backend isa KernelAbstractions.GPU
         u0_ode = u0_ode_
@@ -266,6 +281,11 @@ function semidiscretize(semi, tspan; reset_threads=true)
                                         parallelization_backend=semi.parallelization_backend)
     end
 
+    # Initialize the arrays with zeros to initialize the padding values for alignment.
+    # The values that are actually used will be overwritten in `write_u0!` and `write_v0!`.
+    u0_ode .= 0
+    v0_ode .= 0
+
     # Set initial condition
     foreach_system_wrapped(semi, v0_ode, u0_ode) do system, v0_system, u0_system
         write_u0!(u0_system, system)
@@ -356,8 +376,9 @@ end
     range = ranges_v[system_indices(system, semi)]
 
     @boundscheck begin
-        if length(range) != v_nvariables(system) * n_integrated_particles(system)
-            throw(DimensionMismatch("`v_ode` range length $range_length does not match " *
+        expected = v_nvariables(system) * n_integrated_particles(system)
+        if length(range) != expected
+            throw(DimensionMismatch("`v_ode` range length $(length(range)) does not match " *
                                     "expected number of entries $expected"))
         end
     end
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index 836063538b..c4d969eaef 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -25,14 +25,23 @@ function interact!(dv, v_particle_system, u_particle_system,
     compact_support_ = compact_support(particle_system, neighbor_system)
     almostzero = sqrt(eps(compact_support_^2))
 
+    use_aligned_load_system = Val(use_aligned_vrho_load(v_particle_system, particle_system))
+    use_aligned_load_neighbor = Val(use_aligned_vrho_load(v_neighbor_system,
+                                                          neighbor_system))
+
     @threaded semi for particle in each_integrated_particle(particle_system)
         # We are looping over the particles of `particle_system`, so it is guaranteed
         # that `particle` is in bounds of `particle_system`.
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
 
-        v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
-        rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        # Note that we can only safely use `@inbounds` after checking alignment
+        # with `use_aligned_vrho_load` before the `@threaded` loop.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 use_aligned_load_system, particle)
 
         # Accumulate the RHS contributions over all neighbors before writing to `dv`,
         # to reduce the number of memory writes.
@@ -56,8 +65,11 @@ function interact!(dv, v_particle_system, u_particle_system,
 
             # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
             m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
-            v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
-            rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+            # Note that we can only safely use `@inbounds` after checking alignment
+            # with `use_aligned_vrho_load` before the `@threaded` loop.
+            (v_b,
+             rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                                                     use_aligned_load_neighbor, neighbor)
 
             # The following call is equivalent to
             #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -134,3 +146,55 @@ end
                                    neighbor, p_a)
     return p_a
 end
+
+# Default method, which simply calls `current_velocity` and `current_density` separately.
+@propagate_inbounds function velocity_and_density(v, system, ::Val{false}, particle)
+    v_particle = current_velocity(v, system, particle)
+    rho_particle = current_density(v, system, particle)
+
+    return v_particle, rho_particle
+end
+
+# Optimized version for WCSPH with `ContinuityDensity` in 3D,
+# which combines the velocity and density load into one wide load.
+# This is significantly faster on GPUs than the 4 individual loads of `extract_svector`.
+# WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`,
+#          which is checked by `use_aligned_vrho_load`.
+#          Only call this function after checking `use_aligned_vrho_load` to avoid
+#          segmentation faults from illegal accesses.
+@propagate_inbounds function velocity_and_density(v, system, ::Val{true}, particle)
+    vrho_particle = extract_svector_aligned(v, Val(4), particle)
+
+    # The columns of `v` are ordered as (v_x, v_y, v_z, rho)
+    v..., rho = Tuple(vrho_particle)
+    v_particle = SVector(v)
+
+    return v_particle, rho
+end
+
+# By default, don't use aligned loads
+use_aligned_vrho_load(v, system) = false
+
+function use_aligned_vrho_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
+    use_aligned_vrho_load(v, system, system.density_calculator)
+end
+
+use_aligned_vrho_load(v, system, density_calculator) = false
+
+# Only use aligned loads when all of these conditions are satisfied:
+# - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4.
+# - We are on a GPU, where the aligned load gives a significant speedup.
+# - The velocity array is aligned for aligned loads, which requires that the pointer of `v`
+#   is aligned to `4 * sizeof(eltype(v))`
+#   Otherwise, we cannot use `vloada`, which is an *aligned* load.
+#   The unaligned version `vload` does not produce wide load instructions on GPUs.
+function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity)
+    if !can_use_aligned_load(v, 4)
+        # Aligned loads should always be possible on GPUs because the slices of `v_ode`
+        # are aligned to 64 bytes in `Semidiscretization` and arrays on GPUs are always
+        # aligned to full pages.
+        error("illegal alignment of `v` integration array. Please report this issue.")
+    end
+
+    return true
+end
diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl
index fa000ef38f..e32315959c 100644
--- a/src/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -31,15 +31,19 @@ end
     h = initial_smoothing_length(system)
     almostzero = sqrt(eps(h^2))
 
+    # Check alignment of deformation gradient and `pk1_rho2` arrays before the `@threaded`
+    # loop to be able to use aligned loads safely inside the loop.
+    use_aligned_matrix_load_ = Val(use_aligned_matrix_load(system))
+
     @threaded semi for particle in each_integrated_particle(system)
         # We are looping over the particles of `system`, so it is guaranteed
         # that `particle` is in bounds of `system`.
         m_a = @inbounds system.mass[particle]
         rho_a = @inbounds system.material_density[particle]
         # PK1 / rho^2
-        pk1_rho2_a = @inbounds pk1_rho2(system, particle)
+        pk1_rho2_a = @inbounds pk1_rho2(system, use_aligned_matrix_load_, particle)
         current_coords_a = @inbounds current_coords(system, particle)
-        F_a = @inbounds deformation_gradient(system, particle)
+        F_a = @inbounds deformation_gradient(system, use_aligned_matrix_load_, particle)
 
         # Accumulate the RHS contributions over all neighbors before writing to `dv`
         # to reduce the number of memory writes.
@@ -64,11 +68,12 @@ end
             rho_b = @inbounds system.material_density[neighbor]
             m_b = @inbounds system.mass[neighbor]
             # PK1 / rho^2
-            pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
+            pk1_rho2_b = @inbounds pk1_rho2(system, use_aligned_matrix_load_, neighbor)
             current_coords_b = @inbounds current_coords(system, neighbor)
 
             # The compiler is smart enough to optimize this away if no penalty force is used
-            F_b = @inbounds deformation_gradient(system, neighbor)
+            F_b = @inbounds deformation_gradient(system, use_aligned_matrix_load_,
+                                                 neighbor)
 
             current_pos_diff_ = current_coords_a - current_coords_b
             # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
@@ -120,3 +125,45 @@ function interact!(dv, v_particle_system, u_particle_system,
     # TODO continuity equation?
     return dv
 end
+
+function use_aligned_matrix_load(system)
+    return use_aligned_matrix_load(system.deformation_grad, system.pk1_rho2)
+end
+
+function use_aligned_matrix_load(deformation_grad::AbstractGPUArray,
+                                 pk1_rho2::AbstractGPUArray)
+    # Aligned loads should always be possible on GPUs because GPU arrays are always aligned
+    # to full pages, and these arrays are not slices of larger arrays.
+    if !can_use_aligned_load(deformation_grad, 4)
+        error("illegal alignment of deformation gradient array. Please report this issue.")
+    end
+    if !can_use_aligned_load(pk1_rho2, 4)
+        error("illegal alignment of `pk1_rho2` array. Please report this issue.")
+    end
+
+    return true
+end
+
+# Don't use aligned vector loads on the CPU. For large arrays, alignment to 32 bytes
+# (4 * Float64) is usually given, but it is not guaranteed, as Julia only guarantees
+# alignment to 16 bytes. However, the non-aligned `vload` used in `extract_smatrix` in 2D
+# has the same performance as the aligned `vloada` in `extract_smatrix_aligned` on the CPU.
+use_aligned_matrix_load(deformation_grad, pk1_rho2) = false
+
+# Aligned vector load versions for deformation gradient and `pk1_rho2`.
+# These are only used on GPUs, which is checked by `use_aligned_matrix_load`.
+@propagate_inbounds function pk1_rho2(system, ::Val{true}, particle)
+    return extract_smatrix_aligned(system.pk1_rho2, system, particle)
+end
+
+@propagate_inbounds function pk1_rho2(system, ::Val{false}, particle)
+    return pk1_rho2(system, particle)
+end
+
+@propagate_inbounds function deformation_gradient(system, ::Val{true}, particle)
+    return extract_smatrix_aligned(system.deformation_grad, system, particle)
+end
+
+@propagate_inbounds function deformation_gradient(system, ::Val{false}, particle)
+    return deformation_gradient(system, particle)
+end
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 3117a67ca5..1201a9eeb4 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -77,6 +77,144 @@ end
     end
 end
 
+@testset verbose=true "velocity_and_density $TRIXIPARTICLES_TEST_" begin
+    if supports_double_precision
+        types = [Float64, Float32]
+    else
+        types = [Float32]
+    end
+
+    @testset verbose=true "use_aligned_vrho_load $T" for T in types
+        # Aligned array
+        x = zeros(T, 4, 4)
+
+        # Test different "systems" and density calculators
+        @test !TrixiParticles.use_aligned_vrho_load(x, nothing)
+        @test !TrixiParticles.use_aligned_vrho_load(x,
+                                                    (;
+                                                     density_calculator=ContinuityDensity()))
+        @test !TrixiParticles.use_aligned_vrho_load(x, nothing, SummationDensity())
+
+        # No aligned load on the CPU
+        @test !TrixiParticles.use_aligned_vrho_load(x, nothing, ContinuityDensity())
+
+        y = Adapt.adapt(parallelization_backend, x)
+
+        # Test different "systems" and density calculators
+        @test !TrixiParticles.use_aligned_vrho_load(y, nothing)
+        @test !TrixiParticles.use_aligned_vrho_load(y,
+                                                    (;
+                                                     density_calculator=ContinuityDensity()))
+        @test !TrixiParticles.use_aligned_vrho_load(y, nothing, SummationDensity())
+
+        # Use aligned load on the GPU with 4-aligned arrays
+        @test TrixiParticles.use_aligned_vrho_load(y, nothing, ContinuityDensity())
+        @test TrixiParticles.use_aligned_vrho_load(TrixiParticles.wrap_array(y, 5:16,
+                                                                             (4, 3)),
+                                                   nothing, ContinuityDensity())
+
+        # Unaligned array on the GPU should throw an error
+        str = "illegal alignment"
+        y2 = TrixiParticles.wrap_array(y, 2:13, (4, 3))
+        @test_throws str TrixiParticles.use_aligned_vrho_load(y2, nothing,
+                                                              ContinuityDensity())
+        y3 = TrixiParticles.wrap_array(y, 3:14, (4, 3))
+        @test_throws str TrixiParticles.use_aligned_vrho_load(y3, nothing,
+                                                              ContinuityDensity())
+        y4 = TrixiParticles.wrap_array(y, 4:15, (4, 3))
+        @test_throws str TrixiParticles.use_aligned_vrho_load(y4, nothing,
+                                                              ContinuityDensity())
+    end
+
+    @testset "velocity_and_density $T" for T in types
+        # Aligned array
+        x = rand(T, 4, 4)
+        y = Adapt.adapt(parallelization_backend, x)
+
+        # Dummy system that will only be used for `ndims` and `current_density`.
+        struct MockSystem end
+        Base.ndims(::MockSystem) = 3
+        system = MockSystem()
+        @inline TrixiParticles.current_density(v, ::MockSystem, i) = v[4, i]
+
+        @test TrixiParticles.use_aligned_vrho_load(y, system, ContinuityDensity())
+
+        # Test that the aligned version is consistent with the non-aligned version.
+        # We have 4 particles (with 4 values per particles).
+        # In order to test this on the GPU, we need to use a kernel with `@threaded`.
+        result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
+        TrixiParticles.@threaded parallelization_backend for i in 1:4
+            result[i] = TrixiParticles.velocity_and_density(y, system, Val(false), i) ==
+                        TrixiParticles.velocity_and_density(y, system, Val(true), i)
+        end
+        @test all(result)
+    end
+end
+
+@testset verbose=true "extract_svector_aligned $TRIXIPARTICLES_TEST_" begin
+    if supports_double_precision
+        types = [Float64, Float32]
+    else
+        types = [Float32]
+    end
+
+    @testset verbose=true "$T" for T in types
+        @testset verbose=true "$(N)D" for N in [2, 4, 8]
+            A = Adapt.adapt(parallelization_backend, rand(T, N, 4))
+            val = Val(N)
+
+            @test TrixiParticles.can_use_aligned_load(A, N)
+            slice = TrixiParticles.wrap_array(A, 2:(3 * N + 1), (N, 3))
+            @test !TrixiParticles.can_use_aligned_load(slice, N)
+
+            # Test that the aligned version is consistent with the non-aligned version.
+            # In order to test this on the GPU, we need to use a kernel with `@threaded`.
+            result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
+            TrixiParticles.@threaded parallelization_backend for i in 1:4
+                result[i] = TrixiParticles.extract_svector_aligned(A, val, i) ==
+                            TrixiParticles.extract_svector(A, val, i)
+            end
+            @test all(result)
+        end
+
+        @testset verbose=true "$(N)D" for N in [3, 5, 6, 7]
+            A = Adapt.adapt(parallelization_backend, rand(T, N, 4))
+            val = Val(N)
+
+            # Aligned loads are only support for powers of two.
+            @test !TrixiParticles.can_use_aligned_load(A, N)
+        end
+    end
+end
+
+@testset verbose=true "extract_smatrix_aligned $TRIXIPARTICLES_TEST_" begin
+    if supports_double_precision
+        types = [Float64, Float32]
+    else
+        types = [Float32]
+    end
+
+    @testset verbose=true "$T" for T in types
+        @testset verbose=true "2D" begin
+            A = Adapt.adapt(parallelization_backend, rand(T, 2, 2, 4))
+            val = Val(2)
+
+            @test TrixiParticles.can_use_aligned_load(A, 2^2)
+            slice = TrixiParticles.wrap_array(A, 2:(3 * 4 + 1), (2, 2, 3))
+            @test !TrixiParticles.can_use_aligned_load(slice, 2^2)
+
+            # Test that the aligned version is consistent with the non-aligned version.
+            # In order to test this on the GPU, we need to use a kernel with `@threaded`.
+            result = Adapt.adapt(parallelization_backend, zeros(Bool, 4))
+            TrixiParticles.@threaded parallelization_backend for i in 1:4
+                result[i] = TrixiParticles.extract_smatrix_aligned(A, val, i) ==
+                            TrixiParticles.extract_smatrix(A, val, i)
+            end
+            @test all(result)
+        end
+    end
+end
+
 @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin
     @testset verbose=true "Fluid" begin
         @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin
diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl
index 3c2dcbeb58..12624b24ff 100644
--- a/test/general/semidiscretization.jl
+++ b/test/general/semidiscretization.jl
@@ -8,6 +8,7 @@
     system2 = System2()
 
     Base.eltype(::System1) = Float64
+    Base.eltype(::System2) = Float64
     TrixiParticles.coordinates_eltype(::System1) = Float32
     TrixiParticles.u_nvariables(::System1) = 3
     TrixiParticles.u_nvariables(::System2) = 4
@@ -24,9 +25,12 @@
     @testset verbose=true "Constructor" begin
         semi = Semidiscretization(system1, system2, neighborhood_search=nothing)
 
-        # Verification
-        @test semi.ranges_u == (1:6, 7:18)
-        @test semi.ranges_v == (1:6, 7:12)
+        # These are the ranges that we would expect without alignment padding:
+        # semi.ranges_u == (1:6, 7:18)
+        # semi.ranges_v == (1:6, 7:12)
+        # Due to alignment to 64 bytes, the ranges are adjusted to be:
+        @test semi.ranges_u == (1:6, 9:20)
+        @test semi.ranges_v == (1:6, 9:14)
 
         nhs = [TrixiParticles.TrivialNeighborhoodSearch{3}(search_radius=0.2,
                eachpoint=1:2)
@@ -152,8 +156,8 @@
 
         semi = Semidiscretization(system1, system2, neighborhood_search=nothing)
 
-        dv_ode = zeros(3 * 2 + 2 * 3)
-        du_ode = zeros(3 * 2 + 4 * 3)
+        dv_ode = zeros(semi.ranges_v[end].stop)
+        du_ode = zeros(semi.ranges_u[end].stop)
         u_ode = zero(du_ode)
 
         v1 = [1.0 2.0
diff --git a/test/general/util.jl b/test/general/util.jl
index 4aefddd4e3..7441fdb7f0 100644
--- a/test/general/util.jl
+++ b/test/general/util.jl
@@ -1,3 +1,23 @@
+@testset verbose=true "extract_smatrix" begin
+    A = Float64.(collect(reshape(1:12, 2, 2, 3)))
+    @test TrixiParticles.extract_smatrix(A, Val(2), 1) == A[:, :, 1]
+    @test TrixiParticles.extract_smatrix(A, Val(2), 2) == A[:, :, 2]
+    @test TrixiParticles.extract_smatrix(A, Val(2), 3) == A[:, :, 3]
+    @test_throws "extract_smatrix only works" TrixiParticles.extract_smatrix(A, Val(1), 1)
+    @test_throws "BoundsError" TrixiParticles.extract_smatrix(A, Val(3), 1)
+end
+
+@testset verbose=true "extract_svector" begin
+    A = Float64.(collect(reshape(1:9, 3, 3)))
+    @test TrixiParticles.extract_svector(A, Val(3), 1) == A[:, 1]
+    @test TrixiParticles.extract_svector(A, Val(3), 2) == A[:, 2]
+    @test TrixiParticles.extract_svector(A, Val(3), 3) == A[:, 3]
+    @test TrixiParticles.extract_svector(A, Val(2), 1) == A[1:2, 1]
+    @test TrixiParticles.extract_svector(A, Val(2), 2) == A[1:2, 2]
+    @test TrixiParticles.extract_svector(A, Val(2), 3) == A[1:2, 3]
+    @test_throws "BoundsError" TrixiParticles.extract_svector(A, Val(4), 1)
+end
+
 @testset verbose=true "ThreadedBroadcastArray" begin
     A = TrixiParticles.ThreadedBroadcastArray(ones(3, 3))
     B = ones(3, 3)
diff --git a/test/runtests.jl b/test/runtests.jl
index 457e6b8212..7d60bfc759 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,3 +1,9 @@
+using Test
+# Test that precompilation works without warnings (e.g. due to method overwrites).
+# This is important because method overwrite warnings disable precompilation of the package.
+# We need `@eval Main` before Julia 1.12 to avoid `import" expression not at top level`.
+@test_nowarn @eval Main import TrixiParticles
+
 include("test_util.jl")
 
 const TRIXIPARTICLES_TEST = lowercase(get(ENV, "TRIXIPARTICLES_TEST", "all"))
diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl
index ce498b4db2..fa4f98241d 100644
--- a/test/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -62,14 +62,15 @@
                 current_coordinates::Any
                 material_density::Any
                 pk1_rho2::Any
+                deformation_grad::Any
                 mass::Any
                 penalty_force::Any
                 viscosity::Any
                 buffer::Any
             end
             @inline Base.eltype(::MockSystem) = Float64
-            system = MockSystem(current_coordinates, material_density, pk1_rho2, mass,
-                                nothing, nothing, nothing)
+            system = MockSystem(current_coordinates, material_density, pk1_rho2,
+                                nothing, mass, nothing, nothing, nothing)
 
             function TrixiParticles.initial_coordinates(::MockSystem)
                 return initial_coordinates