From 0441ed6fb2fe5c506e5f311e3253d5bd848c2ca2 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 16 Apr 2026 17:00:57 +0200 Subject: [PATCH 01/22] Add SIMD.jl dependency This reverts commit cdd58e2c3ac6fef072b39c5d2252119eb99bf4e3. --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index 9f17f48c0a..fe5dd24971 100644 --- a/Project.toml +++ b/Project.toml @@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +SIMD = "fdea26ae-647d-5447-a871-4b548cad5224" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -63,6 +64,7 @@ Polyester = "0.7.10" ReadVTK = "0.2" RecipesBase = "1" Reexport = "1" +SIMD = "3.7.2" SciMLBase = "2" StaticArrays = "1" Statistics = "1" From 13d6dfdec49d87b4e79b6efd019f0fd6210a74e0 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 16 Apr 2026 17:01:39 +0200 Subject: [PATCH 02/22] Remove vectorized loads for WCSPH velocity and density This reverts commit 1243fc6cc8b7e7bb7f9e211ce03fdbe88eebfa35. --- src/TrixiParticles.jl | 1 + .../fluid/weakly_compressible_sph/rhs.jl | 72 +++++++++++++++++-- test/examples/gpu.jl | 62 ++++++++++++++++ 3 files changed, 131 insertions(+), 4 deletions(-) diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl index 215a2b3bf1..45b94faca3 100644 --- a/src/TrixiParticles.jl +++ b/src/TrixiParticles.jl @@ -25,6 +25,7 @@ using Random: seed! using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!, get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!, add_tstop! +using SIMD: SIMD @reexport using StaticArrays: SVector using StaticArrays: @SMatrix, SMatrix, setindex using Statistics: Statistics diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl index 836063538b..312f6912f4 100644 --- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl +++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl @@ -25,14 +25,20 @@ function interact!(dv, v_particle_system, u_particle_system, compact_support_ = compact_support(particle_system, neighbor_system) almostzero = sqrt(eps(compact_support_^2)) + use_simd_load_system = Val(use_simd_load(v_particle_system, particle_system)) + use_simd_load_neighbor = Val(use_simd_load(v_neighbor_system, neighbor_system)) + @threaded semi for particle in each_integrated_particle(particle_system) # We are looping over the particles of `particle_system`, so it is guaranteed # that `particle` is in bounds of `particle_system`. m_a = @inbounds hydrodynamic_mass(particle_system, particle) p_a = @inbounds current_pressure(v_particle_system, particle_system, particle) - v_a = @inbounds current_velocity(v_particle_system, particle_system, particle) - rho_a = @inbounds current_density(v_particle_system, particle_system, particle) + # In 3D, this function can combine velocity and density load into one wide load, + # which gives a significant speedup on GPUs. + (v_a, + rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, + use_simd_load_system, particle) # Accumulate the RHS contributions over all neighbors before writing to `dv`, # to reduce the number of memory writes. @@ -56,8 +62,9 @@ function interact!(dv, v_particle_system, u_particle_system, # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system` m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) - v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) - rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + (v_b, + rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, + use_simd_load_neighbor, neighbor) # The following call is equivalent to # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` @@ -134,3 +141,60 @@ end neighbor, p_a) return p_a end + +# Default method, which simply calls `current_velocity` and `current_density` separately. +@propagate_inbounds function velocity_and_density(v, system, ::Val{false}, particle) + v_particle = current_velocity(v, system, particle) + rho_particle = current_density(v, system, particle) + + return v_particle, rho_particle +end + +# Optimized version for WCSPH with `ContinuityDensity` in 3D, +# which combines the velocity and density load into one wide load. +# This is significantly faster on GPUs than the 4 individual loads of `extract_svector`. +# WARNING: this requires that the pointer of `v` is aligned to the size +# of `SIMD.Vec{4, eltype(v)}`, which is checked by `use_simd_load`. +@inline function velocity_and_density(v, system, ::Val{true}, particle) + # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column + # of `v` corresponding to `particle`. + # Note that this doesn't work for 2D because it requires a stride of 2^n. + vrho_particle = SIMD.vloada(SIMD.Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1)) + + # The columns of `v` are ordered as (v_x, v_y, v_z, rho) + v1, v2, v3, rho = Tuple(vrho_particle) + v_particle = SVector(v1, v2, v3) + + return v_particle, rho +end + +# By default, don't use SIMD loads +use_simd_load(v, system) = false + +function use_simd_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3}) + use_simd_load(v, system, system.density_calculator) +end + +use_simd_load(v, system, density_calculator) = false + +# Only use SIMD loads when all of these conditions are satisfied: +# - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4. +# - We are on a GPU, where the SIMD load gives a significant speedup. +# - The velocity array is aligned for SIMD loads, which requires that the pointer of `v` +# is aligned to the size of `SIMD.Vec{4, eltype(v)}`. +# Otherwise, we cannot use `vloada`, which is an *aligned* SIMD load. +# The unaligned version `vload` does not produce wide load instructions on GPUs. +# In this last case, we don't fall back to the non-SIMD version and throw an error instead. +function use_simd_load(v::AbstractGPUArray, system, ::ContinuityDensity) + aligned = is_aligned(pointer(v), SIMD.Vec{4, eltype(v)}) + + if !aligned + error("on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " * + "first systems in the semidiscretization to ensure that their integration " * + "arrays are aligned for SIMD loads.") + end + + return aligned +end + +is_aligned(ptr, ::Type{SIMD.Vec{N, T}}) where {N, T} = UInt(ptr) % (N * sizeof(T)) == 0 diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 3117a67ca5..2cd7ade8a4 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -77,6 +77,68 @@ end end end +@testset verbose=true "velocity_and_density $TRIXIPARTICLES_TEST_" begin + if supports_double_precision + types = [Float64, Float32] + else + types = [Float32] + end + + @testset verbose=true "use_simd_load $T" for T in types + # Aligned array + x = zeros(T, 16) + + # Test different "systems" and density calculators + @test !TrixiParticles.use_simd_load(x, nothing) + @test !TrixiParticles.use_simd_load(x, (; density_calculator=ContinuityDensity())) + @test !TrixiParticles.use_simd_load(x, nothing, SummationDensity()) + + # No SIMD load on the CPU + @test !TrixiParticles.use_simd_load(x, nothing, ContinuityDensity()) + + y = Adapt.adapt(parallelization_backend, x) + + # Test different "systems" and density calculators + @test !TrixiParticles.use_simd_load(y, nothing) + @test !TrixiParticles.use_simd_load(y, (; density_calculator=ContinuityDensity())) + @test !TrixiParticles.use_simd_load(y, nothing, SummationDensity()) + + # Use SIMD load on the GPU with 4-aligned arrays + @test TrixiParticles.use_simd_load(y, nothing, ContinuityDensity()) + @test TrixiParticles.use_simd_load(view(y, 5:16), nothing, ContinuityDensity()) + + # Unaligned array on the GPU should throw an error + @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 2:16), nothing, + ContinuityDensity()) + @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 3:16), nothing, + ContinuityDensity()) + @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 4:16), nothing, + ContinuityDensity()) + end + + @testset "velocity_and_density $T" for T in types + # Aligned array + x = rand(T, 4, 4) + y = Adapt.adapt(parallelization_backend, x) + + # Dummy system that will only be used for `ndims`. + struct MockSystem end + Base.ndims(::MockSystem) = 3 + system = MockSystem() + @inline TrixiParticles.current_density(v, ::MockSystem, i) = v[4, i] + + # Test that the SIMD version is consistent with the non-SIMD version. + # We have 4 particles (with 4 values per particles). + # In order to test this on the GPU, we need to use a kernel with `@threaded`. + result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) + TrixiParticles.@threaded parallelization_backend for i in 1:4 + result[i] = TrixiParticles.velocity_and_density(y, system, Val(false), i) == + TrixiParticles.velocity_and_density(y, system, Val(true), i) + end + @test all(result) + end +end + @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin @testset verbose=true "Fluid" begin @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin From 5b92abbb7d7224c23091946cb63547f2a31b20df Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 16 Apr 2026 17:09:03 +0200 Subject: [PATCH 03/22] Add SIMD load `extract_smatrix_aligned` --- src/general/abstract_system.jl | 28 ++++++++++++++++++++++++++ test/examples/gpu.jl | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index 79b52a81df..0a5f61170a 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -82,6 +82,34 @@ end Val(NDIMS^2))) end +# Optimized version for 2D, which uses SIMD.jl to combine the 4 loads of the 2x2 matrix +# into a single wide load. This is significantly faster on GPUs than 4 individual loads. +# WARNING: +# 1. This only works if the matrix elements are stored contiguously in memory. +# The 4 elements of the 2x2 matrix for a particle must immediately follow the +# 4 elements of the previous particle's matrix, with no padding in between. +# 2. The pointer of `A` must be aligned to the size of `Vec{4, eltype(A)}`. +# This is guaranteed if `A` is allocated by Julia and has the correct size, +# but may not be true if `A` is a view or subarray. +@propagate_inbounds function extract_smatrix_aligned(A, system, particle) + return extract_smatrix_aligned(A, Val(ndims(system)), particle) +end + +@inline function extract_smatrix_aligned(A, ::Val{2}, particle) + @boundscheck checkbounds(A, 2, 2, particle) + + # Note that this doesn't work in 3D because it requires a stride of 2^n. + x = SIMD.vloada(SIMD.Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1)) + + return SMatrix{2, 2}(Tuple(x)) +end + +# Fall back to the generic version when not in 2D. +@propagate_inbounds function extract_smatrix_aligned(A, ::Val{NDIMS}, + particle) where {NDIMS} + return extract_smatrix(A, Val(NDIMS), particle) +end + # Specifically get the current coordinates of a particle for all system types. @propagate_inbounds function current_coords(u, system, particle) return extract_svector(current_coordinates(u, system), system, particle) diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 2cd7ade8a4..91cf036079 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -139,6 +139,42 @@ end end end +@testset verbose=true "extract_smatrix_aligned $TRIXIPARTICLES_TEST_" begin + if supports_double_precision + types = [Float64, Float32] + else + types = [Float32] + end + + @testset verbose=true "$T" for T in types + @testset verbose=true "$(N)D" for N in 2:3 + @testset "CPU" begin + A = rand(T, N, N, 4) + + # Test that the SIMD version is consistent with the non-SIMD version. + for i in 1:4 + @test TrixiParticles.extract_smatrix_aligned(A, Val(N), i) == + TrixiParticles.extract_smatrix(A, Val(N), i) + end + end + + @testset "GPU" begin + A = Adapt.adapt(parallelization_backend, rand(T, N, N, 4)) + val = Val(N) + + # Test that the SIMD version is consistent with the non-SIMD version. + # In order to test this on the GPU, we need to use a kernel with `@threaded`. + result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) + TrixiParticles.@threaded parallelization_backend for i in 1:4 + result[i] = TrixiParticles.extract_smatrix_aligned(A, val, i) == + TrixiParticles.extract_smatrix(A, val, i) + end + @test all(result) + end + end + end +end + @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin @testset verbose=true "Fluid" begin @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin From 7df05abf670b886cf656fe0743ac5eb5a45504b3 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 16 Apr 2026 17:59:15 +0200 Subject: [PATCH 04/22] Make aligned loads nicer --- src/general/gpu.jl | 61 ++++++++++++++++++ .../fluid/weakly_compressible_sph/rhs.jl | 62 +++++++++---------- 2 files changed, 92 insertions(+), 31 deletions(-) diff --git a/src/general/gpu.jl b/src/general/gpu.jl index f0d145bf1e..d6ddf62501 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -35,3 +35,64 @@ end function allocate(backend, ELTYPE, size) return Array{ELTYPE, length(size)}(undef, size) end + +# This function checks if we can use aligned `SVector` or `SMatrix` loads for the given +# array `A` and the size of the vector we want to load `n`. +# +# Note that it is not checked if the size `n` is legal (a power of 2). +# If the size `n` is not legal, the aligned loads will fall back to regular loads. +# This is because a regular load is expected for a problem that doesn't support aligned +# loads (e.g. 3D matrices of 3x3), whereas an illegal alignment is unexpected and should +# throw an error instead of silently (and non-deterministically) falling back to slower +# regular loads. +function can_use_aligned_load(A, n) + # Check if the beginning of `A` is aligned to the size of the vector we want to load. + is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0 + # Check if the stride of `A` in the last dimension is equal to `n`, + # which means that there is no padding between the vectors we want to load. + has_stride = stride(A, ndims(A)) == n + + return is_aligned && has_stride +end + +# For N = 2 and N = 4, use the aligned vector loads. +@propagate_inbounds function extract_svector_aligned(A::AbstractMatrix, + val_n::Union{Val{2}, Val{4}}, i) + return _extract_svector_aligned(A, val_n, i) +end + +# For other N, fall back to the regular `extract_svector`. +@propagate_inbounds function extract_svector_aligned(A, val_n, i) + return extract_svector(A, val_n, i) +end + +# This function only works for N = 2 and N = 4, because it requires a stride of 2^n. +@inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N} + @boundscheck checkbounds(A, 1:N, i) + + # This function assumes alignment of the data, which means that the columns of `A` + # have exactly the size `N` and no padding between them. + @boundscheck @assert stride(A, 2) == N + + vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1)) + + return SVector{N}(Tuple(vec)) +end + +# For general N, fall back to the regular `extract_smatrix`. +@propagate_inbounds function extract_matrix_aligned(A, val_n, i) + return extract_smatrix(A, val_n, i) +end + +# This function only works for 2x2 matrices, because it requires a stride of 2^n. +@inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T} + @boundscheck checkbounds(A, 2, 2, i) + + # This function assumes alignment of the data, which means that the first two of `A` + # have exactly the size `2` and no padding between them. + @boundscheck @assert stride(A, 3) == 4 + + vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1)) + + return SMatrix{2, 2}(Tuple(vec)) +end diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl index 312f6912f4..2c9c5487e5 100644 --- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl +++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl @@ -25,8 +25,9 @@ function interact!(dv, v_particle_system, u_particle_system, compact_support_ = compact_support(particle_system, neighbor_system) almostzero = sqrt(eps(compact_support_^2)) - use_simd_load_system = Val(use_simd_load(v_particle_system, particle_system)) - use_simd_load_neighbor = Val(use_simd_load(v_neighbor_system, neighbor_system)) + use_aligned_load_system = Val(use_aligned_vrho_load(v_particle_system, particle_system)) + use_aligned_load_neighbor = Val(use_aligned_vrho_load(v_neighbor_system, + neighbor_system)) @threaded semi for particle in each_integrated_particle(particle_system) # We are looping over the particles of `particle_system`, so it is guaranteed @@ -38,7 +39,7 @@ function interact!(dv, v_particle_system, u_particle_system, # which gives a significant speedup on GPUs. (v_a, rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, - use_simd_load_system, particle) + use_aligned_load_system, particle) # Accumulate the RHS contributions over all neighbors before writing to `dv`, # to reduce the number of memory writes. @@ -64,7 +65,7 @@ function interact!(dv, v_particle_system, u_particle_system, m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) (v_b, rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, - use_simd_load_neighbor, neighbor) + use_aligned_load_neighbor, neighbor) # The following call is equivalent to # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` @@ -153,48 +154,47 @@ end # Optimized version for WCSPH with `ContinuityDensity` in 3D, # which combines the velocity and density load into one wide load. # This is significantly faster on GPUs than the 4 individual loads of `extract_svector`. -# WARNING: this requires that the pointer of `v` is aligned to the size -# of `SIMD.Vec{4, eltype(v)}`, which is checked by `use_simd_load`. -@inline function velocity_and_density(v, system, ::Val{true}, particle) - # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column - # of `v` corresponding to `particle`. - # Note that this doesn't work for 2D because it requires a stride of 2^n. - vrho_particle = SIMD.vloada(SIMD.Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1)) +# WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`, +# which is checked by `use_aligned_vrho_load`. +# Only call this function after checking `use_aligned_vrho_load` to avoid +# segmentation faults from illegal accesses. +@propagate_inbounds function velocity_and_density(v, system, ::Val{true}, particle) + vrho_particle = extract_svector_aligned(v, Val(4), particle) # The columns of `v` are ordered as (v_x, v_y, v_z, rho) - v1, v2, v3, rho = Tuple(vrho_particle) - v_particle = SVector(v1, v2, v3) + v..., rho = Tuple(vrho_particle) + v_particle = SVector(v) return v_particle, rho end -# By default, don't use SIMD loads -use_simd_load(v, system) = false +# By default, don't use aligned loads +use_aligned_vrho_load(v, system) = false -function use_simd_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3}) - use_simd_load(v, system, system.density_calculator) +function use_aligned_vrho_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3}) + use_aligned_vrho_load(v, system, system.density_calculator) end -use_simd_load(v, system, density_calculator) = false +use_aligned_vrho_load(v, system, density_calculator) = false -# Only use SIMD loads when all of these conditions are satisfied: +# Only use aligned loads when all of these conditions are satisfied: # - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4. -# - We are on a GPU, where the SIMD load gives a significant speedup. -# - The velocity array is aligned for SIMD loads, which requires that the pointer of `v` -# is aligned to the size of `SIMD.Vec{4, eltype(v)}`. -# Otherwise, we cannot use `vloada`, which is an *aligned* SIMD load. +# - We are on a GPU, where the aligned load gives a significant speedup. +# - The velocity array is aligned for aligned loads, which requires that the pointer of `v` +# is aligned to `4 * sizeof(eltype(v))` +# Otherwise, we cannot use `vloada`, which is an *aligned* load. # The unaligned version `vload` does not produce wide load instructions on GPUs. -# In this last case, we don't fall back to the non-SIMD version and throw an error instead. -function use_simd_load(v::AbstractGPUArray, system, ::ContinuityDensity) - aligned = is_aligned(pointer(v), SIMD.Vec{4, eltype(v)}) - - if !aligned +function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity) + if !can_use_aligned_load(v, 4) + # If aligned loads are possible for the problem, but not allowed due to alignment, + # we don't fall back to the non-SIMD version and throw an error instead. + # This is likely a configuration error (see the error message below), and notifying + # the user is better than silently falling back to slower loads and thus + # non-deterministic performance. error("on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " * "first systems in the semidiscretization to ensure that their integration " * "arrays are aligned for SIMD loads.") end - return aligned + return true end - -is_aligned(ptr, ::Type{SIMD.Vec{N, T}}) where {N, T} = UInt(ptr) % (N * sizeof(T)) == 0 From 48350ea59eb0607270e353d79560e91e15f8870b Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 17 Apr 2026 14:44:39 +0200 Subject: [PATCH 05/22] Automatically add padding to make aligned loads possible. --- src/general/semidiscretization.jl | 15 +++++++++++++-- src/schemes/fluid/weakly_compressible_sph/rhs.jl | 12 ++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 6f8f59197e..cb05b4e0b7 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -94,10 +94,21 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; sizes_u = [u_nvariables(system) * n_integrated_particles(system) for system in systems] - ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i]) - for i in eachindex(sizes_u)) sizes_v = [v_nvariables(system) * n_integrated_particles(system) for system in systems] + + # Align sizes to 64 bytes by adding padding if necessary. + # This ensures that aligned loads can be used on the integration arrays, which can + # significantly improve performance on GPUs. Performance benefits on CPUs remain + # to be investigated. + for i in eachindex(systems) + block_size = div(64, sizeof(eltype(systems[i]))) + sizes_u[i] = div(sizes_u[i], block_size, RoundUp) * block_size + sizes_v[i] = div(sizes_v[i], block_size, RoundUp) * block_size + end + + ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i]) + for i in eachindex(sizes_u)) ranges_v = Tuple((sum(sizes_v[1:(i - 1)]) + 1):sum(sizes_v[1:i]) for i in eachindex(sizes_v)) diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl index 2c9c5487e5..a44adaef9b 100644 --- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl +++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl @@ -186,14 +186,10 @@ use_aligned_vrho_load(v, system, density_calculator) = false # The unaligned version `vload` does not produce wide load instructions on GPUs. function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity) if !can_use_aligned_load(v, 4) - # If aligned loads are possible for the problem, but not allowed due to alignment, - # we don't fall back to the non-SIMD version and throw an error instead. - # This is likely a configuration error (see the error message below), and notifying - # the user is better than silently falling back to slower loads and thus - # non-deterministic performance. - error("on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " * - "first systems in the semidiscretization to ensure that their integration " * - "arrays are aligned for SIMD loads.") + # Aligned loads should always be possible on GPUs because the slices of `v_ode` + # are aligned to 64 bytes in `Semidiscretization` and arrays on GPUs are always + # aligned to full pages. + error("illegal alignment of `v` integration array. Please report this issue.") end return true From 65b309a76e617e413ca903d31a1258f349e95e42 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Sun, 19 Apr 2026 12:36:52 +0200 Subject: [PATCH 06/22] Fix tests --- src/general/semidiscretization.jl | 41 ++++++++++++++++++------------ test/general/semidiscretization.jl | 14 ++++++---- 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index cb05b4e0b7..f9a35562f5 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -97,20 +97,25 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; sizes_v = [v_nvariables(system) * n_integrated_particles(system) for system in systems] - # Align sizes to 64 bytes by adding padding if necessary. - # This ensures that aligned loads can be used on the integration arrays, which can - # significantly improve performance on GPUs. Performance benefits on CPUs remain - # to be investigated. + start_u = 1 + start_v = 1 + ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems)) + ranges_v_vec = Vector{UnitRange{Int}}(undef, length(systems)) for i in eachindex(systems) + ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1) + ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1) + + # Align sizes to 64 bytes by adding padding if necessary. + # This ensures that aligned loads can be used on the integration arrays, which can + # significantly improve performance on GPUs. Performance benefits on CPUs remain + # to be investigated. block_size = div(64, sizeof(eltype(systems[i]))) - sizes_u[i] = div(sizes_u[i], block_size, RoundUp) * block_size - sizes_v[i] = div(sizes_v[i], block_size, RoundUp) * block_size + start_u += div(sizes_u[i], block_size, RoundUp) * block_size + start_v += div(sizes_v[i], block_size, RoundUp) * block_size end - ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i]) - for i in eachindex(sizes_u)) - ranges_v = Tuple((sum(sizes_v[1:(i - 1)]) + 1):sum(sizes_v[1:i]) - for i in eachindex(sizes_v)) + ranges_u = Tuple(ranges_u_vec) + ranges_v = Tuple(ranges_v_vec) # Create a n x n matrix of n neighborhood searches for each of the n systems. # We will need one neighborhood search for each pair of systems. @@ -256,13 +261,13 @@ function semidiscretize(semi, tspan; reset_threads=true) Polyester.reset_threads!() end - sizes_u = (u_nvariables(system) * n_integrated_particles(system) for system in systems) - sizes_v = (v_nvariables(system) * n_integrated_particles(system) for system in systems) + size_u_ode = semi.ranges_u[end].stop + size_v_ode = semi.ranges_v[end].stop # Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or # use CPU vectors for all CPU backends. - u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, sum(sizes_u)) - v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v)) + u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, size_u_ode) + v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, size_v_ode) if semi.parallelization_backend isa KernelAbstractions.GPU u0_ode = u0_ode_ @@ -277,6 +282,9 @@ function semidiscretize(semi, tspan; reset_threads=true) parallelization_backend=semi.parallelization_backend) end + u0_ode .= 0 + v0_ode .= 0 + # Set initial condition foreach_system_wrapped(semi, v0_ode, u0_ode) do system, v0_system, u0_system write_u0!(u0_system, system) @@ -367,8 +375,9 @@ end range = ranges_v[system_indices(system, semi)] @boundscheck begin - if length(range) != v_nvariables(system) * n_integrated_particles(system) - throw(DimensionMismatch("`v_ode` range length $range_length does not match " * + expected = v_nvariables(system) * n_integrated_particles(system) + if length(range) != expected + throw(DimensionMismatch("`v_ode` range length $(length(range)) does not match " * "expected number of entries $expected")) end end diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl index 3c2dcbeb58..b816dd26da 100644 --- a/test/general/semidiscretization.jl +++ b/test/general/semidiscretization.jl @@ -8,6 +8,7 @@ system2 = System2() Base.eltype(::System1) = Float64 + Base.eltype(::System2) = Float64 TrixiParticles.coordinates_eltype(::System1) = Float32 TrixiParticles.u_nvariables(::System1) = 3 TrixiParticles.u_nvariables(::System2) = 4 @@ -24,9 +25,12 @@ @testset verbose=true "Constructor" begin semi = Semidiscretization(system1, system2, neighborhood_search=nothing) - # Verification - @test semi.ranges_u == (1:6, 7:18) - @test semi.ranges_v == (1:6, 7:12) + # Verification: These are the ranges that we would expect based on system sizes: + # semi.ranges_u == (1:6, 7:18) + # semi.ranges_v == (1:6, 7:12) + # Due to alignment to 64 bytes, the ranges are adjusted to be: + @test semi.ranges_u == (1:6, 9:20) + @test semi.ranges_v == (1:6, 9:14) nhs = [TrixiParticles.TrivialNeighborhoodSearch{3}(search_radius=0.2, eachpoint=1:2) @@ -152,8 +156,8 @@ semi = Semidiscretization(system1, system2, neighborhood_search=nothing) - dv_ode = zeros(3 * 2 + 2 * 3) - du_ode = zeros(3 * 2 + 4 * 3) + dv_ode = zeros(semi.ranges_v[end].stop) + du_ode = zeros(semi.ranges_u[end].stop) u_ode = zero(du_ode) v1 = [1.0 2.0 From b5fcd699a6fa02423fdb69d93253443ff2f2bc97 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Sun, 19 Apr 2026 12:46:02 +0200 Subject: [PATCH 07/22] Update comment --- src/general/semidiscretization.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index f9a35562f5..73ab75c256 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -107,8 +107,7 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; # Align sizes to 64 bytes by adding padding if necessary. # This ensures that aligned loads can be used on the integration arrays, which can - # significantly improve performance on GPUs. Performance benefits on CPUs remain - # to be investigated. + # significantly improve performance on GPUs. block_size = div(64, sizeof(eltype(systems[i]))) start_u += div(sizes_u[i], block_size, RoundUp) * block_size start_v += div(sizes_v[i], block_size, RoundUp) * block_size From a4d618caeed15259008400e0507f890da594b05e Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Sun, 19 Apr 2026 13:07:57 +0200 Subject: [PATCH 08/22] Only align integration arrays on the GPU --- src/general/semidiscretization.jl | 15 ++++++++++---- test/examples/gpu.jl | 33 ++++++++++++++++++++++++++++++ test/general/semidiscretization.jl | 9 +++----- 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 73ab75c256..5e10b093ed 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -97,6 +97,17 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; sizes_v = [v_nvariables(system) * n_integrated_particles(system) for system in systems] + ELTYPE = eltype(first(systems)) + if parallelization_backend isa KernelAbstractions.GPU + # Align ranges to 64 bytes by adding padding if necessary. + # This ensures that aligned loads can be used on the wrapped integration arrays, + # which can significantly improve performance on GPUs. + block_size = div(64, sizeof(ELTYPE)) + else + # There is no performance benefit to aligning ranges for CPU backends. + block_size = 1 + end + start_u = 1 start_v = 1 ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems)) @@ -105,10 +116,6 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1) ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1) - # Align sizes to 64 bytes by adding padding if necessary. - # This ensures that aligned loads can be used on the integration arrays, which can - # significantly improve performance on GPUs. - block_size = div(64, sizeof(eltype(systems[i]))) start_u += div(sizes_u[i], block_size, RoundUp) * block_size start_v += div(sizes_v[i], block_size, RoundUp) * block_size end diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 91cf036079..f1b8b92ed9 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -175,6 +175,39 @@ end end end +@trixi_testset "Semidiscretization vector alignment" begin + # Mock systems + struct System1 <: TrixiParticles.AbstractSystem{3} end + struct System2 <: TrixiParticles.AbstractSystem{3} end + + system1 = System1() + system2 = System2() + + Base.eltype(::System1) = Float64 + Base.eltype(::System2) = Float64 + TrixiParticles.u_nvariables(::System1) = 3 + TrixiParticles.u_nvariables(::System2) = 4 + TrixiParticles.v_nvariables(::System1) = 3 + TrixiParticles.v_nvariables(::System2) = 2 + TrixiParticles.nparticles(::System1) = 2 + TrixiParticles.nparticles(::System2) = 3 + + TrixiParticles.compact_support(::System1, neighbor) = 0.2 + TrixiParticles.compact_support(::System2, neighbor) = 0.2 + + @testset verbose=true "Constructor" begin + semi = Semidiscretization(system1, system2, neighborhood_search=nothing, + parallelization_backend=Main.parallelization_backend) + + # These are the ranges that we would expect on the CPU: + # semi.ranges_u == (1:6, 7:18) + # semi.ranges_v == (1:6, 7:12) + # Due to alignment to 64 bytes, the ranges are adjusted to be: + @test semi.ranges_u == (1:6, 9:20) + @test semi.ranges_v == (1:6, 9:14) + end +end + @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin @testset verbose=true "Fluid" begin @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl index b816dd26da..52aeb3718c 100644 --- a/test/general/semidiscretization.jl +++ b/test/general/semidiscretization.jl @@ -25,12 +25,9 @@ @testset verbose=true "Constructor" begin semi = Semidiscretization(system1, system2, neighborhood_search=nothing) - # Verification: These are the ranges that we would expect based on system sizes: - # semi.ranges_u == (1:6, 7:18) - # semi.ranges_v == (1:6, 7:12) - # Due to alignment to 64 bytes, the ranges are adjusted to be: - @test semi.ranges_u == (1:6, 9:20) - @test semi.ranges_v == (1:6, 9:14) + # Verification + semi.ranges_u == (1:6, 7:18) + semi.ranges_v == (1:6, 7:12) nhs = [TrixiParticles.TrivialNeighborhoodSearch{3}(search_radius=0.2, eachpoint=1:2) From 81849b39ab5fd85a5493506aed654af7c774e4e0 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Sun, 19 Apr 2026 13:49:59 +0200 Subject: [PATCH 09/22] Use aligned loads for TLSPH 2D matrices --- src/general/abstract_system.jl | 41 +++------------ src/general/gpu.jl | 20 +++---- .../fluid/weakly_compressible_sph/rhs.jl | 4 ++ .../structure/total_lagrangian_sph/rhs.jl | 52 +++++++++++++++++-- 4 files changed, 69 insertions(+), 48 deletions(-) diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index 0a5f61170a..23135da628 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -72,42 +72,13 @@ end return extract_smatrix(A, Val(ndims(system)), particle) end -@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS} - @boundscheck checkbounds(A, NDIMS, NDIMS, particle) +@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N} + @boundscheck checkbounds(A, N, N, i) + # This function assumes that the first two dimensions of `A` have exactly the size `N`. + @boundscheck @assert stride(A, 3) == N^2 - # Extract the matrix elements for this particle as a tuple to pass to SMatrix - return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1, - div(i - 1, NDIMS) + 1, - particle]), - Val(NDIMS^2))) -end - -# Optimized version for 2D, which uses SIMD.jl to combine the 4 loads of the 2x2 matrix -# into a single wide load. This is significantly faster on GPUs than 4 individual loads. -# WARNING: -# 1. This only works if the matrix elements are stored contiguously in memory. -# The 4 elements of the 2x2 matrix for a particle must immediately follow the -# 4 elements of the previous particle's matrix, with no padding in between. -# 2. The pointer of `A` must be aligned to the size of `Vec{4, eltype(A)}`. -# This is guaranteed if `A` is allocated by Julia and has the correct size, -# but may not be true if `A` is a view or subarray. -@propagate_inbounds function extract_smatrix_aligned(A, system, particle) - return extract_smatrix_aligned(A, Val(ndims(system)), particle) -end - -@inline function extract_smatrix_aligned(A, ::Val{2}, particle) - @boundscheck checkbounds(A, 2, 2, particle) - - # Note that this doesn't work in 3D because it requires a stride of 2^n. - x = SIMD.vloada(SIMD.Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1)) - - return SMatrix{2, 2}(Tuple(x)) -end - -# Fall back to the generic version when not in 2D. -@propagate_inbounds function extract_smatrix_aligned(A, ::Val{NDIMS}, - particle) where {NDIMS} - return extract_smatrix(A, Val(NDIMS), particle) + # Extract the matrix elements for this `i` as a tuple to pass to SMatrix + return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[N^2 * (i - 1) + j]), Val(N^2))) end # Specifically get the current coordinates of a particle for all system types. diff --git a/src/general/gpu.jl b/src/general/gpu.jl index d6ddf62501..5cf7082588 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -55,6 +55,10 @@ function can_use_aligned_load(A, n) return is_aligned && has_stride end +@propagate_inbounds function extract_svector_aligned(A, system::AbstractSystem, particle) + return extract_svector_aligned(A, Val(ndims(system)), particle) +end + # For N = 2 and N = 4, use the aligned vector loads. @propagate_inbounds function extract_svector_aligned(A::AbstractMatrix, val_n::Union{Val{2}, Val{4}}, i) @@ -69,28 +73,26 @@ end # This function only works for N = 2 and N = 4, because it requires a stride of 2^n. @inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N} @boundscheck checkbounds(A, 1:N, i) - - # This function assumes alignment of the data, which means that the columns of `A` - # have exactly the size `N` and no padding between them. - @boundscheck @assert stride(A, 2) == N + @boundscheck can_use_aligned_load(A, N) vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1)) return SVector{N}(Tuple(vec)) end +@propagate_inbounds function extract_smatrix_aligned(A, system::AbstractSystem, particle) + return extract_smatrix_aligned(A, Val(ndims(system)), particle) +end + # For general N, fall back to the regular `extract_smatrix`. -@propagate_inbounds function extract_matrix_aligned(A, val_n, i) +@propagate_inbounds function extract_smatrix_aligned(A, val_n, i) return extract_smatrix(A, val_n, i) end # This function only works for 2x2 matrices, because it requires a stride of 2^n. @inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T} @boundscheck checkbounds(A, 2, 2, i) - - # This function assumes alignment of the data, which means that the first two of `A` - # have exactly the size `2` and no padding between them. - @boundscheck @assert stride(A, 3) == 4 + @boundscheck can_use_aligned_load(A, 4) vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1)) diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl index a44adaef9b..c4d969eaef 100644 --- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl +++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl @@ -37,6 +37,8 @@ function interact!(dv, v_particle_system, u_particle_system, # In 3D, this function can combine velocity and density load into one wide load, # which gives a significant speedup on GPUs. + # Note that we can only safely use `@inbounds` after checking alignment + # with `use_aligned_vrho_load` before the `@threaded` loop. (v_a, rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, use_aligned_load_system, particle) @@ -63,6 +65,8 @@ function interact!(dv, v_particle_system, u_particle_system, # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system` m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) + # Note that we can only safely use `@inbounds` after checking alignment + # with `use_aligned_vrho_load` before the `@threaded` loop. (v_b, rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, use_aligned_load_neighbor, neighbor) diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl index fa000ef38f..5f964bd60d 100644 --- a/src/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl @@ -31,15 +31,19 @@ end h = initial_smoothing_length(system) almostzero = sqrt(eps(h^2)) + # Check alignment of deformation gradient and `pk1_rho2` arrays before the `@threaded` + # loop to be able to use aligned loads safely inside the loop. + use_aligned_matrix_load_ = Val(use_aligned_matrix_load(system)) + @threaded semi for particle in each_integrated_particle(system) # We are looping over the particles of `system`, so it is guaranteed # that `particle` is in bounds of `system`. m_a = @inbounds system.mass[particle] rho_a = @inbounds system.material_density[particle] # PK1 / rho^2 - pk1_rho2_a = @inbounds pk1_rho2(system, particle) + pk1_rho2_a = @inbounds pk1_rho2(system, use_aligned_matrix_load_, particle) current_coords_a = @inbounds current_coords(system, particle) - F_a = @inbounds deformation_gradient(system, particle) + F_a = @inbounds deformation_gradient(system, use_aligned_matrix_load_, particle) # Accumulate the RHS contributions over all neighbors before writing to `dv` # to reduce the number of memory writes. @@ -64,11 +68,12 @@ end rho_b = @inbounds system.material_density[neighbor] m_b = @inbounds system.mass[neighbor] # PK1 / rho^2 - pk1_rho2_b = @inbounds pk1_rho2(system, neighbor) + pk1_rho2_b = @inbounds pk1_rho2(system, use_aligned_matrix_load_, neighbor) current_coords_b = @inbounds current_coords(system, neighbor) # The compiler is smart enough to optimize this away if no penalty force is used - F_b = @inbounds deformation_gradient(system, neighbor) + F_b = @inbounds deformation_gradient(system, use_aligned_matrix_load_, + neighbor) current_pos_diff_ = current_coords_a - current_coords_b # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference @@ -120,3 +125,42 @@ function interact!(dv, v_particle_system, u_particle_system, # TODO continuity equation? return dv end + +function use_aligned_matrix_load(system) + return use_aligned_matrix_load(system.deformation_grad, system.pk1_rho2) +end + +function use_aligned_matrix_load(deformation_grad::AbstractGPUArray, + pk1_rho2::AbstractGPUArray) + # Aligned loads should always be possible on GPUs because GPU arrays are always aligned + # to full pages, and these arrays are not slices of larger arrays. + if !can_use_aligned_load(deformation_grad, 4) + error("illegal alignment of deformation gradient array. Please report this issue.") + end + if !can_use_aligned_load(pk1_rho2, 4) + error("illegal alignment of `pk1_rho2` array. Please report this issue.") + end + + return true +end + +# Don't use aligned vector loads on the CPU +use_aligned_matrix_load(deformation_grad, pk1_rho2) = false + +# Aligned vector load versions for deformation gradient and `pk1_rho2`. +# These are only used on GPUs, which is checked by `use_aligned_matrix_load`. +@propagate_inbounds function pk1_rho2(system, ::Val{true}, particle) + return extract_smatrix_aligned(system.pk1_rho2, system, particle) +end + +@propagate_inbounds function pk1_rho2(system, ::Val{false}, particle) + return pk1_rho2(system, particle) +end + +@propagate_inbounds function deformation_gradient(system, ::Val{true}, particle) + return extract_smatrix_aligned(system.deformation_grad, system, particle) +end + +@propagate_inbounds function deformation_gradient(system, ::Val{false}, particle) + return deformation_gradient(system, particle) +end From 78f39fe51f96ac4e9effd80f66e8cf0273167f78 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Sun, 19 Apr 2026 14:11:37 +0200 Subject: [PATCH 10/22] Add and fix tests --- src/general/abstract_system.jl | 5 +- test/examples/gpu.jl | 94 +++++++++++++++++++++------------- test/general/util.jl | 20 ++++++++ 3 files changed, 81 insertions(+), 38 deletions(-) diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index 23135da628..dadd754d23 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -75,7 +75,10 @@ end @inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N} @boundscheck checkbounds(A, N, N, i) # This function assumes that the first two dimensions of `A` have exactly the size `N`. - @boundscheck @assert stride(A, 3) == N^2 + @boundscheck if stride(A, 3) != N^2 + error("extract_smatrix only works for 3D arrays where the first two dimensions " * + "have size N") + end # Extract the matrix elements for this `i` as a tuple to pass to SMatrix return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[N^2 * (i - 1) + j]), Val(N^2))) diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index f1b8b92ed9..621e483463 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -84,35 +84,35 @@ end types = [Float32] end - @testset verbose=true "use_simd_load $T" for T in types + @testset verbose=true "use_aligned_vrho_load $T" for T in types # Aligned array x = zeros(T, 16) # Test different "systems" and density calculators - @test !TrixiParticles.use_simd_load(x, nothing) - @test !TrixiParticles.use_simd_load(x, (; density_calculator=ContinuityDensity())) - @test !TrixiParticles.use_simd_load(x, nothing, SummationDensity()) + @test !TrixiParticles.use_aligned_vrho_load(x, nothing) + @test !TrixiParticles.use_aligned_vrho_load(x, (; density_calculator=ContinuityDensity())) + @test !TrixiParticles.use_aligned_vrho_load(x, nothing, SummationDensity()) - # No SIMD load on the CPU - @test !TrixiParticles.use_simd_load(x, nothing, ContinuityDensity()) + # No aligned load on the CPU + @test !TrixiParticles.use_aligned_vrho_load(x, nothing, ContinuityDensity()) y = Adapt.adapt(parallelization_backend, x) # Test different "systems" and density calculators - @test !TrixiParticles.use_simd_load(y, nothing) - @test !TrixiParticles.use_simd_load(y, (; density_calculator=ContinuityDensity())) - @test !TrixiParticles.use_simd_load(y, nothing, SummationDensity()) + @test !TrixiParticles.use_aligned_vrho_load(y, nothing) + @test !TrixiParticles.use_aligned_vrho_load(y, (; density_calculator=ContinuityDensity())) + @test !TrixiParticles.use_aligned_vrho_load(y, nothing, SummationDensity()) - # Use SIMD load on the GPU with 4-aligned arrays - @test TrixiParticles.use_simd_load(y, nothing, ContinuityDensity()) - @test TrixiParticles.use_simd_load(view(y, 5:16), nothing, ContinuityDensity()) + # Use aligned load on the GPU with 4-aligned arrays + @test TrixiParticles.use_aligned_vrho_load(y, nothing, ContinuityDensity()) + @test TrixiParticles.use_aligned_vrho_load(view(y, 5:16), nothing, ContinuityDensity()) # Unaligned array on the GPU should throw an error - @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 2:16), nothing, + @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 2:16), nothing, ContinuityDensity()) - @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 3:16), nothing, + @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 3:16), nothing, ContinuityDensity()) - @test_throws "on GPUs in 3D" TrixiParticles.use_simd_load(view(y, 4:16), nothing, + @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 4:16), nothing, ContinuityDensity()) end @@ -127,7 +127,9 @@ end system = MockSystem() @inline TrixiParticles.current_density(v, ::MockSystem, i) = v[4, i] - # Test that the SIMD version is consistent with the non-SIMD version. + @test TrixiParticles.use_aligned_vrho_load(y, system, ContinuityDensity()) + + # Test that the aligned version is consistent with the non-aligned version. # We have 4 particles (with 4 values per particles). # In order to test this on the GPU, we need to use a kernel with `@threaded`. result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) @@ -139,7 +141,7 @@ end end end -@testset verbose=true "extract_smatrix_aligned $TRIXIPARTICLES_TEST_" begin +@testset verbose=true "extract_svector_aligned $TRIXIPARTICLES_TEST_" begin if supports_double_precision types = [Float64, Float32] else @@ -147,30 +149,48 @@ end end @testset verbose=true "$T" for T in types - @testset verbose=true "$(N)D" for N in 2:3 - @testset "CPU" begin - A = rand(T, N, N, 4) - - # Test that the SIMD version is consistent with the non-SIMD version. - for i in 1:4 - @test TrixiParticles.extract_smatrix_aligned(A, Val(N), i) == - TrixiParticles.extract_smatrix(A, Val(N), i) - end + @testset verbose=true "$(N)D" for N in 1:6 + A = Adapt.adapt(parallelization_backend, rand(T, N, 4)) + val = Val(N) + + @test TrixiParticles.can_use_aligned_load(A, N) + @test !TrixiParticles.can_use_aligned_load(view(A, 2:length(A)), N) + + # Test that the aligned version is consistent with the non-aligned version. + # In order to test this on the GPU, we need to use a kernel with `@threaded`. + result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) + TrixiParticles.@threaded parallelization_backend for i in 1:4 + result[i] = TrixiParticles.extract_svector_aligned(A, val, i) == + TrixiParticles.extract_svector(A, val, i) end + @test all(result) + end + end +end - @testset "GPU" begin - A = Adapt.adapt(parallelization_backend, rand(T, N, N, 4)) - val = Val(N) +@testset verbose=true "extract_smatrix_aligned $TRIXIPARTICLES_TEST_" begin + if supports_double_precision + types = [Float64, Float32] + else + types = [Float32] + end - # Test that the SIMD version is consistent with the non-SIMD version. - # In order to test this on the GPU, we need to use a kernel with `@threaded`. - result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) - TrixiParticles.@threaded parallelization_backend for i in 1:4 - result[i] = TrixiParticles.extract_smatrix_aligned(A, val, i) == - TrixiParticles.extract_smatrix(A, val, i) - end - @test all(result) + @testset verbose=true "$T" for T in types + @testset verbose=true "$(N)D" for N in 2:3 + A = Adapt.adapt(parallelization_backend, rand(T, N, N, 4)) + val = Val(N) + + @test TrixiParticles.can_use_aligned_load(A, N) + @test !TrixiParticles.can_use_aligned_load(view(A, 2:length(A)), N) + + # Test that the aligned version is consistent with the non-aligned version. + # In order to test this on the GPU, we need to use a kernel with `@threaded`. + result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) + TrixiParticles.@threaded parallelization_backend for i in 1:4 + result[i] = TrixiParticles.extract_smatrix_aligned(A, val, i) == + TrixiParticles.extract_smatrix(A, val, i) end + @test all(result) end end end diff --git a/test/general/util.jl b/test/general/util.jl index 4aefddd4e3..7441fdb7f0 100644 --- a/test/general/util.jl +++ b/test/general/util.jl @@ -1,3 +1,23 @@ +@testset verbose=true "extract_smatrix" begin + A = Float64.(collect(reshape(1:12, 2, 2, 3))) + @test TrixiParticles.extract_smatrix(A, Val(2), 1) == A[:, :, 1] + @test TrixiParticles.extract_smatrix(A, Val(2), 2) == A[:, :, 2] + @test TrixiParticles.extract_smatrix(A, Val(2), 3) == A[:, :, 3] + @test_throws "extract_smatrix only works" TrixiParticles.extract_smatrix(A, Val(1), 1) + @test_throws "BoundsError" TrixiParticles.extract_smatrix(A, Val(3), 1) +end + +@testset verbose=true "extract_svector" begin + A = Float64.(collect(reshape(1:9, 3, 3))) + @test TrixiParticles.extract_svector(A, Val(3), 1) == A[:, 1] + @test TrixiParticles.extract_svector(A, Val(3), 2) == A[:, 2] + @test TrixiParticles.extract_svector(A, Val(3), 3) == A[:, 3] + @test TrixiParticles.extract_svector(A, Val(2), 1) == A[1:2, 1] + @test TrixiParticles.extract_svector(A, Val(2), 2) == A[1:2, 2] + @test TrixiParticles.extract_svector(A, Val(2), 3) == A[1:2, 3] + @test_throws "BoundsError" TrixiParticles.extract_svector(A, Val(4), 1) +end + @testset verbose=true "ThreadedBroadcastArray" begin A = TrixiParticles.ThreadedBroadcastArray(ones(3, 3)) B = ones(3, 3) From 180cc6052946c4151c98434f2ca619ca24ec5d69 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Sun, 19 Apr 2026 14:12:13 +0200 Subject: [PATCH 11/22] Reformat --- test/examples/gpu.jl | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 621e483463..20b6025070 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -90,7 +90,9 @@ end # Test different "systems" and density calculators @test !TrixiParticles.use_aligned_vrho_load(x, nothing) - @test !TrixiParticles.use_aligned_vrho_load(x, (; density_calculator=ContinuityDensity())) + @test !TrixiParticles.use_aligned_vrho_load(x, + (; + density_calculator=ContinuityDensity())) @test !TrixiParticles.use_aligned_vrho_load(x, nothing, SummationDensity()) # No aligned load on the CPU @@ -100,20 +102,26 @@ end # Test different "systems" and density calculators @test !TrixiParticles.use_aligned_vrho_load(y, nothing) - @test !TrixiParticles.use_aligned_vrho_load(y, (; density_calculator=ContinuityDensity())) + @test !TrixiParticles.use_aligned_vrho_load(y, + (; + density_calculator=ContinuityDensity())) @test !TrixiParticles.use_aligned_vrho_load(y, nothing, SummationDensity()) # Use aligned load on the GPU with 4-aligned arrays @test TrixiParticles.use_aligned_vrho_load(y, nothing, ContinuityDensity()) - @test TrixiParticles.use_aligned_vrho_load(view(y, 5:16), nothing, ContinuityDensity()) + @test TrixiParticles.use_aligned_vrho_load(view(y, 5:16), nothing, + ContinuityDensity()) # Unaligned array on the GPU should throw an error - @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 2:16), nothing, - ContinuityDensity()) - @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 3:16), nothing, - ContinuityDensity()) - @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 4:16), nothing, - ContinuityDensity()) + @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 2:16), + nothing, + ContinuityDensity()) + @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 3:16), + nothing, + ContinuityDensity()) + @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 4:16), + nothing, + ContinuityDensity()) end @testset "velocity_and_density $T" for T in types From 49df3727da11a48cb264dd045deacaa6f3eccbe1 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Sun, 19 Apr 2026 16:19:34 +0200 Subject: [PATCH 12/22] Fix tests --- src/general/abstract_system.jl | 9 +++--- src/general/gpu.jl | 10 ++++--- test/examples/gpu.jl | 50 +++++++++++++++++++++------------- 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index dadd754d23..f0c5809285 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -68,16 +68,17 @@ end end # Return `A[:, :, i]` as an `SMatrix`. -@propagate_inbounds function extract_smatrix(A, system, particle) - return extract_smatrix(A, Val(ndims(system)), particle) +@propagate_inbounds function extract_smatrix(A, system, i) + return extract_smatrix(A, Val(ndims(system)), i) end @inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N} @boundscheck checkbounds(A, N, N, i) # This function assumes that the first two dimensions of `A` have exactly the size `N`. @boundscheck if stride(A, 3) != N^2 - error("extract_smatrix only works for 3D arrays where the first two dimensions " * - "have size N") + # WARNING: Don't split this string with `*`, or this function won't compile on GPUs, + # even when the error is never thrown. + error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N") end # Extract the matrix elements for this `i` as a tuple to pass to SMatrix diff --git a/src/general/gpu.jl b/src/general/gpu.jl index 5cf7082588..803535b198 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -46,16 +46,18 @@ end # throw an error instead of silently (and non-deterministically) falling back to slower # regular loads. function can_use_aligned_load(A, n) + # Check that `n` is a power of 2, which is a requirement for aligned loads. + is_power_of_2 = n > 0 && (n & (n - 1)) == 0 # Check if the beginning of `A` is aligned to the size of the vector we want to load. is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0 # Check if the stride of `A` in the last dimension is equal to `n`, # which means that there is no padding between the vectors we want to load. has_stride = stride(A, ndims(A)) == n - return is_aligned && has_stride + return is_power_of_2 && is_aligned && has_stride end -@propagate_inbounds function extract_svector_aligned(A, system::AbstractSystem, particle) +@propagate_inbounds function extract_svector_aligned(A, system, particle) return extract_svector_aligned(A, Val(ndims(system)), particle) end @@ -80,12 +82,12 @@ end return SVector{N}(Tuple(vec)) end -@propagate_inbounds function extract_smatrix_aligned(A, system::AbstractSystem, particle) +@propagate_inbounds function extract_smatrix_aligned(A, system, particle) return extract_smatrix_aligned(A, Val(ndims(system)), particle) end # For general N, fall back to the regular `extract_smatrix`. -@propagate_inbounds function extract_smatrix_aligned(A, val_n, i) +@propagate_inbounds function extract_smatrix_aligned(A, val_n::Val, i) return extract_smatrix(A, val_n, i) end diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 20b6025070..9437731021 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -86,7 +86,7 @@ end @testset verbose=true "use_aligned_vrho_load $T" for T in types # Aligned array - x = zeros(T, 16) + x = zeros(T, 4, 4) # Test different "systems" and density calculators @test !TrixiParticles.use_aligned_vrho_load(x, nothing) @@ -109,19 +109,21 @@ end # Use aligned load on the GPU with 4-aligned arrays @test TrixiParticles.use_aligned_vrho_load(y, nothing, ContinuityDensity()) - @test TrixiParticles.use_aligned_vrho_load(view(y, 5:16), nothing, - ContinuityDensity()) + @test TrixiParticles.use_aligned_vrho_load(TrixiParticles.wrap_array(y, 5:16, + (4, 3)), + nothing, ContinuityDensity()) # Unaligned array on the GPU should throw an error - @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 2:16), - nothing, - ContinuityDensity()) - @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 3:16), - nothing, - ContinuityDensity()) - @test_throws "illegal alignment" TrixiParticles.use_aligned_vrho_load(view(y, 4:16), - nothing, - ContinuityDensity()) + str = "illegal alignment" + y2 = TrixiParticles.wrap_array(y, 2:13, (4, 3)) + @test_throws str TrixiParticles.use_aligned_vrho_load(y2, nothing, + ContinuityDensity()) + y3 = TrixiParticles.wrap_array(y, 3:14, (4, 3)) + @test_throws str TrixiParticles.use_aligned_vrho_load(y3, nothing, + ContinuityDensity()) + y4 = TrixiParticles.wrap_array(y, 4:15, (4, 3)) + @test_throws str TrixiParticles.use_aligned_vrho_load(y4, nothing, + ContinuityDensity()) end @testset "velocity_and_density $T" for T in types @@ -157,12 +159,13 @@ end end @testset verbose=true "$T" for T in types - @testset verbose=true "$(N)D" for N in 1:6 + @testset verbose=true "$(N)D" for N in [2, 4, 8] A = Adapt.adapt(parallelization_backend, rand(T, N, 4)) val = Val(N) @test TrixiParticles.can_use_aligned_load(A, N) - @test !TrixiParticles.can_use_aligned_load(view(A, 2:length(A)), N) + slice = TrixiParticles.wrap_array(A, 2:(3 * N + 1), (N, 3)) + @test !TrixiParticles.can_use_aligned_load(slice, N) # Test that the aligned version is consistent with the non-aligned version. # In order to test this on the GPU, we need to use a kernel with `@threaded`. @@ -173,6 +176,14 @@ end end @test all(result) end + + @testset verbose=true "$(N)D" for N in [3, 5, 6, 7] + A = Adapt.adapt(parallelization_backend, rand(T, N, 4)) + val = Val(N) + + # Aligned loads are only support for powers of two. + @test !TrixiParticles.can_use_aligned_load(A, N) + end end end @@ -184,12 +195,13 @@ end end @testset verbose=true "$T" for T in types - @testset verbose=true "$(N)D" for N in 2:3 - A = Adapt.adapt(parallelization_backend, rand(T, N, N, 4)) - val = Val(N) + @testset verbose=true "2D" begin + A = Adapt.adapt(parallelization_backend, rand(T, 2, 2, 4)) + val = Val(2) - @test TrixiParticles.can_use_aligned_load(A, N) - @test !TrixiParticles.can_use_aligned_load(view(A, 2:length(A)), N) + @test TrixiParticles.can_use_aligned_load(A, 2^2) + slice = TrixiParticles.wrap_array(A, 2:(3 * 4 + 1), (2, 2, 3)) + @test !TrixiParticles.can_use_aligned_load(slice, 2^2) # Test that the aligned version is consistent with the non-aligned version. # In order to test this on the GPU, we need to use a kernel with `@threaded`. From 3df4dea8f903940692d7daef2f270b425b4f047a Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Sun, 19 Apr 2026 16:24:59 +0200 Subject: [PATCH 13/22] Fix tests --- test/schemes/structure/total_lagrangian_sph/rhs.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl index ce498b4db2..fa4f98241d 100644 --- a/test/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl @@ -62,14 +62,15 @@ current_coordinates::Any material_density::Any pk1_rho2::Any + deformation_grad::Any mass::Any penalty_force::Any viscosity::Any buffer::Any end @inline Base.eltype(::MockSystem) = Float64 - system = MockSystem(current_coordinates, material_density, pk1_rho2, mass, - nothing, nothing, nothing) + system = MockSystem(current_coordinates, material_density, pk1_rho2, + nothing, mass, nothing, nothing, nothing) function TrixiParticles.initial_coordinates(::MockSystem) return initial_coordinates From 5e9f881aafbf28d9d2570837d31ba7ac7317b594 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Apr 2026 09:32:58 +0200 Subject: [PATCH 14/22] Add test for precompilation warnings. --- test/runtests.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 457e6b8212..916176a017 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,8 @@ +using Test +# Test that precompilation works without warnings (e.g. due to method overwites). +# This is important because method overwrite warnings disable precompilation of the package. +@test_nowarn using TrixiParticles + include("test_util.jl") const TRIXIPARTICLES_TEST = lowercase(get(ENV, "TRIXIPARTICLES_TEST", "all")) From 92140497c4d91ca3409152af740187067f92107b Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Apr 2026 09:34:00 +0200 Subject: [PATCH 15/22] Fix --- src/general/gpu.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/general/gpu.jl b/src/general/gpu.jl index 803535b198..d28a09931f 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -68,7 +68,7 @@ end end # For other N, fall back to the regular `extract_svector`. -@propagate_inbounds function extract_svector_aligned(A, val_n, i) +@propagate_inbounds function extract_svector_aligned(A, val_n::Val, i) return extract_svector(A, val_n, i) end From f99b95e1640acd50e0cd77c4b0ad0ae31df462e2 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Apr 2026 10:17:23 +0200 Subject: [PATCH 16/22] Try again --- test/runtests.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 916176a017..3df17c81b8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,7 @@ using Test -# Test that precompilation works without warnings (e.g. due to method overwites). +# Test that precompilation works without warnings (e.g. due to method overwrites). # This is important because method overwrite warnings disable precompilation of the package. -@test_nowarn using TrixiParticles +@test_nowarn import TrixiParticles include("test_util.jl") From f02bb364de029c6fb923c9e0bbd4c194c5a20487 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:16:34 +0200 Subject: [PATCH 17/22] Fix tests --- test/runtests.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 3df17c81b8..7d60bfc759 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,8 @@ using Test # Test that precompilation works without warnings (e.g. due to method overwrites). # This is important because method overwrite warnings disable precompilation of the package. -@test_nowarn import TrixiParticles +# We need `@eval Main` before Julia 1.12 to avoid `import" expression not at top level`. +@test_nowarn @eval Main import TrixiParticles include("test_util.jl") From d025c45f5aadd59f490345c02b395180578333a4 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:18:12 +0200 Subject: [PATCH 18/22] Always align integration arrays --- src/general/semidiscretization.jl | 15 ++++---------- test/examples/gpu.jl | 33 ------------------------------ test/general/semidiscretization.jl | 9 +++++--- 3 files changed, 10 insertions(+), 47 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 5e10b093ed..7782d42e2f 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -97,17 +97,6 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; sizes_v = [v_nvariables(system) * n_integrated_particles(system) for system in systems] - ELTYPE = eltype(first(systems)) - if parallelization_backend isa KernelAbstractions.GPU - # Align ranges to 64 bytes by adding padding if necessary. - # This ensures that aligned loads can be used on the wrapped integration arrays, - # which can significantly improve performance on GPUs. - block_size = div(64, sizeof(ELTYPE)) - else - # There is no performance benefit to aligning ranges for CPU backends. - block_size = 1 - end - start_u = 1 start_v = 1 ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems)) @@ -116,6 +105,10 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1) ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1) + # Align ranges to 64 bytes by adding padding if necessary. + # This ensures that aligned loads can be used on the wrapped integration arrays, + # which can significantly improve performance on GPUs. + block_size = div(64, sizeof(eltype(systems[i]))) start_u += div(sizes_u[i], block_size, RoundUp) * block_size start_v += div(sizes_v[i], block_size, RoundUp) * block_size end diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 9437731021..09f8e03205 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -215,39 +215,6 @@ end end end -@trixi_testset "Semidiscretization vector alignment" begin - # Mock systems - struct System1 <: TrixiParticles.AbstractSystem{3} end - struct System2 <: TrixiParticles.AbstractSystem{3} end - - system1 = System1() - system2 = System2() - - Base.eltype(::System1) = Float64 - Base.eltype(::System2) = Float64 - TrixiParticles.u_nvariables(::System1) = 3 - TrixiParticles.u_nvariables(::System2) = 4 - TrixiParticles.v_nvariables(::System1) = 3 - TrixiParticles.v_nvariables(::System2) = 2 - TrixiParticles.nparticles(::System1) = 2 - TrixiParticles.nparticles(::System2) = 3 - - TrixiParticles.compact_support(::System1, neighbor) = 0.2 - TrixiParticles.compact_support(::System2, neighbor) = 0.2 - - @testset verbose=true "Constructor" begin - semi = Semidiscretization(system1, system2, neighborhood_search=nothing, - parallelization_backend=Main.parallelization_backend) - - # These are the ranges that we would expect on the CPU: - # semi.ranges_u == (1:6, 7:18) - # semi.ranges_v == (1:6, 7:12) - # Due to alignment to 64 bytes, the ranges are adjusted to be: - @test semi.ranges_u == (1:6, 9:20) - @test semi.ranges_v == (1:6, 9:14) - end -end - @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin @testset verbose=true "Fluid" begin @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl index 52aeb3718c..6549e3e0bc 100644 --- a/test/general/semidiscretization.jl +++ b/test/general/semidiscretization.jl @@ -25,9 +25,12 @@ @testset verbose=true "Constructor" begin semi = Semidiscretization(system1, system2, neighborhood_search=nothing) - # Verification - semi.ranges_u == (1:6, 7:18) - semi.ranges_v == (1:6, 7:12) + # These are the ranges that we would expect on the CPU: + # semi.ranges_u == (1:6, 7:18) + # semi.ranges_v == (1:6, 7:12) + # Due to alignment to 64 bytes, the ranges are adjusted to be: + @test semi.ranges_u == (1:6, 9:20) + @test semi.ranges_v == (1:6, 9:14) nhs = [TrixiParticles.TrivialNeighborhoodSearch{3}(search_radius=0.2, eachpoint=1:2) From fb04abf909f178830cad1f4023ed0dc78fb9b623 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:35:17 +0200 Subject: [PATCH 19/22] Use `vload` in 2D on the CPU as well --- src/general/abstract_system.jl | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index f0c5809285..7802318a8a 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -59,12 +59,15 @@ initialize!(system, semi) = system end # Return `A[:, i...]` as an `SVector`. -@inline function extract_svector(A, ::Val{NDIMS}, i...) where {NDIMS} +@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS} # Explicit bounds check, which can be removed by calling this function with `@inbounds` @boundscheck checkbounds(A, NDIMS, i...) # Assume inbounds access now return SVector(ntuple(@inline(dim->@inbounds A[dim, i...]), NDIMS)) + # vec = SIMD.vload(SIMD.Vec{NDIMS, eltype(A)}, pointer(A, NDIMS * (i - 1) + 1)) + + return SVector{NDIMS}(Tuple(vec)) end # Return `A[:, :, i]` as an `SMatrix`. @@ -81,8 +84,29 @@ end error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N") end - # Extract the matrix elements for this `i` as a tuple to pass to SMatrix - return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[N^2 * (i - 1) + j]), Val(N^2))) + # Extract the matrix elements as a tuple in column-major order, + # and construct an `SMatrix` from it. + return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2))) +end + +@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{2}, i) where {T} + @boundscheck checkbounds(A, 2, 2, i) + # This function assumes that the first two dimensions of `A` have exactly the size `2`. + @boundscheck if stride(A, 3) != 4 + # WARNING: Don't split this string with `*`, or this function won't compile on GPUs, + # even when the error is never thrown. + error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N") + end + + # This is the same as + # `SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))`, + # but it's slightly faster on CPUs in some cases. As opposed to the GPU-optimized + # version in `extract_smatrix_aligned`, we use `vload` and not `vloada` here, which + # does not require alignment and works if N^2 is not a power of 2. + # This is faster in the TLSPH RHS in 2D, but slower in 3D for some reason. + # See the benchmarks in https://github.com/trixi-framework/TrixiParticles.jl/pull/1147. + vec = SIMD.vload(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1)) + return SMatrix{2, 2}(Tuple(vec)) end # Specifically get the current coordinates of a particle for all system types. From 4904645ff96ed6672f33a9826df5700d480ea2f0 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:37:18 +0200 Subject: [PATCH 20/22] Add comment --- src/schemes/structure/total_lagrangian_sph/rhs.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl index 5f964bd60d..e32315959c 100644 --- a/src/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl @@ -144,7 +144,10 @@ function use_aligned_matrix_load(deformation_grad::AbstractGPUArray, return true end -# Don't use aligned vector loads on the CPU +# Don't use aligned vector loads on the CPU. For large arrays, alignment to 32 bytes +# (4 * Float64) is usually given, but it is not guaranteed, as Julia only guarantees +# alignment to 16 bytes. However, the non-aligned `vload` used in `extract_smatrix` in 2D +# has the same performance as the aligned `vloada` in `extract_smatrix_aligned` on the CPU. use_aligned_matrix_load(deformation_grad, pk1_rho2) = false # Aligned vector load versions for deformation gradient and `pk1_rho2`. From 3c4bb9fcead2baa12c566959bfa337c510cc38c0 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Apr 2026 15:02:58 +0200 Subject: [PATCH 21/22] Fix --- src/general/abstract_system.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index 7802318a8a..faaa1866cf 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -59,13 +59,12 @@ initialize!(system, semi) = system end # Return `A[:, i...]` as an `SVector`. -@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS} +@inline function extract_svector(A, ::Val{NDIMS}, i...) where {NDIMS} # Explicit bounds check, which can be removed by calling this function with `@inbounds` @boundscheck checkbounds(A, NDIMS, i...) # Assume inbounds access now return SVector(ntuple(@inline(dim->@inbounds A[dim, i...]), NDIMS)) - # vec = SIMD.vload(SIMD.Vec{NDIMS, eltype(A)}, pointer(A, NDIMS * (i - 1) + 1)) return SVector{NDIMS}(Tuple(vec)) end From 4dd0472dcb2bfbcdda1115f752ce941a3cad253e Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 21 Apr 2026 11:08:52 +0200 Subject: [PATCH 22/22] Add comments --- src/general/abstract_system.jl | 2 -- src/general/gpu.jl | 19 ++++++++++++------- src/general/semidiscretization.jl | 2 ++ test/examples/gpu.jl | 2 +- test/general/semidiscretization.jl | 2 +- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index faaa1866cf..89e49c7334 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -65,8 +65,6 @@ end # Assume inbounds access now return SVector(ntuple(@inline(dim->@inbounds A[dim, i...]), NDIMS)) - - return SVector{NDIMS}(Tuple(vec)) end # Return `A[:, :, i]` as an `SMatrix`. diff --git a/src/general/gpu.jl b/src/general/gpu.jl index d28a09931f..a3306ca410 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -38,13 +38,6 @@ end # This function checks if we can use aligned `SVector` or `SMatrix` loads for the given # array `A` and the size of the vector we want to load `n`. -# -# Note that it is not checked if the size `n` is legal (a power of 2). -# If the size `n` is not legal, the aligned loads will fall back to regular loads. -# This is because a regular load is expected for a problem that doesn't support aligned -# loads (e.g. 3D matrices of 3x3), whereas an illegal alignment is unexpected and should -# throw an error instead of silently (and non-deterministically) falling back to slower -# regular loads. function can_use_aligned_load(A, n) # Check that `n` is a power of 2, which is a requirement for aligned loads. is_power_of_2 = n > 0 && (n & (n - 1)) == 0 @@ -76,6 +69,12 @@ end @inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N} @boundscheck checkbounds(A, 1:N, i) @boundscheck can_use_aligned_load(A, N) + # This function assumes a stride of N in the last dimension. + @boundscheck if stride(A, 2) != N + # WARNING: Don't split this string with `*`, or this function won't compile on GPUs, + # even when the error is never thrown. + error("extract_svector_aligned only works for 2D arrays where the stride in the last dimension is equal to N") + end vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1)) @@ -95,6 +94,12 @@ end @inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T} @boundscheck checkbounds(A, 2, 2, i) @boundscheck can_use_aligned_load(A, 4) + # This function assumes that the first two dimensions of `A` have exactly the size `N`. + @boundscheck if stride(A, 3) != 4 + # WARNING: Don't split this string with `*`, or this function won't compile on GPUs, + # even when the error is never thrown. + error("extract_smatrix_aligned only works for 3D arrays where the first two dimensions each have size N") + end vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1)) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 7782d42e2f..31a028110b 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -281,6 +281,8 @@ function semidiscretize(semi, tspan; reset_threads=true) parallelization_backend=semi.parallelization_backend) end + # Initialize the arrays with zeros to initialize the padding values for alignment. + # The values that are actually used will be overwritten in `write_u0!` and `write_v0!`. u0_ode .= 0 v0_ode .= 0 diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 09f8e03205..1201a9eeb4 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -131,7 +131,7 @@ end x = rand(T, 4, 4) y = Adapt.adapt(parallelization_backend, x) - # Dummy system that will only be used for `ndims`. + # Dummy system that will only be used for `ndims` and `current_density`. struct MockSystem end Base.ndims(::MockSystem) = 3 system = MockSystem() diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl index 6549e3e0bc..12624b24ff 100644 --- a/test/general/semidiscretization.jl +++ b/test/general/semidiscretization.jl @@ -25,7 +25,7 @@ @testset verbose=true "Constructor" begin semi = Semidiscretization(system1, system2, neighborhood_search=nothing) - # These are the ranges that we would expect on the CPU: + # These are the ranges that we would expect without alignment padding: # semi.ranges_u == (1:6, 7:18) # semi.ranges_v == (1:6, 7:12) # Due to alignment to 64 bytes, the ranges are adjusted to be: