diff --git a/Project.toml b/Project.toml index 9f17f48c0a..fe5dd24971 100644 --- a/Project.toml +++ b/Project.toml @@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +SIMD = "fdea26ae-647d-5447-a871-4b548cad5224" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -63,6 +64,7 @@ Polyester = "0.7.10" ReadVTK = "0.2" RecipesBase = "1" Reexport = "1" +SIMD = "3.7.2" SciMLBase = "2" StaticArrays = "1" Statistics = "1" diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl index 215a2b3bf1..45b94faca3 100644 --- a/src/TrixiParticles.jl +++ b/src/TrixiParticles.jl @@ -25,6 +25,7 @@ using Random: seed! using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!, get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!, add_tstop! +using SIMD: SIMD @reexport using StaticArrays: SVector using StaticArrays: @SMatrix, SMatrix, setindex using Statistics: Statistics diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index 79b52a81df..89e49c7334 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -68,18 +68,42 @@ end end # Return `A[:, :, i]` as an `SMatrix`. -@propagate_inbounds function extract_smatrix(A, system, particle) - return extract_smatrix(A, Val(ndims(system)), particle) +@propagate_inbounds function extract_smatrix(A, system, i) + return extract_smatrix(A, Val(ndims(system)), i) end -@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS} - @boundscheck checkbounds(A, NDIMS, NDIMS, particle) +@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N} + @boundscheck checkbounds(A, N, N, i) + # This function assumes that the first two dimensions of `A` have exactly the size `N`. + @boundscheck if stride(A, 3) != N^2 + # WARNING: Don't split this string with `*`, or this function won't compile on GPUs, + # even when the error is never thrown. + error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N") + end + + # Extract the matrix elements as a tuple in column-major order, + # and construct an `SMatrix` from it. + return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2))) +end + +@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{2}, i) where {T} + @boundscheck checkbounds(A, 2, 2, i) + # This function assumes that the first two dimensions of `A` have exactly the size `2`. + @boundscheck if stride(A, 3) != 4 + # WARNING: Don't split this string with `*`, or this function won't compile on GPUs, + # even when the error is never thrown. + error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N") + end - # Extract the matrix elements for this particle as a tuple to pass to SMatrix - return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1, - div(i - 1, NDIMS) + 1, - particle]), - Val(NDIMS^2))) + # This is the same as + # `SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))`, + # but it's slightly faster on CPUs in some cases. As opposed to the GPU-optimized + # version in `extract_smatrix_aligned`, we use `vload` and not `vloada` here, which + # does not require alignment and works if N^2 is not a power of 2. + # This is faster in the TLSPH RHS in 2D, but slower in 3D for some reason. + # See the benchmarks in https://github.com/trixi-framework/TrixiParticles.jl/pull/1147. + vec = SIMD.vload(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1)) + return SMatrix{2, 2}(Tuple(vec)) end # Specifically get the current coordinates of a particle for all system types. diff --git a/src/general/gpu.jl b/src/general/gpu.jl index f0d145bf1e..a3306ca410 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -35,3 +35,73 @@ end function allocate(backend, ELTYPE, size) return Array{ELTYPE, length(size)}(undef, size) end + +# This function checks if we can use aligned `SVector` or `SMatrix` loads for the given +# array `A` and the size of the vector we want to load `n`. +function can_use_aligned_load(A, n) + # Check that `n` is a power of 2, which is a requirement for aligned loads. + is_power_of_2 = n > 0 && (n & (n - 1)) == 0 + # Check if the beginning of `A` is aligned to the size of the vector we want to load. + is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0 + # Check if the stride of `A` in the last dimension is equal to `n`, + # which means that there is no padding between the vectors we want to load. + has_stride = stride(A, ndims(A)) == n + + return is_power_of_2 && is_aligned && has_stride +end + +@propagate_inbounds function extract_svector_aligned(A, system, particle) + return extract_svector_aligned(A, Val(ndims(system)), particle) +end + +# For N = 2 and N = 4, use the aligned vector loads. +@propagate_inbounds function extract_svector_aligned(A::AbstractMatrix, + val_n::Union{Val{2}, Val{4}}, i) + return _extract_svector_aligned(A, val_n, i) +end + +# For other N, fall back to the regular `extract_svector`. +@propagate_inbounds function extract_svector_aligned(A, val_n::Val, i) + return extract_svector(A, val_n, i) +end + +# This function only works for N = 2 and N = 4, because it requires a stride of 2^n. +@inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N} + @boundscheck checkbounds(A, 1:N, i) + @boundscheck can_use_aligned_load(A, N) + # This function assumes a stride of N in the last dimension. + @boundscheck if stride(A, 2) != N + # WARNING: Don't split this string with `*`, or this function won't compile on GPUs, + # even when the error is never thrown. + error("extract_svector_aligned only works for 2D arrays where the stride in the last dimension is equal to N") + end + + vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1)) + + return SVector{N}(Tuple(vec)) +end + +@propagate_inbounds function extract_smatrix_aligned(A, system, particle) + return extract_smatrix_aligned(A, Val(ndims(system)), particle) +end + +# For general N, fall back to the regular `extract_smatrix`. +@propagate_inbounds function extract_smatrix_aligned(A, val_n::Val, i) + return extract_smatrix(A, val_n, i) +end + +# This function only works for 2x2 matrices, because it requires a stride of 2^n. +@inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T} + @boundscheck checkbounds(A, 2, 2, i) + @boundscheck can_use_aligned_load(A, 4) + # This function assumes that the first two dimensions of `A` have exactly the size `N`. + @boundscheck if stride(A, 3) != 4 + # WARNING: Don't split this string with `*`, or this function won't compile on GPUs, + # even when the error is never thrown. + error("extract_smatrix_aligned only works for 3D arrays where the first two dimensions each have size N") + end + + vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1)) + + return SMatrix{2, 2}(Tuple(vec)) +end diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 6f8f59197e..31a028110b 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -94,12 +94,27 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; sizes_u = [u_nvariables(system) * n_integrated_particles(system) for system in systems] - ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i]) - for i in eachindex(sizes_u)) sizes_v = [v_nvariables(system) * n_integrated_particles(system) for system in systems] - ranges_v = Tuple((sum(sizes_v[1:(i - 1)]) + 1):sum(sizes_v[1:i]) - for i in eachindex(sizes_v)) + + start_u = 1 + start_v = 1 + ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems)) + ranges_v_vec = Vector{UnitRange{Int}}(undef, length(systems)) + for i in eachindex(systems) + ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1) + ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1) + + # Align ranges to 64 bytes by adding padding if necessary. + # This ensures that aligned loads can be used on the wrapped integration arrays, + # which can significantly improve performance on GPUs. + block_size = div(64, sizeof(eltype(systems[i]))) + start_u += div(sizes_u[i], block_size, RoundUp) * block_size + start_v += div(sizes_v[i], block_size, RoundUp) * block_size + end + + ranges_u = Tuple(ranges_u_vec) + ranges_v = Tuple(ranges_v_vec) # Create a n x n matrix of n neighborhood searches for each of the n systems. # We will need one neighborhood search for each pair of systems. @@ -245,13 +260,13 @@ function semidiscretize(semi, tspan; reset_threads=true) Polyester.reset_threads!() end - sizes_u = (u_nvariables(system) * n_integrated_particles(system) for system in systems) - sizes_v = (v_nvariables(system) * n_integrated_particles(system) for system in systems) + size_u_ode = semi.ranges_u[end].stop + size_v_ode = semi.ranges_v[end].stop # Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or # use CPU vectors for all CPU backends. - u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, sum(sizes_u)) - v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v)) + u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, size_u_ode) + v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, size_v_ode) if semi.parallelization_backend isa KernelAbstractions.GPU u0_ode = u0_ode_ @@ -266,6 +281,11 @@ function semidiscretize(semi, tspan; reset_threads=true) parallelization_backend=semi.parallelization_backend) end + # Initialize the arrays with zeros to initialize the padding values for alignment. + # The values that are actually used will be overwritten in `write_u0!` and `write_v0!`. + u0_ode .= 0 + v0_ode .= 0 + # Set initial condition foreach_system_wrapped(semi, v0_ode, u0_ode) do system, v0_system, u0_system write_u0!(u0_system, system) @@ -356,8 +376,9 @@ end range = ranges_v[system_indices(system, semi)] @boundscheck begin - if length(range) != v_nvariables(system) * n_integrated_particles(system) - throw(DimensionMismatch("`v_ode` range length $range_length does not match " * + expected = v_nvariables(system) * n_integrated_particles(system) + if length(range) != expected + throw(DimensionMismatch("`v_ode` range length $(length(range)) does not match " * "expected number of entries $expected")) end end diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl index 836063538b..c4d969eaef 100644 --- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl +++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl @@ -25,14 +25,23 @@ function interact!(dv, v_particle_system, u_particle_system, compact_support_ = compact_support(particle_system, neighbor_system) almostzero = sqrt(eps(compact_support_^2)) + use_aligned_load_system = Val(use_aligned_vrho_load(v_particle_system, particle_system)) + use_aligned_load_neighbor = Val(use_aligned_vrho_load(v_neighbor_system, + neighbor_system)) + @threaded semi for particle in each_integrated_particle(particle_system) # We are looping over the particles of `particle_system`, so it is guaranteed # that `particle` is in bounds of `particle_system`. m_a = @inbounds hydrodynamic_mass(particle_system, particle) p_a = @inbounds current_pressure(v_particle_system, particle_system, particle) - v_a = @inbounds current_velocity(v_particle_system, particle_system, particle) - rho_a = @inbounds current_density(v_particle_system, particle_system, particle) + # In 3D, this function can combine velocity and density load into one wide load, + # which gives a significant speedup on GPUs. + # Note that we can only safely use `@inbounds` after checking alignment + # with `use_aligned_vrho_load` before the `@threaded` loop. + (v_a, + rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, + use_aligned_load_system, particle) # Accumulate the RHS contributions over all neighbors before writing to `dv`, # to reduce the number of memory writes. @@ -56,8 +65,11 @@ function interact!(dv, v_particle_system, u_particle_system, # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system` m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) - v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) - rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + # Note that we can only safely use `@inbounds` after checking alignment + # with `use_aligned_vrho_load` before the `@threaded` loop. + (v_b, + rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, + use_aligned_load_neighbor, neighbor) # The following call is equivalent to # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` @@ -134,3 +146,55 @@ end neighbor, p_a) return p_a end + +# Default method, which simply calls `current_velocity` and `current_density` separately. +@propagate_inbounds function velocity_and_density(v, system, ::Val{false}, particle) + v_particle = current_velocity(v, system, particle) + rho_particle = current_density(v, system, particle) + + return v_particle, rho_particle +end + +# Optimized version for WCSPH with `ContinuityDensity` in 3D, +# which combines the velocity and density load into one wide load. +# This is significantly faster on GPUs than the 4 individual loads of `extract_svector`. +# WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`, +# which is checked by `use_aligned_vrho_load`. +# Only call this function after checking `use_aligned_vrho_load` to avoid +# segmentation faults from illegal accesses. +@propagate_inbounds function velocity_and_density(v, system, ::Val{true}, particle) + vrho_particle = extract_svector_aligned(v, Val(4), particle) + + # The columns of `v` are ordered as (v_x, v_y, v_z, rho) + v..., rho = Tuple(vrho_particle) + v_particle = SVector(v) + + return v_particle, rho +end + +# By default, don't use aligned loads +use_aligned_vrho_load(v, system) = false + +function use_aligned_vrho_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3}) + use_aligned_vrho_load(v, system, system.density_calculator) +end + +use_aligned_vrho_load(v, system, density_calculator) = false + +# Only use aligned loads when all of these conditions are satisfied: +# - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4. +# - We are on a GPU, where the aligned load gives a significant speedup. +# - The velocity array is aligned for aligned loads, which requires that the pointer of `v` +# is aligned to `4 * sizeof(eltype(v))` +# Otherwise, we cannot use `vloada`, which is an *aligned* load. +# The unaligned version `vload` does not produce wide load instructions on GPUs. +function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity) + if !can_use_aligned_load(v, 4) + # Aligned loads should always be possible on GPUs because the slices of `v_ode` + # are aligned to 64 bytes in `Semidiscretization` and arrays on GPUs are always + # aligned to full pages. + error("illegal alignment of `v` integration array. Please report this issue.") + end + + return true +end diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl index fa000ef38f..e32315959c 100644 --- a/src/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl @@ -31,15 +31,19 @@ end h = initial_smoothing_length(system) almostzero = sqrt(eps(h^2)) + # Check alignment of deformation gradient and `pk1_rho2` arrays before the `@threaded` + # loop to be able to use aligned loads safely inside the loop. + use_aligned_matrix_load_ = Val(use_aligned_matrix_load(system)) + @threaded semi for particle in each_integrated_particle(system) # We are looping over the particles of `system`, so it is guaranteed # that `particle` is in bounds of `system`. m_a = @inbounds system.mass[particle] rho_a = @inbounds system.material_density[particle] # PK1 / rho^2 - pk1_rho2_a = @inbounds pk1_rho2(system, particle) + pk1_rho2_a = @inbounds pk1_rho2(system, use_aligned_matrix_load_, particle) current_coords_a = @inbounds current_coords(system, particle) - F_a = @inbounds deformation_gradient(system, particle) + F_a = @inbounds deformation_gradient(system, use_aligned_matrix_load_, particle) # Accumulate the RHS contributions over all neighbors before writing to `dv` # to reduce the number of memory writes. @@ -64,11 +68,12 @@ end rho_b = @inbounds system.material_density[neighbor] m_b = @inbounds system.mass[neighbor] # PK1 / rho^2 - pk1_rho2_b = @inbounds pk1_rho2(system, neighbor) + pk1_rho2_b = @inbounds pk1_rho2(system, use_aligned_matrix_load_, neighbor) current_coords_b = @inbounds current_coords(system, neighbor) # The compiler is smart enough to optimize this away if no penalty force is used - F_b = @inbounds deformation_gradient(system, neighbor) + F_b = @inbounds deformation_gradient(system, use_aligned_matrix_load_, + neighbor) current_pos_diff_ = current_coords_a - current_coords_b # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference @@ -120,3 +125,45 @@ function interact!(dv, v_particle_system, u_particle_system, # TODO continuity equation? return dv end + +function use_aligned_matrix_load(system) + return use_aligned_matrix_load(system.deformation_grad, system.pk1_rho2) +end + +function use_aligned_matrix_load(deformation_grad::AbstractGPUArray, + pk1_rho2::AbstractGPUArray) + # Aligned loads should always be possible on GPUs because GPU arrays are always aligned + # to full pages, and these arrays are not slices of larger arrays. + if !can_use_aligned_load(deformation_grad, 4) + error("illegal alignment of deformation gradient array. Please report this issue.") + end + if !can_use_aligned_load(pk1_rho2, 4) + error("illegal alignment of `pk1_rho2` array. Please report this issue.") + end + + return true +end + +# Don't use aligned vector loads on the CPU. For large arrays, alignment to 32 bytes +# (4 * Float64) is usually given, but it is not guaranteed, as Julia only guarantees +# alignment to 16 bytes. However, the non-aligned `vload` used in `extract_smatrix` in 2D +# has the same performance as the aligned `vloada` in `extract_smatrix_aligned` on the CPU. +use_aligned_matrix_load(deformation_grad, pk1_rho2) = false + +# Aligned vector load versions for deformation gradient and `pk1_rho2`. +# These are only used on GPUs, which is checked by `use_aligned_matrix_load`. +@propagate_inbounds function pk1_rho2(system, ::Val{true}, particle) + return extract_smatrix_aligned(system.pk1_rho2, system, particle) +end + +@propagate_inbounds function pk1_rho2(system, ::Val{false}, particle) + return pk1_rho2(system, particle) +end + +@propagate_inbounds function deformation_gradient(system, ::Val{true}, particle) + return extract_smatrix_aligned(system.deformation_grad, system, particle) +end + +@propagate_inbounds function deformation_gradient(system, ::Val{false}, particle) + return deformation_gradient(system, particle) +end diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 3117a67ca5..1201a9eeb4 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -77,6 +77,144 @@ end end end +@testset verbose=true "velocity_and_density $TRIXIPARTICLES_TEST_" begin + if supports_double_precision + types = [Float64, Float32] + else + types = [Float32] + end + + @testset verbose=true "use_aligned_vrho_load $T" for T in types + # Aligned array + x = zeros(T, 4, 4) + + # Test different "systems" and density calculators + @test !TrixiParticles.use_aligned_vrho_load(x, nothing) + @test !TrixiParticles.use_aligned_vrho_load(x, + (; + density_calculator=ContinuityDensity())) + @test !TrixiParticles.use_aligned_vrho_load(x, nothing, SummationDensity()) + + # No aligned load on the CPU + @test !TrixiParticles.use_aligned_vrho_load(x, nothing, ContinuityDensity()) + + y = Adapt.adapt(parallelization_backend, x) + + # Test different "systems" and density calculators + @test !TrixiParticles.use_aligned_vrho_load(y, nothing) + @test !TrixiParticles.use_aligned_vrho_load(y, + (; + density_calculator=ContinuityDensity())) + @test !TrixiParticles.use_aligned_vrho_load(y, nothing, SummationDensity()) + + # Use aligned load on the GPU with 4-aligned arrays + @test TrixiParticles.use_aligned_vrho_load(y, nothing, ContinuityDensity()) + @test TrixiParticles.use_aligned_vrho_load(TrixiParticles.wrap_array(y, 5:16, + (4, 3)), + nothing, ContinuityDensity()) + + # Unaligned array on the GPU should throw an error + str = "illegal alignment" + y2 = TrixiParticles.wrap_array(y, 2:13, (4, 3)) + @test_throws str TrixiParticles.use_aligned_vrho_load(y2, nothing, + ContinuityDensity()) + y3 = TrixiParticles.wrap_array(y, 3:14, (4, 3)) + @test_throws str TrixiParticles.use_aligned_vrho_load(y3, nothing, + ContinuityDensity()) + y4 = TrixiParticles.wrap_array(y, 4:15, (4, 3)) + @test_throws str TrixiParticles.use_aligned_vrho_load(y4, nothing, + ContinuityDensity()) + end + + @testset "velocity_and_density $T" for T in types + # Aligned array + x = rand(T, 4, 4) + y = Adapt.adapt(parallelization_backend, x) + + # Dummy system that will only be used for `ndims` and `current_density`. + struct MockSystem end + Base.ndims(::MockSystem) = 3 + system = MockSystem() + @inline TrixiParticles.current_density(v, ::MockSystem, i) = v[4, i] + + @test TrixiParticles.use_aligned_vrho_load(y, system, ContinuityDensity()) + + # Test that the aligned version is consistent with the non-aligned version. + # We have 4 particles (with 4 values per particles). + # In order to test this on the GPU, we need to use a kernel with `@threaded`. + result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) + TrixiParticles.@threaded parallelization_backend for i in 1:4 + result[i] = TrixiParticles.velocity_and_density(y, system, Val(false), i) == + TrixiParticles.velocity_and_density(y, system, Val(true), i) + end + @test all(result) + end +end + +@testset verbose=true "extract_svector_aligned $TRIXIPARTICLES_TEST_" begin + if supports_double_precision + types = [Float64, Float32] + else + types = [Float32] + end + + @testset verbose=true "$T" for T in types + @testset verbose=true "$(N)D" for N in [2, 4, 8] + A = Adapt.adapt(parallelization_backend, rand(T, N, 4)) + val = Val(N) + + @test TrixiParticles.can_use_aligned_load(A, N) + slice = TrixiParticles.wrap_array(A, 2:(3 * N + 1), (N, 3)) + @test !TrixiParticles.can_use_aligned_load(slice, N) + + # Test that the aligned version is consistent with the non-aligned version. + # In order to test this on the GPU, we need to use a kernel with `@threaded`. + result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) + TrixiParticles.@threaded parallelization_backend for i in 1:4 + result[i] = TrixiParticles.extract_svector_aligned(A, val, i) == + TrixiParticles.extract_svector(A, val, i) + end + @test all(result) + end + + @testset verbose=true "$(N)D" for N in [3, 5, 6, 7] + A = Adapt.adapt(parallelization_backend, rand(T, N, 4)) + val = Val(N) + + # Aligned loads are only support for powers of two. + @test !TrixiParticles.can_use_aligned_load(A, N) + end + end +end + +@testset verbose=true "extract_smatrix_aligned $TRIXIPARTICLES_TEST_" begin + if supports_double_precision + types = [Float64, Float32] + else + types = [Float32] + end + + @testset verbose=true "$T" for T in types + @testset verbose=true "2D" begin + A = Adapt.adapt(parallelization_backend, rand(T, 2, 2, 4)) + val = Val(2) + + @test TrixiParticles.can_use_aligned_load(A, 2^2) + slice = TrixiParticles.wrap_array(A, 2:(3 * 4 + 1), (2, 2, 3)) + @test !TrixiParticles.can_use_aligned_load(slice, 2^2) + + # Test that the aligned version is consistent with the non-aligned version. + # In order to test this on the GPU, we need to use a kernel with `@threaded`. + result = Adapt.adapt(parallelization_backend, zeros(Bool, 4)) + TrixiParticles.@threaded parallelization_backend for i in 1:4 + result[i] = TrixiParticles.extract_smatrix_aligned(A, val, i) == + TrixiParticles.extract_smatrix(A, val, i) + end + @test all(result) + end + end +end + @testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin @testset verbose=true "Fluid" begin @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl index 3c2dcbeb58..12624b24ff 100644 --- a/test/general/semidiscretization.jl +++ b/test/general/semidiscretization.jl @@ -8,6 +8,7 @@ system2 = System2() Base.eltype(::System1) = Float64 + Base.eltype(::System2) = Float64 TrixiParticles.coordinates_eltype(::System1) = Float32 TrixiParticles.u_nvariables(::System1) = 3 TrixiParticles.u_nvariables(::System2) = 4 @@ -24,9 +25,12 @@ @testset verbose=true "Constructor" begin semi = Semidiscretization(system1, system2, neighborhood_search=nothing) - # Verification - @test semi.ranges_u == (1:6, 7:18) - @test semi.ranges_v == (1:6, 7:12) + # These are the ranges that we would expect without alignment padding: + # semi.ranges_u == (1:6, 7:18) + # semi.ranges_v == (1:6, 7:12) + # Due to alignment to 64 bytes, the ranges are adjusted to be: + @test semi.ranges_u == (1:6, 9:20) + @test semi.ranges_v == (1:6, 9:14) nhs = [TrixiParticles.TrivialNeighborhoodSearch{3}(search_radius=0.2, eachpoint=1:2) @@ -152,8 +156,8 @@ semi = Semidiscretization(system1, system2, neighborhood_search=nothing) - dv_ode = zeros(3 * 2 + 2 * 3) - du_ode = zeros(3 * 2 + 4 * 3) + dv_ode = zeros(semi.ranges_v[end].stop) + du_ode = zeros(semi.ranges_u[end].stop) u_ode = zero(du_ode) v1 = [1.0 2.0 diff --git a/test/general/util.jl b/test/general/util.jl index 4aefddd4e3..7441fdb7f0 100644 --- a/test/general/util.jl +++ b/test/general/util.jl @@ -1,3 +1,23 @@ +@testset verbose=true "extract_smatrix" begin + A = Float64.(collect(reshape(1:12, 2, 2, 3))) + @test TrixiParticles.extract_smatrix(A, Val(2), 1) == A[:, :, 1] + @test TrixiParticles.extract_smatrix(A, Val(2), 2) == A[:, :, 2] + @test TrixiParticles.extract_smatrix(A, Val(2), 3) == A[:, :, 3] + @test_throws "extract_smatrix only works" TrixiParticles.extract_smatrix(A, Val(1), 1) + @test_throws "BoundsError" TrixiParticles.extract_smatrix(A, Val(3), 1) +end + +@testset verbose=true "extract_svector" begin + A = Float64.(collect(reshape(1:9, 3, 3))) + @test TrixiParticles.extract_svector(A, Val(3), 1) == A[:, 1] + @test TrixiParticles.extract_svector(A, Val(3), 2) == A[:, 2] + @test TrixiParticles.extract_svector(A, Val(3), 3) == A[:, 3] + @test TrixiParticles.extract_svector(A, Val(2), 1) == A[1:2, 1] + @test TrixiParticles.extract_svector(A, Val(2), 2) == A[1:2, 2] + @test TrixiParticles.extract_svector(A, Val(2), 3) == A[1:2, 3] + @test_throws "BoundsError" TrixiParticles.extract_svector(A, Val(4), 1) +end + @testset verbose=true "ThreadedBroadcastArray" begin A = TrixiParticles.ThreadedBroadcastArray(ones(3, 3)) B = ones(3, 3) diff --git a/test/runtests.jl b/test/runtests.jl index 457e6b8212..7d60bfc759 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,9 @@ +using Test +# Test that precompilation works without warnings (e.g. due to method overwrites). +# This is important because method overwrite warnings disable precompilation of the package. +# We need `@eval Main` before Julia 1.12 to avoid `import" expression not at top level`. +@test_nowarn @eval Main import TrixiParticles + include("test_util.jl") const TRIXIPARTICLES_TEST = lowercase(get(ENV, "TRIXIPARTICLES_TEST", "all")) diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl index ce498b4db2..fa4f98241d 100644 --- a/test/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl @@ -62,14 +62,15 @@ current_coordinates::Any material_density::Any pk1_rho2::Any + deformation_grad::Any mass::Any penalty_force::Any viscosity::Any buffer::Any end @inline Base.eltype(::MockSystem) = Float64 - system = MockSystem(current_coordinates, material_density, pk1_rho2, mass, - nothing, nothing, nothing) + system = MockSystem(current_coordinates, material_density, pk1_rho2, + nothing, mass, nothing, nothing, nothing) function TrixiParticles.initial_coordinates(::MockSystem) return initial_coordinates