Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand Down Expand Up @@ -63,6 +64,7 @@ Polyester = "0.7.10"
ReadVTK = "0.2"
RecipesBase = "1"
Reexport = "1"
SIMD = "3.7.2"
SciMLBase = "2"
StaticArrays = "1"
Statistics = "1"
Expand Down
1 change: 1 addition & 0 deletions src/TrixiParticles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ using Random: seed!
using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
add_tstop!
using SIMD: SIMD
@reexport using StaticArrays: SVector
using StaticArrays: @SMatrix, SMatrix, setindex
using Statistics: Statistics
Expand Down
42 changes: 33 additions & 9 deletions src/general/abstract_system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,42 @@ end
end

# Return `A[:, :, i]` as an `SMatrix`.
@propagate_inbounds function extract_smatrix(A, system, particle)
return extract_smatrix(A, Val(ndims(system)), particle)
@propagate_inbounds function extract_smatrix(A, system, i)
return extract_smatrix(A, Val(ndims(system)), i)
end

@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
@boundscheck checkbounds(A, NDIMS, NDIMS, particle)
@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{N}, i) where {T, N}
@boundscheck checkbounds(A, N, N, i)
# This function assumes that the first two dimensions of `A` have exactly the size `N`.
@boundscheck if stride(A, 3) != N^2
# WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
# even when the error is never thrown.
error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
end

# Extract the matrix elements as a tuple in column-major order,
# and construct an `SMatrix` from it.
return SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))
end

@inline function extract_smatrix(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
@boundscheck checkbounds(A, 2, 2, i)
# This function assumes that the first two dimensions of `A` have exactly the size `2`.
@boundscheck if stride(A, 3) != 4
# WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
# even when the error is never thrown.
error("extract_smatrix only works for 3D arrays where the first two dimensions each have size N")
end

# Extract the matrix elements for this particle as a tuple to pass to SMatrix
return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
div(i - 1, NDIMS) + 1,
particle]),
Val(NDIMS^2)))
# This is the same as
# `SMatrix{N, N}(ntuple(@inline(j->@inbounds A[(i - 1) * N^2 + j]), Val(N^2)))`,
# but it's slightly faster on CPUs in some cases. As opposed to the GPU-optimized
# version in `extract_smatrix_aligned`, we use `vload` and not `vloada` here, which
# does not require alignment and works if N^2 is not a power of 2.
# This is faster in the TLSPH RHS in 2D, but slower in 3D for some reason.
# See the benchmarks in https://github.com/trixi-framework/TrixiParticles.jl/pull/1147.
vec = SIMD.vload(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))
return SMatrix{2, 2}(Tuple(vec))
end

# Specifically get the current coordinates of a particle for all system types.
Expand Down
70 changes: 70 additions & 0 deletions src/general/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,73 @@ end
function allocate(backend, ELTYPE, size)
return Array{ELTYPE, length(size)}(undef, size)
end

# This function checks if we can use aligned `SVector` or `SMatrix` loads for the given
# array `A` and the size of the vector we want to load `n`.
function can_use_aligned_load(A, n)
# Check that `n` is a power of 2, which is a requirement for aligned loads.
is_power_of_2 = n > 0 && (n & (n - 1)) == 0
# Check if the beginning of `A` is aligned to the size of the vector we want to load.
is_aligned = UInt(pointer(A)) % (n * sizeof(eltype(A))) == 0
# Check if the stride of `A` in the last dimension is equal to `n`,
# which means that there is no padding between the vectors we want to load.
has_stride = stride(A, ndims(A)) == n

return is_power_of_2 && is_aligned && has_stride
end

@propagate_inbounds function extract_svector_aligned(A, system, particle)
return extract_svector_aligned(A, Val(ndims(system)), particle)
end

# For N = 2 and N = 4, use the aligned vector loads.
@propagate_inbounds function extract_svector_aligned(A::AbstractMatrix,
val_n::Union{Val{2}, Val{4}}, i)
return _extract_svector_aligned(A, val_n, i)
end

# For other N, fall back to the regular `extract_svector`.
@propagate_inbounds function extract_svector_aligned(A, val_n::Val, i)
return extract_svector(A, val_n, i)
end

# This function only works for N = 2 and N = 4, because it requires a stride of 2^n.
@inline function _extract_svector_aligned(A::AbstractMatrix{T}, ::Val{N}, i) where {T, N}
@boundscheck checkbounds(A, 1:N, i)
@boundscheck can_use_aligned_load(A, N)
# This function assumes a stride of N in the last dimension.
@boundscheck if stride(A, 2) != N
# WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
# even when the error is never thrown.
error("extract_svector_aligned only works for 2D arrays where the stride in the last dimension is equal to N")
end

vec = SIMD.vloada(SIMD.Vec{N, eltype(A)}, pointer(A, N * (i - 1) + 1))

return SVector{N}(Tuple(vec))
end

@propagate_inbounds function extract_smatrix_aligned(A, system, particle)
return extract_smatrix_aligned(A, Val(ndims(system)), particle)
end

# For general N, fall back to the regular `extract_smatrix`.
@propagate_inbounds function extract_smatrix_aligned(A, val_n::Val, i)
return extract_smatrix(A, val_n, i)
end

# This function only works for 2x2 matrices, because it requires a stride of 2^n.
@inline function extract_smatrix_aligned(A::AbstractArray{T, 3}, ::Val{2}, i) where {T}
@boundscheck checkbounds(A, 2, 2, i)
@boundscheck can_use_aligned_load(A, 4)
# This function assumes that the first two dimensions of `A` have exactly the size `N`.
@boundscheck if stride(A, 3) != 4
# WARNING: Don't split this string with `*`, or this function won't compile on GPUs,
# even when the error is never thrown.
error("extract_smatrix_aligned only works for 3D arrays where the first two dimensions each have size N")
end

vec = SIMD.vloada(SIMD.Vec{4, T}, pointer(A, 4 * (i - 1) + 1))

return SMatrix{2, 2}(Tuple(vec))
end
41 changes: 31 additions & 10 deletions src/general/semidiscretization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,27 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;

sizes_u = [u_nvariables(system) * n_integrated_particles(system)
for system in systems]
ranges_u = Tuple((sum(sizes_u[1:(i - 1)]) + 1):sum(sizes_u[1:i])
for i in eachindex(sizes_u))
sizes_v = [v_nvariables(system) * n_integrated_particles(system)
for system in systems]
ranges_v = Tuple((sum(sizes_v[1:(i - 1)]) + 1):sum(sizes_v[1:i])
for i in eachindex(sizes_v))

start_u = 1
start_v = 1
ranges_u_vec = Vector{UnitRange{Int}}(undef, length(systems))
ranges_v_vec = Vector{UnitRange{Int}}(undef, length(systems))
for i in eachindex(systems)
ranges_u_vec[i] = start_u:(start_u + sizes_u[i] - 1)
ranges_v_vec[i] = start_v:(start_v + sizes_v[i] - 1)

# Align ranges to 64 bytes by adding padding if necessary.
# This ensures that aligned loads can be used on the wrapped integration arrays,
# which can significantly improve performance on GPUs.
block_size = div(64, sizeof(eltype(systems[i])))
start_u += div(sizes_u[i], block_size, RoundUp) * block_size
start_v += div(sizes_v[i], block_size, RoundUp) * block_size
end

ranges_u = Tuple(ranges_u_vec)
ranges_v = Tuple(ranges_v_vec)

# Create a n x n matrix of n neighborhood searches for each of the n systems.
# We will need one neighborhood search for each pair of systems.
Expand Down Expand Up @@ -245,13 +260,13 @@ function semidiscretize(semi, tspan; reset_threads=true)
Polyester.reset_threads!()
end

sizes_u = (u_nvariables(system) * n_integrated_particles(system) for system in systems)
sizes_v = (v_nvariables(system) * n_integrated_particles(system) for system in systems)
size_u_ode = semi.ranges_u[end].stop
size_v_ode = semi.ranges_v[end].stop

# Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or
# use CPU vectors for all CPU backends.
u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, sum(sizes_u))
v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v))
u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, size_u_ode)
v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, size_v_ode)

if semi.parallelization_backend isa KernelAbstractions.GPU
u0_ode = u0_ode_
Expand All @@ -266,6 +281,11 @@ function semidiscretize(semi, tspan; reset_threads=true)
parallelization_backend=semi.parallelization_backend)
end

# Initialize the arrays with zeros to initialize the padding values for alignment.
# The values that are actually used will be overwritten in `write_u0!` and `write_v0!`.
u0_ode .= 0
v0_ode .= 0

# Set initial condition
foreach_system_wrapped(semi, v0_ode, u0_ode) do system, v0_system, u0_system
write_u0!(u0_system, system)
Expand Down Expand Up @@ -356,8 +376,9 @@ end
range = ranges_v[system_indices(system, semi)]

@boundscheck begin
if length(range) != v_nvariables(system) * n_integrated_particles(system)
throw(DimensionMismatch("`v_ode` range length $range_length does not match " *
expected = v_nvariables(system) * n_integrated_particles(system)
if length(range) != expected
throw(DimensionMismatch("`v_ode` range length $(length(range)) does not match " *
"expected number of entries $expected"))
end
end
Expand Down
72 changes: 68 additions & 4 deletions src/schemes/fluid/weakly_compressible_sph/rhs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,23 @@ function interact!(dv, v_particle_system, u_particle_system,
compact_support_ = compact_support(particle_system, neighbor_system)
almostzero = sqrt(eps(compact_support_^2))

use_aligned_load_system = Val(use_aligned_vrho_load(v_particle_system, particle_system))
use_aligned_load_neighbor = Val(use_aligned_vrho_load(v_neighbor_system,
neighbor_system))

@threaded semi for particle in each_integrated_particle(particle_system)
# We are looping over the particles of `particle_system`, so it is guaranteed
# that `particle` is in bounds of `particle_system`.
m_a = @inbounds hydrodynamic_mass(particle_system, particle)
p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)

v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
# In 3D, this function can combine velocity and density load into one wide load,
# which gives a significant speedup on GPUs.
# Note that we can only safely use `@inbounds` after checking alignment
# with `use_aligned_vrho_load` before the `@threaded` loop.
(v_a,
rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
use_aligned_load_system, particle)

# Accumulate the RHS contributions over all neighbors before writing to `dv`,
# to reduce the number of memory writes.
Expand All @@ -56,8 +65,11 @@ function interact!(dv, v_particle_system, u_particle_system,

# `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
# Note that we can only safely use `@inbounds` after checking alignment
# with `use_aligned_vrho_load` before the `@threaded` loop.
(v_b,
rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
use_aligned_load_neighbor, neighbor)

# The following call is equivalent to
# `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
Expand Down Expand Up @@ -134,3 +146,55 @@ end
neighbor, p_a)
return p_a
end

# Default method, which simply calls `current_velocity` and `current_density` separately.
@propagate_inbounds function velocity_and_density(v, system, ::Val{false}, particle)
v_particle = current_velocity(v, system, particle)
rho_particle = current_density(v, system, particle)

return v_particle, rho_particle
end

# Optimized version for WCSPH with `ContinuityDensity` in 3D,
# which combines the velocity and density load into one wide load.
# This is significantly faster on GPUs than the 4 individual loads of `extract_svector`.
# WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`,
# which is checked by `use_aligned_vrho_load`.
# Only call this function after checking `use_aligned_vrho_load` to avoid
# segmentation faults from illegal accesses.
@propagate_inbounds function velocity_and_density(v, system, ::Val{true}, particle)
vrho_particle = extract_svector_aligned(v, Val(4), particle)

# The columns of `v` are ordered as (v_x, v_y, v_z, rho)
v..., rho = Tuple(vrho_particle)
v_particle = SVector(v)

return v_particle, rho
end

# By default, don't use aligned loads
use_aligned_vrho_load(v, system) = false

function use_aligned_vrho_load(v::AbstractGPUArray, system::WeaklyCompressibleSPHSystem{3})
use_aligned_vrho_load(v, system, system.density_calculator)
end

use_aligned_vrho_load(v, system, density_calculator) = false

# Only use aligned loads when all of these conditions are satisfied:
# - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4.
# - We are on a GPU, where the aligned load gives a significant speedup.
# - The velocity array is aligned for aligned loads, which requires that the pointer of `v`
# is aligned to `4 * sizeof(eltype(v))`
# Otherwise, we cannot use `vloada`, which is an *aligned* load.
# The unaligned version `vload` does not produce wide load instructions on GPUs.
function use_aligned_vrho_load(v::AbstractGPUArray, system, ::ContinuityDensity)
if !can_use_aligned_load(v, 4)
# Aligned loads should always be possible on GPUs because the slices of `v_ode`
# are aligned to 64 bytes in `Semidiscretization` and arrays on GPUs are always
# aligned to full pages.
error("illegal alignment of `v` integration array. Please report this issue.")
end

return true
end
Loading
Loading