Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand All @@ -34,13 +35,13 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
TrixiParticlesCUDAExt = "CUDA"
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]

[compat]
Accessors = "0.1.43"
Expand All @@ -63,6 +64,7 @@ Polyester = "0.7.10"
ReadVTK = "0.2"
RecipesBase = "1"
Reexport = "1"
SIMD = "3.7.2"
SciMLBase = "2"
StaticArrays = "1"
Statistics = "1"
Expand Down
1 change: 1 addition & 0 deletions src/TrixiParticles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ using Random: seed!
using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
add_tstop!
using SIMD: SIMD
@reexport using StaticArrays: SVector
using StaticArrays: @SMatrix, SMatrix, setindex
using Statistics: Statistics
Expand Down
30 changes: 21 additions & 9 deletions src/general/abstract_system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,29 @@ end
end

# Return `A[:, :, i]` as an `SMatrix`.
@inline function extract_smatrix(A, system, particle)
@boundscheck checkbounds(A, ndims(system), ndims(system), particle)
@propagate_inbounds function extract_smatrix(A, system, particle)
return extract_smatrix(A, Val(ndims(system)), particle)
end

@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
@boundscheck checkbounds(A, NDIMS, NDIMS, particle)

# Extract the matrix elements for this particle as a tuple to pass to SMatrix
return SMatrix{ndims(system),
ndims(system)}(ntuple(@inline(i->@inbounds A[mod(i - 1,
ndims(system)) + 1,
div(i - 1,
ndims(system)) + 1,
particle]),
Val(ndims(system)^2)))
return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
div(i - 1, NDIMS) + 1,
particle]),
Val(NDIMS^2)))
end

# Optimized version for 2D, which uses SIMD.jl to combine the 4 loads of the 2x2 matrix
# into a single wide load. This is significantly faster on GPUs than 4 individual loads.
@inline function extract_smatrix(A, ::Val{2}, particle)
@boundscheck checkbounds(A, 2, 2, particle)

# Note that this doesn't work in 3D because it requires a stride of 2^n.
x = SIMD.vloada(SIMD.Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1))
Comment thread
efaulhaber marked this conversation as resolved.
Outdated

return SMatrix{2, 2}(Tuple(x))
end

# Specifically get the current coordinates of a particle for all system types.
Expand Down
6 changes: 6 additions & 0 deletions src/general/neighborhood_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ function PointNeighbors.foreach_point_neighbor(f, system, neighbor_system,
points, parallelization_backend)
end

@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, backend, particle)
PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, particle)
Comment thread
efaulhaber marked this conversation as resolved.
end

# === Compact support selection ===
# -- Generic
@inline function compact_support(system, neighbor)
Expand Down
5 changes: 5 additions & 0 deletions src/schemes/boundary/wall_boundary/system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ end
return kernel(smoothing_kernel, distance, smoothing_length)
end

@inline function smoothing_kernel_unsafe(system::WallBoundarySystem, distance, particle)
(; smoothing_kernel, smoothing_length) = system.boundary_model
return kernel_unsafe(smoothing_kernel, distance, smoothing_length)
end

@inline function smoothing_length(system::WallBoundarySystem, particle)
return smoothing_length(system.boundary_model, particle)
end
Expand Down
63 changes: 38 additions & 25 deletions src/schemes/structure/total_lagrangian_sph/penalty_force.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,38 +14,51 @@ struct PenaltyForceGanzenmueller{ELTYPE}
end
end

@inline function dv_penalty_force(penalty_force::Nothing,
particle, neighbor, initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)
return zero(initial_pos_diff)
@inline function dv_penalty_force!(dv_particle, penalty_force::Nothing,
particle, neighbor, initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
return dv_particle
end

@propagate_inbounds function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
particle, neighbor, initial_pos_diff,
initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)
volume_a = m_a / rho_a
volume_b = m_b / rho_b
@propagate_inbounds function dv_penalty_force!(dv_particle,
penalty_force::PenaltyForceGanzenmueller,
particle, neighbor, initial_pos_diff,
initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
(; alpha) = penalty_force

kernel_weight = smoothing_kernel(system, initial_distance, particle)
# Since this is one of the most performance critical functions, using fast divisions
# here gives a significant speedup on GPUs.
# See the docs page "Development" for more details on `div_fast`.
volume_a = div_fast(m_a, rho_a)
volume_b = div_fast(m_b, rho_b)

F_a = deformation_gradient(system, particle)
F_b = deformation_gradient(system, neighbor)
# This function is called after a compact support check, so we can use the unsafe
# kernel function, which does not check the distance again.
kernel_weight = smoothing_kernel_unsafe(system, initial_distance, particle)

inv_current_distance = 1 / current_distance
E_a = young_modulus(system, particle)
E_b = young_modulus(system, neighbor)

# Use the symmetry of epsilon to simplify computations
eps_sum = (F_a + F_b) * initial_pos_diff - 2 * current_pos_diff
delta_sum = dot(eps_sum, current_pos_diff) * inv_current_distance
eps_a = F_a * initial_pos_diff - current_pos_diff
eps_b = -(F_b * initial_pos_diff - current_pos_diff)
Comment thread
svchb marked this conversation as resolved.
Outdated

E = young_modulus(system, particle)
# This is (E_a * delta_a + E_b * delta_b) * current_distance.
# Pulling the division by `current_distance` out allows us to do one division by
# `current_distance^2` instead.
delta_sum = E_a * dot(eps_a, current_pos_diff) + E_b * dot(eps_b, current_pos_diff)

f = (penalty_force.alpha / 2) * volume_a * volume_b *
kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff *
inv_current_distance
# The division contains all scalar factors, which are then multiplied by
# the vector `current_pos_diff` at the end.
# We already divide by `m_a` to obtain an acceleration.
# Since this is one of the most performance critical functions, using fast divisions
# here gives a significant speedup on GPUs.
# See the docs page "Development" for more details on `div_fast`.
dv_particle[] += div_fast((alpha / 2) * volume_a * volume_b * kernel_weight * delta_sum,
initial_distance^2 * current_distance^2 * m_a) *
current_pos_diff

# Divide force by mass to obtain acceleration
return f / m_a
return dv_particle
end
93 changes: 54 additions & 39 deletions src/schemes/structure/total_lagrangian_sph/rhs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ end

# Everything here is done in the initial coordinates
system_coords = initial_coordinates(system)
neighborhood_search = get_neighborhood_search(system, system, semi)
backend = semi.parallelization_backend

# For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
# and the density diffusion divide by zero.
Expand All @@ -29,48 +31,61 @@ end
h = initial_smoothing_length(system)
almostzero = sqrt(eps(h^2))

# Loop over all pairs of particles and neighbors within the kernel cutoff.
# For structure-structure interaction, this has to happen in the initial coordinates.
foreach_point_neighbor(system, system, system_coords, system_coords, semi;
points=each_integrated_particle(system)) do particle, neighbor,
initial_pos_diff,
initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

rho_a = @inbounds system.material_density[particle]
rho_b = @inbounds system.material_density[neighbor]

@threaded semi for particle in each_integrated_particle(system)
# We are looping over the particles of `system`, so it is guaranteed
# that `particle` is in bounds of `system`.
m_a = @inbounds system.mass[particle]
m_b = @inbounds system.mass[neighbor]

rho_a = @inbounds system.material_density[particle]
# PK1 / rho^2
pk1_rho2_a = @inbounds pk1_rho2(system, particle)
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)

current_pos_diff_ = @inbounds current_coords(system, particle) -
current_coords(system, neighbor)
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
current_pos_diff = convert.(eltype(system), current_pos_diff_)
current_distance = norm(current_pos_diff)

dv_stress = m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel

dv_penalty_force_ = @inbounds dv_penalty_force(penalty_force, particle, neighbor,
initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)

dv_particle = Ref(dv_stress + dv_penalty_force_)
@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
current_pos_diff, current_distance,
m_a, m_b, rho_a, rho_b, grad_kernel)
current_coords_a = @inbounds current_coords(system, particle)
F_a = @inbounds deformation_gradient(system, particle)

# Accumulate the RHS contributions over all neighbors before writing to `dv`
# to reduce the number of memory writes.
# Note that we need a `Ref` in order to be able to update these variables
# inside the closure in the `foreach_neighbor` loop.
dv_particle = Ref(zero(current_coords_a))

# Loop over all neighbors within the kernel cutoff
@inbounds foreach_neighbor(system_coords, system_coords,
neighborhood_search, backend,
particle) do particle, neighbor,
initial_pos_diff, initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

rho_b = @inbounds system.material_density[neighbor]
m_b = @inbounds system.mass[neighbor]
# PK1 / rho^2
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
current_coords_b = @inbounds current_coords(system, neighbor)

# The compiler is smart enough to optimize this away if no penalty force is used
F_b = @inbounds deformation_gradient(system, neighbor)
Comment thread
efaulhaber marked this conversation as resolved.

current_pos_diff_ = current_coords_a - current_coords_b
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
current_pos_diff = convert.(eltype(system), current_pos_diff_)
current_distance = norm(current_pos_diff)

dv_particle[] += m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel

@inbounds dv_penalty_force!(dv_particle, penalty_force, particle, neighbor,
initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)

@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
current_pos_diff, current_distance,
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
end

for i in 1:ndims(system)
@inbounds dv[i, particle] += dv_particle[][i]
Expand Down
60 changes: 39 additions & 21 deletions src/schemes/structure/total_lagrangian_sph/system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -489,30 +489,48 @@ end

# Loop over all pairs of particles and neighbors within the kernel cutoff
initial_coords = initial_coordinates(system)
foreach_point_neighbor(system, system, initial_coords, initial_coords,
semi) do particle, neighbor, initial_pos_diff, initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return
neighborhood_search = get_neighborhood_search(system, system, semi)
backend = semi.parallelization_backend

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

volume = @inbounds mass[neighbor] / material_density[neighbor]
pos_diff_ = @inbounds current_coords(system, particle) -
current_coords(system, neighbor)
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
pos_diff = convert.(eltype(system), pos_diff_)

# Multiply by L_{0a}
L = @inbounds correction_matrix(system, particle)

result = volume * pos_diff * grad_kernel' * L'
@threaded semi for particle in each_integrated_particle(system)
# We are looping over the particles of `system`, so it is guaranteed
# that `particle` is in bounds of `system`.
current_coords_a = @inbounds current_coords(system, particle)
L_a = @inbounds correction_matrix(system, particle)

# Accumulate the contributions over all neighbors before writing
# to `deformation_grad` to reduce the number of memory writes.
# Note that we need a `Ref` in order to be able to update these variables
# inside the closure in the `foreach_neighbor` loop.
result = Ref(zero(L_a))

# Loop over all neighbors within the kernel cutoff
@inbounds foreach_neighbor(initial_coords, initial_coords,
neighborhood_search, backend,
particle) do particle, neighbor,
initial_pos_diff, initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

volume = @inbounds mass[neighbor] / material_density[neighbor]
current_coords_b = @inbounds current_coords(system, neighbor)
pos_diff_ = current_coords_a - current_coords_b
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
pos_diff = convert.(eltype(system), pos_diff_)
Comment thread
LasNikas marked this conversation as resolved.

# The tensor product pos_diff ⊗ (L_{0a} * ∇W) is equivalent to multiplication
# by the transpose: pos_diff * (L_{0a} * ∇W)ᵀ = pos_diff * ∇Wᵀ * L_{0a}ᵀ.
result[] -= volume * pos_diff * grad_kernel' * L_a'
end

for j in 1:ndims(system), i in 1:ndims(system)
@inbounds deformation_grad[i, j, particle] -= result[i, j]
@inbounds deformation_grad[i, j, particle] += result[][i, j]
end
end

Expand Down
Loading
Loading