Skip to content

Commit e4befa2

Browse files
committed
Improve performance of TLSPH RHS
1 parent 425a7b4 commit e4befa2

File tree

4 files changed

+115
-75
lines changed

4 files changed

+115
-75
lines changed

src/schemes/boundary/wall_boundary/system.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ end
179179
return kernel(smoothing_kernel, distance, smoothing_length)
180180
end
181181

182+
@inline function smoothing_kernel_unsafe(system::WallBoundarySystem, distance, particle)
183+
(; smoothing_kernel, smoothing_length) = system.boundary_model
184+
return kernel_unsafe(smoothing_kernel, distance, smoothing_length)
185+
end
186+
182187
@inline function smoothing_length(system::WallBoundarySystem, particle)
183188
return smoothing_length(system.boundary_model, particle)
184189
end

src/schemes/structure/total_lagrangian_sph/penalty_force.jl

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,38 +14,51 @@ struct PenaltyForceGanzenmueller{ELTYPE}
1414
end
1515
end
1616

17-
@inline function dv_penalty_force(penalty_force::Nothing,
18-
particle, neighbor, initial_pos_diff, initial_distance,
19-
current_pos_diff, current_distance,
20-
system, m_a, m_b, rho_a, rho_b)
21-
return zero(initial_pos_diff)
17+
@inline function dv_penalty_force!(dv_particle, penalty_force::Nothing,
18+
particle, neighbor, initial_pos_diff, initial_distance,
19+
current_pos_diff, current_distance,
20+
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
21+
return dv_particle
2222
end
2323

24-
@propagate_inbounds function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
25-
particle, neighbor, initial_pos_diff,
26-
initial_distance,
27-
current_pos_diff, current_distance,
28-
system, m_a, m_b, rho_a, rho_b)
29-
volume_a = m_a / rho_a
30-
volume_b = m_b / rho_b
24+
@propagate_inbounds function dv_penalty_force!(dv_particle,
25+
penalty_force::PenaltyForceGanzenmueller,
26+
particle, neighbor, initial_pos_diff,
27+
initial_distance,
28+
current_pos_diff, current_distance,
29+
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
30+
(; alpha) = penalty_force
3131

32-
kernel_weight = smoothing_kernel(system, initial_distance, particle)
32+
# Since this is one of the most performance critical functions, using fast divisions
33+
# here gives a significant speedup on GPUs.
34+
# See the docs page "Development" for more details on `div_fast`.
35+
volume_a = div_fast(m_a, rho_a)
36+
volume_b = div_fast(m_b, rho_b)
3337

34-
F_a = deformation_gradient(system, particle)
35-
F_b = deformation_gradient(system, neighbor)
38+
# This function is called after a compact support check, so we can use the unsafe
39+
# kernel function, which does not check the distance again.
40+
kernel_weight = smoothing_kernel_unsafe(system, initial_distance, particle)
3641

37-
inv_current_distance = 1 / current_distance
42+
E_a = young_modulus(system, particle)
43+
E_b = young_modulus(system, neighbor)
3844

39-
# Use the symmetry of epsilon to simplify computations
40-
eps_sum = (F_a + F_b) * initial_pos_diff - 2 * current_pos_diff
41-
delta_sum = dot(eps_sum, current_pos_diff) * inv_current_distance
45+
eps_a = F_a * initial_pos_diff - current_pos_diff
46+
eps_b = -(F_b * initial_pos_diff - current_pos_diff)
4247

43-
E = young_modulus(system, particle)
48+
# This is (E_a * delta_a + E_b * delta_b) * current_distance.
49+
# Pulling the division by `current_distance` out allows us to do one division by
50+
# `current_distance^2` instead.
51+
delta_sum = E_a * dot(eps_a, current_pos_diff) + E_b * dot(eps_b, current_pos_diff)
4452

45-
f = (penalty_force.alpha / 2) * volume_a * volume_b *
46-
kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff *
47-
inv_current_distance
53+
# The division contains all scalar factors, which are then multiplied by
54+
# the vector `current_pos_diff` at the end.
55+
# We already divide by `m_a` to obtain an acceleration.
56+
# Since this is one of the most performance critical functions, using fast divisions
57+
# here gives a significant speedup on GPUs.
58+
# See the docs page "Development" for more details on `div_fast`.
59+
dv_particle[] += div_fast((alpha / 2) * volume_a * volume_b * kernel_weight * delta_sum,
60+
initial_distance^2 * current_distance^2 * m_a) *
61+
current_pos_diff
4862

49-
# Divide force by mass to obtain acceleration
50-
return f / m_a
63+
return dv_particle
5164
end

src/schemes/structure/total_lagrangian_sph/rhs.jl

Lines changed: 54 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ end
1919

2020
# Everything here is done in the initial coordinates
2121
system_coords = initial_coordinates(system)
22+
neighborhood_search = get_neighborhood_search(system, system, semi)
2223

2324
# For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
2425
# and the density diffusion divide by zero.
@@ -29,48 +30,62 @@ end
2930
h = initial_smoothing_length(system)
3031
almostzero = sqrt(eps(h^2))
3132

32-
# Loop over all pairs of particles and neighbors within the kernel cutoff.
33-
# For structure-structure interaction, this has to happen in the initial coordinates.
34-
foreach_point_neighbor(system, system, system_coords, system_coords, semi;
35-
points=each_integrated_particle(system)) do particle, neighbor,
36-
initial_pos_diff,
37-
initial_distance
38-
# Skip neighbors with the same position because the kernel gradient is zero.
39-
# Note that `return` only exits the closure, i.e., skips the current neighbor.
40-
skip_zero_distance(system) && initial_distance < almostzero && return
41-
42-
# Now that we know that `distance` is not zero, we can safely call the unsafe
43-
# version of the kernel gradient to avoid redundant zero checks.
44-
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
45-
initial_distance, particle)
46-
47-
rho_a = @inbounds system.material_density[particle]
48-
rho_b = @inbounds system.material_density[neighbor]
49-
33+
@threaded semi for particle in each_integrated_particle(system)
34+
# We are looping over the particles of `system`, so it is guaranteed
35+
# that `particle` is in bounds of `system`.
5036
m_a = @inbounds system.mass[particle]
51-
m_b = @inbounds system.mass[neighbor]
52-
37+
rho_a = @inbounds system.material_density[particle]
5338
# PK1 / rho^2
5439
pk1_rho2_a = @inbounds pk1_rho2(system, particle)
55-
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
56-
57-
current_pos_diff_ = @inbounds current_coords(system, particle) -
58-
current_coords(system, neighbor)
59-
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
60-
current_pos_diff = convert.(eltype(system), current_pos_diff_)
61-
current_distance = norm(current_pos_diff)
62-
63-
dv_stress = m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
64-
65-
dv_penalty_force_ = @inbounds dv_penalty_force(penalty_force, particle, neighbor,
66-
initial_pos_diff, initial_distance,
67-
current_pos_diff, current_distance,
68-
system, m_a, m_b, rho_a, rho_b)
69-
70-
dv_particle = Ref(dv_stress + dv_penalty_force_)
71-
@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
72-
current_pos_diff, current_distance,
73-
m_a, m_b, rho_a, rho_b, grad_kernel)
40+
current_coords_a = @inbounds current_coords(system, particle)
41+
F_a = @inbounds deformation_gradient(system, particle)
42+
43+
# Accumulate the RHS contributions over all neighbors before writing to `dv`,
44+
# to reduce the number of memory writes.
45+
# Note that we need a `Ref` in order to be able to update these variables
46+
# inside the closure in the `foreach_neighbor` loop.
47+
dv_particle = Ref(zero(current_coords_a))
48+
49+
# Loop over all neighbors within the kernel cutoff
50+
@inbounds PointNeighbors.foreach_neighbor(system_coords, system_coords,
51+
neighborhood_search,
52+
particle) do particle, neighbor,
53+
initial_pos_diff,
54+
initial_distance
55+
# Skip neighbors with the same position because the kernel gradient is zero.
56+
# Note that `return` only exits the closure, i.e., skips the current neighbor.
57+
skip_zero_distance(system) && initial_distance < almostzero && return
58+
59+
# Now that we know that `distance` is not zero, we can safely call the unsafe
60+
# version of the kernel gradient to avoid redundant zero checks.
61+
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
62+
initial_distance, particle)
63+
64+
rho_b = @inbounds system.material_density[neighbor]
65+
m_b = @inbounds system.mass[neighbor]
66+
# PK1 / rho^2
67+
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
68+
current_coords_b = @inbounds current_coords(system, neighbor)
69+
70+
# The compiler is smart enough to optimize this away if no penalty force is used
71+
F_b = @inbounds deformation_gradient(system, neighbor)
72+
73+
current_pos_diff_ = current_coords_a - current_coords_b
74+
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
75+
current_pos_diff = convert.(eltype(system), current_pos_diff_)
76+
current_distance = norm(current_pos_diff)
77+
78+
dv_particle[] += m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
79+
80+
@inbounds dv_penalty_force!(dv_particle, penalty_force, particle, neighbor,
81+
initial_pos_diff, initial_distance,
82+
current_pos_diff, current_distance,
83+
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
84+
85+
@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
86+
current_pos_diff, current_distance,
87+
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
88+
end
7489

7590
for i in 1:ndims(system)
7691
@inbounds dv[i, particle] += dv_particle[][i]

src/schemes/structure/total_lagrangian_sph/viscosity.jl

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,21 @@
44
@propagate_inbounds function dv_viscosity_tlsph!(dv_particle, system, v_system,
55
particle, neighbor,
66
current_pos_diff, current_distance,
7-
m_a, m_b, rho_a, rho_b, grad_kernel)
7+
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
88
viscosity = system.viscosity
99

1010
return dv_viscosity_tlsph!(dv_particle, viscosity, system, v_system,
1111
particle, neighbor, current_pos_diff, current_distance,
12-
m_a, m_b, rho_a, rho_b, grad_kernel)
12+
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
1313
end
1414

1515
@propagate_inbounds function dv_viscosity_tlsph!(dv_particle, viscosity, system,
1616
v_system, particle, neighbor,
1717
current_pos_diff, current_distance,
18-
m_a, m_b, rho_a, rho_b, grad_kernel)
18+
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
1919
return viscosity(dv_particle, system, v_system, particle, neighbor,
2020
current_pos_diff, current_distance,
21-
m_a, m_b, rho_a, rho_b, grad_kernel)
21+
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
2222
end
2323

2424
@inline function dv_viscosity_tlsph!(dv_particle, viscosity::Nothing, system,
@@ -38,7 +38,8 @@ end
3838
current_pos_diff,
3939
current_distance,
4040
m_a, m_b, rho_a,
41-
rho_b, grad_kernel)
41+
rho_b, F_a,
42+
grad_kernel)
4243
v_a = current_velocity(v_system, system, particle)
4344
v_b = current_velocity(v_system, system, neighbor)
4445
v_diff = v_a - v_b
@@ -54,29 +55,35 @@ end
5455
# Compute bulk modulus from Young's modulus and Poisson's ratio.
5556
# See the table at the end of https://en.wikipedia.org/wiki/Lam%C3%A9_parameters
5657
E = young_modulus(system, particle)
58+
# A fast division is slower here for some reason
5759
K = E / (ndims(system) * (1 - 2 * poisson_ratio(system, particle)))
5860

5961
# Newton–Laplace equation
60-
sound_speed = sqrt(K / rho_a)
62+
# Since this is one of the most performance critical functions, using fast divisions
63+
# here gives a significant speedup on GPUs.
64+
# See the docs page "Development" for more details on `div_fast`.
65+
sound_speed = sqrt(div_fast(K, rho_a))
6166

6267
h_a = smoothing_length(system, particle)
6368
h_b = smoothing_length(system, neighbor)
6469
h = (h_a + h_b) / 2
6570

6671
rho_mean = (rho_a + rho_b) / 2
6772

73+
# Since this is one of the most performance critical functions, using fast divisions
74+
# here gives a significant speedup on GPUs.
75+
# See the docs page "Development" for more details on `div_fast`.
6876
(; alpha, beta, epsilon) = viscosity
69-
mu = h * vr / (current_distance^2 + epsilon * h^2)
77+
mu = div_fast(h * vr, (current_distance^2 + epsilon * h^2))
7078
c = sound_speed
71-
pi_ab = (alpha * c * mu + beta * mu^2) / rho_mean * grad_kernel
79+
pi_ab = div_fast(alpha * c * mu + beta * mu^2, rho_mean) * grad_kernel
7280

73-
F = deformation_gradient(system, particle)
74-
det_F = det(F)
81+
det_F = det(F_a)
7582
if abs(det_F) < 1.0f-9
7683
return dv_particle
7784
end
7885
# See eq. 26 of Lin et al. (2015)
79-
dv_particle[] += m_b * det_F * inv(F)' * pi_ab
86+
dv_particle[] += m_b * det_F * inv(F_a)' * pi_ab
8087
end
8188

8289
return dv_particle

0 commit comments

Comments
 (0)