Improve performance of TLSPH RHS

efaulhaber · efaulhaber · commit e4befa2350f3 · 2026-04-13T17:17:12.000+02:00
diff --git a/src/schemes/boundary/wall_boundary/system.jl b/src/schemes/boundary/wall_boundary/system.jl
@@ -179,6 +179,11 @@ end
     return kernel(smoothing_kernel, distance, smoothing_length)
 end
 
+@inline function smoothing_kernel_unsafe(system::WallBoundarySystem, distance, particle)
+    (; smoothing_kernel, smoothing_length) = system.boundary_model
+    return kernel_unsafe(smoothing_kernel, distance, smoothing_length)
+end
+
 @inline function smoothing_length(system::WallBoundarySystem, particle)
     return smoothing_length(system.boundary_model, particle)
 end
diff --git a/src/schemes/structure/total_lagrangian_sph/penalty_force.jl b/src/schemes/structure/total_lagrangian_sph/penalty_force.jl
@@ -14,38 +14,51 @@ struct PenaltyForceGanzenmueller{ELTYPE}
     end
 end
 
-@inline function dv_penalty_force(penalty_force::Nothing,
-                                  particle, neighbor, initial_pos_diff, initial_distance,
-                                  current_pos_diff, current_distance,
-                                  system, m_a, m_b, rho_a, rho_b)
-    return zero(initial_pos_diff)
+@inline function dv_penalty_force!(dv_particle, penalty_force::Nothing,
+                                   particle, neighbor, initial_pos_diff, initial_distance,
+                                   current_pos_diff, current_distance,
+                                   system, m_a, m_b, rho_a, rho_b, F_a, F_b)
+    return dv_particle
 end
 
-@propagate_inbounds function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
-                                              particle, neighbor, initial_pos_diff,
-                                              initial_distance,
-                                              current_pos_diff, current_distance,
-                                              system, m_a, m_b, rho_a, rho_b)
-    volume_a = m_a / rho_a
-    volume_b = m_b / rho_b
+@propagate_inbounds function dv_penalty_force!(dv_particle,
+                                               penalty_force::PenaltyForceGanzenmueller,
+                                               particle, neighbor, initial_pos_diff,
+                                               initial_distance,
+                                               current_pos_diff, current_distance,
+                                               system, m_a, m_b, rho_a, rho_b, F_a, F_b)
+    (; alpha) = penalty_force
 
-    kernel_weight = smoothing_kernel(system, initial_distance, particle)
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    volume_a = div_fast(m_a, rho_a)
+    volume_b = div_fast(m_b, rho_b)
 
-    F_a = deformation_gradient(system, particle)
-    F_b = deformation_gradient(system, neighbor)
+    # This function is called after a compact support check, so we can use the unsafe
+    # kernel function, which does not check the distance again.
+    kernel_weight = smoothing_kernel_unsafe(system, initial_distance, particle)
 
-    inv_current_distance = 1 / current_distance
+    E_a = young_modulus(system, particle)
+    E_b = young_modulus(system, neighbor)
 
-    # Use the symmetry of epsilon to simplify computations
-    eps_sum = (F_a + F_b) * initial_pos_diff - 2 * current_pos_diff
-    delta_sum = dot(eps_sum, current_pos_diff) * inv_current_distance
+    eps_a = F_a * initial_pos_diff - current_pos_diff
+    eps_b = -(F_b * initial_pos_diff - current_pos_diff)
 
-    E = young_modulus(system, particle)
+    # This is (E_a * delta_a + E_b * delta_b) * current_distance.
+    # Pulling the division by `current_distance` out allows us to do one division by
+    # `current_distance^2` instead.
+    delta_sum = E_a * dot(eps_a, current_pos_diff) + E_b * dot(eps_b, current_pos_diff)
 
-    f = (penalty_force.alpha / 2) * volume_a * volume_b *
-        kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff *
-        inv_current_distance
+    # The division contains all scalar factors, which are then multiplied by
+    # the vector `current_pos_diff` at the end.
+    # We already divide by `m_a` to obtain an acceleration.
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    dv_particle[] += div_fast((alpha / 2) * volume_a * volume_b * kernel_weight * delta_sum,
+                              initial_distance^2 * current_distance^2 * m_a) *
+                     current_pos_diff
 
-    # Divide force by mass to obtain acceleration
-    return f / m_a
+    return dv_particle
 end
diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -19,6 +19,7 @@ end
 
     # Everything here is done in the initial coordinates
     system_coords = initial_coordinates(system)
+    neighborhood_search = get_neighborhood_search(system, system, semi)
 
     # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
     # and the density diffusion divide by zero.
@@ -29,48 +30,62 @@ end
     h = initial_smoothing_length(system)
     almostzero = sqrt(eps(h^2))
 
-    # Loop over all pairs of particles and neighbors within the kernel cutoff.
-    # For structure-structure interaction, this has to happen in the initial coordinates.
-    foreach_point_neighbor(system, system, system_coords, system_coords, semi;
-                           points=each_integrated_particle(system)) do particle, neighbor,
-                                                                       initial_pos_diff,
-                                                                       initial_distance
-        # Skip neighbors with the same position because the kernel gradient is zero.
-        # Note that `return` only exits the closure, i.e., skips the current neighbor.
-        skip_zero_distance(system) && initial_distance < almostzero && return
-
-        # Now that we know that `distance` is not zero, we can safely call the unsafe
-        # version of the kernel gradient to avoid redundant zero checks.
-        grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
-                                                   initial_distance, particle)
-
-        rho_a = @inbounds system.material_density[particle]
-        rho_b = @inbounds system.material_density[neighbor]
-
+    @threaded semi for particle in each_integrated_particle(system)
+        # We are looping over the particles of `system`, so it is guaranteed
+        # that `particle` is in bounds of `system`.
         m_a = @inbounds system.mass[particle]
-        m_b = @inbounds system.mass[neighbor]
-
+        rho_a = @inbounds system.material_density[particle]
         # PK1 / rho^2
         pk1_rho2_a = @inbounds pk1_rho2(system, particle)
-        pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
-
-        current_pos_diff_ = @inbounds current_coords(system, particle) -
-                                      current_coords(system, neighbor)
-        # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
-        current_pos_diff = convert.(eltype(system), current_pos_diff_)
-        current_distance = norm(current_pos_diff)
-
-        dv_stress = m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
-
-        dv_penalty_force_ = @inbounds dv_penalty_force(penalty_force, particle, neighbor,
-                                                       initial_pos_diff, initial_distance,
-                                                       current_pos_diff, current_distance,
-                                                       system, m_a, m_b, rho_a, rho_b)
-
-        dv_particle = Ref(dv_stress + dv_penalty_force_)
-        @inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
-                                      current_pos_diff, current_distance,
-                                      m_a, m_b, rho_a, rho_b, grad_kernel)
+        current_coords_a = @inbounds current_coords(system, particle)
+        F_a = @inbounds deformation_gradient(system, particle)
+
+        # Accumulate the RHS contributions over all neighbors before writing to `dv`,
+        # to reduce the number of memory writes.
+        # Note that we need a `Ref` in order to be able to update these variables
+        # inside the closure in the `foreach_neighbor` loop.
+        dv_particle = Ref(zero(current_coords_a))
+
+        # Loop over all neighbors within the kernel cutoff
+        @inbounds PointNeighbors.foreach_neighbor(system_coords, system_coords,
+                                                  neighborhood_search,
+                                                  particle) do particle, neighbor,
+                                                               initial_pos_diff,
+                                                               initial_distance
+            # Skip neighbors with the same position because the kernel gradient is zero.
+            # Note that `return` only exits the closure, i.e., skips the current neighbor.
+            skip_zero_distance(system) && initial_distance < almostzero && return
+
+            # Now that we know that `distance` is not zero, we can safely call the unsafe
+            # version of the kernel gradient to avoid redundant zero checks.
+            grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
+                                                       initial_distance, particle)
+
+            rho_b = @inbounds system.material_density[neighbor]
+            m_b = @inbounds system.mass[neighbor]
+            # PK1 / rho^2
+            pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
+            current_coords_b = @inbounds current_coords(system, neighbor)
+
+            # The compiler is smart enough to optimize this away if no penalty force is used
+            F_b = @inbounds deformation_gradient(system, neighbor)
+
+            current_pos_diff_ = current_coords_a - current_coords_b
+            # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
+            current_pos_diff = convert.(eltype(system), current_pos_diff_)
+            current_distance = norm(current_pos_diff)
+
+            dv_particle[] += m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
+
+            @inbounds dv_penalty_force!(dv_particle, penalty_force, particle, neighbor,
+                                        initial_pos_diff, initial_distance,
+                                        current_pos_diff, current_distance,
+                                        system, m_a, m_b, rho_a, rho_b, F_a, F_b)
+
+            @inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
+                                          current_pos_diff, current_distance,
+                                          m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
+        end
 
         for i in 1:ndims(system)
             @inbounds dv[i, particle] += dv_particle[][i]
diff --git a/src/schemes/structure/total_lagrangian_sph/viscosity.jl b/src/schemes/structure/total_lagrangian_sph/viscosity.jl
@@ -4,21 +4,21 @@
 @propagate_inbounds function dv_viscosity_tlsph!(dv_particle, system, v_system,
                                                  particle, neighbor,
                                                  current_pos_diff, current_distance,
-                                                 m_a, m_b, rho_a, rho_b, grad_kernel)
+                                                 m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
     viscosity = system.viscosity
 
     return dv_viscosity_tlsph!(dv_particle, viscosity, system, v_system,
                                particle, neighbor, current_pos_diff, current_distance,
-                               m_a, m_b, rho_a, rho_b, grad_kernel)
+                               m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
 end
 
 @propagate_inbounds function dv_viscosity_tlsph!(dv_particle, viscosity, system,
                                                  v_system, particle, neighbor,
                                                  current_pos_diff, current_distance,
-                                                 m_a, m_b, rho_a, rho_b, grad_kernel)
+                                                 m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
     return viscosity(dv_particle, system, v_system, particle, neighbor,
                      current_pos_diff, current_distance,
-                     m_a, m_b, rho_a, rho_b, grad_kernel)
+                     m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
 end
 
 @inline function dv_viscosity_tlsph!(dv_particle, viscosity::Nothing, system,
@@ -38,7 +38,8 @@ end
                                                                       current_pos_diff,
                                                                       current_distance,
                                                                       m_a, m_b, rho_a,
-                                                                      rho_b, grad_kernel)
+                                                                      rho_b, F_a,
+                                                                      grad_kernel)
     v_a = current_velocity(v_system, system, particle)
     v_b = current_velocity(v_system, system, neighbor)
     v_diff = v_a - v_b
@@ -54,29 +55,35 @@ end
         # Compute bulk modulus from Young's modulus and Poisson's ratio.
         # See the table at the end of https://en.wikipedia.org/wiki/Lam%C3%A9_parameters
         E = young_modulus(system, particle)
+        # A fast division is slower here for some reason
         K = E / (ndims(system) * (1 - 2 * poisson_ratio(system, particle)))
 
         # Newton–Laplace equation
-        sound_speed = sqrt(K / rho_a)
+        # Since this is one of the most performance critical functions, using fast divisions
+        # here gives a significant speedup on GPUs.
+        # See the docs page "Development" for more details on `div_fast`.
+        sound_speed = sqrt(div_fast(K, rho_a))
 
         h_a = smoothing_length(system, particle)
         h_b = smoothing_length(system, neighbor)
         h = (h_a + h_b) / 2
 
         rho_mean = (rho_a + rho_b) / 2
 
+        # Since this is one of the most performance critical functions, using fast divisions
+        # here gives a significant speedup on GPUs.
+        # See the docs page "Development" for more details on `div_fast`.
         (; alpha, beta, epsilon) = viscosity
-        mu = h * vr / (current_distance^2 + epsilon * h^2)
+        mu = div_fast(h * vr, (current_distance^2 + epsilon * h^2))
         c = sound_speed
-        pi_ab = (alpha * c * mu + beta * mu^2) / rho_mean * grad_kernel
+        pi_ab = div_fast(alpha * c * mu + beta * mu^2, rho_mean) * grad_kernel
 
-        F = deformation_gradient(system, particle)
-        det_F = det(F)
+        det_F = det(F_a)
         if abs(det_F) < 1.0f-9
             return dv_particle
         end
         # See eq. 26 of Lin et al. (2015)
-        dv_particle[] += m_b * det_F * inv(F)' * pi_ab
+        dv_particle[] += m_b * det_F * inv(F_a)' * pi_ab
     end
 
     return dv_particle