Optimize deformation gradient

efaulhaber · efaulhaber · commit 006d4f5275f4 · 2026-04-13T17:17:12.000+02:00
diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
@@ -68,17 +68,27 @@ end
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.
-@inline function extract_smatrix(A, system, particle)
-    @boundscheck checkbounds(A, ndims(system), ndims(system), particle)
+@propagate_inbounds function extract_smatrix(A, system, particle)
+    return extract_smatrix(A, Val(ndims(system)), particle)
+end
+
+@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
+    @boundscheck checkbounds(A, NDIMS, NDIMS, particle)
+
+    # Extract the matrix elements for this particle as a tuple to pass to SMatrix
+    return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
+                                                               div(i - 1, NDIMS) + 1,
+                                                               particle]),
+                                        Val(NDIMS^2)))
+end
+
+@inline function extract_smatrix(A, ::Val{2}, particle)
+    @boundscheck checkbounds(A, 2, 2, particle)
+
+    x = vloada(Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1))
 
     # Extract the matrix elements for this particle as a tuple to pass to SMatrix
-    return SMatrix{ndims(system),
-                   ndims(system)}(ntuple(@inline(i->@inbounds A[mod(i - 1,
-                                                                    ndims(system)) + 1,
-                                                                div(i - 1,
-                                                                    ndims(system)) + 1,
-                                                                particle]),
-                                         Val(ndims(system)^2)))
+    return SMatrix{2, 2}(Tuple(x))
 end
 
 # Specifically get the current coordinates of a particle for all system types.
diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -40,7 +40,7 @@ end
         current_coords_a = @inbounds current_coords(system, particle)
         F_a = @inbounds deformation_gradient(system, particle)
 
-        # Accumulate the RHS contributions over all neighbors before writing to `dv`,
+        # Accumulate the RHS contributions over all neighbors before writing to `dv`
         # to reduce the number of memory writes.
         # Note that we need a `Ref` in order to be able to update these variables
         # inside the closure in the `foreach_neighbor` loop.
diff --git a/src/schemes/structure/total_lagrangian_sph/system.jl b/src/schemes/structure/total_lagrangian_sph/system.jl
@@ -489,30 +489,48 @@ end
 
     # Loop over all pairs of particles and neighbors within the kernel cutoff
     initial_coords = initial_coordinates(system)
-    foreach_point_neighbor(system, system, initial_coords, initial_coords,
-                           semi) do particle, neighbor, initial_pos_diff, initial_distance
-        # Skip neighbors with the same position because the kernel gradient is zero.
-        # Note that `return` only exits the closure, i.e., skips the current neighbor.
-        skip_zero_distance(system) && initial_distance < almostzero && return
+    neighborhood_search = get_neighborhood_search(system, system, semi)
 
-        # Now that we know that `distance` is not zero, we can safely call the unsafe
-        # version of the kernel gradient to avoid redundant zero checks.
-        grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
-                                                   initial_distance, particle)
-
-        volume = @inbounds mass[neighbor] / material_density[neighbor]
-        pos_diff_ = @inbounds current_coords(system, particle) -
-                              current_coords(system, neighbor)
-        # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
-        pos_diff = convert.(eltype(system), pos_diff_)
-
-        # Multiply by L_{0a}
-        L = @inbounds correction_matrix(system, particle)
-
-        result = volume * pos_diff * grad_kernel' * L'
+    @threaded semi for particle in each_integrated_particle(system)
+        # We are looping over the particles of `system`, so it is guaranteed
+        # that `particle` is in bounds of `system`.
+        current_coords_a = @inbounds current_coords(system, particle)
+        L_a = @inbounds correction_matrix(system, particle)
+
+        # Accumulate the contributions over all neighbors before writing
+        # to `deformation_grad` to reduce the number of memory writes.
+        # Note that we need a `Ref` in order to be able to update these variables
+        # inside the closure in the `foreach_neighbor` loop.
+        result = Ref(zero(L_a))
+
+        # Loop over all neighbors within the kernel cutoff
+        @inbounds PointNeighbors.foreach_neighbor(initial_coords, initial_coords,
+                                                  neighborhood_search,
+                                                  particle) do particle, neighbor,
+                                                               initial_pos_diff,
+                                                               initial_distance
+            # Skip neighbors with the same position because the kernel gradient is zero.
+            # Note that `return` only exits the closure, i.e., skips the current neighbor.
+            skip_zero_distance(system) && initial_distance < almostzero && return
+
+            # Now that we know that `distance` is not zero, we can safely call the unsafe
+            # version of the kernel gradient to avoid redundant zero checks.
+            grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
+                                                       initial_distance, particle)
+
+            volume = @inbounds mass[neighbor] / material_density[neighbor]
+            current_coords_b = @inbounds current_coords(system, neighbor)
+            pos_diff_ = current_coords_a - current_coords_b
+            # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
+            pos_diff = convert.(eltype(system), pos_diff_)
+
+            # The tensor product pos_diff ⊗ (L_{0a} * ∇W) is equivalent to multiplication
+            # by the transpose: pos_diff * (L_{0a} * ∇W)ᵀ = pos_diff * ∇Wᵀ * L_{0a}ᵀ.
+            result[] -= volume * pos_diff * grad_kernel' * L_a'
+        end
 
         for j in 1:ndims(system), i in 1:ndims(system)
-            @inbounds deformation_grad[i, j, particle] -= result[i, j]
+            @inbounds deformation_grad[i, j, particle] += result[][i, j]
         end
     end