@@ -489,30 +489,48 @@ end
489489
490490 # Loop over all pairs of particles and neighbors within the kernel cutoff
491491 initial_coords = initial_coordinates (system)
492- foreach_point_neighbor (system, system, initial_coords, initial_coords,
493- semi) do particle, neighbor, initial_pos_diff, initial_distance
494- # Skip neighbors with the same position because the kernel gradient is zero.
495- # Note that `return` only exits the closure, i.e., skips the current neighbor.
496- skip_zero_distance (system) && initial_distance < almostzero && return
492+ neighborhood_search = get_neighborhood_search (system, system, semi)
497493
498- # Now that we know that `distance` is not zero, we can safely call the unsafe
499- # version of the kernel gradient to avoid redundant zero checks.
500- grad_kernel = smoothing_kernel_grad_unsafe (system, initial_pos_diff,
501- initial_distance, particle)
502-
503- volume = @inbounds mass[neighbor] / material_density[neighbor]
504- pos_diff_ = @inbounds current_coords (system, particle) -
505- current_coords (system, neighbor)
506- # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
507- pos_diff = convert .(eltype (system), pos_diff_)
508-
509- # Multiply by L_{0a}
510- L = @inbounds correction_matrix (system, particle)
511-
512- result = volume * pos_diff * grad_kernel' * L'
494+ @threaded semi for particle in each_integrated_particle (system)
495+ # We are looping over the particles of `system`, so it is guaranteed
496+ # that `particle` is in bounds of `system`.
497+ current_coords_a = @inbounds current_coords (system, particle)
498+ L_a = @inbounds correction_matrix (system, particle)
499+
500+ # Accumulate the contributions over all neighbors before writing
501+ # to `deformation_grad` to reduce the number of memory writes.
502+ # Note that we need a `Ref` in order to be able to update these variables
503+ # inside the closure in the `foreach_neighbor` loop.
504+ result = Ref (zero (L_a))
505+
506+ # Loop over all neighbors within the kernel cutoff
507+ @inbounds PointNeighbors. foreach_neighbor (initial_coords, initial_coords,
508+ neighborhood_search,
509+ particle) do particle, neighbor,
510+ initial_pos_diff,
511+ initial_distance
512+ # Skip neighbors with the same position because the kernel gradient is zero.
513+ # Note that `return` only exits the closure, i.e., skips the current neighbor.
514+ skip_zero_distance (system) && initial_distance < almostzero && return
515+
516+ # Now that we know that `distance` is not zero, we can safely call the unsafe
517+ # version of the kernel gradient to avoid redundant zero checks.
518+ grad_kernel = smoothing_kernel_grad_unsafe (system, initial_pos_diff,
519+ initial_distance, particle)
520+
521+ volume = @inbounds mass[neighbor] / material_density[neighbor]
522+ current_coords_b = @inbounds current_coords (system, neighbor)
523+ pos_diff_ = current_coords_a - current_coords_b
524+ # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
525+ pos_diff = convert .(eltype (system), pos_diff_)
526+
527+ # The tensor product pos_diff ⊗ (L_{0a} * ∇W) is equivalent to multiplication
528+ # by the transpose: pos_diff * (L_{0a} * ∇W)ᵀ = pos_diff * ∇Wᵀ * L_{0a}ᵀ.
529+ result[] -= volume * pos_diff * grad_kernel' * L_a'
530+ end
513531
514532 for j in 1 : ndims (system), i in 1 : ndims (system)
515- @inbounds deformation_grad[i, j, particle] - = result[i, j]
533+ @inbounds deformation_grad[i, j, particle] + = result[] [i, j]
516534 end
517535 end
518536
0 commit comments