Skip to content

Commit 006d4f5

Browse files
committed
Optimize deformation gradient
1 parent e4befa2 commit 006d4f5

File tree

3 files changed

+59
-31
lines changed

3 files changed

+59
-31
lines changed

src/general/abstract_system.jl

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,17 +68,27 @@ end
6868
end
6969

7070
# Return `A[:, :, i]` as an `SMatrix`.
71-
@inline function extract_smatrix(A, system, particle)
72-
@boundscheck checkbounds(A, ndims(system), ndims(system), particle)
71+
@propagate_inbounds function extract_smatrix(A, system, particle)
72+
return extract_smatrix(A, Val(ndims(system)), particle)
73+
end
74+
75+
@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
76+
@boundscheck checkbounds(A, NDIMS, NDIMS, particle)
77+
78+
# Extract the matrix elements for this particle as a tuple to pass to SMatrix
79+
return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
80+
div(i - 1, NDIMS) + 1,
81+
particle]),
82+
Val(NDIMS^2)))
83+
end
84+
85+
@inline function extract_smatrix(A, ::Val{2}, particle)
86+
@boundscheck checkbounds(A, 2, 2, particle)
87+
88+
x = vloada(Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1))
7389

7490
# Extract the matrix elements for this particle as a tuple to pass to SMatrix
75-
return SMatrix{ndims(system),
76-
ndims(system)}(ntuple(@inline(i->@inbounds A[mod(i - 1,
77-
ndims(system)) + 1,
78-
div(i - 1,
79-
ndims(system)) + 1,
80-
particle]),
81-
Val(ndims(system)^2)))
91+
return SMatrix{2, 2}(Tuple(x))
8292
end
8393

8494
# Specifically get the current coordinates of a particle for all system types.

src/schemes/structure/total_lagrangian_sph/rhs.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ end
4040
current_coords_a = @inbounds current_coords(system, particle)
4141
F_a = @inbounds deformation_gradient(system, particle)
4242

43-
# Accumulate the RHS contributions over all neighbors before writing to `dv`,
43+
# Accumulate the RHS contributions over all neighbors before writing to `dv`
4444
# to reduce the number of memory writes.
4545
# Note that we need a `Ref` in order to be able to update these variables
4646
# inside the closure in the `foreach_neighbor` loop.

src/schemes/structure/total_lagrangian_sph/system.jl

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -489,30 +489,48 @@ end
489489

490490
# Loop over all pairs of particles and neighbors within the kernel cutoff
491491
initial_coords = initial_coordinates(system)
492-
foreach_point_neighbor(system, system, initial_coords, initial_coords,
493-
semi) do particle, neighbor, initial_pos_diff, initial_distance
494-
# Skip neighbors with the same position because the kernel gradient is zero.
495-
# Note that `return` only exits the closure, i.e., skips the current neighbor.
496-
skip_zero_distance(system) && initial_distance < almostzero && return
492+
neighborhood_search = get_neighborhood_search(system, system, semi)
497493

498-
# Now that we know that `distance` is not zero, we can safely call the unsafe
499-
# version of the kernel gradient to avoid redundant zero checks.
500-
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
501-
initial_distance, particle)
502-
503-
volume = @inbounds mass[neighbor] / material_density[neighbor]
504-
pos_diff_ = @inbounds current_coords(system, particle) -
505-
current_coords(system, neighbor)
506-
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
507-
pos_diff = convert.(eltype(system), pos_diff_)
508-
509-
# Multiply by L_{0a}
510-
L = @inbounds correction_matrix(system, particle)
511-
512-
result = volume * pos_diff * grad_kernel' * L'
494+
@threaded semi for particle in each_integrated_particle(system)
495+
# We are looping over the particles of `system`, so it is guaranteed
496+
# that `particle` is in bounds of `system`.
497+
current_coords_a = @inbounds current_coords(system, particle)
498+
L_a = @inbounds correction_matrix(system, particle)
499+
500+
# Accumulate the contributions over all neighbors before writing
501+
# to `deformation_grad` to reduce the number of memory writes.
502+
# Note that we need a `Ref` in order to be able to update these variables
503+
# inside the closure in the `foreach_neighbor` loop.
504+
result = Ref(zero(L_a))
505+
506+
# Loop over all neighbors within the kernel cutoff
507+
@inbounds PointNeighbors.foreach_neighbor(initial_coords, initial_coords,
508+
neighborhood_search,
509+
particle) do particle, neighbor,
510+
initial_pos_diff,
511+
initial_distance
512+
# Skip neighbors with the same position because the kernel gradient is zero.
513+
# Note that `return` only exits the closure, i.e., skips the current neighbor.
514+
skip_zero_distance(system) && initial_distance < almostzero && return
515+
516+
# Now that we know that `distance` is not zero, we can safely call the unsafe
517+
# version of the kernel gradient to avoid redundant zero checks.
518+
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
519+
initial_distance, particle)
520+
521+
volume = @inbounds mass[neighbor] / material_density[neighbor]
522+
current_coords_b = @inbounds current_coords(system, neighbor)
523+
pos_diff_ = current_coords_a - current_coords_b
524+
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
525+
pos_diff = convert.(eltype(system), pos_diff_)
526+
527+
# The tensor product pos_diff ⊗ (L_{0a} * ∇W) is equivalent to multiplication
528+
# by the transpose: pos_diff * (L_{0a} * ∇W)ᵀ = pos_diff * ∇Wᵀ * L_{0a}ᵀ.
529+
result[] -= volume * pos_diff * grad_kernel' * L_a'
530+
end
513531

514532
for j in 1:ndims(system), i in 1:ndims(system)
515-
@inbounds deformation_grad[i, j, particle] -= result[i, j]
533+
@inbounds deformation_grad[i, j, particle] += result[][i, j]
516534
end
517535
end
518536

0 commit comments

Comments
 (0)