Skip to content

Commit c06d032

Browse files
committed
Combine interact! kernels to reduce GPU latency
1 parent 5a699d6 commit c06d032

6 files changed

Lines changed: 117 additions & 49 deletions

File tree

src/general/abstract_system.jl

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,20 @@ vtkname(system::AbstractBoundarySystem) = "boundary"
2626
# Number of particles in the system
2727
@inline nparticles(system) = length(system.mass)
2828

29-
# Number of particles in the system whose positions are to be integrated (corresponds to the size of u and du)
29+
# Number of particles in the system whose positions are to be integrated
30+
# (corresponds to the size of u and du).
31+
# See comment below for `each_integrated_particle` for more details.
3032
@inline n_integrated_particles(system) = nparticles(system)
3133

34+
# Iterator over all particle indices in the system
3235
@inline eachparticle(system::AbstractSystem) = each_active_particle(system)
3336
@inline eachparticle(initial_condition) = Base.OneTo(nparticles(initial_condition))
3437

35-
# Wrapper for systems with `SystemBuffer`
38+
# Iterator over all particle indices in the system whose positions are to be integrated
39+
# (corresponds to the size of u and du).
40+
# For most systems, this is the same as `eachparticle`, but for the
41+
# `TotalLagrangianSPHSystem`, the clamped particles are included in `eachparticle`
42+
# but not in `each_integrated_particle`.
3643
@inline each_integrated_particle(system) = each_integrated_particle(system, buffer(system))
3744
@inline function each_integrated_particle(system, ::Nothing)
3845
return Base.OneTo(n_integrated_particles(system))

src/general/semidiscretization.jl

Lines changed: 81 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT}
5555
ranges_v :: RV
5656
neighborhood_searches :: NS
5757
parallelization_backend :: BACKEND
58+
interaction_timers :: TIMERS
5859
update_callback_used :: UCU
5960
integrate_tlsph :: IT # `false` if TLSPH integration is decoupled
6061

@@ -64,19 +65,20 @@ struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT}
6465
# and by Adapt.jl.
6566
function Semidiscretization(systems::Tuple, ranges_u, ranges_v, neighborhood_searches,
6667
parallelization_backend::PointNeighbors.ParallelizationBackend,
67-
update_callback_used, integrate_tlsph)
68-
new{typeof(parallelization_backend), typeof(systems), typeof(ranges_u),
68+
interaction_timers, update_callback_used, integrate_tlsph)
69+
new{typeof(interaction_timers), typeof(systems), typeof(ranges_u),
6970
typeof(ranges_v), typeof(neighborhood_searches),
70-
typeof(update_callback_used),
71-
typeof(integrate_tlsph)}(systems, ranges_u, ranges_v,
72-
neighborhood_searches, parallelization_backend,
71+
typeof(parallelization_backend), typeof(update_callback_used),
72+
typeof(integrate_tlsph)}(systems, ranges_u, ranges_v, neighborhood_searches,
73+
parallelization_backend, interaction_timers,
7374
update_callback_used, integrate_tlsph)
7475
end
7576
end
7677

7778
function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
7879
neighborhood_search=GridNeighborhoodSearch{ndims(first(systems))}(),
79-
parallelization_backend=PolyesterBackend())
80+
parallelization_backend=PolyesterBackend(),
81+
interaction_timers=IndividualTimers())
8082
systems = filter(system -> !isnothing(system), systems)
8183

8284
if isempty(systems)
@@ -120,10 +122,13 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
120122
integrate_tlsph = Ref(true)
121123

122124
return Semidiscretization(systems, ranges_u, ranges_v, searches,
123-
parallelization_backend, update_callback_used,
124-
integrate_tlsph)
125+
parallelization_backend, interaction_timers,
126+
update_callback_used, integrate_tlsph)
125127
end
126128

129+
struct IndividualTimers end
130+
struct CombinedTimers end
131+
127132
# Inline show function e.g. Semidiscretization(neighborhood_search=...)
128133
function Base.show(io::IO, semi::Semidiscretization)
129134
@nospecialize semi # reduce precompilation time
@@ -649,8 +654,8 @@ end
649654
return -damping_coefficient * velocity
650655
end
651656

652-
function system_interaction!(dv_ode, v_ode, u_ode, semi)
653-
# Call `interact!` for each pair of systems
657+
function system_interaction!(dv_ode, v_ode, u_ode,
658+
semi::Semidiscretization{IndividualTimers})
654659
foreach_system(semi) do system
655660
foreach_system(semi) do neighbor
656661
# Construct string for the interactions timer.
@@ -663,7 +668,34 @@ function system_interaction!(dv_ode, v_ode, u_ode, semi)
663668
timer_str = ""
664669
end
665670

666-
interact!(dv_ode, v_ode, u_ode, system, neighbor, semi, timer_str=timer_str)
671+
# Call individual `interact!` for this pair of systems.
672+
# On GPUs, this is fully synchronized, so we get a separate timer for each
673+
# pair of systems.
674+
@trixi_timeit timer() timer_str begin
675+
interact!(dv_ode, v_ode, u_ode, system, neighbor, semi)
676+
end
677+
end
678+
end
679+
680+
return dv_ode
681+
end
682+
683+
function system_interaction!(dv_ode, v_ode, u_ode,
684+
semi::Semidiscretization{CombinedTimers})
685+
foreach_system(semi) do system
686+
# Construct string for the interactions timer.
687+
# Avoid allocations from string construction when no timers are used.
688+
if timeit_debug_enabled()
689+
system_index = system_indices(system, semi)
690+
timer_str = "$(timer_name(system))$system_index-*"
691+
else
692+
timer_str = ""
693+
end
694+
695+
# Call a combined `interact!` for all interactions of this system with other systems.
696+
# On GPUs, this is fully synchronized, so we get a separate timer for each system.
697+
@trixi_timeit timer() timer_str begin
698+
interact_combined!(dv_ode, v_ode, u_ode, system, semi)
667699
end
668700
end
669701

@@ -674,16 +706,51 @@ end
674706
# One can benchmark, e.g. the fluid-fluid interaction, with:
675707
# dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x;
676708
# @btime TrixiParticles.interact!($dv_ode, $v_ode, $u_ode, $fluid_system, $fluid_system, $semi);
677-
@inline function interact!(dv_ode, v_ode, u_ode, system, neighbor, semi; timer_str="")
709+
function interact!(dv_ode, v_ode, u_ode, system, neighbor, semi)
678710
dv = wrap_v(dv_ode, system, semi)
679711
v_system = wrap_v(v_ode, system, semi)
680712
u_system = wrap_u(u_ode, system, semi)
681713

682714
v_neighbor = wrap_v(v_ode, neighbor, semi)
683715
u_neighbor = wrap_u(u_ode, neighbor, semi)
684716

685-
@trixi_timeit timer() timer_str begin
686-
interact!(dv, v_system, u_system, v_neighbor, u_neighbor, system, neighbor, semi)
717+
nhs = get_neighborhood_search(system, neighbor, semi)
718+
719+
# Loop over all particles that are integrated for this system, i.e., all particles
720+
# for which `dv` has entries.
721+
@threaded semi for particle in each_integrated_particle(system)
722+
interact!(dv, v_system, u_system, v_neighbor, u_neighbor,
723+
system, neighbor, semi, nhs, particle)
724+
end
725+
end
726+
727+
# Benchmark the combined interaction for a system with:
728+
# dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x;
729+
# @btime TrixiParticles.interact_combined!($dv_ode, $v_ode, $u_ode, $system, $semi);
730+
function interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=true)
731+
dv = wrap_v(dv_ode, system, semi)
732+
v_system = wrap_v(v_ode, system, semi)
733+
u_system = wrap_u(u_ode, system, semi)
734+
735+
# Create an iterator combining systems with their wrapped arrays and NHS.
736+
# Note that we cannot use `get_neighborhood_search` because this will return
737+
# a NHS of a different type for TLSPH-TLSPH interactions, making the iterator tuple
738+
# type-unstable. The different self-interaction NHS for TLSPH is handled
739+
# by the `interact!` function for TLSPH.
740+
system_index = system_indices(system, semi)
741+
f(neighbor) = (neighbor, wrap_v(v_ode, neighbor, semi), wrap_u(u_ode, neighbor, semi),
742+
semi.neighborhood_searches[system_index][system_indices(neighbor, semi)])
743+
iterator = map(f, semi.systems)
744+
745+
# Loop over all particles that are integrated for this system, i.e., all particles
746+
# for which `dv` has entries.
747+
@threaded semi for particle in each_integrated_particle(system)
748+
# Now loop over all neighbor systems to avoid separate loops/kernels
749+
# for each pair of systems.
750+
foreach_noalloc(iterator) do (neighbor, v_neighbor, u_neighbor, nhs)
751+
interact!(dv, v_system, u_system, v_neighbor, u_neighbor,
752+
system, neighbor, semi, nhs, particle)
753+
end
687754
end
688755
end
689756

src/schemes/boundary/open_boundary/mirroring.jl

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,8 @@ function extrapolate_values!(system,
165165
ndims(system) + 1,
166166
eltype(system)}))
167167

168-
# TODO: Not public API
169-
PointNeighbors.foreach_neighbor(fluid_coords, nhs, particle, ghost_node_position,
170-
nhs.search_radius) do particle, neighbor, pos_diff,
171-
distance
168+
foreach_neighbor(fluid_coords, nhs, particle, ghost_node_position,
169+
nhs.search_radius) do particle, neighbor, pos_diff, distance
172170
m_b = hydrodynamic_mass(fluid_system, neighbor)
173171
rho_b = current_density(v_fluid, fluid_system, neighbor)
174172
pressure_b = current_pressure(v_fluid, fluid_system, neighbor)
@@ -305,10 +303,8 @@ function extrapolate_values!(system, mirror_method::ZerothOrderMirroring,
305303
interpolated_pressure = Ref(zero(eltype(system)))
306304
interpolated_velocity = Ref(zero(particle_coords))
307305

308-
# TODO: Not public API
309-
PointNeighbors.foreach_neighbor(fluid_coords, nhs, particle, ghost_node_position,
310-
nhs.search_radius) do particle, neighbor, pos_diff,
311-
distance
306+
foreach_neighbor(fluid_coords, nhs, particle, ghost_node_position,
307+
nhs.search_radius) do particle, neighbor, pos_diff, distance
312308
m_b = hydrodynamic_mass(fluid_system, neighbor)
313309
rho_b = current_density(v_fluid, fluid_system, neighbor)
314310
volume_b = m_b / rho_b

src/schemes/boundary/wall_boundary/rhs.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
function interact!(dv, v_particle_system, u_particle_system,
33
v_neighbor_system, u_neighbor_system,
44
particle_system::Union{AbstractBoundarySystem, OpenBoundarySystem},
5-
neighbor_system, semi)
5+
neighbor_system, semi, nhs, particle)
66
# TODO Solids and moving boundaries should be considered in the continuity equation
77
return dv
88
end

src/schemes/fluid/weakly_compressible_sph/rhs.jl

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
# in `neighbor_system` and updates `dv` accordingly.
33
# It takes into account pressure forces, viscosity, and for `ContinuityDensity` updates
44
# the density using the continuity equation.
5-
function interact!(dv, v_particle_system, u_particle_system,
6-
v_neighbor_system, u_neighbor_system,
7-
particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi)
5+
@inline function interact!(dv, v_particle_system, u_particle_system,
6+
v_neighbor_system, u_neighbor_system,
7+
particle_system::WeaklyCompressibleSPHSystem, neighbor_system,
8+
semi, nhs, particle)
89
(; density_calculator, correction) = particle_system
910

1011
sound_speed = system_sound_speed(particle_system)
@@ -20,12 +21,8 @@ function interact!(dv, v_particle_system, u_particle_system,
2021
# debug_array = zeros(ndims(particle_system), nparticles(particle_system))
2122

2223
# Loop over all pairs of particles and neighbors within the kernel cutoff
23-
foreach_point_neighbor(particle_system, neighbor_system,
24-
system_coords, neighbor_system_coords, semi;
25-
points=each_integrated_particle(particle_system)) do particle,
26-
neighbor,
27-
pos_diff,
28-
distance
24+
foreach_neighbor(system_coords, neighbor_system_coords,
25+
nhs, particle) do particle, neighbor, pos_diff, distance
2926
# `foreach_point_neighbor` makes sure that `particle` and `neighbor` are
3027
# in bounds of the respective system. For performance reasons, we use `@inbounds`
3128
# in this hot loop to avoid bounds checking when extracting particle quantities.

src/schemes/structure/total_lagrangian_sph/rhs.jl

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,34 @@
22
function interact!(dv, v_particle_system, u_particle_system,
33
v_neighbor_system, u_neighbor_system,
44
particle_system::TotalLagrangianSPHSystem,
5-
neighbor_system::TotalLagrangianSPHSystem, semi;
5+
neighbor_system::TotalLagrangianSPHSystem,
6+
semi, nhs, particle;
67
integrate_tlsph=semi.integrate_tlsph[])
78
# Different structures do not interact with each other (yet)
89
particle_system === neighbor_system || return dv
910

1011
# Skip interaction if TLSPH systems are integrated separately
1112
integrate_tlsph || return dv
1213

13-
interact_structure_structure!(dv, v_particle_system, particle_system, semi)
14+
interact_structure_structure!(dv, v_particle_system, particle_system, semi, particle)
1415
end
1516

1617
# Function barrier without dispatch for unit testing
17-
@inline function interact_structure_structure!(dv, v_system, system, semi)
18+
@inline function interact_structure_structure!(dv, v_system, system, semi, particle)
1819
(; penalty_force) = system
1920

2021
# Everything here is done in the initial coordinates
2122
system_coords = initial_coordinates(system)
2223

24+
# Specialized neighborhood search for structure-structure interaction that is
25+
# optimized for finding neighbors in the initial configuration.
26+
nhs = system.self_interaction_nhs
27+
2328
# Loop over all pairs of particles and neighbors within the kernel cutoff.
2429
# For structure-structure interaction, this has to happen in the initial coordinates.
25-
foreach_point_neighbor(system, system, system_coords, system_coords, semi;
26-
points=each_integrated_particle(system)) do particle, neighbor,
27-
initial_pos_diff,
28-
initial_distance
30+
foreach_neighbor(system_coords, system_coords, nhs,
31+
particle) do particle, neighbor,
32+
initial_pos_diff, initial_distance
2933
# Only consider particles with a distance > 0.
3034
# See `src/general/smoothing_kernels.jl` for more details.
3135
initial_distance^2 < eps(initial_smoothing_length(system)^2) && return
@@ -76,7 +80,7 @@ end
7680
function interact!(dv, v_particle_system, u_particle_system,
7781
v_neighbor_system, u_neighbor_system,
7882
particle_system::TotalLagrangianSPHSystem,
79-
neighbor_system::AbstractFluidSystem, semi;
83+
neighbor_system::AbstractFluidSystem, semi, nhs, particle;
8084
integrate_tlsph=semi.integrate_tlsph[])
8185
# Skip interaction if TLSPH systems are integrated separately
8286
integrate_tlsph || return dv
@@ -87,12 +91,8 @@ function interact!(dv, v_particle_system, u_particle_system,
8791
neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
8892

8993
# Loop over all pairs of particles and neighbors within the kernel cutoff
90-
foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords,
91-
semi;
92-
points=each_integrated_particle(particle_system)) do particle,
93-
neighbor,
94-
pos_diff,
95-
distance
94+
foreach_neighbor(system_coords, neighbor_coords,
95+
nhs, particle) do particle, neighbor, pos_diff, distance
9696
# Only consider particles with a distance > 0.
9797
# See `src/general/smoothing_kernels.jl` for more details.
9898
distance^2 < eps(initial_smoothing_length(particle_system)^2) && return
@@ -183,7 +183,8 @@ end
183183
function interact!(dv, v_particle_system, u_particle_system,
184184
v_neighbor_system, u_neighbor_system,
185185
particle_system::TotalLagrangianSPHSystem,
186-
neighbor_system::Union{WallBoundarySystem, OpenBoundarySystem}, semi;
186+
neighbor_system::Union{WallBoundarySystem, OpenBoundarySystem},
187+
semi, nhs, particle;
187188
integrate_tlsph=semi.integrate_tlsph[])
188189
# TODO continuity equation?
189190
return dv

0 commit comments

Comments
 (0)