@@ -55,6 +55,7 @@ struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT}
5555 ranges_v :: RV
5656 neighborhood_searches :: NS
5757 parallelization_backend :: BACKEND
58+ interaction_timers :: TIMERS
5859 update_callback_used :: UCU
5960 integrate_tlsph :: IT # `false` if TLSPH integration is decoupled
6061
@@ -64,19 +65,20 @@ struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT}
6465 # and by Adapt.jl.
6566 function Semidiscretization (systems:: Tuple , ranges_u, ranges_v, neighborhood_searches,
6667 parallelization_backend:: PointNeighbors.ParallelizationBackend ,
67- update_callback_used, integrate_tlsph)
68- new{typeof (parallelization_backend ), typeof (systems), typeof (ranges_u),
68+ interaction_timers, update_callback_used, integrate_tlsph)
69+ new{typeof (interaction_timers ), typeof (systems), typeof (ranges_u),
6970 typeof (ranges_v), typeof (neighborhood_searches),
70- typeof (update_callback_used),
71- typeof (integrate_tlsph)}(systems, ranges_u, ranges_v,
72- neighborhood_searches, parallelization_backend ,
71+ typeof (parallelization_backend), typeof ( update_callback_used),
72+ typeof (integrate_tlsph)}(systems, ranges_u, ranges_v, neighborhood_searches,
73+ parallelization_backend, interaction_timers ,
7374 update_callback_used, integrate_tlsph)
7475 end
7576end
7677
7778function Semidiscretization (systems:: Union{AbstractSystem, Nothing} ...;
7879 neighborhood_search= GridNeighborhoodSearch {ndims(first(systems))} (),
79- parallelization_backend= PolyesterBackend ())
80+ parallelization_backend= PolyesterBackend (),
81+ interaction_timers= IndividualTimers ())
8082 systems = filter (system -> ! isnothing (system), systems)
8183
8284 if isempty (systems)
@@ -120,10 +122,13 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...;
120122 integrate_tlsph = Ref (true )
121123
122124 return Semidiscretization (systems, ranges_u, ranges_v, searches,
123- parallelization_backend, update_callback_used ,
124- integrate_tlsph)
125+ parallelization_backend, interaction_timers ,
126+ update_callback_used, integrate_tlsph)
125127end
126128
129+ struct IndividualTimers end
130+ struct CombinedTimers end
131+
127132# Inline show function e.g. Semidiscretization(neighborhood_search=...)
128133function Base. show (io:: IO , semi:: Semidiscretization )
129134 @nospecialize semi # reduce precompilation time
649654 return - damping_coefficient * velocity
650655end
651656
652- function system_interaction! (dv_ode, v_ode, u_ode, semi)
653- # Call `interact!` for each pair of systems
657+ function system_interaction! (dv_ode, v_ode, u_ode,
658+ semi :: Semidiscretization{IndividualTimers} )
654659 foreach_system (semi) do system
655660 foreach_system (semi) do neighbor
656661 # Construct string for the interactions timer.
@@ -663,7 +668,34 @@ function system_interaction!(dv_ode, v_ode, u_ode, semi)
663668 timer_str = " "
664669 end
665670
666- interact! (dv_ode, v_ode, u_ode, system, neighbor, semi, timer_str= timer_str)
671+ # Call individual `interact!` for this pair of systems.
672+ # On GPUs, this is fully synchronized, so we get a separate timer for each
673+ # pair of systems.
674+ @trixi_timeit timer () timer_str begin
675+ interact! (dv_ode, v_ode, u_ode, system, neighbor, semi)
676+ end
677+ end
678+ end
679+
680+ return dv_ode
681+ end
682+
683+ function system_interaction! (dv_ode, v_ode, u_ode,
684+ semi:: Semidiscretization{CombinedTimers} )
685+ foreach_system (semi) do system
686+ # Construct string for the interactions timer.
687+ # Avoid allocations from string construction when no timers are used.
688+ if timeit_debug_enabled ()
689+ system_index = system_indices (system, semi)
690+ timer_str = " $(timer_name (system))$system_index -*"
691+ else
692+ timer_str = " "
693+ end
694+
695+ # Call a combined `interact!` for all interactions of this system with other systems.
696+ # On GPUs, this is fully synchronized, so we get a separate timer for each system.
697+ @trixi_timeit timer () timer_str begin
698+ interact_combined! (dv_ode, v_ode, u_ode, system, semi)
667699 end
668700 end
669701
@@ -674,16 +706,51 @@ end
674706# One can benchmark, e.g. the fluid-fluid interaction, with:
675707# dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x;
676708# @btime TrixiParticles.interact!($dv_ode, $v_ode, $u_ode, $fluid_system, $fluid_system, $semi);
677- @inline function interact! (dv_ode, v_ode, u_ode, system, neighbor, semi; timer_str = " " )
709+ function interact! (dv_ode, v_ode, u_ode, system, neighbor, semi)
678710 dv = wrap_v (dv_ode, system, semi)
679711 v_system = wrap_v (v_ode, system, semi)
680712 u_system = wrap_u (u_ode, system, semi)
681713
682714 v_neighbor = wrap_v (v_ode, neighbor, semi)
683715 u_neighbor = wrap_u (u_ode, neighbor, semi)
684716
685- @trixi_timeit timer () timer_str begin
686- interact! (dv, v_system, u_system, v_neighbor, u_neighbor, system, neighbor, semi)
717+ nhs = get_neighborhood_search (system, neighbor, semi)
718+
719+ # Loop over all particles that are integrated for this system, i.e., all particles
720+ # for which `dv` has entries.
721+ @threaded semi for particle in each_integrated_particle (system)
722+ interact! (dv, v_system, u_system, v_neighbor, u_neighbor,
723+ system, neighbor, semi, nhs, particle)
724+ end
725+ end
726+
727+ # Benchmark the combined interaction for a system with:
728+ # dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x;
729+ # @btime TrixiParticles.interact_combined!($dv_ode, $v_ode, $u_ode, $system, $semi);
730+ function interact_combined! (dv_ode, v_ode, u_ode, system, semi; synchronize= true )
731+ dv = wrap_v (dv_ode, system, semi)
732+ v_system = wrap_v (v_ode, system, semi)
733+ u_system = wrap_u (u_ode, system, semi)
734+
735+ # Create an iterator combining systems with their wrapped arrays and NHS.
736+ # Note that we cannot use `get_neighborhood_search` because this will return
737+ # a NHS of a different type for TLSPH-TLSPH interactions, making the iterator tuple
738+ # type-unstable. The different self-interaction NHS for TLSPH is handled
739+ # by the `interact!` function for TLSPH.
740+ system_index = system_indices (system, semi)
741+ f (neighbor) = (neighbor, wrap_v (v_ode, neighbor, semi), wrap_u (u_ode, neighbor, semi),
742+ semi. neighborhood_searches[system_index][system_indices (neighbor, semi)])
743+ iterator = map (f, semi. systems)
744+
745+ # Loop over all particles that are integrated for this system, i.e., all particles
746+ # for which `dv` has entries.
747+ @threaded semi for particle in each_integrated_particle (system)
748+ # Now loop over all neighbor systems to avoid separate loops/kernels
749+ # for each pair of systems.
750+ foreach_noalloc (iterator) do (neighbor, v_neighbor, u_neighbor, nhs)
751+ interact! (dv, v_system, u_system, v_neighbor, u_neighbor,
752+ system, neighbor, semi, nhs, particle)
753+ end
687754 end
688755end
689756
0 commit comments