@@ -30,8 +30,11 @@ function interact!(dv, v_particle_system, u_particle_system,
3030 m_a = @inbounds hydrodynamic_mass (particle_system, particle)
3131 p_a = @inbounds current_pressure (v_particle_system, particle_system, particle)
3232
33- v_a = @inbounds current_velocity (v_particle_system, particle_system, particle)
34- rho_a = @inbounds current_density (v_particle_system, particle_system, particle)
33+ # In 3D, this function can combine velocity and density load into one wide load,
34+ # which gives a significant speedup on GPUs.
35+ (v_a,
36+ rho_a) = @inbounds velocity_and_density (v_particle_system, particle_system,
37+ particle)
3538
3639 # Accumulate the RHS contributions over all neighbors before writing to `dv`,
3740 # to reduce the number of memory writes.
@@ -56,8 +59,9 @@ function interact!(dv, v_particle_system, u_particle_system,
5659
5760 # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
5861 m_b = @inbounds hydrodynamic_mass (neighbor_system, neighbor)
59- v_b = @inbounds current_velocity (v_neighbor_system, neighbor_system, neighbor)
60- rho_b = @inbounds current_density (v_neighbor_system, neighbor_system, neighbor)
62+ (v_b,
63+ rho_b) = @inbounds velocity_and_density (v_neighbor_system, neighbor_system,
64+ neighbor)
6165
6266 # The following call is equivalent to
6367 # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
135139 neighbor, p_a)
136140 return p_a
137141end
142+
143+ @propagate_inbounds function velocity_and_density (v, system:: WeaklyCompressibleSPHSystem ,
144+ particle)
145+ (; density_calculator) = system
146+
147+ return velocity_and_density (v, density_calculator, system, particle)
148+ end
149+
150+ @propagate_inbounds function velocity_and_density (v, _, system, particle)
151+ v_particle = current_velocity (v, system, particle)
152+ rho_particle = current_density (v, system, particle)
153+
154+ return v_particle, rho_particle
155+ end
156+
157+ @inline function velocity_and_density (v:: AbstractGPUArray , :: ContinuityDensity ,
158+ :: WeaklyCompressibleSPHSystem{3} , particle)
159+ # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
160+ # of `v` corresponding to `particle`.
161+ # As opposed to `extract_svector`, this will translate to a single wide load instruction
162+ # on the GPU, which is faster than 4 separate loads.
163+ vrho_a = vloada (Vec{4 , eltype (v)}, pointer (v, 4 * (particle - 1 ) + 1 ))
164+
165+ # The column of `v` is ordered as (v_x, v_y, v_z, rho)
166+ a, b, c, d = Tuple (vrho_a)
167+ v_particle = SVector (a, b, c)
168+ rho_particle = d
169+
170+ return v_particle, rho_particle
171+ end
0 commit comments