@@ -147,19 +147,28 @@ end
147147 return velocity_and_density (v, density_calculator, system, particle)
148148end
149149
150+ @propagate_inbounds function velocity_and_density (v, system, particle)
151+ # Call the default method below
152+ return velocity_and_density (v, nothing , system, particle)
153+ end
154+
155+ # Default method, which simply calls `current_velocity` and `current_density` separately.
150156@propagate_inbounds function velocity_and_density (v, _, system, particle)
151157 v_particle = current_velocity (v, system, particle)
152158 rho_particle = current_density (v, system, particle)
153159
154160 return v_particle, rho_particle
155161end
156162
163+ # Optimized version for WCSPH with `ContinuityDensity` in 3D on GPUs,
164+ # which combines the velocity and density load into one wide load.
157165@inline function velocity_and_density (v:: AbstractGPUArray , :: ContinuityDensity ,
158166 :: WeaklyCompressibleSPHSystem{3} , particle)
159167 # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
160168 # of `v` corresponding to `particle`.
161169 # As opposed to `extract_svector`, this will translate to a single wide load instruction
162170 # on the GPU, which is faster than 4 separate loads.
171+ # Note that this doesn't work for 2D because it requires a stride of 2^n.
163172 vrho_a = vloada (Vec{4 , eltype (v)}, pointer (v, 4 * (particle - 1 ) + 1 ))
164173
165174 # The column of `v` is ordered as (v_x, v_y, v_z, rho)
0 commit comments