@@ -15,6 +15,7 @@ function interact!(dv, v_particle_system, u_particle_system,
1515 system_coords = current_coordinates (u_particle_system, particle_system)
1616 neighbor_system_coords = current_coordinates (u_neighbor_system, neighbor_system)
1717 neighborhood_search = get_neighborhood_search (particle_system, neighbor_system, semi)
18+ backend = semi. parallelization_backend
1819
1920 # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
2021 # and the density diffusion divide by zero.
@@ -35,7 +36,7 @@ function interact!(dv, v_particle_system, u_particle_system,
3536 # which gives a significant speedup on GPUs.
3637 (v_a,
3738 rho_a) = @inbounds velocity_and_density (v_particle_system, particle_system,
38- particle)
39+ backend, particle)
3940
4041 # Accumulate the RHS contributions over all neighbors before writing to `dv`,
4142 # to reduce the number of memory writes.
@@ -62,7 +63,7 @@ function interact!(dv, v_particle_system, u_particle_system,
6263 m_b = @inbounds hydrodynamic_mass (neighbor_system, neighbor)
6364 (v_b,
6465 rho_b) = @inbounds velocity_and_density (v_neighbor_system, neighbor_system,
65- neighbor)
66+ backend, neighbor)
6667
6768 # The following call is equivalent to
6869 # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -88,6 +89,7 @@ function interact!(dv, v_particle_system, u_particle_system,
8889
8990 dv_particle[] += pressure_correction * dv_pressure
9091
92+ # Propagate `@inbounds` to the viscosity function, which accesses particle data
9193 @inbounds dv_viscosity! (dv_particle, particle_system, neighbor_system,
9294 v_particle_system, v_neighbor_system,
9395 particle, neighbor, pos_diff, distance,
@@ -142,19 +144,19 @@ end
142144end
143145
144146@propagate_inbounds function velocity_and_density (v, system:: WeaklyCompressibleSPHSystem ,
145- particle)
147+ backend, particle)
146148 (; density_calculator) = system
147149
148- return velocity_and_density (v, density_calculator, system, particle)
150+ return velocity_and_density (v, density_calculator, system, backend, particle)
149151end
150152
151- @propagate_inbounds function velocity_and_density (v, system, particle)
153+ @propagate_inbounds function velocity_and_density (v, system, backend, particle)
152154 # Call the default method below
153- return velocity_and_density (v, nothing , system, particle)
155+ return velocity_and_density (v, nothing , system, backend, particle)
154156end
155157
156158# Default method, which simply calls `current_velocity` and `current_density` separately.
157- @propagate_inbounds function velocity_and_density (v, _, system, particle)
159+ @propagate_inbounds function velocity_and_density (v, _, system, backend, particle)
158160 v_particle = current_velocity (v, system, particle)
159161 rho_particle = current_density (v, system, particle)
160162
@@ -163,19 +165,24 @@ end
163165
164166# Optimized version for WCSPH with `ContinuityDensity` in 3D on GPUs,
165167# which combines the velocity and density load into one wide load.
168+ # This is slightly slower on CPUs, so we only use it with GPU backends.
169+ # Note that we cannot dispatch by `AbstractGPUArray` because this is called from within
170+ # a kernel, where the arrays are device arrays (like `CuDeviceArray`),
171+ # which are not `AbstractGPUArray`s.
166172@inline function velocity_and_density (v, :: ContinuityDensity ,
167- :: WeaklyCompressibleSPHSystem{3} , particle)
173+ :: WeaklyCompressibleSPHSystem{3} ,
174+ :: KernelAbstractions.GPU ,
175+ particle)
168176 # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
169177 # of `v` corresponding to `particle`.
170178 # As opposed to `extract_svector`, this will translate to a single wide load instruction
171179 # on the GPU, which is faster than 4 separate loads.
172180 # Note that this doesn't work for 2D because it requires a stride of 2^n.
173- vrho_a = vloada (Vec{4 , eltype (v)}, pointer (v, 4 * (particle - 1 ) + 1 ))
181+ vrho_particle = SIMD . vloada (SIMD . Vec{4 , eltype (v)}, pointer (v, 4 * (particle - 1 ) + 1 ))
174182
175- # The column of `v` is ordered as (v_x, v_y, v_z, rho)
176- a, b, c, d = Tuple (vrho_a)
177- v_particle = SVector (a, b, c)
178- rho_particle = d
183+ # The columns of `v` are ordered as (v_x, v_y, v_z, rho)
184+ v1, v2, v3, rho = Tuple (vrho_particle)
185+ v_particle = SVector (v1, v2, v3)
179186
180- return v_particle, rho_particle
187+ return v_particle, rho
181188end
0 commit comments