Skip to content

Commit e282a52

Browse files
committed
Fix velocity_and_density
1 parent 99ab1cd commit e282a52

2 files changed

Lines changed: 22 additions & 15 deletions

File tree

src/TrixiParticles.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ using Random: seed!
2626
using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
2727
get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
2828
add_tstop!
29-
using SIMD: vloada, Vec
29+
using SIMD: SIMD
3030
@reexport using StaticArrays: SVector
3131
using StaticArrays: @SMatrix, SMatrix, setindex
3232
using Statistics: Statistics

src/schemes/fluid/weakly_compressible_sph/rhs.jl

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ function interact!(dv, v_particle_system, u_particle_system,
1515
system_coords = current_coordinates(u_particle_system, particle_system)
1616
neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system)
1717
neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
18+
backend = semi.parallelization_backend
1819

1920
# For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
2021
# and the density diffusion divide by zero.
@@ -35,7 +36,7 @@ function interact!(dv, v_particle_system, u_particle_system,
3536
# which gives a significant speedup on GPUs.
3637
(v_a,
3738
rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
38-
particle)
39+
backend, particle)
3940

4041
# Accumulate the RHS contributions over all neighbors before writing to `dv`,
4142
# to reduce the number of memory writes.
@@ -62,7 +63,7 @@ function interact!(dv, v_particle_system, u_particle_system,
6263
m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
6364
(v_b,
6465
rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
65-
neighbor)
66+
backend, neighbor)
6667

6768
# The following call is equivalent to
6869
# `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -88,6 +89,7 @@ function interact!(dv, v_particle_system, u_particle_system,
8889

8990
dv_particle[] += pressure_correction * dv_pressure
9091

92+
# Propagate `@inbounds` to the viscosity function, which accesses particle data
9193
@inbounds dv_viscosity!(dv_particle, particle_system, neighbor_system,
9294
v_particle_system, v_neighbor_system,
9395
particle, neighbor, pos_diff, distance,
@@ -142,19 +144,19 @@ end
142144
end
143145

144146
@propagate_inbounds function velocity_and_density(v, system::WeaklyCompressibleSPHSystem,
145-
particle)
147+
backend, particle)
146148
(; density_calculator) = system
147149

148-
return velocity_and_density(v, density_calculator, system, particle)
150+
return velocity_and_density(v, density_calculator, system, backend, particle)
149151
end
150152

151-
@propagate_inbounds function velocity_and_density(v, system, particle)
153+
@propagate_inbounds function velocity_and_density(v, system, backend, particle)
152154
# Call the default method below
153-
return velocity_and_density(v, nothing, system, particle)
155+
return velocity_and_density(v, nothing, system, backend, particle)
154156
end
155157

156158
# Default method, which simply calls `current_velocity` and `current_density` separately.
157-
@propagate_inbounds function velocity_and_density(v, _, system, particle)
159+
@propagate_inbounds function velocity_and_density(v, _, system, backend, particle)
158160
v_particle = current_velocity(v, system, particle)
159161
rho_particle = current_density(v, system, particle)
160162

@@ -163,19 +165,24 @@ end
163165

164166
# Optimized version for WCSPH with `ContinuityDensity` in 3D on GPUs,
165167
# which combines the velocity and density load into one wide load.
168+
# This is slightly slower on CPUs, so we only use it with GPU backends.
169+
# Note that we cannot dispatch by `AbstractGPUArray` because this is called from within
170+
# a kernel, where the arrays are device arrays (like `CuDeviceArray`),
171+
# which are not `AbstractGPUArray`s.
166172
@inline function velocity_and_density(v, ::ContinuityDensity,
167-
::WeaklyCompressibleSPHSystem{3}, particle)
173+
::WeaklyCompressibleSPHSystem{3},
174+
::KernelAbstractions.GPU,
175+
particle)
168176
# Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
169177
# of `v` corresponding to `particle`.
170178
# As opposed to `extract_svector`, this will translate to a single wide load instruction
171179
# on the GPU, which is faster than 4 separate loads.
172180
# Note that this doesn't work for 2D because it requires a stride of 2^n.
173-
vrho_a = vloada(Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
181+
vrho_particle = SIMD.vloada(SIMD.Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
174182

175-
# The column of `v` is ordered as (v_x, v_y, v_z, rho)
176-
a, b, c, d = Tuple(vrho_a)
177-
v_particle = SVector(a, b, c)
178-
rho_particle = d
183+
# The columns of `v` are ordered as (v_x, v_y, v_z, rho)
184+
v1, v2, v3, rho = Tuple(vrho_particle)
185+
v_particle = SVector(v1, v2, v3)
179186

180-
return v_particle, rho_particle
187+
return v_particle, rho
181188
end

0 commit comments

Comments
 (0)