Skip to content

Commit fde5c59

Browse files
committed
Use combined wide load for velocity and density in 3D
1 parent 7c0d6ea commit fde5c59

3 files changed

Lines changed: 41 additions & 4 deletions

File tree

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2525
ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
2626
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
2727
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
28+
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
2829
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
2930
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
3031
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -63,6 +64,7 @@ Polyester = "0.7.10"
6364
ReadVTK = "0.2"
6465
RecipesBase = "1"
6566
Reexport = "1"
67+
SIMD = "3.7.2"
6668
SciMLBase = "2"
6769
StaticArrays = "1"
6870
Statistics = "1"

src/TrixiParticles.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ using Random: seed!
2525
using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
2626
get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
2727
add_tstop!
28+
using SIMD: vloada, Vec
2829
@reexport using StaticArrays: SVector
2930
using StaticArrays: @SMatrix, SMatrix, setindex
3031
using Statistics: Statistics

src/schemes/fluid/weakly_compressible_sph/rhs.jl

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,11 @@ function interact!(dv, v_particle_system, u_particle_system,
3030
m_a = @inbounds hydrodynamic_mass(particle_system, particle)
3131
p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
3232

33-
v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
34-
rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
33+
# In 3D, this function can combine velocity and density load into one wide load,
34+
# which gives a significant speedup on GPUs.
35+
(v_a,
36+
rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
37+
particle)
3538

3639
# Accumulate the RHS contributions over all neighbors before writing to `dv`,
3740
# to reduce the number of memory writes.
@@ -56,8 +59,9 @@ function interact!(dv, v_particle_system, u_particle_system,
5659

5760
# `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
5861
m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
59-
v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
60-
rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
62+
(v_b,
63+
rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
64+
neighbor)
6165

6266
# The following call is equivalent to
6367
# `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -135,3 +139,33 @@ end
135139
neighbor, p_a)
136140
return p_a
137141
end
142+
143+
@propagate_inbounds function velocity_and_density(v, system::WeaklyCompressibleSPHSystem,
144+
particle)
145+
(; density_calculator) = system
146+
147+
return velocity_and_density(v, density_calculator, system, particle)
148+
end
149+
150+
@propagate_inbounds function velocity_and_density(v, _, system, particle)
151+
v_particle = current_velocity(v, system, particle)
152+
rho_particle = current_density(v, system, particle)
153+
154+
return v_particle, rho_particle
155+
end
156+
157+
@inline function velocity_and_density(v::AbstractGPUArray, ::ContinuityDensity,
158+
::WeaklyCompressibleSPHSystem{3}, particle)
159+
# Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
160+
# of `v` corresponding to `particle`.
161+
# As opposed to `extract_svector`, this will translate to a single wide load instruction
162+
# on the GPU, which is faster than 4 separate loads.
163+
vrho_a = vloada(Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
164+
165+
# The column of `v` is ordered as (v_x, v_y, v_z, rho)
166+
a, b, c, d = Tuple(vrho_a)
167+
v_particle = SVector(a, b, c)
168+
rho_particle = d
169+
170+
return v_particle, rho_particle
171+
end

0 commit comments

Comments
 (0)