Use combined wide load for velocity and density in 3D

efaulhaber · efaulhaber · commit fde5c5959bf8 · 2026-04-14T11:32:37.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -63,6 +64,7 @@ Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
 Reexport = "1"
+SIMD = "3.7.2"
 SciMLBase = "2"
 StaticArrays = "1"
 Statistics = "1"
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -25,6 +25,7 @@ using Random: seed!
 using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
                  get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
                  add_tstop!
+using SIMD: vloada, Vec
 @reexport using StaticArrays: SVector
 using StaticArrays: @SMatrix, SMatrix, setindex
 using Statistics: Statistics
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -30,8 +30,11 @@ function interact!(dv, v_particle_system, u_particle_system,
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
 
-        v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
-        rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 particle)
 
         # Accumulate the RHS contributions over all neighbors before writing to `dv`,
         # to reduce the number of memory writes.
@@ -56,8 +59,9 @@ function interact!(dv, v_particle_system, u_particle_system,
 
             # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
             m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
-            v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
-            rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+            (v_b,
+             rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                                                     neighbor)
 
             # The following call is equivalent to
             #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -135,3 +139,33 @@ end
                                    neighbor, p_a)
     return p_a
 end
+
+@propagate_inbounds function velocity_and_density(v, system::WeaklyCompressibleSPHSystem,
+                                                  particle)
+    (; density_calculator) = system
+
+    return velocity_and_density(v, density_calculator, system, particle)
+end
+
+@propagate_inbounds function velocity_and_density(v, _, system, particle)
+    v_particle = current_velocity(v, system, particle)
+    rho_particle = current_density(v, system, particle)
+
+    return v_particle, rho_particle
+end
+
+@inline function velocity_and_density(v::AbstractGPUArray, ::ContinuityDensity,
+                                      ::WeaklyCompressibleSPHSystem{3}, particle)
+    # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
+    # of `v` corresponding to `particle`.
+    # As opposed to `extract_svector`, this will translate to a single wide load instruction
+    # on the GPU, which is faster than 4 separate loads.
+    vrho_a = vloada(Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
+
+    # The column of `v` is ordered as (v_x, v_y, v_z, rho)
+    a, b, c, d = Tuple(vrho_a)
+    v_particle = SVector(a, b, c)
+    rho_particle = d
+
+    return v_particle, rho_particle
+end