@@ -25,8 +25,9 @@ function interact!(dv, v_particle_system, u_particle_system,
2525 compact_support_ = compact_support (particle_system, neighbor_system)
2626 almostzero = sqrt (eps (compact_support_^ 2 ))
2727
28- use_simd_load_system = Val (use_simd_load (v_particle_system, particle_system))
29- use_simd_load_neighbor = Val (use_simd_load (v_neighbor_system, neighbor_system))
28+ use_aligned_load_system = Val (use_aligned_vrho_load (v_particle_system, particle_system))
29+ use_aligned_load_neighbor = Val (use_aligned_vrho_load (v_neighbor_system,
30+ neighbor_system))
3031
3132 @threaded semi for particle in each_integrated_particle (particle_system)
3233 # We are looping over the particles of `particle_system`, so it is guaranteed
@@ -38,7 +39,7 @@ function interact!(dv, v_particle_system, u_particle_system,
3839 # which gives a significant speedup on GPUs.
3940 (v_a,
4041 rho_a) = @inbounds velocity_and_density (v_particle_system, particle_system,
41- use_simd_load_system , particle)
42+ use_aligned_load_system , particle)
4243
4344 # Accumulate the RHS contributions over all neighbors before writing to `dv`,
4445 # to reduce the number of memory writes.
@@ -64,7 +65,7 @@ function interact!(dv, v_particle_system, u_particle_system,
6465 m_b = @inbounds hydrodynamic_mass (neighbor_system, neighbor)
6566 (v_b,
6667 rho_b) = @inbounds velocity_and_density (v_neighbor_system, neighbor_system,
67- use_simd_load_neighbor , neighbor)
68+ use_aligned_load_neighbor , neighbor)
6869
6970 # The following call is equivalent to
7071 # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
@@ -153,48 +154,47 @@ end
153154# Optimized version for WCSPH with `ContinuityDensity` in 3D,
154155# which combines the velocity and density load into one wide load.
155156# This is significantly faster on GPUs than the 4 individual loads of `extract_svector`.
156- # WARNING: this requires that the pointer of `v` is aligned to the size
157- # of `SIMD.Vec{4, eltype(v)}`, which is checked by `use_simd_load`.
158- @inline function velocity_and_density (v, system, :: Val{true} , particle)
159- # Since `v` is stored as a 4 x N matrix, this aligned load extracts one column
160- # of `v` corresponding to `particle`.
161- # Note that this doesn't work for 2D because it requires a stride of 2^n.
162- vrho_particle = SIMD. vloada (SIMD. Vec{4 , eltype (v)}, pointer (v, 4 * (particle - 1 ) + 1 ))
157+ # WARNING: this requires that the pointer of `v` is aligned to `4 * sizeof(eltype(v))`,
158+ # which is checked by `use_aligned_vrho_load`.
159+ # Only call this function after checking `use_aligned_vrho_load` to avoid
160+ # segmentation faults from illegal accesses.
161+ @propagate_inbounds function velocity_and_density (v, system, :: Val{true} , particle)
162+ vrho_particle = extract_svector_aligned (v, Val (4 ), particle)
163163
164164 # The columns of `v` are ordered as (v_x, v_y, v_z, rho)
165- v1, v2, v3 , rho = Tuple (vrho_particle)
166- v_particle = SVector (v1, v2, v3 )
165+ v ... , rho = Tuple (vrho_particle)
166+ v_particle = SVector (v )
167167
168168 return v_particle, rho
169169end
170170
171- # By default, don't use SIMD loads
172- use_simd_load (v, system) = false
171+ # By default, don't use aligned loads
172+ use_aligned_vrho_load (v, system) = false
173173
174- function use_simd_load (v:: AbstractGPUArray , system:: WeaklyCompressibleSPHSystem{3} )
175- use_simd_load (v, system, system. density_calculator)
174+ function use_aligned_vrho_load (v:: AbstractGPUArray , system:: WeaklyCompressibleSPHSystem{3} )
175+ use_aligned_vrho_load (v, system, system. density_calculator)
176176end
177177
178- use_simd_load (v, system, density_calculator) = false
178+ use_aligned_vrho_load (v, system, density_calculator) = false
179179
180- # Only use SIMD loads when all of these conditions are satisfied:
180+ # Only use aligned loads when all of these conditions are satisfied:
181181# - WCSPH with `ContinuityDensity` in 3D. Only then, the columns of `v` are of length 4.
182- # - We are on a GPU, where the SIMD load gives a significant speedup.
183- # - The velocity array is aligned for SIMD loads, which requires that the pointer of `v`
184- # is aligned to the size of `SIMD.Vec{4, eltype(v)}`.
185- # Otherwise, we cannot use `vloada`, which is an *aligned* SIMD load.
182+ # - We are on a GPU, where the aligned load gives a significant speedup.
183+ # - The velocity array is aligned for aligned loads, which requires that the pointer of `v`
184+ # is aligned to `4 * sizeof( eltype(v))`
185+ # Otherwise, we cannot use `vloada`, which is an *aligned* load.
186186# The unaligned version `vload` does not produce wide load instructions on GPUs.
187- # In this last case, we don't fall back to the non-SIMD version and throw an error instead.
188- function use_simd_load (v:: AbstractGPUArray , system, :: ContinuityDensity )
189- aligned = is_aligned (pointer (v), SIMD. Vec{4 , eltype (v)})
190-
191- if ! aligned
187+ function use_aligned_vrho_load (v:: AbstractGPUArray , system, :: ContinuityDensity )
188+ if ! can_use_aligned_load (v, 4 )
189+ # If aligned loads are possible for the problem, but not allowed due to alignment,
190+ # we don't fall back to the non-SIMD version and throw an error instead.
191+ # This is likely a configuration error (see the error message below), and notifying
192+ # the user is better than silently falling back to slower loads and thus
193+ # non-deterministic performance.
192194 error (" on GPUs in 3D, all WCSPH systems with `ContinuityDensity` must be the " *
193195 " first systems in the semidiscretization to ensure that their integration " *
194196 " arrays are aligned for SIMD loads." )
195197 end
196198
197- return aligned
199+ return true
198200end
199-
200- is_aligned (ptr, :: Type{SIMD.Vec{N, T}} ) where {N, T} = UInt (ptr) % (N * sizeof (T)) == 0
0 commit comments