From 86ae08aafaeeab35c00b471a3cadebd440e1cca4 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Wed, 28 Jan 2026 12:56:58 -0800 Subject: [PATCH 1/2] SIMD: Use AMReX versions --- src/elements/mixin/beamoptic.H | 210 ++++++++------------------------- 1 file changed, 46 insertions(+), 164 deletions(-) diff --git a/src/elements/mixin/beamoptic.H b/src/elements/mixin/beamoptic.H index a2e4ba587..afcd579bd 100644 --- a/src/elements/mixin/beamoptic.H +++ b/src/elements/mixin/beamoptic.H @@ -21,125 +21,10 @@ #include - -namespace impactx -{ - /** A generalized ParallelFor that dispatches to amrex::ParallelFor or amrex::ParallelForSIMD - * depending on whether the element type T_Element is vectorized. - * - * @tparam T_Element the element type - * @tparam F the functor type - * @param n the number of items to iterate over - * @param f the functor to execute - */ - template - void ParallelFor (int n, F&& f) { -#ifdef AMREX_USE_SIMD - if constexpr (amrex::simd::is_vectorized) { - amrex::ParallelForSIMD(n, std::forward(f)); - } else -#endif - { - amrex::ParallelFor(n, std::forward(f)); - } - } -} // namespace impactx - namespace impactx::elements::mixin { namespace detail { - /** Load particle data from array pointers - * - * On GPU and CPU w/o SIMD, this dereferences a particle property at the - * index position i. - * On CPU with SIMD, this loads a SIMD register at the IndexType::width - * SIMD-wide index position i. - * - * @tparam T data type (amrex::ParticleReal or uint64_t) - * @tparam IndexType int or amrex::SIMDindex - * @param ptr pointer to the array data - * @param i index or SIMD index - * @return a reference to the data (scalar) or a SIMD register (SIMD) - */ - template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE - decltype(auto) load_pdata (T* ptr, IndexType const i) - { - if constexpr (std::is_integral_v) { - return ptr[i]; - } else { -#ifdef AMREX_USE_SIMD - using namespace amrex::simd; - using RealType = SIMDParticleReal; - using IdCpuType = SIMDIdCpu; - using DataType = std::conditional_t< - std::is_same_v, - RealType, - IdCpuType - >; - - // initialize vector register - // TODO stdx::vector_aligned needs alignment guarantees - // https://github.com/AMReX-Codes/amrex/issues/4592 - // https://en.cppreference.com/w/cpp/experimental/simd/simd/copy_from - DataType val; - val.copy_from(&ptr[i.index], stdx::element_aligned); - return val; - -#else - // error handling: we should never get here - amrex::ignore_unused(ptr, i); - amrex::Abort("SIMD index used but SIMD is not enabled"); - return ptr[0]; -#endif - } - } - - /** Store particle data back to array pointers - * - * On GPU and CPU without SIMD, this does nothing because we already - * modified the (global) RAM directly via pointer. - * - * On CPU with SIMD, this performs a conditional writeback of a SIMD register - * to RAM (index in pointer array), but only if the argument was not passed - * as const and thus was likely changed. - * - * Good optimizing compilers can eliminate writebacks of unchanged values - * themselves, but we better help a little for robustness. Background: - * https://github.com/AMReX-Codes/amrex/pull/4520#issuecomment-3064064215 - * - * @tparam P_Method pointer to the push method (for is_nth_arg_non_const) - * @tparam N the argument index (for is_nth_arg_non_const) - * @tparam T data type - * @tparam IndexType int or SIMD index - * @tparam ValType the type of the value to store - * @param val the value to store - * @param ptr pointer to the SoA data - * @param i index or SIMD index - */ - template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE - void store_pdata ( - ValType const & AMREX_RESTRICT val, - T * const AMREX_RESTRICT ptr, - IndexType const i - ) - { -#ifdef AMREX_USE_SIMD - if constexpr (!std::is_integral_v) { - if constexpr (amrex::simd::is_nth_arg_non_const(P_Method, N)) { - // write back to memory - // TODO stdx::vector_aligned needs alignment guarantees - // https://github.com/AMReX-Codes/amrex/issues/4592 - // https://en.cppreference.com/w/cpp/experimental/simd/simd/copy_from - val.copy_to(&ptr[i.index], amrex::simd::stdx::element_aligned); - } - } -#endif - amrex::ignore_unused(val, ptr, i); - } - /** Push a single particle through an element * * Note: we usually would just write a C++ lambda below in ParallelFor. But, due to restrictions @@ -216,40 +101,37 @@ namespace detail // access SoA data // note: an optimizing compiler will eliminate loads of unused parameters - decltype(auto) x = load_pdata(m_part_x, i); - decltype(auto) y = load_pdata(m_part_y, i); - decltype(auto) t = load_pdata(m_part_t, i); - decltype(auto) px = load_pdata(m_part_px, i); - decltype(auto) py = load_pdata(m_part_py, i); - decltype(auto) pt = load_pdata(m_part_pt, i); - decltype(auto) sx = load_pdata(m_part_sx, i); - decltype(auto) sy = load_pdata(m_part_sy, i); - decltype(auto) sz = load_pdata(m_part_sz, i); - decltype(auto) idcpu = load_pdata(m_part_idcpu, i); + decltype(auto) x = load_1d(m_part_x, i); + decltype(auto) y = load_1d(m_part_y, i); + decltype(auto) t = load_1d(m_part_t, i); + decltype(auto) px = load_1d(m_part_px, i); + decltype(auto) py = load_1d(m_part_py, i); + decltype(auto) pt = load_1d(m_part_pt, i); + decltype(auto) sx = load_1d(m_part_sx, i); + decltype(auto) sy = load_1d(m_part_sy, i); + decltype(auto) sz = load_1d(m_part_sz, i); + decltype(auto) idcpu = load_1d(m_part_idcpu, i); // push spin & phase space through element m_element.spin_and_phasespace_push(x, y, t, px, py, pt, sx, sy, sz, idcpu, m_ref_part); // SIMD: write back to memory -#ifdef AMREX_USE_SIMD - if constexpr (amrex::simd::is_vectorized) - { - using RealType = std::decay_t; - using IdCpuType = std::decay_t; - constexpr auto P_Method = &T_Element::template spin_and_phasespace_push; - - store_pdata(x, m_part_x, i); - store_pdata(y, m_part_y, i); - store_pdata(t, m_part_t, i); - store_pdata(px, m_part_px, i); - store_pdata(py, m_part_py, i); - store_pdata(pt, m_part_pt, i); - store_pdata(sx, m_part_sx, i); - store_pdata(sy, m_part_sy, i); - store_pdata(sz, m_part_sz, i); - store_pdata(idcpu, m_part_idcpu, i); - } -#endif + using RealType = std::decay_t; + using IdCpuType = std::decay_t; + //static_assert(std::is_same_v, "SIMD push requires ParticleReal particle data!"); + //static_assert(std::is_same_v, "SIMD push requires uint64_t particle data!"); + constexpr decltype(auto) P_Method = &T_Element::template spin_and_phasespace_push; + + store_1d(x, m_part_x, i); + store_1d(y, m_part_y, i); + store_1d(t, m_part_t, i); + store_1d(px, m_part_px, i); + store_1d(py, m_part_py, i); + store_1d(pt, m_part_pt, i); + store_1d(sx, m_part_sx, i); + store_1d(sy, m_part_sy, i); + store_1d(sz, m_part_sz, i); + store_1d(idcpu, m_part_idcpu, i); } private: @@ -336,34 +218,34 @@ namespace detail // access SoA data // note: an optimizing compiler will eliminate loads of unused parameters - decltype(auto) x = load_pdata(m_part_x, i); - decltype(auto) y = load_pdata(m_part_y, i); - decltype(auto) t = load_pdata(m_part_t, i); - decltype(auto) px = load_pdata(m_part_px, i); - decltype(auto) py = load_pdata(m_part_py, i); - decltype(auto) pt = load_pdata(m_part_pt, i); - decltype(auto) idcpu = load_pdata(m_part_idcpu, i); + decltype(auto) x = load_1d(m_part_x, i); + decltype(auto) y = load_1d(m_part_y, i); + decltype(auto) t = load_1d(m_part_t, i); + decltype(auto) px = load_1d(m_part_px, i); + decltype(auto) py = load_1d(m_part_py, i); + decltype(auto) pt = load_1d(m_part_pt, i); + decltype(auto) idcpu = load_1d(m_part_idcpu, i); // push through element m_element(x, y, t, px, py, pt, idcpu, m_ref_part); // write back to memory -#ifdef AMREX_USE_SIMD if constexpr (amrex::simd::is_vectorized) { using RealType = std::decay_t; using IdCpuType = std::decay_t; - constexpr auto P_Method = &T_Element::template operator(); - - store_pdata(x, m_part_x, i); - store_pdata(y, m_part_y, i); - store_pdata(t, m_part_t, i); - store_pdata(px, m_part_px, i); - store_pdata(py, m_part_py, i); - store_pdata(pt, m_part_pt, i); - store_pdata(idcpu, m_part_idcpu, i); + //static_assert(std::is_same_v, "SIMD push requires ParticleReal particle data!"); + //static_assert(std::is_same_v, "SIMD push requires uint64_t particle data!"); + constexpr decltype(auto) P_Method = &T_Element::template operator(); + + store_1d(x, m_part_x, i); + store_1d(y, m_part_y, i); + store_1d(t, m_part_t, i); + store_1d(px, m_part_px, i); + store_1d(py, m_part_py, i); + store_1d(pt, m_part_pt, i); + store_1d(idcpu, m_part_idcpu, i); } -#endif } private: @@ -411,7 +293,7 @@ namespace detail element, part_x, part_y, part_t, part_px, part_py, part_pt, part_sx, part_sy, part_sz, part_idcpu, ref_part); // loop over beam particles in the box - impactx::ParallelFor(np, pushSingleParticle); + amrex::ParallelForSIMD(np, pushSingleParticle); } else { throw std::runtime_error("Spin transport requested but element does not implement the `SpinTransport` interface class!"); } @@ -421,7 +303,7 @@ namespace detail element, part_x, part_y, part_t, part_px, part_py, part_pt, part_idcpu, ref_part); // loop over beam particles in the box - impactx::ParallelFor(np, pushSingleParticle); + amrex::ParallelForSIMD(np, pushSingleParticle); } } } // namespace detail From 477d648f67357e035c5beb76426a542a1d65f0e5 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Wed, 6 May 2026 23:40:39 -0700 Subject: [PATCH 2/2] Remove Commented Static Asserts Co-authored-by: Axel Huebl --- src/elements/mixin/beamoptic.H | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/elements/mixin/beamoptic.H b/src/elements/mixin/beamoptic.H index afcd579bd..c4026989f 100644 --- a/src/elements/mixin/beamoptic.H +++ b/src/elements/mixin/beamoptic.H @@ -118,8 +118,6 @@ namespace detail // SIMD: write back to memory using RealType = std::decay_t; using IdCpuType = std::decay_t; - //static_assert(std::is_same_v, "SIMD push requires ParticleReal particle data!"); - //static_assert(std::is_same_v, "SIMD push requires uint64_t particle data!"); constexpr decltype(auto) P_Method = &T_Element::template spin_and_phasespace_push; store_1d(x, m_part_x, i); @@ -234,8 +232,6 @@ namespace detail { using RealType = std::decay_t; using IdCpuType = std::decay_t; - //static_assert(std::is_same_v, "SIMD push requires ParticleReal particle data!"); - //static_assert(std::is_same_v, "SIMD push requires uint64_t particle data!"); constexpr decltype(auto) P_Method = &T_Element::template operator(); store_1d(x, m_part_x, i);