feat: add xsimd::get<I>() for compile-time element extraction

DiamonDinoia · DiamonDinoia · commit 22f9a1e49927 · 2026-04-17T11:53:27.000-04:00
Adds a new public API `xsimd::get&lt;I&gt;(batch)` that extracts a compile-time
indexed lane from a batch. Unlike the runtime `batch::get(i)`, the index is
a template parameter so each arch can dispatch to the best single-op path.

Design per architecture (objdump-verified, pure -march flags, no reliance
on compiler optimization):

- SSE2: `first` for I==0; 32/64-bit (int, float, double) go through
  `swizzle + first` so the xsimd permute API emits the shuffle; 8/16-bit
  stay on `psrldq + movd` because sse2 swizzle expands to 2 ops for
  broadcast-to-lane-0 (pshuflw/pshufhw + unpck) while srli keeps it at 1.
- SSE4.1: native `pextrb/w/d/q` for integer (1 op); float override removed
  so it falls through to sse2's swizzle path (equivalent 1-op codegen).
- AVX/AVX2: half-extract + delegate to sse4_1 (1 op low half, 2 ops upper
  half — hardware lower bound).
- AVX-512F: `valignd`/`valignq` rotate + extract for float/double — 1 op
  for every I, including upper half (was 2). Integer keeps the extract +
  pextr* split (2 ops, optimal).
- NEON/NEON64: native per-lane `mov`/`umov v.X[I]` (1 op).
- RVV: skip `vslidedown` when I==0.

Tests build `array_type { xsimd::get&lt;Is&gt;(res)... }` via pack-initialization,
compare against the reference array, and verify that reloading the extracted
values reproduces the original batch.

Verified on sse2, sse4.1, avx2, avx-512 (sde), aarch64 (qemu), rvv (qemu).
diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -224,7 +224,8 @@ namespace xsimd
         template <class A, size_t I, class T>
         XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<common>) noexcept
         {
-            alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
+            using value_type = typename batch<std::complex<T>, A>::value_type;
+            alignas(A::alignment()) value_type buffer[batch<std::complex<T>, A>::size];
             self.store_aligned(&buffer[0]);
             return buffer[I];
         }
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -748,6 +748,37 @@ namespace xsimd
             return self - batch<T, A>(mask.data);
         }
 
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            constexpr size_t elements_per_lane = 4;
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            __m128 half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<float, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            constexpr size_t elements_per_lane = 2;
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            __m128d half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<double, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
+        template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            constexpr size_t elements_per_lane = 16 / sizeof(T);
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            __m128i half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<T, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
         // insert
         template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -1346,6 +1346,39 @@ namespace xsimd
             }
         }
 
+        // get: use valignd/valignq to rotate lane I into position 0 in a single op, regardless of half.
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return _mm_cvtss_f32(_mm512_castps512_ps128(self));
+            }
+            const __m512i rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I);
+            return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated)));
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return _mm_cvtsd_f64(_mm512_castpd512_pd128(self));
+            }
+            const __m512i rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I);
+            return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated)));
+        }
+
+        template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            constexpr size_t elements_per_lane = 32 / sizeof(T);
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            __m256i half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<T, avx>(half), ::xsimd::index<sub_index> {}, avx {});
+        }
+
         // insert
         template <class A, size_t I>
         XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512f>) noexcept
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
@@ -2742,6 +2742,61 @@ namespace xsimd
             return vshrq_n_s64(x, shift);
         }
 
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_f32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u8(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s8(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u16(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s16(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u64(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s64(self, I);
+        }
+
         // first
         template <class A>
         XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<neon>) noexcept
diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
@@ -31,6 +31,13 @@ namespace xsimd
     {
         using namespace types;
 
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<neon64>) noexcept
+        {
+            return vgetq_lane_f64(self, I);
+        }
+
         // first
         template <class A>
         XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<neon64>) noexcept
diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp
@@ -1370,6 +1370,27 @@ namespace xsimd
             return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
         }
 
+        // get (compile-time index): skip the slidedown when I == 0; lane 0 maps straight to the scalar move.
+        template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& arg, index<I>, requires_arch<rvv>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return detail::rvvmv_lane0(arg);
+            }
+            return get(arg, I, rvv {});
+        }
+
+        template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, index<I>, requires_arch<rvv>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return std::complex<T> { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) };
+            }
+            return get(arg, I, rvv {});
+        }
+
         // all
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
@@ -2276,6 +2276,56 @@ namespace xsimd
             }
         }
 
+        // get (must appear after first and swizzle so it can delegate through the xsimd API)
+        namespace detail
+        {
+            // broadcast lane index I across a batch_constant<IdxT, A, I, I, ..., I> matching batch<T, A>::size
+            template <class T, class A, size_t I, size_t... Is>
+            XSIMD_INLINE auto broadcast_lane_index(std::index_sequence<Is...>) noexcept
+                -> batch_constant<as_unsigned_integer_t<T>, A, static_cast<as_unsigned_integer_t<T>>(Is * 0 + I)...>
+            {
+                return {};
+            }
+
+            template <class T, class A, size_t I>
+            XSIMD_INLINE auto broadcast_lane_index() noexcept
+                -> decltype(broadcast_lane_index<T, A, I>(std::make_index_sequence<batch<T, A>::size> {}))
+            {
+                return {};
+            }
+        }
+
+        template <class A, size_t I, class T>
+        XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= 2, T>::type
+        get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse2>) noexcept
+        {
+            // sse2 lacks a single-op broadcast-extract for byte/word lanes; a plain byte-lane shift + movd is strictly better
+            // than routing through swizzle, which expands to pshuflw/pshufhw+unpck for non-trivial I.
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, A {});
+            }
+            else
+            {
+                constexpr uint32_t mask = sizeof(T) == 1 ? 0xFF : 0xFFFF;
+                return static_cast<T>(_mm_cvtsi128_si32(_mm_srli_si128(self, I * sizeof(T))) & mask);
+            }
+        }
+
+        template <class A, size_t I, class T>
+        XSIMD_INLINE typename std::enable_if<(std::is_integral<T>::value && sizeof(T) >= 4) || std::is_floating_point<T>::value, T>::type
+        get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, A {});
+            }
+            else
+            {
+                return first(swizzle(self, detail::broadcast_lane_index<T, A, I>(), A {}), A {});
+            }
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -105,6 +105,39 @@ namespace xsimd
             return _mm_floor_pd(self);
         }
 
+        // get
+        template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_extract_epi8(self, I));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_extract_epi16(self, I));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_extract_epi32(self, I));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if defined(__x86_64__)
+                return static_cast<T>(_mm_extract_epi64(self, I));
+#else
+                return get(self, ::xsimd::index<I> {}, sse2 {});
+#endif
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // get<float> falls through to sse2's swizzle+first path (equivalent 1-op codegen; no sse4.1-specific win)
+
         // insert
         template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -1109,6 +1109,37 @@ namespace xsimd
         return x > y;
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Extract the scalar element at compile-time index \c I from batch \c b.
+     * @param b the batch to extract from.
+     * @return the scalar element at index \c I.
+     */
+    template <size_t I, class T, class A>
+    XSIMD_INLINE T get(batch<T, A> const& b) noexcept
+    {
+        static_assert(I < batch<T, A>::size, "index out of bounds");
+        detail::static_check_supported_config<T, A>();
+        return kernel::get(b, index<I> {}, A {});
+    }
+
+    template <size_t I, class T, class A>
+    XSIMD_INLINE bool get(batch_bool<T, A> const& b) noexcept
+    {
+        static_assert(I < batch_bool<T, A>::size, "index out of bounds");
+        detail::static_check_supported_config<T, A>();
+        return kernel::get(b, index<I> {}, A {});
+    }
+
+    template <size_t I, class T, class A>
+    XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& b) noexcept
+    {
+        static_assert(I < batch<std::complex<T>, A>::size, "index out of bounds");
+        detail::static_check_supported_config<T, A>();
+        return kernel::get(b, index<I> {}, A {});
+    }
+
     /**
      * @ingroup batch_reducers
      *
diff --git a/test/test_batch.cpp b/test/test_batch.cpp
@@ -158,6 +158,21 @@ struct batch_test
         CHECK_EQ(res.first(), lhs[0]);
     }
 
+    template <size_t... Is>
+    void test_get_impl(batch_type const& res, std::index_sequence<Is...>) const
+    {
+        array_type extracted = { xsimd::get<Is>(res)... };
+        CHECK_EQ(extracted, lhs);
+        CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res);
+    }
+
+    void test_get() const
+    {
+        batch_type res = batch_lhs();
+        CHECK_EQ(xsimd::get<0>(res), res.first());
+        test_get_impl(res, std::make_index_sequence<size> {});
+    }
+
     void test_arithmetic() const
     {
         // +batch
@@ -986,6 +1001,11 @@ TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES)
         Test.test_first_element();
     }
 
+    SUBCASE("get")
+    {
+        Test.test_get();
+    }
+
     SUBCASE("arithmetic")
     {
         Test.test_arithmetic();
diff --git a/test/test_batch_complex.cpp b/test/test_batch_complex.cpp
@@ -182,6 +182,21 @@ struct batch_complex_test
         CHECK_EQ(res.first(), lhs[0]);
     }
 
+    template <size_t... Is>
+    void test_get_impl(batch_type const& res, std::index_sequence<Is...>) const
+    {
+        array_type extracted = { xsimd::get<Is>(res)... };
+        CHECK_EQ(extracted, lhs);
+        CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res);
+    }
+
+    void test_get() const
+    {
+        batch_type res = batch_lhs();
+        CHECK_EQ(xsimd::get<0>(res), res.first());
+        test_get_impl(res, std::make_index_sequence<size> {});
+    }
+
     void test_arithmetic() const
     {
         // +batch
@@ -689,6 +704,8 @@ TEST_CASE_TEMPLATE("[xsimd complex batches]", B, BATCH_COMPLEX_TYPES)
 
     SUBCASE("first element") { Test.test_first_element(); }
 
+    SUBCASE("get") { Test.test_get(); }
+
     SUBCASE("arithmetic") { Test.test_arithmetic(); }
 
     SUBCASE("computed_assignment") { Test.test_computed_assignment(); }

Original file line number	Diff line number	Diff line change
`@@ -224,7 +224,8 @@ namespace xsimd`
`224`	`224`	`template <class A, size_t I, class T>`
`225`	`225`	`XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<common>) noexcept`
`226`	`226`	`{`
`227`		`- alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];`
	`227`	`+ using value_type = typename batch<std::complex<T>, A>::value_type;`
	`228`	`+ alignas(A::alignment()) value_type buffer[batch<std::complex<T>, A>::size];`
`228`	`229`	`self.store_aligned(&buffer[0]);`
`229`	`230`	`return buffer[I];`
`230`	`231`	`}`