xtensor-stack
diff --git a/‎include/xsimd/arch/common/xsimd_common_memory.hpp‎
Lines changed: 2 additions & 1 deletion b/‎include/xsimd/arch/common/xsimd_common_memory.hpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 74 additions & 40 deletions b/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 74 additions & 40 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx512f.hpp‎
Lines changed: 91 additions & 40 deletions b/‎include/xsimd/arch/xsimd_avx512f.hpp‎
Lines changed: 91 additions & 40 deletions
diff --git a/‎include/xsimd/arch/xsimd_neon.hpp‎
Lines changed: 55 additions & 0 deletions b/‎include/xsimd/arch/xsimd_neon.hpp‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_neon64.hpp‎
Lines changed: 7 additions & 0 deletions b/‎include/xsimd/arch/xsimd_neon64.hpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_rvv.hpp‎
Lines changed: 21 additions & 0 deletions b/‎include/xsimd/arch/xsimd_rvv.hpp‎
Lines changed: 21 additions & 0 deletions
@@ -223,7 +223,8 @@ namespace xsimd
         template <class A, size_t I, class T>
         XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<common>) noexcept
         {
-            alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
+            using value_type = typename batch<std::complex<T>, A>::value_type;
+            alignas(A::alignment()) value_type buffer[batch<std::complex<T>, A>::size];
             self.store_aligned(&buffer[0]);
             return buffer[I];
         }
 
@@ -748,6 +748,80 @@ namespace xsimd
             return self - batch<T, A>(mask.data);
         }
 
+        // first (must precede get for two-phase lookup)
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtss_f32(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtsd_f64(self);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, sse4_2> low = _mm256_castsi256_si128(self);
+                return first(low, sse4_2 {});
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
+            constexpr size_t elements_per_lane = batch<float, sse4_1>::size;
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<float, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
+            constexpr size_t elements_per_lane = batch<double, sse4_1>::size;
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<double, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
+        template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
+            constexpr size_t elements_per_lane = batch<T, sse4_1>::size;
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<T, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
         // insert
         template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
@@ -2015,46 +2089,6 @@ namespace xsimd
             return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
         }
 
-        // first
-        template <class A>
-        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
-        {
-            return _mm256_cvtss_f32(self);
-        }
-
-        template <class A>
-        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
-        {
-            return _mm256_cvtsd_f64(self);
-        }
-
-        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
-        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFFFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return static_cast<T>(_mm256_cvtsi256_si32(self));
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                batch<T, sse4_2> low = _mm256_castsi256_si128(self);
-                return first(low, sse4_2 {});
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-
         // widen
         template <class A, class T>
         XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
 
@@ -1346,6 +1346,97 @@ namespace xsimd
             }
         }
 
+        // first (must precede get for two-phase lookup)
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtss_f32(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtsd_f64(self);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, sse4_2> low = _mm512_castsi512_si128(self);
+                return first(low, sse4_2 {});
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // get: use valignd/valignq to rotate lane I into position 0 in a single op.
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, avx512f {});
+            }
+            const auto rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I);
+            return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated)));
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, avx512f {});
+            }
+            const auto rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I);
+            return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated)));
+        }
+
+        template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, avx512f {});
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                const auto rotated = _mm512_alignr_epi32(self, self, I);
+                return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 {});
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                const auto rotated = _mm512_alignr_epi64(self, self, I);
+                return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 {});
+            }
+            else
+            {
+                // 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves.
+                constexpr size_t elements_per_lane = batch<T, avx>::size;
+                constexpr size_t lane = I / elements_per_lane;
+                constexpr size_t sub_index = I % elements_per_lane;
+                const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+                return kernel::get(batch<T, avx>(half), ::xsimd::index<sub_index> {}, avx {});
+            }
+        }
+
         // insert
         template <class A, size_t I>
         XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512f>) noexcept
@@ -2753,46 +2844,6 @@ namespace xsimd
                 2));
         }
 
-        // first
-        template <class A>
-        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            return _mm512_cvtss_f32(self);
-        }
-
-        template <class A>
-        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            return _mm512_cvtsd_f64(self);
-        }
-
-        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
-        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                batch<T, sse4_2> low = _mm512_castsi512_si128(self);
-                return first(low, sse4_2 {});
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-
         // widen
         template <class A, class T>
         XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept
 
@@ -2742,6 +2742,61 @@ namespace xsimd
             return vshrq_n_s64(x, shift);
         }
 
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_f32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u8(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s8(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u16(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s16(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u64(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s64(self, I);
+        }
+
         // first
         template <class A>
         XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<neon>) noexcept
 
@@ -31,6 +31,13 @@ namespace xsimd
     {
         using namespace types;
 
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<neon64>) noexcept
+        {
+            return vgetq_lane_f64(self, I);
+        }
+
         // first
         template <class A>
         XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<neon64>) noexcept
 
@@ -1369,6 +1369,27 @@ namespace xsimd
             return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
         }
 
+        // get (compile-time index): skip the slidedown when I == 0; lane 0 maps straight to the scalar move.
+        template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& arg, index<I>, requires_arch<rvv>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return detail::rvvmv_lane0(arg);
+            }
+            return get(arg, I, rvv {});
+        }
+
+        template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, index<I>, requires_arch<rvv>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return std::complex<T> { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) };
+            }
+            return get(arg, I, rvv {});
+        }
+
         // all
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
Original file line number	Diff line number	Diff line change
`@@ -223,7 +223,8 @@ namespace xsimd`
`223`	`223`	`template <class A, size_t I, class T>`
`224`	`224`	`XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<common>) noexcept`
`225`	`225`	`{`
`226`		`- alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];`
	`226`	`+ using value_type = typename batch<std::complex<T>, A>::value_type;`
	`227`	`+ alignas(A::alignment()) value_type buffer[batch<std::complex<T>, A>::size];`
`227`	`228`	`self.store_aligned(&buffer[0]);`
`228`	`229`	`return buffer[I];`
`229`	`230`	`}`