Skip to content

Commit f30c5e0

Browse files
committed
feat: add xsimd::get<I>() for compile-time element extraction
Introduces get<I>(batch) as a top-level API for extracting a single lane at a compile-time index. Falls back to the runtime get() when per-arch overloads aren't present. Per-arch optimal lowerings: - SSE2: pextrw / byte-shift+movd / swizzle+first by lane width. - SSE4.1: pextrb/w/d/q; I==0 short-circuits to first(). - AVX: I==0 short-circuits to first(); else halve + SSE4.1 path. - AVX-512F: I==0 short-circuits to first(); 32/64-bit lanes use valignd/valignq + first() (2 ops); 8/16-bit halve through AVX. - NEON / NEON64 / RVV: native single-lane extract intrinsics.
1 parent 7c9611e commit f30c5e0

11 files changed

Lines changed: 404 additions & 81 deletions

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,8 @@ namespace xsimd
223223
template <class A, size_t I, class T>
224224
XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<common>) noexcept
225225
{
226-
alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
226+
using value_type = typename batch<std::complex<T>, A>::value_type;
227+
alignas(A::alignment()) value_type buffer[batch<std::complex<T>, A>::size];
227228
self.store_aligned(&buffer[0]);
228229
return buffer[I];
229230
}

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 74 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,80 @@ namespace xsimd
748748
return self - batch<T, A>(mask.data);
749749
}
750750

751+
// first (must precede get for two-phase lookup)
752+
template <class A>
753+
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
754+
{
755+
return _mm256_cvtss_f32(self);
756+
}
757+
758+
template <class A>
759+
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
760+
{
761+
return _mm256_cvtsd_f64(self);
762+
}
763+
764+
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
765+
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
766+
{
767+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
768+
{
769+
return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFF);
770+
}
771+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
772+
{
773+
return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFFFF);
774+
}
775+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
776+
{
777+
return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)));
778+
}
779+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
780+
{
781+
batch<T, sse4_2> low = _mm256_castsi256_si128(self);
782+
return first(low, sse4_2 {});
783+
}
784+
else
785+
{
786+
assert(false && "unsupported arch/op combination");
787+
return {};
788+
}
789+
}
790+
791+
// get
792+
template <class A, size_t I>
793+
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
794+
{
795+
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
796+
constexpr size_t elements_per_lane = batch<float, sse4_1>::size;
797+
constexpr size_t lane = I / elements_per_lane;
798+
constexpr size_t sub_index = I % elements_per_lane;
799+
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
800+
return kernel::get(batch<float, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
801+
}
802+
803+
template <class A, size_t I>
804+
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
805+
{
806+
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
807+
constexpr size_t elements_per_lane = batch<double, sse4_1>::size;
808+
constexpr size_t lane = I / elements_per_lane;
809+
constexpr size_t sub_index = I % elements_per_lane;
810+
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
811+
return kernel::get(batch<double, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
812+
}
813+
814+
template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
815+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
816+
{
817+
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
818+
constexpr size_t elements_per_lane = batch<T, sse4_1>::size;
819+
constexpr size_t lane = I / elements_per_lane;
820+
constexpr size_t sub_index = I % elements_per_lane;
821+
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
822+
return kernel::get(batch<T, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
823+
}
824+
751825
// insert
752826
template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
753827
XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
@@ -2015,46 +2089,6 @@ namespace xsimd
20152089
return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
20162090
}
20172091

2018-
// first
2019-
template <class A>
2020-
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
2021-
{
2022-
return _mm256_cvtss_f32(self);
2023-
}
2024-
2025-
template <class A>
2026-
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
2027-
{
2028-
return _mm256_cvtsd_f64(self);
2029-
}
2030-
2031-
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
2032-
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
2033-
{
2034-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2035-
{
2036-
return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFF);
2037-
}
2038-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2039-
{
2040-
return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFFFF);
2041-
}
2042-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2043-
{
2044-
return static_cast<T>(_mm256_cvtsi256_si32(self));
2045-
}
2046-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2047-
{
2048-
batch<T, sse4_2> low = _mm256_castsi256_si128(self);
2049-
return first(low, sse4_2 {});
2050-
}
2051-
else
2052-
{
2053-
assert(false && "unsupported arch/op combination");
2054-
return {};
2055-
}
2056-
}
2057-
20582092
// widen
20592093
template <class A, class T>
20602094
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 91 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,6 +1346,97 @@ namespace xsimd
13461346
}
13471347
}
13481348

1349+
// first (must precede get for two-phase lookup)
1350+
template <class A>
1351+
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
1352+
{
1353+
return _mm512_cvtss_f32(self);
1354+
}
1355+
1356+
template <class A>
1357+
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
1358+
{
1359+
return _mm512_cvtsd_f64(self);
1360+
}
1361+
1362+
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
1363+
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1364+
{
1365+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1366+
{
1367+
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
1368+
}
1369+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1370+
{
1371+
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
1372+
}
1373+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1374+
{
1375+
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
1376+
}
1377+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1378+
{
1379+
batch<T, sse4_2> low = _mm512_castsi512_si128(self);
1380+
return first(low, sse4_2 {});
1381+
}
1382+
else
1383+
{
1384+
assert(false && "unsupported arch/op combination");
1385+
return {};
1386+
}
1387+
}
1388+
1389+
// get: use valignd/valignq to rotate lane I into position 0 in a single op.
1390+
template <class A, size_t I>
1391+
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1392+
{
1393+
XSIMD_IF_CONSTEXPR(I == 0)
1394+
{
1395+
return first(self, avx512f {});
1396+
}
1397+
const auto rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I);
1398+
return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated)));
1399+
}
1400+
1401+
template <class A, size_t I>
1402+
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1403+
{
1404+
XSIMD_IF_CONSTEXPR(I == 0)
1405+
{
1406+
return first(self, avx512f {});
1407+
}
1408+
const auto rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I);
1409+
return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated)));
1410+
}
1411+
1412+
template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
1413+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1414+
{
1415+
XSIMD_IF_CONSTEXPR(I == 0)
1416+
{
1417+
return first(self, avx512f {});
1418+
}
1419+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1420+
{
1421+
const auto rotated = _mm512_alignr_epi32(self, self, I);
1422+
return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 {});
1423+
}
1424+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1425+
{
1426+
const auto rotated = _mm512_alignr_epi64(self, self, I);
1427+
return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 {});
1428+
}
1429+
else
1430+
{
1431+
// 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves.
1432+
constexpr size_t elements_per_lane = batch<T, avx>::size;
1433+
constexpr size_t lane = I / elements_per_lane;
1434+
constexpr size_t sub_index = I % elements_per_lane;
1435+
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
1436+
return kernel::get(batch<T, avx>(half), ::xsimd::index<sub_index> {}, avx {});
1437+
}
1438+
}
1439+
13491440
// insert
13501441
template <class A, size_t I>
13511442
XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512f>) noexcept
@@ -2753,46 +2844,6 @@ namespace xsimd
27532844
2));
27542845
}
27552846

2756-
// first
2757-
template <class A>
2758-
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
2759-
{
2760-
return _mm512_cvtss_f32(self);
2761-
}
2762-
2763-
template <class A>
2764-
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
2765-
{
2766-
return _mm512_cvtsd_f64(self);
2767-
}
2768-
2769-
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
2770-
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
2771-
{
2772-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2773-
{
2774-
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
2775-
}
2776-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2777-
{
2778-
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
2779-
}
2780-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2781-
{
2782-
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
2783-
}
2784-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2785-
{
2786-
batch<T, sse4_2> low = _mm512_castsi512_si128(self);
2787-
return first(low, sse4_2 {});
2788-
}
2789-
else
2790-
{
2791-
assert(false && "unsupported arch/op combination");
2792-
return {};
2793-
}
2794-
}
2795-
27962847
// widen
27972848
template <class A, class T>
27982849
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2742,6 +2742,61 @@ namespace xsimd
27422742
return vshrq_n_s64(x, shift);
27432743
}
27442744

2745+
// get
2746+
template <class A, size_t I>
2747+
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2748+
{
2749+
return vgetq_lane_f32(self, I);
2750+
}
2751+
2752+
template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2753+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2754+
{
2755+
return vgetq_lane_u8(self, I);
2756+
}
2757+
2758+
template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 1> = 0>
2759+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2760+
{
2761+
return vgetq_lane_s8(self, I);
2762+
}
2763+
2764+
template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2765+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2766+
{
2767+
return vgetq_lane_u16(self, I);
2768+
}
2769+
2770+
template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 2> = 0>
2771+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2772+
{
2773+
return vgetq_lane_s16(self, I);
2774+
}
2775+
2776+
template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2777+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2778+
{
2779+
return vgetq_lane_u32(self, I);
2780+
}
2781+
2782+
template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 4> = 0>
2783+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2784+
{
2785+
return vgetq_lane_s32(self, I);
2786+
}
2787+
2788+
template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2789+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2790+
{
2791+
return vgetq_lane_u64(self, I);
2792+
}
2793+
2794+
template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 8> = 0>
2795+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2796+
{
2797+
return vgetq_lane_s64(self, I);
2798+
}
2799+
27452800
// first
27462801
template <class A>
27472802
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<neon>) noexcept

include/xsimd/arch/xsimd_neon64.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ namespace xsimd
3131
{
3232
using namespace types;
3333

34+
// get
35+
template <class A, size_t I>
36+
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<neon64>) noexcept
37+
{
38+
return vgetq_lane_f64(self, I);
39+
}
40+
3441
// first
3542
template <class A>
3643
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<neon64>) noexcept

include/xsimd/arch/xsimd_rvv.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,6 +1369,27 @@ namespace xsimd
13691369
return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
13701370
}
13711371

1372+
// get (compile-time index): skip the slidedown when I == 0; lane 0 maps straight to the scalar move.
1373+
template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
1374+
XSIMD_INLINE T get(batch<T, A> const& arg, index<I>, requires_arch<rvv>) noexcept
1375+
{
1376+
XSIMD_IF_CONSTEXPR(I == 0)
1377+
{
1378+
return detail::rvvmv_lane0(arg);
1379+
}
1380+
return get(arg, I, rvv {});
1381+
}
1382+
1383+
template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
1384+
XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, index<I>, requires_arch<rvv>) noexcept
1385+
{
1386+
XSIMD_IF_CONSTEXPR(I == 0)
1387+
{
1388+
return std::complex<T> { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) };
1389+
}
1390+
return get(arg, I, rvv {});
1391+
}
1392+
13721393
// all
13731394
template <class A, class T, detail::rvv_enable_all_t<T> = 0>
13741395
XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept

0 commit comments

Comments
 (0)