Skip to content

Commit 945fce0

Browse files
committed
feat: add xsimd::get<I>() for compile-time element extraction
Introduces get<I>(batch) as a top-level API for extracting a single lane at a compile-time index. Falls back to the runtime get() when per-arch overloads aren't present. Per-arch optimal lowerings: - SSE2: pextrw / byte-shift+movd / swizzle+first by lane width. - SSE4.1: pextrb/w/d/q; I==0 short-circuits to first(). - AVX: I==0 short-circuits to first(); else halve + SSE4.1 path. - AVX-512F: I==0 short-circuits to first(); 32/64-bit lanes use valignd/valignq + first() (2 ops); 8/16-bit halve through AVX. - NEON / NEON64 / RVV: native single-lane extract intrinsics.
1 parent 5f1c41c commit 945fce0

11 files changed

Lines changed: 324 additions & 1 deletion

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,8 @@ namespace xsimd
224224
template <class A, size_t I, class T>
225225
XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<common>) noexcept
226226
{
227-
alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
227+
using value_type = typename batch<std::complex<T>, A>::value_type;
228+
alignas(A::alignment()) value_type buffer[batch<std::complex<T>, A>::size];
228229
self.store_aligned(&buffer[0]);
229230
return buffer[I];
230231
}

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,40 @@ namespace xsimd
748748
return self - batch<T, A>(mask.data);
749749
}
750750

751+
// get
752+
template <class A, size_t I>
753+
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
754+
{
755+
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx { }); }
756+
constexpr size_t elements_per_lane = batch<float, sse4_1>::size;
757+
constexpr size_t lane = I / elements_per_lane;
758+
constexpr size_t sub_index = I % elements_per_lane;
759+
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
760+
return kernel::get(batch<float, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
761+
}
762+
763+
template <class A, size_t I>
764+
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
765+
{
766+
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx { }); }
767+
constexpr size_t elements_per_lane = batch<double, sse4_1>::size;
768+
constexpr size_t lane = I / elements_per_lane;
769+
constexpr size_t sub_index = I % elements_per_lane;
770+
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
771+
return kernel::get(batch<double, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
772+
}
773+
774+
template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
775+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
776+
{
777+
XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx { }); }
778+
constexpr size_t elements_per_lane = batch<T, sse4_1>::size;
779+
constexpr size_t lane = I / elements_per_lane;
780+
constexpr size_t sub_index = I % elements_per_lane;
781+
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
782+
return kernel::get(batch<T, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
783+
}
784+
751785
// insert
752786
template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
753787
XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,6 +1346,57 @@ namespace xsimd
13461346
}
13471347
}
13481348

1349+
// get: use valignd/valignq to rotate lane I into position 0 in a single op.
1350+
template <class A, size_t I>
1351+
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1352+
{
1353+
XSIMD_IF_CONSTEXPR(I == 0)
1354+
{
1355+
return first(self, avx512f { });
1356+
}
1357+
const auto rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I);
1358+
return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated)));
1359+
}
1360+
1361+
template <class A, size_t I>
1362+
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1363+
{
1364+
XSIMD_IF_CONSTEXPR(I == 0)
1365+
{
1366+
return first(self, avx512f { });
1367+
}
1368+
const auto rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I);
1369+
return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated)));
1370+
}
1371+
1372+
template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
1373+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1374+
{
1375+
XSIMD_IF_CONSTEXPR(I == 0)
1376+
{
1377+
return first(self, avx512f { });
1378+
}
1379+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1380+
{
1381+
const auto rotated = _mm512_alignr_epi32(self, self, I);
1382+
return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 { });
1383+
}
1384+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1385+
{
1386+
const auto rotated = _mm512_alignr_epi64(self, self, I);
1387+
return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 { });
1388+
}
1389+
else
1390+
{
1391+
// 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves.
1392+
constexpr size_t elements_per_lane = batch<T, avx>::size;
1393+
constexpr size_t lane = I / elements_per_lane;
1394+
constexpr size_t sub_index = I % elements_per_lane;
1395+
const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
1396+
return kernel::get(batch<T, avx>(half), ::xsimd::index<sub_index> { }, avx { });
1397+
}
1398+
}
1399+
13491400
// insert
13501401
template <class A, size_t I>
13511402
XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512f>) noexcept

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2742,6 +2742,61 @@ namespace xsimd
27422742
return vshrq_n_s64(x, shift);
27432743
}
27442744

2745+
// get
2746+
template <class A, size_t I>
2747+
XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2748+
{
2749+
return vgetq_lane_f32(self, I);
2750+
}
2751+
2752+
template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2753+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2754+
{
2755+
return vgetq_lane_u8(self, I);
2756+
}
2757+
2758+
template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 1> = 0>
2759+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2760+
{
2761+
return vgetq_lane_s8(self, I);
2762+
}
2763+
2764+
template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2765+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2766+
{
2767+
return vgetq_lane_u16(self, I);
2768+
}
2769+
2770+
template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 2> = 0>
2771+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2772+
{
2773+
return vgetq_lane_s16(self, I);
2774+
}
2775+
2776+
template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2777+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2778+
{
2779+
return vgetq_lane_u32(self, I);
2780+
}
2781+
2782+
template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 4> = 0>
2783+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2784+
{
2785+
return vgetq_lane_s32(self, I);
2786+
}
2787+
2788+
template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2789+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2790+
{
2791+
return vgetq_lane_u64(self, I);
2792+
}
2793+
2794+
template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 8> = 0>
2795+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
2796+
{
2797+
return vgetq_lane_s64(self, I);
2798+
}
2799+
27452800
// first
27462801
template <class A>
27472802
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<neon>) noexcept

include/xsimd/arch/xsimd_neon64.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ namespace xsimd
3131
{
3232
using namespace types;
3333

34+
// get
35+
template <class A, size_t I>
36+
XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<neon64>) noexcept
37+
{
38+
return vgetq_lane_f64(self, I);
39+
}
40+
3441
// first
3542
template <class A>
3643
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<neon64>) noexcept

include/xsimd/arch/xsimd_rvv.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,6 +1370,27 @@ namespace xsimd
13701370
return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
13711371
}
13721372

1373+
// get (compile-time index): skip the slidedown when I == 0; lane 0 maps straight to the scalar move.
1374+
template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
1375+
XSIMD_INLINE T get(batch<T, A> const& arg, index<I>, requires_arch<rvv>) noexcept
1376+
{
1377+
XSIMD_IF_CONSTEXPR(I == 0)
1378+
{
1379+
return detail::rvvmv_lane0(arg);
1380+
}
1381+
return get(arg, I, rvv {});
1382+
}
1383+
1384+
template <class A, size_t I, class T, detail::rvv_enable_all_t<T> = 0>
1385+
XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, index<I>, requires_arch<rvv>) noexcept
1386+
{
1387+
XSIMD_IF_CONSTEXPR(I == 0)
1388+
{
1389+
return std::complex<T> { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) };
1390+
}
1391+
return get(arg, I, rvv {});
1392+
}
1393+
13731394
// all
13741395
template <class A, class T, detail::rvv_enable_all_t<T> = 0>
13751396
XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2276,6 +2276,57 @@ namespace xsimd
22762276
}
22772277
}
22782278

2279+
// get (must appear after first and swizzle so it can delegate through the xsimd API)
2280+
namespace detail
2281+
{
2282+
// broadcast lane index I across a batch_constant<IdxT, A, I, I, ..., I> matching batch<T, A>::size
2283+
template <class T, class A, size_t I, size_t... Is>
2284+
XSIMD_INLINE auto broadcast_lane_index(std::index_sequence<Is...>) noexcept
2285+
-> batch_constant<as_unsigned_integer_t<T>, A, static_cast<as_unsigned_integer_t<T>>(Is * 0 + I)...>
2286+
{
2287+
return {};
2288+
}
2289+
2290+
template <class T, class A, size_t I>
2291+
XSIMD_INLINE auto broadcast_lane_index() noexcept
2292+
-> decltype(broadcast_lane_index<T, A, I>(std::make_index_sequence<batch<T, A>::size> {}))
2293+
{
2294+
return {};
2295+
}
2296+
}
2297+
2298+
template <class A, size_t I, class T>
2299+
XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= 2, T>::type
2300+
get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse2>) noexcept
2301+
{
2302+
XSIMD_IF_CONSTEXPR(I == 0)
2303+
{
2304+
return first(self, A {});
2305+
}
2306+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2307+
{
2308+
return static_cast<T>(_mm_extract_epi16(self, I));
2309+
}
2310+
else
2311+
{
2312+
// SSE2 has no pextrb; byte-lane shift + movd is the shortest path for I>0.
2313+
return static_cast<T>(_mm_cvtsi128_si32(_mm_srli_si128(self, I)) & 0xFF);
2314+
}
2315+
}
2316+
2317+
template <class A, size_t I, class T>
2318+
XSIMD_INLINE typename std::enable_if<(std::is_integral<T>::value && sizeof(T) >= 4) || std::is_floating_point<T>::value, T>::type
2319+
get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse2>) noexcept
2320+
{
2321+
XSIMD_IF_CONSTEXPR(I == 0)
2322+
{
2323+
return first(self, A {});
2324+
}
2325+
else
2326+
{
2327+
return first(swizzle(self, detail::broadcast_lane_index<T, A, I>(), A {}), A {});
2328+
}
2329+
}
22792330
}
22802331
}
22812332

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,41 @@ namespace xsimd
105105
return _mm_floor_pd(self);
106106
}
107107

108+
// get
109+
template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
110+
XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse4_1>) noexcept
111+
{
112+
XSIMD_IF_CONSTEXPR(I == 0)
113+
{
114+
return first(self, sse2 { });
115+
}
116+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
117+
{
118+
return static_cast<T>(_mm_extract_epi8(self, I));
119+
}
120+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
121+
{
122+
return static_cast<T>(_mm_extract_epi16(self, I));
123+
}
124+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
125+
{
126+
return static_cast<T>(_mm_extract_epi32(self, I));
127+
}
128+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
129+
{
130+
#if defined(__x86_64__)
131+
return static_cast<T>(_mm_extract_epi64(self, I));
132+
#else
133+
return get(self, ::xsimd::index<I> {}, sse2 {});
134+
#endif
135+
}
136+
else
137+
{
138+
assert(false && "unsupported arch/op combination");
139+
return {};
140+
}
141+
}
142+
108143
// insert
109144
template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
110145
XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept

include/xsimd/types/xsimd_api.hpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,6 +1109,37 @@ namespace xsimd
11091109
return x > y;
11101110
}
11111111

1112+
/**
1113+
* @ingroup batch_data_transfer
1114+
*
1115+
* Extract the scalar element at compile-time index \c I from batch \c b.
1116+
* @param b the batch to extract from.
1117+
* @return the scalar element at index \c I.
1118+
*/
1119+
template <size_t I, class T, class A>
1120+
XSIMD_INLINE T get(batch<T, A> const& b) noexcept
1121+
{
1122+
static_assert(I < batch<T, A>::size, "index out of bounds");
1123+
detail::static_check_supported_config<T, A>();
1124+
return kernel::get(b, index<I> {}, A {});
1125+
}
1126+
1127+
template <size_t I, class T, class A>
1128+
XSIMD_INLINE bool get(batch_bool<T, A> const& b) noexcept
1129+
{
1130+
static_assert(I < batch_bool<T, A>::size, "index out of bounds");
1131+
detail::static_check_supported_config<T, A>();
1132+
return kernel::get(b, index<I> {}, A {});
1133+
}
1134+
1135+
template <size_t I, class T, class A>
1136+
XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& b) noexcept
1137+
{
1138+
static_assert(I < batch<std::complex<T>, A>::size, "index out of bounds");
1139+
detail::static_check_supported_config<T, A>();
1140+
return kernel::get(b, index<I> {}, A {});
1141+
}
1142+
11121143
/**
11131144
* @ingroup batch_reducers
11141145
*

test/test_batch.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,21 @@ struct batch_test
158158
CHECK_EQ(res.first(), lhs[0]);
159159
}
160160

161+
template <size_t... Is>
162+
void test_get_impl(batch_type const& res, std::index_sequence<Is...>) const
163+
{
164+
array_type extracted = { xsimd::get<Is>(res)... };
165+
CHECK_EQ(extracted, lhs);
166+
CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res);
167+
}
168+
169+
void test_get() const
170+
{
171+
batch_type res = batch_lhs();
172+
CHECK_EQ(xsimd::get<0>(res), res.first());
173+
test_get_impl(res, std::make_index_sequence<size> {});
174+
}
175+
161176
void test_arithmetic() const
162177
{
163178
// +batch
@@ -986,6 +1001,11 @@ TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES)
9861001
Test.test_first_element();
9871002
}
9881003

1004+
SUBCASE("get")
1005+
{
1006+
Test.test_get();
1007+
}
1008+
9891009
SUBCASE("arithmetic")
9901010
{
9911011
Test.test_arithmetic();

0 commit comments

Comments
 (0)