@@ -1346,6 +1346,97 @@ namespace xsimd
13461346 }
13471347 }
13481348
1349+ // first (must precede get for two-phase lookup)
1350+ template <class A >
1351+ XSIMD_INLINE float first (batch<float , A> const & self, requires_arch<avx512f>) noexcept
1352+ {
1353+ return _mm512_cvtss_f32 (self);
1354+ }
1355+
1356+ template <class A >
1357+ XSIMD_INLINE double first (batch<double , A> const & self, requires_arch<avx512f>) noexcept
1358+ {
1359+ return _mm512_cvtsd_f64 (self);
1360+ }
1361+
1362+ template <class A , class T , class = std::enable_if_t <std::is_integral<T>::value>>
1363+ XSIMD_INLINE T first (batch<T, A> const & self, requires_arch<avx512f>) noexcept
1364+ {
1365+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
1366+ {
1367+ return static_cast <T>(_mm_cvtsi128_si32 (_mm512_castsi512_si128 (self)) & 0xFF );
1368+ }
1369+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
1370+ {
1371+ return static_cast <T>(_mm_cvtsi128_si32 (_mm512_castsi512_si128 (self)) & 0xFFFF );
1372+ }
1373+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
1374+ {
1375+ return static_cast <T>(_mm_cvtsi128_si32 (_mm512_castsi512_si128 (self)));
1376+ }
1377+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
1378+ {
1379+ batch<T, sse4_2> low = _mm512_castsi512_si128 (self);
1380+ return first (low, sse4_2 {});
1381+ }
1382+ else
1383+ {
1384+ assert (false && " unsupported arch/op combination" );
1385+ return {};
1386+ }
1387+ }
1388+
1389+ // get: use valignd/valignq to rotate lane I into position 0 in a single op.
1390+ template <class A , size_t I>
1391+ XSIMD_INLINE float get (batch<float , A> const & self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1392+ {
1393+ XSIMD_IF_CONSTEXPR (I == 0 )
1394+ {
1395+ return first (self, avx512f {});
1396+ }
1397+ const auto rotated = _mm512_alignr_epi32 (_mm512_castps_si512 (self), _mm512_castps_si512 (self), I);
1398+ return _mm_cvtss_f32 (_mm512_castps512_ps128 (_mm512_castsi512_ps (rotated)));
1399+ }
1400+
1401+ template <class A , size_t I>
1402+ XSIMD_INLINE double get (batch<double , A> const & self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1403+ {
1404+ XSIMD_IF_CONSTEXPR (I == 0 )
1405+ {
1406+ return first (self, avx512f {});
1407+ }
1408+ const auto rotated = _mm512_alignr_epi64 (_mm512_castpd_si512 (self), _mm512_castpd_si512 (self), I);
1409+ return _mm_cvtsd_f64 (_mm512_castpd512_pd128 (_mm512_castsi512_pd (rotated)));
1410+ }
1411+
1412+ template <class A , size_t I, class T , class = std::enable_if_t <std::is_integral<T>::value>>
1413+ XSIMD_INLINE T get (batch<T, A> const & self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
1414+ {
1415+ XSIMD_IF_CONSTEXPR (I == 0 )
1416+ {
1417+ return first (self, avx512f {});
1418+ }
1419+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
1420+ {
1421+ const auto rotated = _mm512_alignr_epi32 (self, self, I);
1422+ return first (batch<T, sse4_2>(_mm512_castsi512_si128 (rotated)), sse4_2 {});
1423+ }
1424+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
1425+ {
1426+ const auto rotated = _mm512_alignr_epi64 (self, self, I);
1427+ return first (batch<T, sse4_2>(_mm512_castsi512_si128 (rotated)), sse4_2 {});
1428+ }
1429+ else
1430+ {
1431+ // 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves.
1432+ constexpr size_t elements_per_lane = batch<T, avx>::size;
1433+ constexpr size_t lane = I / elements_per_lane;
1434+ constexpr size_t sub_index = I % elements_per_lane;
1435+ const auto half = (lane == 0 ) ? detail::lower_half (self) : detail::upper_half (self);
1436+ return kernel::get (batch<T, avx>(half), ::xsimd::index<sub_index> {}, avx {});
1437+ }
1438+ }
1439+
13491440 // insert
13501441 template <class A , size_t I>
13511442 XSIMD_INLINE batch<float , A> insert (batch<float , A> const & self, float val, index<I>, requires_arch<avx512f>) noexcept
@@ -2753,46 +2844,6 @@ namespace xsimd
27532844 2 ));
27542845 }
27552846
2756- // first
2757- template <class A >
2758- XSIMD_INLINE float first (batch<float , A> const & self, requires_arch<avx512f>) noexcept
2759- {
2760- return _mm512_cvtss_f32 (self);
2761- }
2762-
2763- template <class A >
2764- XSIMD_INLINE double first (batch<double , A> const & self, requires_arch<avx512f>) noexcept
2765- {
2766- return _mm512_cvtsd_f64 (self);
2767- }
2768-
2769- template <class A , class T , class = std::enable_if_t <std::is_integral<T>::value>>
2770- XSIMD_INLINE T first (batch<T, A> const & self, requires_arch<avx512f>) noexcept
2771- {
2772- XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
2773- {
2774- return static_cast <T>(_mm_cvtsi128_si32 (_mm512_castsi512_si128 (self)) & 0xFF );
2775- }
2776- else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
2777- {
2778- return static_cast <T>(_mm_cvtsi128_si32 (_mm512_castsi512_si128 (self)) & 0xFFFF );
2779- }
2780- else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
2781- {
2782- return static_cast <T>(_mm_cvtsi128_si32 (_mm512_castsi512_si128 (self)));
2783- }
2784- else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
2785- {
2786- batch<T, sse4_2> low = _mm512_castsi512_si128 (self);
2787- return first (low, sse4_2 {});
2788- }
2789- else
2790- {
2791- assert (false && " unsupported arch/op combination" );
2792- return {};
2793- }
2794- }
2795-
27962847 // widen
27972848 template <class A , class T >
27982849 XSIMD_INLINE std::array<batch<widen_t <T>, A>, 2 > widen (batch<T, A> const & x, requires_arch<avx512f>) noexcept
0 commit comments