@@ -2588,30 +2588,45 @@ namespace xsimd
25882588 I16 / 2 , I18 / 2 , I20 / 2 , I22 / 2 , I24 / 2 , I26 / 2 , I28 / 2 , I30 / 2 >;
25892589 };
25902590
2591- }
2591+ template <class A , uint16_t ... Is>
2592+ constexpr bool is_reduce_pattern () {
2593+ // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
2594+ if (sizeof ...(Is) != batch<uint16_t , A>::size) return false ;
2595+ uint16_t pattern[] = {Is...};
2596+ if (pattern[0 ] != 1 )
2597+ return false ;
2598+ for (size_t i = 1 ; i < sizeof ...(Is); i += 1 ) {
2599+ if (pattern[i] != (i & 1 ))
2600+ return false ;
2601+ }
2602+ return true ;
2603+ }
25922604
2593- template <class A , uint16_t ... Idx, class = std::enable_if_t <detail::is_pair_of_contiguous_indices<uint16_t , A, Idx...>::value>>
2594- XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, Idx...>, requires_arch<avx512f>) noexcept
2595- {
2596- constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2597- return _mm512_permutexvar_epi32 (static_cast <batch<uint32_t , A>>(mask32), self);
25982605 }
25992606
2600- template <class A >
2601- XSIMD_INLINE batch<uint16_t , A>
2602- swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, (uint16_t )1 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 >, requires_arch<avx512f>) noexcept
2607+ template <class A , uint16_t ... Idx>
2608+ XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, Idx...> mask, requires_arch<avx512f>) noexcept
26032609 {
2604- // FIXME: this sequence is very inefficient, but it's here to catch
2605- // a pattern generated by detail::reduce from xsimd_common_math.hpp.
2606- // The whole pattern is actually decently folded by GCC and Clang,
2607- // so bare with it.
2608- constexpr batch_constant<uint32_t , A, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 > mask32;
2609- auto tmp = _mm512_permutexvar_epi32 (static_cast <batch<uint32_t , A>>(mask32), self);
2610+ XSIMD_IF_CONSTEXPR (detail::is_pair_of_contiguous_indices<uint16_t , A, Idx...>::value) {
2611+ constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2612+ return _mm512_permutexvar_epi32 (static_cast <batch<uint32_t , A>>(mask32), self);
2613+ }
2614+ else XSIMD_IF_CONSTEXPR (detail::is_reduce_pattern<A, Idx...>()) {
2615+ // FIXME: this sequence is very inefficient, but it's here to catch
2616+ // a pattern generated by detail::reduce from xsimd_common_math.hpp.
2617+ // The whole pattern is actually decently folded by GCC and Clang,
2618+ // so bare with it.
2619+ constexpr batch_constant<uint32_t , A, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 > mask32;
2620+ auto tmp = _mm512_permutexvar_epi32 (static_cast <batch<uint32_t , A>>(mask32), self);
26102621
2611- alignas (A::alignment ()) uint16_t buffer[32 ];
2612- _mm512_store_si512 ((__m512i*)&buffer[0 ], tmp);
2613- buffer[0 ] = buffer[1 ];
2614- return _mm512_load_si512 (&buffer[0 ]);
2622+ alignas (A::alignment ()) uint16_t buffer[32 ];
2623+ _mm512_store_si512 ((__m512i*)&buffer[0 ], tmp);
2624+ buffer[0 ] = buffer[1 ];
2625+ return _mm512_load_si512 (&buffer[0 ]);
2626+ }
2627+ else {
2628+ return swizzle (self, mask, common{});
2629+ }
26152630 }
26162631
26172632 template <class A , uint16_t ... Vs>
0 commit comments