@@ -2588,30 +2588,50 @@ namespace xsimd
25882588 I16 / 2 , I18 / 2 , I20 / 2 , I22 / 2 , I24 / 2 , I26 / 2 , I28 / 2 , I30 / 2 >;
25892589 };
25902590
2591+ template <class A , uint16_t ... Is>
2592+ constexpr bool is_reduce_pattern ()
2593+ {
2594+ // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
2595+ if (sizeof ...(Is) != batch<uint16_t , A>::size)
2596+ return false ;
2597+ uint16_t pattern[] = { Is... };
2598+ if (pattern[0 ] != 1 )
2599+ return false ;
2600+ for (size_t i = 1 ; i < sizeof ...(Is); i += 1 )
2601+ {
2602+ if (pattern[i] != (i & 1 ))
2603+ return false ;
2604+ }
2605+ return true ;
2606+ }
25912607 }
25922608
2593- template <class A , uint16_t ... Idx, class = std::enable_if_t <detail::is_pair_of_contiguous_indices<uint16_t , A, Idx...>::value>>
2594- XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, Idx...>, requires_arch<avx512f>) noexcept
2595- {
2596- constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2597- return _mm512_permutexvar_epi32 (static_cast <batch<uint32_t , A>>(mask32), self);
2598- }
2599-
2600- template <class A >
2601- XSIMD_INLINE batch<uint16_t , A>
2602- swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, (uint16_t )1 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 , (uint16_t )0 , (uint16_t )1 >, requires_arch<avx512f>) noexcept
2609+ template <class A , uint16_t ... Idx>
2610+ XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, Idx...> mask, requires_arch<avx512f>) noexcept
26032611 {
2604- // FIXME: this sequence is very inefficient, but it's here to catch
2605- // a pattern generated by detail::reduce from xsimd_common_math.hpp.
2606- // The whole pattern is actually decently folded by GCC and Clang,
2607- // so bare with it.
2608- constexpr batch_constant<uint32_t , A, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 > mask32;
2609- auto tmp = _mm512_permutexvar_epi32 (static_cast <batch<uint32_t , A>>(mask32), self);
2612+ XSIMD_IF_CONSTEXPR (detail::is_pair_of_contiguous_indices<uint16_t , A, Idx...>::value)
2613+ {
2614+ constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2615+ return _mm512_permutexvar_epi32 (static_cast <batch<uint32_t , A>>(mask32), self);
2616+ }
2617+ else XSIMD_IF_CONSTEXPR (detail::is_reduce_pattern<A, Idx...>())
2618+ {
2619+ // FIXME: this sequence is very inefficient, but it's here to catch
2620+ // a pattern generated by detail::reduce from xsimd_common_math.hpp.
2621+ // The whole pattern is actually decently folded by GCC and Clang,
2622+ // so bare with it.
2623+ constexpr batch_constant<uint32_t , A, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 > mask32;
2624+ auto tmp = _mm512_permutexvar_epi32 (static_cast <batch<uint32_t , A>>(mask32), self);
26102625
2611- alignas (A::alignment ()) uint16_t buffer[32 ];
2612- _mm512_store_si512 ((__m512i*)&buffer[0 ], tmp);
2613- buffer[0 ] = buffer[1 ];
2614- return _mm512_load_si512 (&buffer[0 ]);
2626+ alignas (A::alignment ()) uint16_t buffer[32 ];
2627+ _mm512_store_si512 ((__m512i*)&buffer[0 ], tmp);
2628+ buffer[0 ] = buffer[1 ];
2629+ return _mm512_load_si512 (&buffer[0 ]);
2630+ }
2631+ else
2632+ {
2633+ return swizzle (self, mask, common {});
2634+ }
26152635 }
26162636
26172637 template <class A , uint16_t ... Vs>
0 commit comments