Skip to content

Commit 1ddbbcd

Browse files
Fix emulated arch interaction with avx512
It is possible to have a batch targeting an emulated build living alongside a batch for arch512. In such a configuration, trying to swizzle the emulated build will lead to instantiation of avx512 swizzle which itself tries to instantiate a batch constant that's invalid in avx512 terms. Fix the situation by aggregating all avx512 swizzle of uint16_t under one function and then switch on patterns. Based on an idea from @hdu-sdlzx
1 parent 4bdd822 commit 1ddbbcd

1 file changed

Lines changed: 40 additions & 20 deletions

File tree

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2588,30 +2588,50 @@ namespace xsimd
25882588
I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
25892589
};
25902590

2591+
template <class A, uint16_t... Is>
2592+
constexpr bool is_reduce_pattern()
2593+
{
2594+
// The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
2595+
if (sizeof...(Is) != batch<uint16_t, A>::size)
2596+
return false;
2597+
uint16_t pattern[] = { Is... };
2598+
if (pattern[0] != 1)
2599+
return false;
2600+
for (size_t i = 1; i < sizeof...(Is); i += 1)
2601+
{
2602+
if (pattern[i] != (i & 1))
2603+
return false;
2604+
}
2605+
return true;
2606+
}
25912607
}
25922608

2593-
template <class A, uint16_t... Idx, class = std::enable_if_t<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value>>
2594-
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
2595-
{
2596-
constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2597-
return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
2598-
}
2599-
2600-
template <class A>
2601-
XSIMD_INLINE batch<uint16_t, A>
2602-
swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
2609+
template <class A, uint16_t... Idx>
2610+
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...> mask, requires_arch<avx512f>) noexcept
26032611
{
2604-
// FIXME: this sequence is very inefficient, but it's here to catch
2605-
// a pattern generated by detail::reduce from xsimd_common_math.hpp.
2606-
// The whole pattern is actually decently folded by GCC and Clang,
2607-
// so bare with it.
2608-
constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
2609-
auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
2612+
XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value)
2613+
{
2614+
constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2615+
return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
2616+
}
2617+
else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern<A, Idx...>())
2618+
{
2619+
// FIXME: this sequence is very inefficient, but it's here to catch
2620+
// a pattern generated by detail::reduce from xsimd_common_math.hpp.
2621+
// The whole pattern is actually decently folded by GCC and Clang,
2622+
// so bare with it.
2623+
constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
2624+
auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
26102625

2611-
alignas(A::alignment()) uint16_t buffer[32];
2612-
_mm512_store_si512((__m512i*)&buffer[0], tmp);
2613-
buffer[0] = buffer[1];
2614-
return _mm512_load_si512(&buffer[0]);
2626+
alignas(A::alignment()) uint16_t buffer[32];
2627+
_mm512_store_si512((__m512i*)&buffer[0], tmp);
2628+
buffer[0] = buffer[1];
2629+
return _mm512_load_si512(&buffer[0]);
2630+
}
2631+
else
2632+
{
2633+
return swizzle(self, mask, common {});
2634+
}
26152635
}
26162636

26172637
template <class A, uint16_t... Vs>

0 commit comments

Comments
 (0)