Skip to content

Commit 65fc123

Browse files
Fix emulated arch interaction with avx512
It is possible to have a batch targeting an emulated build living alongside a batch for arch512. In such a configuration, trying to swizzle the emulated build will lead to instantiation of avx512 swizzle which itself tries to instantiate a batch constant that's invalid in avx512 terms. Fix the situation by aggregating all avx512 swizzle of uint16_t under one function and then switch on patterns.
1 parent 6857c86 commit 65fc123

2 files changed

Lines changed: 35 additions & 20 deletions

File tree

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2588,30 +2588,45 @@ namespace xsimd
25882588
I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
25892589
};
25902590

2591-
}
2591+
template<class A, uint16_t... Is>
2592+
constexpr bool is_reduce_pattern() {
2593+
// The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
2594+
if(sizeof...(Is) != batch<uint16_t, A>::size) return false;
2595+
uint16_t pattern[] = {Is...};
2596+
if(pattern[0] != 1)
2597+
return false;
2598+
for(size_t i = 1; i < sizeof...(Is); i += 1) {
2599+
if(pattern[i] != (i & 1))
2600+
return false;
2601+
}
2602+
return true;
2603+
}
25922604

2593-
template <class A, uint16_t... Idx, class = std::enable_if_t<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value>>
2594-
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
2595-
{
2596-
constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2597-
return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
25982605
}
25992606

2600-
template <class A>
2601-
XSIMD_INLINE batch<uint16_t, A>
2602-
swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
2607+
template <class A, uint16_t... Idx>
2608+
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...> mask, requires_arch<avx512f>) noexcept
26032609
{
2604-
// FIXME: this sequence is very inefficient, but it's here to catch
2605-
// a pattern generated by detail::reduce from xsimd_common_math.hpp.
2606-
// The whole pattern is actually decently folded by GCC and Clang,
2607-
// so bare with it.
2608-
constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
2609-
auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
2610+
XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value) {
2611+
constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
2612+
return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
2613+
}
2614+
else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern<A, Idx...>()) {
2615+
// FIXME: this sequence is very inefficient, but it's here to catch
2616+
// a pattern generated by detail::reduce from xsimd_common_math.hpp.
2617+
// The whole pattern is actually decently folded by GCC and Clang,
2618+
// so bare with it.
2619+
constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
2620+
auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
26102621

2611-
alignas(A::alignment()) uint16_t buffer[32];
2612-
_mm512_store_si512((__m512i*)&buffer[0], tmp);
2613-
buffer[0] = buffer[1];
2614-
return _mm512_load_si512(&buffer[0]);
2622+
alignas(A::alignment()) uint16_t buffer[32];
2623+
_mm512_store_si512((__m512i*)&buffer[0], tmp);
2624+
buffer[0] = buffer[1];
2625+
return _mm512_load_si512(&buffer[0]);
2626+
}
2627+
else {
2628+
return swizzle(self, mask, common{});
2629+
}
26152630
}
26162631

26172632
template <class A, uint16_t... Vs>

test/test_arch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
****************************************************************************/
1111

1212
#include "xsimd/xsimd.hpp"
13-
#if !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && !defined(XSIMD_WITH_EMULATED)
13+
#ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
1414

1515
#include <numeric>
1616
#include <type_traits>

0 commit comments

Comments
 (0)