Fix emulated arch interaction with avx512

serge-sans-paille · serge-sans-paille · commit 65fc123ef55e · 2026-04-26T16:51:43.000+02:00
It is possible to have a batch targeting an emulated build living
alongside a batch for arch512. In such a configuration, trying to
swizzle the emulated build will lead to instantiation of avx512 swizzle
which itself tries to instantiate a batch constant that's invalid in
avx512 terms.

Fix the situation by aggregating all avx512 swizzle of uint16_t under
one function and then switch on patterns.
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -2588,30 +2588,45 @@ namespace xsimd
                                             I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
             };
 
-        }
+            template<class A, uint16_t... Is>
+            constexpr bool is_reduce_pattern() {
+              // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
+              if(sizeof...(Is) != batch<uint16_t, A>::size) return false;
+              uint16_t pattern[] = {Is...};
+              if(pattern[0] != 1)
+                return false;
+              for(size_t i = 1; i < sizeof...(Is); i += 1) {
+                if(pattern[i] != (i & 1))
+                  return false;
+              }
+              return true;
+            }
 
-        template <class A, uint16_t... Idx, class = std::enable_if_t<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value>>
-        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
-        {
-            constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
-            return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
         }
 
-        template <class A>
-        XSIMD_INLINE batch<uint16_t, A>
-        swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        template <class A, uint16_t... Idx>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...> mask, requires_arch<avx512f>) noexcept
         {
-            // FIXME: this sequence is very inefficient, but it's here to catch
-            // a pattern generated by detail::reduce from xsimd_common_math.hpp.
-            // The whole pattern is actually decently folded by GCC and Clang,
-            // so bare with it.
-            constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
-            auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+            XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value) {
+              constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
+              return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+            }
+            else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern<A, Idx...>()) {
+              // FIXME: this sequence is very inefficient, but it's here to catch
+              // a pattern generated by detail::reduce from xsimd_common_math.hpp.
+              // The whole pattern is actually decently folded by GCC and Clang,
+              // so bare with it.
+              constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+              auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
 
-            alignas(A::alignment()) uint16_t buffer[32];
-            _mm512_store_si512((__m512i*)&buffer[0], tmp);
-            buffer[0] = buffer[1];
-            return _mm512_load_si512(&buffer[0]);
+              alignas(A::alignment()) uint16_t buffer[32];
+              _mm512_store_si512((__m512i*)&buffer[0], tmp);
+              buffer[0] = buffer[1];
+              return _mm512_load_si512(&buffer[0]);
+            }
+            else {
+              return swizzle(self, mask, common{});
+            }
         }
 
         template <class A, uint16_t... Vs>
diff --git a/test/test_arch.cpp b/test/test_arch.cpp
@@ -10,7 +10,7 @@
  ****************************************************************************/
 
 #include "xsimd/xsimd.hpp"
-#if !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && !defined(XSIMD_WITH_EMULATED)
+#ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
 #include <numeric>
 #include <type_traits>