perf: native AVX512BW masked load/store for 8/16-bit integers

DiamonDinoia · DiamonDinoia · commit 860bb558eb2f · 2026-06-10T14:57:39.000-04:00
8/16-bit int masked load/store on AVX512BW previously fell through to the
branchy common scalar fallback because xsimd_avx512bw.hpp had no
load_masked/store_masked overloads. Add four requires_arch&lt;avx512bw&gt;
overloads (runtime batch_bool + compile-time batch_bool_constant, load +
store) constrained to sizeof(T)==1||2, emitting the native vmovdqu8 /
vmovdqu16 predicated moves (2 instructions, no branch).

The size branch lives only in the runtime overloads; the constant
overloads delegate via mask.as_batch_bool(), which also avoids
batch_bool_constant::mask() (return type int) truncating a 64-lane int8
compile-time mask.

32/64-bit stays on the avx512f path; SSE/AVX2 8/16-bit scalar fallback is
hardware-forced and unchanged.
diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -378,6 +378,53 @@ namespace xsimd
             }
         }
 
+        // load_masked / store_masked: native vmovdqu8 / vmovdqu16 predication for
+        // 8/16-bit, replacing the common scalar fallback. No aligned masked 8/16
+        // intrinsic exists and masked moves never fault, so loadu fits both modes.
+        template <class A, class T, class Mode,
+                  class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_maskz_loadu_epi8((__mmask64)mask.mask(), mem);
+            }
+            else
+            {
+                return _mm512_maskz_loadu_epi16((__mmask32)mask.mask(), mem);
+            }
+        }
+
+        template <class A, class T, class Mode,
+                  class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                _mm512_mask_storeu_epi8((void*)mem, (__mmask64)mask.mask(), src);
+            }
+            else
+            {
+                _mm512_mask_storeu_epi16((void*)mem, (__mmask32)mask.mask(), src);
+            }
+        }
+
+        // Constant masks reuse the runtime overloads; as_batch_bool() also avoids
+        // batch_bool_constant::mask() truncating a 64-lane int8 mask to int.
+        template <class A, class T, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx512bw>) noexcept
+        {
+            return load_masked(mem, mask.as_batch_bool(), convert<T> {}, Mode {}, avx512bw {});
+        }
+
+        template <class A, class T, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx512bw>) noexcept
+        {
+            store_masked(mem, src, mask.as_batch_bool(), Mode {}, avx512bw {});
+        }
+
         // max
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -356,7 +356,8 @@ namespace xsimd
 
         // Runtime-mask load/store: same native k-register path as the constant
         // overloads above, minus the compile-time half-forwarding. 8/16-bit
-        // elements fall back to the common scalar path.
+        // elements are handled natively by avx512bw (vmovdqu8 / vmovdqu16);
+        // without AVX512BW they fall back to the common scalar path.
         template <class A, class T, class Mode,
                   typename = std::enable_if_t<(sizeof(T) >= 4)>>
         XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx512f>) noexcept