Skip to content

Commit 860bb55

Browse files
committed
perf: native AVX512BW masked load/store for 8/16-bit integers
8/16-bit int masked load/store on AVX512BW previously fell through to the branchy common scalar fallback because xsimd_avx512bw.hpp had no load_masked/store_masked overloads. Add four requires_arch<avx512bw> overloads (runtime batch_bool + compile-time batch_bool_constant, load + store) constrained to sizeof(T)==1||2, emitting the native vmovdqu8 / vmovdqu16 predicated moves (2 instructions, no branch). The size branch lives only in the runtime overloads; the constant overloads delegate via mask.as_batch_bool(), which also avoids batch_bool_constant::mask() (return type int) truncating a 64-lane int8 compile-time mask. 32/64-bit stays on the avx512f path; SSE/AVX2 8/16-bit scalar fallback is hardware-forced and unchanged.
1 parent 262f5a7 commit 860bb55

2 files changed

Lines changed: 49 additions & 1 deletion

File tree

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,53 @@ namespace xsimd
378378
}
379379
}
380380

381+
// load_masked / store_masked: native vmovdqu8 / vmovdqu16 predication for
382+
// 8/16-bit, replacing the common scalar fallback. No aligned masked 8/16
383+
// intrinsic exists and masked moves never fault, so loadu fits both modes.
384+
template <class A, class T, class Mode,
385+
class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
386+
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx512bw>) noexcept
387+
{
388+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
389+
{
390+
return _mm512_maskz_loadu_epi8((__mmask64)mask.mask(), mem);
391+
}
392+
else
393+
{
394+
return _mm512_maskz_loadu_epi16((__mmask32)mask.mask(), mem);
395+
}
396+
}
397+
398+
template <class A, class T, class Mode,
399+
class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
400+
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx512bw>) noexcept
401+
{
402+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
403+
{
404+
_mm512_mask_storeu_epi8((void*)mem, (__mmask64)mask.mask(), src);
405+
}
406+
else
407+
{
408+
_mm512_mask_storeu_epi16((void*)mem, (__mmask32)mask.mask(), src);
409+
}
410+
}
411+
412+
// Constant masks reuse the runtime overloads; as_batch_bool() also avoids
413+
// batch_bool_constant::mask() truncating a 64-lane int8 mask to int.
414+
template <class A, class T, bool... Values, class Mode,
415+
class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
416+
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx512bw>) noexcept
417+
{
418+
return load_masked(mem, mask.as_batch_bool(), convert<T> {}, Mode {}, avx512bw {});
419+
}
420+
421+
template <class A, class T, bool... Values, class Mode,
422+
class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
423+
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx512bw>) noexcept
424+
{
425+
store_masked(mem, src, mask.as_batch_bool(), Mode {}, avx512bw {});
426+
}
427+
381428
// max
382429
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
383430
XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,8 @@ namespace xsimd
356356

357357
// Runtime-mask load/store: same native k-register path as the constant
358358
// overloads above, minus the compile-time half-forwarding. 8/16-bit
359-
// elements fall back to the common scalar path.
359+
// elements are handled natively by avx512bw (vmovdqu8 / vmovdqu16);
360+
// without AVX512BW they fall back to the common scalar path.
360361
template <class A, class T, class Mode,
361362
typename = std::enable_if_t<(sizeof(T) >= 4)>>
362363
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx512f>) noexcept

0 commit comments

Comments
 (0)