From 7b335b6c2f1d1f5019f7994e396afa59c3a376db Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 26 Nov 2025 08:35:10 +0100 Subject: [PATCH 1/2] [nits] Fix low-level lane size computation in AVX2 --- include/xsimd/arch/xsimd_avx2.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 2ee2a5241..f9628f450 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -193,7 +193,7 @@ namespace xsimd template XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - constexpr size_t lanes_per_half = sizeof(__m128i) / sizeof(T); + constexpr size_t lanes_per_half = batch::size / 2; // confined to lower 128-bit half → forward to SSE XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) From d67b318c016f16b0e8d4999cc879eb6d30989558 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 26 Nov 2025 08:35:33 +0100 Subject: [PATCH 2/2] Add support for masked load in SVE --- include/xsimd/arch/xsimd_sve.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index 3fd08be1f..4d81311f0 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -39,6 +39,17 @@ namespace xsimd template svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index {}); } + // predicate loading + template + svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); } + template + svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); } + template + svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); } + template + svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); } + // count active lanes in a predicate XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); } XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); } @@ -95,6 +106,13 @@ namespace xsimd return load_aligned(src, convert(), sve {}); } + // load_masked + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return svld1(detail::sve_pmask(), reinterpret_cast const*>(mem)); + } + // load_complex template = 0> XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept