xtensor-stack
diff --git a/‎docs/source/api/data_transfer.rst‎
Lines changed: 15 additions & 2 deletions b/‎docs/source/api/data_transfer.rst‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎include/xsimd/arch/common/xsimd_common_memory.hpp‎
Lines changed: 61 additions & 0 deletions b/‎include/xsimd/arch/common/xsimd_common_memory.hpp‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 33 additions & 0 deletions b/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 28 additions & 5 deletions b/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 28 additions & 5 deletions
diff --git a/‎include/xsimd/arch/xsimd_common_fwd.hpp‎
Lines changed: 8 additions & 0 deletions b/‎include/xsimd/arch/xsimd_common_fwd.hpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_rvv.hpp‎
Lines changed: 24 additions & 0 deletions b/‎include/xsimd/arch/xsimd_rvv.hpp‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎include/xsimd/arch/xsimd_sve.hpp‎
Lines changed: 23 additions & 2 deletions b/‎include/xsimd/arch/xsimd_sve.hpp‎
Lines changed: 23 additions & 2 deletions
@@ -12,7 +12,7 @@ Data Transfers
 From memory:
 
 +---------------------------------------+----------------------------------------------------+
-| :cpp:func:`load`                      | load values from memory (optionally masked)        |
+| :cpp:func:`load`                      | load values from memory (optionally masked) [#m]_  |
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`load_aligned`              | load values from aligned memory                    |
 +---------------------------------------+----------------------------------------------------+
@@ -32,7 +32,7 @@ From a scalar:
 To memory:
 
 +---------------------------------------+----------------------------------------------------+
-| :cpp:func:`store`                     | store values to memory (optionally masked)         |
+| :cpp:func:`store`                     | store values to memory (optionally masked) [#m]_   |
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`store_aligned`             | store values to aligned memory                     |
 +---------------------------------------+----------------------------------------------------+
@@ -84,3 +84,16 @@ The following empty types are used for tag dispatching:
 
 .. doxygenstruct:: xsimd::unaligned_mode
    :project: xsimd
+
+.. rubric:: Footnotes
+
+.. [#m] Masked ``load`` / ``store`` come in two flavours. The
+   :cpp:class:`batch_bool_constant` overload encodes the mask in the type, is
+   resolved at compile time and is always efficient. The runtime
+   :cpp:class:`batch_bool` overload, by contrast, falls back to a per-lane
+   scalar loop on architectures without a native masked load/store
+   instruction — SSE2 through SSE4.2, NEON/NEON64, VSX, S390x, and WASM.
+   AVX, AVX2, AVX-512, SVE and RVV use native masked instructions and pay no
+   such penalty. Prefer the compile-time mask whenever the selection is known
+   at compile time, and avoid runtime-mask loads/stores in hot inner loops on
+   the affected architectures.
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <array>
 #include <complex>
+#include <cstdint>
 
 #include "../../types/xsimd_batch_constant.hpp"
 #include "./xsimd_common_details.hpp"
@@ -374,6 +375,39 @@ namespace xsimd
             return batch<T_out, A>::load(buffer.data(), aligned_mode {});
         }
 
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, aligned_mode, requires_arch<common>) noexcept
+        {
+            // Aligned mode contract: ``mem`` is aligned to ``A::alignment()``,
+            // and ``A::alignment() >= sizeof(batch<T, A>)`` for every common-
+            // fallback arch (SSE2-SSE4.2, NEON, NEON64, VSX, S390x, WASM — all
+            // 16-byte aligned, 16-byte vectors). The whole vector therefore
+            // lives inside a single alignment unit (and a single page, since
+            // pages are >= alignment), so an unconditional load cannot fault
+            // on inactive lanes. Lower the masked load to ``select`` against a
+            // zero broadcast — collapses to ~3 SIMD ops on every fallback arch.
+            return select(mask,
+                          batch<T, A>::load_aligned(mem),
+                          batch<T, A>(T(0)));
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, unaligned_mode, requires_arch<common>) noexcept
+        {
+            // Unaligned + runtime mask: ``mem`` may straddle a page boundary
+            // whose neighbour is unmapped, so an unconditional whole-vector
+            // ``load_unaligned`` is unsafe. Stay scalar.
+            constexpr std::size_t size = batch<T, A>::size;
+            alignas(A::alignment()) std::array<T, size> buffer {};
+            const uint64_t bits = mask.mask();
+            for (std::size_t i = 0; i < size; ++i)
+                if ((bits >> i) & uint64_t(1))
+                    buffer[i] = mem[i];
+            return batch<T, A>::load_aligned(buffer.data());
+        }
+
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE void
         store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
@@ -388,6 +422,33 @@ namespace xsimd
                 }
         }
 
+        template <class A, class T>
+        XSIMD_INLINE void
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, aligned_mode, requires_arch<common>) noexcept
+        {
+            // Symmetric to load_masked: aligned ``mem`` cannot fault for any
+            // lane in the batch, so a read-modify-write through ``select`` is
+            // safe and collapses to load + select + store on every fallback
+            // arch.
+            const auto current = batch<T, A>::load_aligned(mem);
+            select(mask, src, current).store_aligned(mem);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE void
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, unaligned_mode, requires_arch<common>) noexcept
+        {
+            // Symmetric to the unaligned load: unaligned RMW could fault on a
+            // page boundary, so stay scalar.
+            constexpr std::size_t size = batch<T, A>::size;
+            alignas(A::alignment()) std::array<T, size> src_buf;
+            src.store_aligned(src_buf.data());
+            const uint64_t bits = mask.mask();
+            for (std::size_t i = 0; i < size; ++i)
+                if ((bits >> i) & uint64_t(1))
+                    mem[i] = src_buf[i];
+        }
+
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
         {
 
@@ -1015,6 +1015,23 @@ namespace xsimd
             }
         }
 
+        // Runtime-mask load for float/double on AVX. Both aligned_mode and
+        // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault
+        // on masked-off lanes, so partial loads across page boundaries are safe.
+        template <class A, class Mode>
+        XSIMD_INLINE batch<float, A>
+        load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
+        {
+            return _mm256_maskload_ps(mem, _mm256_castps_si256(mask));
+        }
+
+        template <class A, class Mode>
+        XSIMD_INLINE batch<double, A>
+        load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx>) noexcept
+        {
+            return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask));
+        }
+
         // store_masked
         namespace detail
         {
@@ -1031,6 +1048,22 @@ namespace xsimd
             }
         }
 
+        // Runtime-mask store for float/double on AVX. Same fault-suppression
+        // semantics as the masked loads above; alignment mode is irrelevant.
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept
+        {
+            _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src);
+        }
+
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx>) noexcept
+        {
+            _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src);
+        }
+
         template <class A, class T, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
         {
 
@@ -119,7 +119,6 @@ namespace xsimd
         }
 
         // load_masked
-        // AVX2 low-level helpers (operate on raw SIMD registers)
         namespace detail
         {
             XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
@@ -138,14 +137,12 @@ namespace xsimd
             }
         }
 
-        // single templated implementation for integer masked loads (32/64-bit)
         template <class A, class T, bool... Values, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
         load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
         {
             static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
             using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            // Use the raw register-level maskload helpers for the remaining cases.
             return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
         }
 
@@ -175,6 +172,20 @@ namespace xsimd
             return bitwise_cast<uint64_t>(r);
         }
 
+        // Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers
+        // fall back to the scalar common path: AVX2 has no native maskload for
+        // those widths, and a load-then-blend would break fault-suppression at
+        // page boundaries (the main reason callers ask for a masked load).
+        // Both aligned_mode and unaligned_mode route to the same intrinsic —
+        // masked-off lanes do not fault regardless of alignment.
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
+        {
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
+        }
+
         // store_masked
         namespace detail
         {
@@ -196,14 +207,12 @@ namespace xsimd
         {
             constexpr size_t lanes_per_half = batch<T, A>::size / 2;
 
-            // confined to lower 128-bit half → forward to SSE
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
             {
                 constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
                 const auto lo = detail::lower_half(src);
                 store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
             }
-            // confined to upper 128-bit half → forward to SSE
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
             {
                 constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
@@ -230,6 +239,20 @@ namespace xsimd
             store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
         }
 
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                _mm256_maskstore_epi32(reinterpret_cast<int*>(mem), __m256i(mask), __m256i(src));
+            }
+            else
+            {
+                _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), __m256i(mask), __m256i(src));
+            }
+        }
+
         // load_stream
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
         XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
 
@@ -79,8 +79,16 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> load(T const* mem, unaligned_mode, requires_arch<A>) noexcept;
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE batch<T_out, A> load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...> mask, convert<T_out>, alignment, requires_arch<common>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, aligned_mode, requires_arch<common>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, unaligned_mode, requires_arch<common>) noexcept;
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE void store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...> mask, alignment, requires_arch<common>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, aligned_mode, requires_arch<common>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, unaligned_mode, requires_arch<common>) noexcept;
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<A>) noexcept;
         template <class A, bool... Values, class Mode>
 
@@ -409,6 +409,11 @@ namespace xsimd
         {
             XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*))
             XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec))
+            // Masked load (mask-undisturbed with zero passthrough): inactive lanes read as 0,
+            // no memory access is performed for inactive lanes (page-fault safe).
+            XSIMD_RVV_OVERLOAD(rvvle_mu, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM _mu), , vec(bvec, vec, T const*))
+            // Masked store: inactive lanes are not written.
+            XSIMD_RVV_OVERLOAD(rvvse_m, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM _m), , void(bvec, T*, vec))
         }
 
         template <class A, class T, detail::enable_arithmetic_t<T> = 0>
@@ -423,6 +428,16 @@ namespace xsimd
             return load_aligned<A>(src, convert<T>(), rvv {});
         }
 
+        // load_masked (runtime mask): native vle*.v vd, (rs1), v0.t with zero-init
+        // passthrough so inactive lanes read as 0, matching xsimd's contract.
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<rvv>) noexcept
+        {
+            using proj_t = project_num_t<T>;
+            const auto zero = detail_rvv::rvvmv_splat(proj_t {});
+            return detail_rvv::rvvle_mu(mask, zero, reinterpret_cast<proj_t const*>(mem));
+        }
+
         // load_complex
         namespace detail_rvv
         {
@@ -500,6 +515,15 @@ namespace xsimd
             store_aligned<A>(dst, src, rvv {});
         }
 
+        // store_masked (runtime mask): native vse*.v vd, (rs1), v0.t — inactive lanes
+        // are not written (page-fault safe).
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<rvv>) noexcept
+        {
+            using proj_t = project_num_t<T>;
+            detail_rvv::rvvse_m(mask, reinterpret_cast<proj_t*>(mem), src);
+        }
+
         /******************
          * scatter/gather *
          ******************/
 
@@ -101,13 +101,20 @@ namespace xsimd
             return load_aligned<A>(src, convert<T>(), sve {});
         }
 
-        // load_masked
+        // load_masked (compile-time mask)
         template <class A, class T, bool... Values, class Mode, detail::enable_arithmetic_t<T> = 0>
-        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<float, A, Values...>, Mode, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...>, convert<T>, Mode, requires_arch<sve>) noexcept
         {
             return svld1(detail_sve::pmask<Values...>(), reinterpret_cast<map_to_sized_type_t<T> const*>(mem));
         }
 
+        // load_masked (runtime mask)
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<sve>) noexcept
+        {
+            return svld1(mask, reinterpret_cast<project_num_t<T> const*>(mem));
+        }
+
         // load_complex
         template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
@@ -141,6 +148,20 @@ namespace xsimd
             store_aligned<A>(dst, src, sve {});
         }
 
+        // store_masked (compile-time mask)
+        template <class A, class T, bool... Values, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...>, Mode, requires_arch<sve>) noexcept
+        {
+            svst1(detail_sve::pmask<Values...>(), reinterpret_cast<project_num_t<T>*>(mem), src);
+        }
+
+        // store_masked (runtime mask)
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<sve>) noexcept
+        {
+            svst1(mask, reinterpret_cast<project_num_t<T>*>(mem), src);
+        }
+
         // store_complex
         template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept