1515#include < algorithm>
1616#include < array>
1717#include < complex>
18+ #include < cstdint>
1819
1920#include " ../../types/xsimd_batch_constant.hpp"
2021#include " ./xsimd_common_details.hpp"
@@ -374,6 +375,39 @@ namespace xsimd
374375 return batch<T_out, A>::load (buffer.data (), aligned_mode {});
375376 }
376377
378+ template <class A , class T >
379+ XSIMD_INLINE batch<T, A>
380+ load_masked (T const * mem, batch_bool<T, A> mask, convert<T>, aligned_mode, requires_arch<common>) noexcept
381+ {
382+ // Aligned mode contract: ``mem`` is aligned to ``A::alignment()``,
383+ // and ``A::alignment() >= sizeof(batch<T, A>)`` for every common-
384+ // fallback arch (SSE2-SSE4.2, NEON, NEON64, VSX, S390x, WASM — all
385+ // 16-byte aligned, 16-byte vectors). The whole vector therefore
386+ // lives inside a single alignment unit (and a single page, since
387+ // pages are >= alignment), so an unconditional load cannot fault
388+ // on inactive lanes. Lower the masked load to ``select`` against a
389+ // zero broadcast — collapses to ~3 SIMD ops on every fallback arch.
390+ return select (mask,
391+ batch<T, A>::load_aligned (mem),
392+ batch<T, A>(T (0 )));
393+ }
394+
395+ template <class A , class T >
396+ XSIMD_INLINE batch<T, A>
397+ load_masked (T const * mem, batch_bool<T, A> mask, convert<T>, unaligned_mode, requires_arch<common>) noexcept
398+ {
399+ // Unaligned + runtime mask: ``mem`` may straddle a page boundary
400+ // whose neighbour is unmapped, so an unconditional whole-vector
401+ // ``load_unaligned`` is unsafe. Stay scalar.
402+ constexpr std::size_t size = batch<T, A>::size;
403+ alignas (A::alignment ()) std::array<T, size> buffer {};
404+ const uint64_t bits = mask.mask ();
405+ for (std::size_t i = 0 ; i < size; ++i)
406+ if ((bits >> i) & uint64_t (1 ))
407+ buffer[i] = mem[i];
408+ return batch<T, A>::load_aligned (buffer.data ());
409+ }
410+
377411 template <class A , class T_in , class T_out , bool ... Values, class alignment >
378412 XSIMD_INLINE void
379413 store_masked (T_out* mem, batch<T_in, A> const & src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
@@ -388,6 +422,33 @@ namespace xsimd
388422 }
389423 }
390424
425+ template <class A , class T >
426+ XSIMD_INLINE void
427+ store_masked (T* mem, batch<T, A> const & src, batch_bool<T, A> mask, aligned_mode, requires_arch<common>) noexcept
428+ {
429+ // Symmetric to load_masked: aligned ``mem`` cannot fault for any
430+ // lane in the batch, so a read-modify-write through ``select`` is
431+ // safe and collapses to load + select + store on every fallback
432+ // arch.
433+ const auto current = batch<T, A>::load_aligned (mem);
434+ select (mask, src, current).store_aligned (mem);
435+ }
436+
437+ template <class A , class T >
438+ XSIMD_INLINE void
439+ store_masked (T* mem, batch<T, A> const & src, batch_bool<T, A> mask, unaligned_mode, requires_arch<common>) noexcept
440+ {
441+ // Symmetric to the unaligned load: unaligned RMW could fault on a
442+ // page boundary, so stay scalar.
443+ constexpr std::size_t size = batch<T, A>::size;
444+ alignas (A::alignment ()) std::array<T, size> src_buf;
445+ src.store_aligned (src_buf.data ());
446+ const uint64_t bits = mask.mask ();
447+ for (std::size_t i = 0 ; i < size; ++i)
448+ if ((bits >> i) & uint64_t (1 ))
449+ mem[i] = src_buf[i];
450+ }
451+
391452 template <class A , bool ... Values, class Mode >
392453 XSIMD_INLINE batch<int32_t , A> load_masked (int32_t const * mem, batch_bool_constant<int32_t , A, Values...>, convert<int32_t >, Mode, requires_arch<A>) noexcept
393454 {
0 commit comments