Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions docs/source/api/data_transfer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Data Transfers
From memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load` | load values from memory (optionally masked) |
| :cpp:func:`load` | load values from memory (optionally masked) [#m]_ |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_aligned` | load values from aligned memory |
+---------------------------------------+----------------------------------------------------+
Expand All @@ -32,7 +32,7 @@ From a scalar:
To memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store` | store values to memory (optionally masked) |
| :cpp:func:`store` | store values to memory (optionally masked) [#m]_ |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_aligned` | store values to aligned memory |
+---------------------------------------+----------------------------------------------------+
Expand Down Expand Up @@ -84,3 +84,16 @@ The following empty types are used for tag dispatching:

.. doxygenstruct:: xsimd::unaligned_mode
:project: xsimd

.. rubric:: Footnotes

.. [#m] Masked ``load`` / ``store`` come in two flavours. The
:cpp:class:`batch_bool_constant` overload encodes the mask in the type, is
resolved at compile time and is always efficient. The runtime
:cpp:class:`batch_bool` overload, by contrast, falls back to a per-lane
scalar loop on architectures without a native masked load/store
instruction — SSE2 through SSE4.2, NEON/NEON64, VSX, S390x, and WASM.
AVX, AVX2, AVX-512, SVE and RVV use native masked instructions and pay no
such penalty. Prefer the compile-time mask whenever the selection is known
at compile time, and avoid runtime-mask loads/stores in hot inner loops on
the affected architectures.
61 changes: 61 additions & 0 deletions include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <algorithm>
#include <array>
#include <complex>
#include <cstdint>

#include "../../types/xsimd_batch_constant.hpp"
#include "./xsimd_common_details.hpp"
Expand Down Expand Up @@ -374,6 +375,39 @@ namespace xsimd
return batch<T_out, A>::load(buffer.data(), aligned_mode {});
}

template <class A, class T>
XSIMD_INLINE batch<T, A>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, aligned_mode, requires_arch<common>) noexcept
{
// Aligned mode contract: ``mem`` is aligned to ``A::alignment()``,
// and ``A::alignment() >= sizeof(batch<T, A>)`` for every common-
// fallback arch (SSE2-SSE4.2, NEON, NEON64, VSX, S390x, WASM — all
// 16-byte aligned, 16-byte vectors). The whole vector therefore
// lives inside a single alignment unit (and a single page, since
// pages are >= alignment), so an unconditional load cannot fault
// on inactive lanes. Lower the masked load to ``select`` against a
// zero broadcast — collapses to ~3 SIMD ops on every fallback arch.
return select(mask,
batch<T, A>::load_aligned(mem),
batch<T, A>(T(0)));
}

template <class A, class T>
XSIMD_INLINE batch<T, A>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, unaligned_mode, requires_arch<common>) noexcept
{
// Unaligned + runtime mask: ``mem`` may straddle a page boundary
// whose neighbour is unmapped, so an unconditional whole-vector
// ``load_unaligned`` is unsafe. Stay scalar.
constexpr std::size_t size = batch<T, A>::size;
alignas(A::alignment()) std::array<T, size> buffer {};
const uint64_t bits = mask.mask();
for (std::size_t i = 0; i < size; ++i)
if ((bits >> i) & uint64_t(1))
buffer[i] = mem[i];
return batch<T, A>::load_aligned(buffer.data());
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
Expand All @@ -388,6 +422,33 @@ namespace xsimd
}
}

template <class A, class T>
XSIMD_INLINE void
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, aligned_mode, requires_arch<common>) noexcept
{
// Symmetric to load_masked: aligned ``mem`` cannot fault for any
// lane in the batch, so a read-modify-write through ``select`` is
// safe and collapses to load + select + store on every fallback
// arch.
const auto current = batch<T, A>::load_aligned(mem);
select(mask, src, current).store_aligned(mem);
}

template <class A, class T>
XSIMD_INLINE void
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, unaligned_mode, requires_arch<common>) noexcept
{
// Symmetric to the unaligned load: unaligned RMW could fault on a
// page boundary, so stay scalar.
constexpr std::size_t size = batch<T, A>::size;
alignas(A::alignment()) std::array<T, size> src_buf;
src.store_aligned(src_buf.data());
const uint64_t bits = mask.mask();
for (std::size_t i = 0; i < size; ++i)
if ((bits >> i) & uint64_t(1))
mem[i] = src_buf[i];
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
{
Expand Down
33 changes: 33 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1015,6 +1015,23 @@ namespace xsimd
}
}

// Runtime-mask load for float/double on AVX. Both aligned_mode and
// unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault
// on masked-off lanes, so partial loads across page boundaries are safe.
template <class A, class Mode>
XSIMD_INLINE batch<float, A>
load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
{
return _mm256_maskload_ps(mem, _mm256_castps_si256(mask));
}

template <class A, class Mode>
XSIMD_INLINE batch<double, A>
load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx>) noexcept
{
return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask));
}

// store_masked
namespace detail
{
Expand All @@ -1031,6 +1048,22 @@ namespace xsimd
}
}

// Runtime-mask store for float/double on AVX. Same fault-suppression
// semantics as the masked loads above; alignment mode is irrelevant.
template <class A, class Mode>
XSIMD_INLINE void
store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept
{
_mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src);
}

template <class A, class Mode>
XSIMD_INLINE void
store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx>) noexcept
{
_mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src);
}

template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
{
Expand Down
33 changes: 28 additions & 5 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ namespace xsimd
}

// load_masked
// AVX2 low-level helpers (operate on raw SIMD registers)
namespace detail
{
XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
Expand All @@ -138,14 +137,12 @@ namespace xsimd
}
}

// single templated implementation for integer masked loads (32/64-bit)
template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
{
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
// Use the raw register-level maskload helpers for the remaining cases.
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
}

Expand Down Expand Up @@ -175,6 +172,20 @@ namespace xsimd
return bitwise_cast<uint64_t>(r);
}

// Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers
// fall back to the scalar common path: AVX2 has no native maskload for
// those widths, and a load-then-blend would break fault-suppression at
// page boundaries (the main reason callers ask for a masked load).
// Both aligned_mode and unaligned_mode route to the same intrinsic —
// masked-off lanes do not fault regardless of alignment.
template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
{
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
}

// store_masked
namespace detail
{
Expand All @@ -196,14 +207,12 @@ namespace xsimd
{
constexpr size_t lanes_per_half = batch<T, A>::size / 2;

// confined to lower 128-bit half → forward to SSE
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
}
// confined to upper 128-bit half → forward to SSE
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
Expand All @@ -230,6 +239,20 @@ namespace xsimd
store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
}

template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
_mm256_maskstore_epi32(reinterpret_cast<int*>(mem), __m256i(mask), __m256i(src));
}
else
{
_mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), __m256i(mask), __m256i(src));
}
}

// load_stream
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
Expand Down
8 changes: 8 additions & 0 deletions include/xsimd/arch/xsimd_common_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,16 @@ namespace xsimd
XSIMD_INLINE batch<T, A> load(T const* mem, unaligned_mode, requires_arch<A>) noexcept;
template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE batch<T_out, A> load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...> mask, convert<T_out>, alignment, requires_arch<common>) noexcept;
template <class A, class T>
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, aligned_mode, requires_arch<common>) noexcept;
template <class A, class T>
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, unaligned_mode, requires_arch<common>) noexcept;
template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...> mask, alignment, requires_arch<common>) noexcept;
template <class A, class T>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, aligned_mode, requires_arch<common>) noexcept;
template <class A, class T>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, unaligned_mode, requires_arch<common>) noexcept;
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<A>) noexcept;
template <class A, bool... Values, class Mode>
Expand Down
24 changes: 24 additions & 0 deletions include/xsimd/arch/xsimd_rvv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,11 @@ namespace xsimd
{
XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*))
XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec))
// Masked load (mask-undisturbed with zero passthrough): inactive lanes read as 0,
// no memory access is performed for inactive lanes (page-fault safe).
XSIMD_RVV_OVERLOAD(rvvle_mu, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM _mu), , vec(bvec, vec, T const*))
// Masked store: inactive lanes are not written.
XSIMD_RVV_OVERLOAD(rvvse_m, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM _m), , void(bvec, T*, vec))
}

template <class A, class T, detail::enable_arithmetic_t<T> = 0>
Expand All @@ -423,6 +428,16 @@ namespace xsimd
return load_aligned<A>(src, convert<T>(), rvv {});
}

// load_masked (runtime mask): native vle*.v vd, (rs1), v0.t with zero-init
// passthrough so inactive lanes read as 0, matching xsimd's contract.
template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<rvv>) noexcept
{
using proj_t = project_num_t<T>;
const auto zero = detail_rvv::rvvmv_splat(proj_t {});
return detail_rvv::rvvle_mu(mask, zero, reinterpret_cast<proj_t const*>(mem));
}

// load_complex
namespace detail_rvv
{
Expand Down Expand Up @@ -500,6 +515,15 @@ namespace xsimd
store_aligned<A>(dst, src, rvv {});
}

// store_masked (runtime mask): native vse*.v vd, (rs1), v0.t — inactive lanes
// are not written (page-fault safe).
template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<rvv>) noexcept
{
using proj_t = project_num_t<T>;
detail_rvv::rvvse_m(mask, reinterpret_cast<proj_t*>(mem), src);
}

/******************
* scatter/gather *
******************/
Expand Down
25 changes: 23 additions & 2 deletions include/xsimd/arch/xsimd_sve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,20 @@ namespace xsimd
return load_aligned<A>(src, convert<T>(), sve {});
}

// load_masked
// load_masked (compile-time mask)
template <class A, class T, bool... Values, class Mode, detail::enable_arithmetic_t<T> = 0>
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<float, A, Values...>, Mode, requires_arch<sve>) noexcept
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...>, convert<T>, Mode, requires_arch<sve>) noexcept
{
return svld1(detail_sve::pmask<Values...>(), reinterpret_cast<map_to_sized_type_t<T> const*>(mem));
}

// load_masked (runtime mask)
template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<sve>) noexcept
{
return svld1(mask, reinterpret_cast<project_num_t<T> const*>(mem));
}

// load_complex
template <class A, class T, detail::enable_floating_point_t<T> = 0>
XSIMD_INLINE batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
Expand Down Expand Up @@ -141,6 +148,20 @@ namespace xsimd
store_aligned<A>(dst, src, sve {});
}

// store_masked (compile-time mask)
template <class A, class T, bool... Values, class Mode, detail::enable_arithmetic_t<T> = 0>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...>, Mode, requires_arch<sve>) noexcept
{
svst1(detail_sve::pmask<Values...>(), reinterpret_cast<project_num_t<T>*>(mem), src);
}

// store_masked (runtime mask)
template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<sve>) noexcept
{
svst1(mask, reinterpret_cast<project_num_t<T>*>(mem), src);
}

// store_complex
template <class A, class T, detail::enable_floating_point_t<T> = 0>
XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
Expand Down
Loading
Loading