Skip to content

Commit e227346

Browse files
committed
feat: add load_head / load_tail / store_head / store_tail APIs
Sugar over runtime-mask load/store for loop head/tail remainders. Take ``n`` directly instead of a constructed batch_bool; only ``mem[0, n)`` is touched. ``head`` uses mask ``(1 << n) - 1``; ``tail`` uses ``((1 << n) - 1) << (size - n)`` with a base-pointer offset (via uintptr_t to dodge -Warray-bounds), so every arch with native predicated load/store inherits its intrinsic for free. Tested on sse2/sse41/avx2/avx512f/emulated256 native and neon64/rvv under qemu.
1 parent b57a766 commit e227346

6 files changed

Lines changed: 446 additions & 12 deletions

File tree

docs/source/api/data_transfer.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ From memory:
2020
+---------------------------------------+----------------------------------------------------+
2121
| :cpp:func:`load_as` | load values, forcing a type conversion |
2222
+---------------------------------------+----------------------------------------------------+
23+
| :cpp:func:`load_head` | load the first ``n`` contiguous elements [#h]_ |
24+
+---------------------------------------+----------------------------------------------------+
25+
| :cpp:func:`load_tail` | load the last ``n`` contiguous elements [#h]_ |
26+
+---------------------------------------+----------------------------------------------------+
2327

2428
From a scalar:
2529

@@ -40,6 +44,10 @@ To memory:
4044
+---------------------------------------+----------------------------------------------------+
4145
| :cpp:func:`store_as` | store values, forcing a type conversion |
4246
+---------------------------------------+----------------------------------------------------+
47+
| :cpp:func:`store_head` | store the first ``n`` contiguous elements [#h]_ |
48+
+---------------------------------------+----------------------------------------------------+
49+
| :cpp:func:`store_tail` | store the last ``n`` contiguous elements [#h]_ |
50+
+---------------------------------------+----------------------------------------------------+
4351

4452
In place:
4553

@@ -97,3 +105,9 @@ The following empty types are used for tag dispatching:
97105
such penalty. Prefer the compile-time mask whenever the selection is known
98106
at compile time, and avoid runtime-mask loads/stores in hot inner loops on
99107
the affected architectures.
108+
109+
.. [#h] ``load_head`` / ``store_head`` / ``load_tail`` / ``store_tail``
110+
take a runtime element count ``n`` instead of a constructed mask;
111+
they are sugar for the runtime-mask ``load`` / ``store`` with a
112+
contiguous-prefix or contiguous-suffix mask, and inherit its
113+
contract and per-arch codegen.

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -379,12 +379,8 @@ namespace xsimd
379379
XSIMD_INLINE batch<T, A>
380380
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
381381
{
382-
// Per-lane validity contract: only active lanes of ``mem`` are
383-
// required to be addressable. An unconditional whole-vector load
384-
// would touch inactive lanes and trip ASan/Valgrind on partial
385-
// buffers, so stay scalar. Arches with hardware predicated loads
386-
// (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single
387-
// intrinsic that suppresses inactive-lane reads in hardware.
382+
// Per-lane validity contract: only active lanes are read.
383+
// Arches with hardware predicated loads override this.
388384
constexpr std::size_t size = batch<T, A>::size;
389385
alignas(A::alignment()) std::array<T, size> buffer {};
390386
const uint64_t bits = mask.mask();
@@ -412,12 +408,8 @@ namespace xsimd
412408
XSIMD_INLINE void
413409
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
414410
{
415-
// Per-lane validity contract (matches native masked-store APIs):
416-
// only active lanes of ``mem`` are touched. A load+select+store
417-
// RMW would both read and write inactive bytes, breaking that
418-
// contract — stay scalar. Arches with hardware predicated stores
419-
// override this with a single intrinsic that suppresses inactive
420-
// lanes in hardware.
411+
// Per-lane validity contract: only active lanes are written.
412+
// Arches with hardware predicated stores override this.
421413
constexpr std::size_t size = batch<T, A>::size;
422414
alignas(A::alignment()) std::array<T, size> src_buf;
423415
src.store_aligned(src_buf.data());
@@ -427,6 +419,58 @@ namespace xsimd
427419
mem[i] = src_buf[i];
428420
}
429421

422+
// Head/tail forward to the runtime-mask path. ``tail`` offsets
423+
// the base pointer back by ``(size - n)`` so the active high-``n``
424+
// lanes land at ``[mem, mem + n)``; the offset goes through
425+
// ``uintptr_t`` to dodge ``-Warray-bounds`` on small buffers.
426+
namespace detail
427+
{
428+
template <class T>
429+
XSIMD_INLINE T const* offset_back(T const* p, std::size_t k) noexcept
430+
{
431+
return reinterpret_cast<T const*>(reinterpret_cast<std::uintptr_t>(p) - k * sizeof(T));
432+
}
433+
template <class T>
434+
XSIMD_INLINE T* offset_back(T* p, std::size_t k) noexcept
435+
{
436+
return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(p) - k * sizeof(T));
437+
}
438+
}
439+
440+
template <class A, class T, class Mode>
441+
XSIMD_INLINE batch<T, A>
442+
load_head(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept
443+
{
444+
const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n));
445+
return load_masked<A>(mem, mask, convert<T> {}, unaligned_mode {}, A {});
446+
}
447+
448+
template <class A, class T, class Mode>
449+
XSIMD_INLINE void
450+
store_head(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept
451+
{
452+
const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n));
453+
store_masked<A>(mem, src, mask, unaligned_mode {}, A {});
454+
}
455+
456+
template <class A, class T, class Mode>
457+
XSIMD_INLINE batch<T, A>
458+
load_tail(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept
459+
{
460+
constexpr std::size_t size = batch<T, A>::size;
461+
const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n) << (size - n));
462+
return load_masked<A>(detail::offset_back(mem, size - n), mask, convert<T> {}, unaligned_mode {}, A {});
463+
}
464+
465+
template <class A, class T, class Mode>
466+
XSIMD_INLINE void
467+
store_tail(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept
468+
{
469+
constexpr std::size_t size = batch<T, A>::size;
470+
const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n) << (size - n));
471+
store_masked<A>(detail::offset_back(mem, size - n), src, mask, unaligned_mode {}, A {});
472+
}
473+
430474
template <class A, bool... Values, class Mode>
431475
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
432476
{

include/xsimd/arch/xsimd_common_fwd.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,16 @@ namespace xsimd
102102
template <class A, bool... Values, class Mode>
103103
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value> store_masked(uint64_t*, batch<uint64_t, A> const&, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept;
104104

105+
// Head/tail: contiguous prefix/suffix variants of the masked load/store.
106+
template <class A, class T, class Mode>
107+
XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept;
108+
template <class A, class T, class Mode>
109+
XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept;
110+
template <class A, class T, class Mode>
111+
XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept;
112+
template <class A, class T, class Mode>
113+
XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept;
114+
105115
// Forward declarations for pack-level helpers
106116
namespace detail
107117
{

include/xsimd/types/xsimd_api.hpp

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1631,6 +1631,51 @@ namespace xsimd
16311631
return batch<T, A>::load(ptr, mask, unaligned_mode {});
16321632
}
16331633

1634+
/**
1635+
* @ingroup batch_data_transfer
1636+
*
1637+
* Loads the prefix \c mem[0, n) into the low \c n lanes; remaining
1638+
* lanes are zero. Sugar for a runtime-mask load with mask
1639+
* <tt>(1 << n) - 1</tt>; same contract — only \c mem[0, n) is read.
1640+
* \c n is clamped to \c batch::size.
1641+
*/
1642+
template <class T, class A = default_arch>
1643+
XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, aligned_mode = {}) noexcept
1644+
{
1645+
detail::static_check_supported_config<T, A>();
1646+
return batch<T, A>::load_head(mem, n, aligned_mode {});
1647+
}
1648+
1649+
/// \overload
1650+
template <class T, class A = default_arch>
1651+
XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, unaligned_mode) noexcept
1652+
{
1653+
detail::static_check_supported_config<T, A>();
1654+
return batch<T, A>::load_head(mem, n, unaligned_mode {});
1655+
}
1656+
1657+
/**
1658+
* @ingroup batch_data_transfer
1659+
*
1660+
* Loads \c mem[0, n) into the high \c n lanes (lanes
1661+
* <tt>[size - n, size)</tt>); remaining low lanes are zero. Same
1662+
* contract as \ref load_head. \c n is clamped to \c batch::size.
1663+
*/
1664+
template <class T, class A = default_arch>
1665+
XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, aligned_mode = {}) noexcept
1666+
{
1667+
detail::static_check_supported_config<T, A>();
1668+
return batch<T, A>::load_tail(mem, n, aligned_mode {});
1669+
}
1670+
1671+
/// \overload
1672+
template <class T, class A = default_arch>
1673+
XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept
1674+
{
1675+
detail::static_check_supported_config<T, A>();
1676+
return batch<T, A>::load_tail(mem, n, unaligned_mode {});
1677+
}
1678+
16341679
/**
16351680
* @ingroup batch_data_transfer
16361681
*
@@ -2807,6 +2852,50 @@ namespace xsimd
28072852
val.store(mem, mask, unaligned_mode {});
28082853
}
28092854

2855+
/**
2856+
* @ingroup batch_data_transfer
2857+
*
2858+
* Stores the low \c n lanes of \c val to \c mem[0, n). Sugar for a
2859+
* runtime-mask store with mask <tt>(1 << n) - 1</tt>; same contract —
2860+
* only \c mem[0, n) is written. \c n is clamped to \c batch::size.
2861+
*/
2862+
template <class T, class A>
2863+
XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& val, aligned_mode = {}) noexcept
2864+
{
2865+
detail::static_check_supported_config<T, A>();
2866+
val.store_head(mem, n, aligned_mode {});
2867+
}
2868+
2869+
/// \overload
2870+
template <class T, class A>
2871+
XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& val, unaligned_mode) noexcept
2872+
{
2873+
detail::static_check_supported_config<T, A>();
2874+
val.store_head(mem, n, unaligned_mode {});
2875+
}
2876+
2877+
/**
2878+
* @ingroup batch_data_transfer
2879+
*
2880+
* Stores the high \c n lanes (lanes <tt>[size - n, size)</tt>) of
2881+
* \c val to \c mem[0, n). Same contract as \ref store_head. \c n is
2882+
* clamped to \c batch::size.
2883+
*/
2884+
template <class T, class A>
2885+
XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& val, aligned_mode = {}) noexcept
2886+
{
2887+
detail::static_check_supported_config<T, A>();
2888+
val.store_tail(mem, n, aligned_mode {});
2889+
}
2890+
2891+
/// \overload
2892+
template <class T, class A>
2893+
XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& val, unaligned_mode) noexcept
2894+
{
2895+
detail::static_check_supported_config<T, A>();
2896+
val.store_tail(mem, n, unaligned_mode {});
2897+
}
2898+
28102899
/**
28112900
* @ingroup batch_data_transfer
28122901
*

include/xsimd/types/xsimd_batch.hpp

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,12 @@ namespace xsimd
151151
template <class Mode = aligned_mode>
152152
XSIMD_INLINE void store(T* mem, batch_bool<T, A> mask, Mode = {}) const noexcept;
153153

154+
// Head/tail: contiguous prefix/suffix variants of the runtime-mask store.
155+
XSIMD_INLINE void store_head(T* mem, std::size_t n, aligned_mode) const noexcept;
156+
XSIMD_INLINE void store_head(T* mem, std::size_t n, unaligned_mode) const noexcept;
157+
XSIMD_INLINE void store_tail(T* mem, std::size_t n, aligned_mode) const noexcept;
158+
XSIMD_INLINE void store_tail(T* mem, std::size_t n, unaligned_mode) const noexcept;
159+
154160
template <class U>
155161
XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept;
156162
template <class U>
@@ -168,6 +174,12 @@ namespace xsimd
168174
template <class U>
169175
XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
170176

177+
// Head/tail: contiguous prefix/suffix variants of the runtime-mask load.
178+
XSIMD_NO_DISCARD static XSIMD_INLINE batch load_head(T const* mem, std::size_t n, aligned_mode) noexcept;
179+
XSIMD_NO_DISCARD static XSIMD_INLINE batch load_head(T const* mem, std::size_t n, unaligned_mode) noexcept;
180+
XSIMD_NO_DISCARD static XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, aligned_mode) noexcept;
181+
XSIMD_NO_DISCARD static XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept;
182+
171183
template <class U, class V>
172184
XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
173185
template <class U, class V>
@@ -794,6 +806,106 @@ namespace xsimd
794806
kernel::store_masked<A>(mem, *this, mask, mode, A {});
795807
}
796808

809+
template <class T, class A>
810+
XSIMD_INLINE batch<T, A> batch<T, A>::load_head(T const* mem, std::size_t n, aligned_mode) noexcept
811+
{
812+
detail::static_check_supported_config<T, A>();
813+
if (n == 0)
814+
return broadcast<T>(0);
815+
if (n >= size)
816+
return load_aligned(mem);
817+
return kernel::load_head<A>(mem, n, aligned_mode {}, A {});
818+
}
819+
820+
template <class T, class A>
821+
XSIMD_INLINE batch<T, A> batch<T, A>::load_head(T const* mem, std::size_t n, unaligned_mode) noexcept
822+
{
823+
detail::static_check_supported_config<T, A>();
824+
if (n == 0)
825+
return broadcast<T>(0);
826+
if (n >= size)
827+
return load_unaligned(mem);
828+
return kernel::load_head<A>(mem, n, unaligned_mode {}, A {});
829+
}
830+
831+
template <class T, class A>
832+
XSIMD_INLINE batch<T, A> batch<T, A>::load_tail(T const* mem, std::size_t n, aligned_mode) noexcept
833+
{
834+
detail::static_check_supported_config<T, A>();
835+
if (n == 0)
836+
return broadcast<T>(0);
837+
if (n >= size)
838+
return load_aligned(mem);
839+
return kernel::load_tail<A>(mem, n, aligned_mode {}, A {});
840+
}
841+
842+
template <class T, class A>
843+
XSIMD_INLINE batch<T, A> batch<T, A>::load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept
844+
{
845+
detail::static_check_supported_config<T, A>();
846+
if (n == 0)
847+
return broadcast<T>(0);
848+
if (n >= size)
849+
return load_unaligned(mem);
850+
return kernel::load_tail<A>(mem, n, unaligned_mode {}, A {});
851+
}
852+
853+
template <class T, class A>
854+
XSIMD_INLINE void batch<T, A>::store_head(T* mem, std::size_t n, aligned_mode) const noexcept
855+
{
856+
detail::static_check_supported_config<T, A>();
857+
if (n == 0)
858+
return;
859+
if (n >= size)
860+
{
861+
store_aligned(mem);
862+
return;
863+
}
864+
kernel::store_head<A>(mem, n, *this, aligned_mode {}, A {});
865+
}
866+
867+
template <class T, class A>
868+
XSIMD_INLINE void batch<T, A>::store_head(T* mem, std::size_t n, unaligned_mode) const noexcept
869+
{
870+
detail::static_check_supported_config<T, A>();
871+
if (n == 0)
872+
return;
873+
if (n >= size)
874+
{
875+
store_unaligned(mem);
876+
return;
877+
}
878+
kernel::store_head<A>(mem, n, *this, unaligned_mode {}, A {});
879+
}
880+
881+
template <class T, class A>
882+
XSIMD_INLINE void batch<T, A>::store_tail(T* mem, std::size_t n, aligned_mode) const noexcept
883+
{
884+
detail::static_check_supported_config<T, A>();
885+
if (n == 0)
886+
return;
887+
if (n >= size)
888+
{
889+
store_aligned(mem);
890+
return;
891+
}
892+
kernel::store_tail<A>(mem, n, *this, aligned_mode {}, A {});
893+
}
894+
895+
template <class T, class A>
896+
XSIMD_INLINE void batch<T, A>::store_tail(T* mem, std::size_t n, unaligned_mode) const noexcept
897+
{
898+
detail::static_check_supported_config<T, A>();
899+
if (n == 0)
900+
return;
901+
if (n >= size)
902+
{
903+
store_unaligned(mem);
904+
return;
905+
}
906+
kernel::store_tail<A>(mem, n, *this, unaligned_mode {}, A {});
907+
}
908+
797909
template <class T, class A>
798910
template <class U>
799911
XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, stream_mode) noexcept

0 commit comments

Comments
 (0)