feat: add load_head / load_tail / store_head / store_tail APIs

DiamonDinoia · DiamonDinoia · commit e22734657bbb · 2026-05-01T15:41:26.000-04:00
Sugar over runtime-mask load/store for loop head/tail remainders.
Take ``n`` directly instead of a constructed batch_bool; only
``mem[0, n)`` is touched. ``head`` uses mask ``(1 &lt;&lt; n) - 1``;
``tail`` uses ``((1 &lt;&lt; n) - 1) &lt;&lt; (size - n)`` with a base-pointer
offset (via uintptr_t to dodge -Warray-bounds), so every arch with
native predicated load/store inherits its intrinsic for free.

Tested on sse2/sse41/avx2/avx512f/emulated256 native and
neon64/rvv under qemu.
diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst
@@ -20,6 +20,10 @@ From memory:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`load_as`                   | load values, forcing a type conversion             |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`load_head`                  | load the first ``n`` contiguous elements [#h]_     |
++---------------------------------------+----------------------------------------------------+
+| :cpp:func:`load_tail`                  | load the last ``n`` contiguous elements [#h]_      |
++---------------------------------------+----------------------------------------------------+
 
 From a scalar:
 
@@ -40,6 +44,10 @@ To memory:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`store_as`                  | store values, forcing a type conversion            |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`store_head`                 | store the first ``n`` contiguous elements [#h]_    |
++---------------------------------------+----------------------------------------------------+
+| :cpp:func:`store_tail`                 | store the last ``n`` contiguous elements [#h]_     |
++---------------------------------------+----------------------------------------------------+
 
 In place:
 
@@ -97,3 +105,9 @@ The following empty types are used for tag dispatching:
    such penalty. Prefer the compile-time mask whenever the selection is known
    at compile time, and avoid runtime-mask loads/stores in hot inner loops on
    the affected architectures.
+
+.. [#h] ``load_head`` / ``store_head`` / ``load_tail`` / ``store_tail``
+   take a runtime element count ``n`` instead of a constructed mask;
+   they are sugar for the runtime-mask ``load`` / ``store`` with a
+   contiguous-prefix or contiguous-suffix mask, and inherit its
+   contract and per-arch codegen.
diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -379,12 +379,8 @@ namespace xsimd
         XSIMD_INLINE batch<T, A>
         load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
         {
-            // Per-lane validity contract: only active lanes of ``mem`` are
-            // required to be addressable. An unconditional whole-vector load
-            // would touch inactive lanes and trip ASan/Valgrind on partial
-            // buffers, so stay scalar. Arches with hardware predicated loads
-            // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single
-            // intrinsic that suppresses inactive-lane reads in hardware.
+            // Per-lane validity contract: only active lanes are read.
+            // Arches with hardware predicated loads override this.
             constexpr std::size_t size = batch<T, A>::size;
             alignas(A::alignment()) std::array<T, size> buffer {};
             const uint64_t bits = mask.mask();
@@ -412,12 +408,8 @@ namespace xsimd
         XSIMD_INLINE void
         store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
         {
-            // Per-lane validity contract (matches native masked-store APIs):
-            // only active lanes of ``mem`` are touched. A load+select+store
-            // RMW would both read and write inactive bytes, breaking that
-            // contract — stay scalar. Arches with hardware predicated stores
-            // override this with a single intrinsic that suppresses inactive
-            // lanes in hardware.
+            // Per-lane validity contract: only active lanes are written.
+            // Arches with hardware predicated stores override this.
             constexpr std::size_t size = batch<T, A>::size;
             alignas(A::alignment()) std::array<T, size> src_buf;
             src.store_aligned(src_buf.data());
@@ -427,6 +419,58 @@ namespace xsimd
                     mem[i] = src_buf[i];
         }
 
+        // Head/tail forward to the runtime-mask path. ``tail`` offsets
+        // the base pointer back by ``(size - n)`` so the active high-``n``
+        // lanes land at ``[mem, mem + n)``; the offset goes through
+        // ``uintptr_t`` to dodge ``-Warray-bounds`` on small buffers.
+        namespace detail
+        {
+            template <class T>
+            XSIMD_INLINE T const* offset_back(T const* p, std::size_t k) noexcept
+            {
+                return reinterpret_cast<T const*>(reinterpret_cast<std::uintptr_t>(p) - k * sizeof(T));
+            }
+            template <class T>
+            XSIMD_INLINE T* offset_back(T* p, std::size_t k) noexcept
+            {
+                return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(p) - k * sizeof(T));
+            }
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A>
+        load_head(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept
+        {
+            const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n));
+            return load_masked<A>(mem, mask, convert<T> {}, unaligned_mode {}, A {});
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void
+        store_head(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept
+        {
+            const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n));
+            store_masked<A>(mem, src, mask, unaligned_mode {}, A {});
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A>
+        load_tail(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n) << (size - n));
+            return load_masked<A>(detail::offset_back(mem, size - n), mask, convert<T> {}, unaligned_mode {}, A {});
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void
+        store_tail(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n) << (size - n));
+            store_masked<A>(detail::offset_back(mem, size - n), src, mask, unaligned_mode {}, A {});
+        }
+
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -102,6 +102,16 @@ namespace xsimd
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value> store_masked(uint64_t*, batch<uint64_t, A> const&, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept;
 
+        // Head/tail: contiguous prefix/suffix variants of the masked load/store.
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept;
+
         // Forward declarations for pack-level helpers
         namespace detail
         {
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -1631,6 +1631,51 @@ namespace xsimd
         return batch<T, A>::load(ptr, mask, unaligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Loads the prefix \c mem[0, n) into the low \c n lanes; remaining
+     * lanes are zero. Sugar for a runtime-mask load with mask
+     * <tt>(1 << n) - 1</tt>; same contract — only \c mem[0, n) is read.
+     * \c n is clamped to \c batch::size.
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load_head(mem, n, aligned_mode {});
+    }
+
+    /// \overload
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load_head(mem, n, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Loads \c mem[0, n) into the high \c n lanes (lanes
+     * <tt>[size - n, size)</tt>); remaining low lanes are zero. Same
+     * contract as \ref load_head. \c n is clamped to \c batch::size.
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load_tail(mem, n, aligned_mode {});
+    }
+
+    /// \overload
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load_tail(mem, n, unaligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -2807,6 +2852,50 @@ namespace xsimd
         val.store(mem, mask, unaligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Stores the low \c n lanes of \c val to \c mem[0, n). Sugar for a
+     * runtime-mask store with mask <tt>(1 << n) - 1</tt>; same contract —
+     * only \c mem[0, n) is written. \c n is clamped to \c batch::size.
+     */
+    template <class T, class A>
+    XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& val, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store_head(mem, n, aligned_mode {});
+    }
+
+    /// \overload
+    template <class T, class A>
+    XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& val, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store_head(mem, n, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Stores the high \c n lanes (lanes <tt>[size - n, size)</tt>) of
+     * \c val to \c mem[0, n). Same contract as \ref store_head. \c n is
+     * clamped to \c batch::size.
+     */
+    template <class T, class A>
+    XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& val, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store_tail(mem, n, aligned_mode {});
+    }
+
+    /// \overload
+    template <class T, class A>
+    XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& val, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store_tail(mem, n, unaligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
@@ -151,6 +151,12 @@ namespace xsimd
         template <class Mode = aligned_mode>
         XSIMD_INLINE void store(T* mem, batch_bool<T, A> mask, Mode = {}) const noexcept;
 
+        // Head/tail: contiguous prefix/suffix variants of the runtime-mask store.
+        XSIMD_INLINE void store_head(T* mem, std::size_t n, aligned_mode) const noexcept;
+        XSIMD_INLINE void store_head(T* mem, std::size_t n, unaligned_mode) const noexcept;
+        XSIMD_INLINE void store_tail(T* mem, std::size_t n, aligned_mode) const noexcept;
+        XSIMD_INLINE void store_tail(T* mem, std::size_t n, unaligned_mode) const noexcept;
+
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept;
         template <class U>
@@ -168,6 +174,12 @@ namespace xsimd
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
 
+        // Head/tail: contiguous prefix/suffix variants of the runtime-mask load.
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_head(T const* mem, std::size_t n, aligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_head(T const* mem, std::size_t n, unaligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, aligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept;
+
         template <class U, class V>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
         template <class U, class V>
@@ -794,6 +806,106 @@ namespace xsimd
         kernel::store_masked<A>(mem, *this, mask, mode, A {});
     }
 
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_head(T const* mem, std::size_t n, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return broadcast<T>(0);
+        if (n >= size)
+            return load_aligned(mem);
+        return kernel::load_head<A>(mem, n, aligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_head(T const* mem, std::size_t n, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return broadcast<T>(0);
+        if (n >= size)
+            return load_unaligned(mem);
+        return kernel::load_head<A>(mem, n, unaligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_tail(T const* mem, std::size_t n, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return broadcast<T>(0);
+        if (n >= size)
+            return load_aligned(mem);
+        return kernel::load_tail<A>(mem, n, aligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return broadcast<T>(0);
+        if (n >= size)
+            return load_unaligned(mem);
+        return kernel::load_tail<A>(mem, n, unaligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<T, A>::store_head(T* mem, std::size_t n, aligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return;
+        if (n >= size)
+        {
+            store_aligned(mem);
+            return;
+        }
+        kernel::store_head<A>(mem, n, *this, aligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<T, A>::store_head(T* mem, std::size_t n, unaligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return;
+        if (n >= size)
+        {
+            store_unaligned(mem);
+            return;
+        }
+        kernel::store_head<A>(mem, n, *this, unaligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<T, A>::store_tail(T* mem, std::size_t n, aligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return;
+        if (n >= size)
+        {
+            store_aligned(mem);
+            return;
+        }
+        kernel::store_tail<A>(mem, n, *this, aligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<T, A>::store_tail(T* mem, std::size_t n, unaligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return;
+        if (n >= size)
+        {
+            store_unaligned(mem);
+            return;
+        }
+        kernel::store_tail<A>(mem, n, *this, unaligned_mode {}, A {});
+    }
+
     template <class T, class A>
     template <class U>
     XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, stream_mode) noexcept
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp