xtensor-stack
diff --git a/‎.github/workflows/doxygen.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/doxygen.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎include/xsimd/arch/common/xsimd_common_memory.hpp‎
Lines changed: 4 additions & 12 deletions b/‎include/xsimd/arch/common/xsimd_common_memory.hpp‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 4 additions & 11 deletions b/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 4 additions & 11 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 28 additions & 80 deletions b/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 28 additions & 80 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx2_128.hpp‎
Lines changed: 40 additions & 66 deletions b/‎include/xsimd/arch/xsimd_avx2_128.hpp‎
Lines changed: 40 additions & 66 deletions
@@ -9,6 +9,8 @@ jobs:
     steps:
     - uses: actions/checkout@v6
     - name: Install dependencies
-      run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y doxygen python3-breathe python3-sphinx-rtd-theme
     - name: Render
       run: make -C docs
@@ -441,12 +441,8 @@ namespace xsimd
         XSIMD_INLINE batch<T, A>
         load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
         {
-            // Per-lane validity contract: only active lanes of ``mem`` are
-            // required to be addressable. An unconditional whole-vector load
-            // would touch inactive lanes and trip ASan/Valgrind on partial
-            // buffers, so stay scalar. Arches with hardware predicated loads
-            // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single
-            // intrinsic that suppresses inactive-lane reads in hardware.
+            // Scalar fallback: only active lanes are touched. Arches with
+            // hardware predicated loads override this.
             constexpr std::size_t size = batch<T, A>::size;
             alignas(A::alignment()) std::array<T, size> buffer;
             for (std::size_t i = 0; i < size; ++i)
@@ -465,12 +461,8 @@ namespace xsimd
         XSIMD_INLINE void
         store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
         {
-            // Per-lane validity contract (matches native masked-store APIs):
-            // only active lanes of ``mem`` are touched. A load+select+store
-            // RMW would both read and write inactive bytes, breaking that
-            // contract — stay scalar. Arches with hardware predicated stores
-            // override this with a single intrinsic that suppresses inactive
-            // lanes in hardware.
+            // Scalar fallback: only active lanes are touched. Arches with
+            // hardware predicated stores override this.
             constexpr std::size_t size = batch<T, A>::size;
             alignas(A::alignment()) std::array<T, size> src_buf;
             src.store_aligned(src_buf.data());
 
@@ -987,9 +987,7 @@ namespace xsimd
             }
         }
 
-        // Runtime-mask load for float/double on AVX. Both aligned_mode and
-        // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault
-        // on masked-off lanes, so partial loads across page boundaries are safe.
+        // Runtime-mask load (float/double).
         template <class A, class Mode>
         XSIMD_INLINE batch<float, A>
         load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
@@ -1036,12 +1034,8 @@ namespace xsimd
         // store_masked
         namespace detail
         {
-            // True when batch_bool<T, A> is the legacy VEX vector mask, i.e. it is stored
-            // in the same register as the data (__m256 / __m256d) rather than in an EVEX
-            // k-register (__mmask8) as on the avx512vl architectures. The _mm256_cast*_si256
-            // path below is only well-formed for the vector-mask representation. This names
-            // no architecture — it tests the mask's representation, in the spirit of
-            // detail::masked_memory_uses_fp_bitcast.
+            // True when batch_bool<T, A> shares the data register (__m256/__m256d) rather
+            // than an EVEX k-register; the _mm256_cast*_si256 path below needs the former.
             template <class T, class A>
             using uses_vector_mask = std::is_same<typename batch_bool<T, A>::register_type,
                                                   typename batch<T, A>::register_type>;
@@ -1087,8 +1081,7 @@ namespace xsimd
             }
         }
 
-        // Runtime-mask store for float/double on AVX. Same fault-suppression
-        // semantics as the masked loads above; alignment mode is irrelevant.
+        // Runtime-mask store (float/double).
         template <class A, class Mode>
         XSIMD_INLINE void
         store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept
 
@@ -117,18 +117,34 @@ namespace xsimd
             }
         }
 
-        // load_masked
-        // AVX2 low-level helpers (operate on raw SIMD registers)
+        // load_masked / store_masked: AVX2 has _mm256_maskload/maskstore_epi{32,64};
+        // 8/16-bit integers fall back to the common scalar path.
         namespace detail
         {
-            XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
+            template <class T>
+            XSIMD_INLINE __m256i maskload(T const* mem, __m256i mask) noexcept
             {
-                return _mm256_maskload_epi32(mem, mask);
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_maskload_epi32(reinterpret_cast<int const*>(mem), mask);
+                }
+                else
+                {
+                    return _mm256_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
+                }
             }
 
-            XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept
+            template <class T>
+            XSIMD_INLINE void maskstore(T* mem, __m256i mask, __m256i src) noexcept
             {
-                return _mm256_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    _mm256_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
+                }
+                else
+                {
+                    _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
+                }
             }
 
             XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept
@@ -137,72 +153,22 @@ namespace xsimd
             }
         }
 
-        // single templated implementation for integer masked loads (32/64-bit)
         template <class A, class T, bool... Values, class Mode>
-        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
         load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
         {
-            static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            // Use the raw register-level maskload helpers for the remaining cases.
-            return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
-        }
-
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2>) noexcept
-        {
-            return load_masked<A, int32_t>(mem, mask, convert<int32_t> {}, Mode {}, avx2 {});
-        }
-
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<avx2>) noexcept
-        {
-            const auto r = load_masked<A, int32_t>(reinterpret_cast<int32_t const*>(mem), batch_bool_constant<int32_t, A, Values...> {}, convert<int32_t> {}, Mode {}, avx2 {});
-            return bitwise_cast<uint32_t>(r);
+            return detail::maskload(mem, mask.as_batch());
         }
 
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2>) noexcept
-        {
-            return load_masked<A, int64_t>(mem, mask, convert<int64_t> {}, Mode {}, avx2 {});
-        }
-
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<avx2>) noexcept
-        {
-            const auto r = load_masked<A, int64_t>(reinterpret_cast<int64_t const*>(mem), batch_bool_constant<int64_t, A, Values...> {}, convert<int64_t> {}, Mode {}, avx2 {});
-            return bitwise_cast<uint64_t>(r);
-        }
-
-        // Runtime-mask load for 32/64-bit integers on AVX2; narrower widths fall
-        // back to the scalar common path. Aligned and unaligned share the same
-        // intrinsic — masked-off lanes do not fault regardless of alignment.
         template <class A, class T, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
         load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
         {
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
-        }
-
-        // store_masked
-        namespace detail
-        {
-            template <class T, class A>
-            XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept
-            {
-                _mm256_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
-            }
-
-            template <class T, class A>
-            XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept
-            {
-                _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
-            }
+            return detail::maskload(mem, __m256i(mask));
         }
 
         template <class A, class T, bool... Values, class Mode,
-                  typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4)>>
+                  typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)>>
         XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
         {
             constexpr size_t lanes_per_half = batch<T, A>::size / 2;
@@ -225,33 +191,15 @@ namespace xsimd
             }
             else
             {
-                detail::maskstore<T, A>(mem, mask.as_batch(), src);
+                detail::maskstore(mem, mask.as_batch(), src);
             }
         }
 
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
-        {
-            const auto s32 = bitwise_cast<int32_t>(src);
-            store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});
-        }
-
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
-        {
-            const auto s64 = bitwise_cast<int64_t>(src);
-            store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
-        }
-
-        // Runtime-mask store for 32/64-bit integers on AVX2; narrower widths fall
-        // back to the scalar common path. Same fault-suppression semantics as the
-        // masked loads above; alignment mode is irrelevant.
         template <class A, class T, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
         store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
         {
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
-            detail::maskstore<int_t, A>(reinterpret_cast<int_t*>(mem), __m256i(mask), __m256i(src));
+            detail::maskstore(mem, __m256i(mask), __m256i(src));
         }
 
         // load_stream
 
@@ -89,91 +89,65 @@ namespace xsimd
             }
         }
 
-        // load_masked — native 128-bit integer masked loads. Tagged on avx2_128
-        // because the vpmaskmov* intrinsics require AVX2; an AVX1-only build routes
-        // integer masked memory through the float path in xsimd_common_memory.hpp.
-        // Any arch with a native masked path provides its own exact-tag overload that
-        // out-ranks this one, so no cross-arch exclusion is needed here.
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2_128>) noexcept
-        {
-            return _mm_maskload_epi32(mem, mask.as_batch());
-        }
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx2_128>) noexcept
-        {
-            return _mm_maskload_epi32(reinterpret_cast<int32_t const*>(mem), mask.as_batch());
-        }
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2_128>) noexcept
-        {
-            return _mm_maskload_epi64(reinterpret_cast<long long const*>(mem), mask.as_batch());
-        }
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<uint64_t>, Mode, requires_arch<avx2_128>) noexcept
+        // load_masked / store_masked: native 128-bit integer masked memory.
+        // Tagged on avx2_128 because vpmaskmov* needs AVX2; an AVX1-only build
+        // routes integer masked memory through the float path in
+        // xsimd_common_memory.hpp. 8/16-bit fall back to the common scalar path.
+        namespace detail
         {
-            return _mm_maskload_epi64(reinterpret_cast<long long const*>(mem), mask.as_batch());
-        }
+            template <class T>
+            XSIMD_INLINE __m128i maskload_avx2_128(T const* mem, __m128i mask) noexcept
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_maskload_epi32(reinterpret_cast<int const*>(mem), mask);
+                }
+                else
+                {
+                    return _mm_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
+                }
+            }
 
-        // store_masked — native 128-bit integer masked stores (see load note above).
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
-        {
-            return _mm_maskstore_epi32(mem, mask.as_batch(), src);
-        }
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
-        {
-            return _mm_maskstore_epi32(reinterpret_cast<int32_t*>(mem), mask.as_batch(), src);
-        }
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
-        {
-            return _mm_maskstore_epi64(reinterpret_cast<long long*>(mem), mask.as_batch(), src);
+            template <class T>
+            XSIMD_INLINE void maskstore_avx2_128(T* mem, __m128i mask, __m128i src) noexcept
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    _mm_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
+                }
+                else
+                {
+                    _mm_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
+                }
+            }
         }
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
+
+        template <class A, class T, bool... Values, class Mode,
+                  typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)>>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskstore_epi64(reinterpret_cast<long long*>(mem), mask.as_batch(), src);
+            return detail::maskload_avx2_128(mem, mask.as_batch());
         }
 
-        // Runtime-mask path for 32/64-bit integers; narrower widths fall back to
-        // the common scalar path. Aligned and unaligned share the same intrinsic
-        // — masked-off lanes do not fault regardless of alignment.
-        namespace detail
+        template <class A, class T, bool... Values, class Mode,
+                  typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)>>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            XSIMD_INLINE __m128i maskload_128(int32_t const* mem, __m128i mask) noexcept
-            {
-                return _mm_maskload_epi32(mem, mask);
-            }
-            XSIMD_INLINE __m128i maskload_128(long long const* mem, __m128i mask) noexcept
-            {
-                return _mm_maskload_epi64(mem, mask);
-            }
-            XSIMD_INLINE void maskstore_128(int32_t* mem, __m128i mask, __m128i src) noexcept
-            {
-                _mm_maskstore_epi32(mem, mask, src);
-            }
-            XSIMD_INLINE void maskstore_128(long long* mem, __m128i mask, __m128i src) noexcept
-            {
-                _mm_maskstore_epi64(mem, mask, src);
-            }
+            detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src));
         }
 
         template <class A, class T, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
         load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
         {
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            return detail::maskload_128(reinterpret_cast<int_t const*>(mem), __m128i(mask));
+            return detail::maskload_avx2_128(mem, __m128i(mask));
         }
 
         template <class A, class T, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
         store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            detail::maskstore_128(reinterpret_cast<int_t*>(mem), __m128i(mask), __m128i(src));
+            detail::maskstore_avx2_128(mem, __m128i(mask), __m128i(src));
         }
 
         // gather