xtensor-stack
diff --git a/‎include/xsimd/arch/common/xsimd_common_memory.hpp‎
Lines changed: 15 additions & 8 deletions b/‎include/xsimd/arch/common/xsimd_common_memory.hpp‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 21 additions & 17 deletions b/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 21 additions & 17 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 28 additions & 18 deletions b/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 28 additions & 18 deletions
@@ -388,57 +388,64 @@ namespace xsimd
                 }
         }
 
+        // Integer→float reinterpret bridges. Excluded for AVX-512VL archs which provide
+        // their own EVEX masked integer ovlds; without the exclusion gcc-10 sees the bridge
+        // and the VL native as equally specialized for A=avx512vl_*. (bridge_not_vl in fwd.hpp)
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
+        XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<int32_t, A>>
+        load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
         {
             const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
             return bitwise_cast<int32_t>(f);
         }
 
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
+        XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<uint32_t, A>>
+        load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
         {
             const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
             return bitwise_cast<uint32_t>(f);
         }
 
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>>
+        XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<int64_t, A>>
         load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept
         {
             const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
             return bitwise_cast<int64_t>(d);
         }
 
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>>
+        XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<uint64_t, A>>
         load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept
         {
             const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
             return bitwise_cast<uint64_t>(d);
         }
 
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
+        XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>
+        store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
         {
             store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
         }
 
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
+        XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>
+        store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
         {
             store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
         }
 
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
+        XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>
         store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept
         {
             store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
         }
 
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
+        XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>
         store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept
         {
             store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
 
@@ -993,19 +993,20 @@ namespace xsimd
         {
             using int_t = as_integer_t<T>;
             constexpr size_t half_size = batch<T, A>::size / 2;
+            using half_arch = typename ::xsimd::make_sized_batch_t<T, half_size>::arch_type;
 
-            // confined to lower 128-bit half → forward to 128 bit
+            // lower 128-bit half
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
             {
-                constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
-                const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
+                constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(batch_bool_constant<int_t, A, Values...> {});
+                const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, half_arch {});
                 return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
             }
-            // confined to upper 128-bit half → forward to 128 bit
+            // upper 128-bit half
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
             {
-                constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
-                const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
+                constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
+                const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, half_arch {});
                 return detail::zero_extend<A>(hi);
             }
             else
@@ -1019,36 +1020,39 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept
+            XSIMD_INLINE void maskstore(float* mem, batch<as_integer_t<float>, A> const& mask, batch<float, A> const& src) noexcept
             {
                 _mm256_maskstore_ps(mem, mask, src);
             }
 
             template <class A>
-            XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept
+            XSIMD_INLINE void maskstore(double* mem, batch<as_integer_t<double>, A> const& mask, batch<double, A> const& src) noexcept
             {
                 _mm256_maskstore_pd(mem, mask, src);
             }
         }
 
-        template <class A, class T, bool... Values, class Mode>
+        template <class A, class T, bool... Values, class Mode,
+                  typename = std::enable_if_t<std::is_floating_point<T>::value>>
         XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
         {
             constexpr size_t half_size = batch<T, A>::size / 2;
+            using half_batch = ::xsimd::make_sized_batch_t<T, half_size>;
+            using half_arch = typename half_batch::arch_type;
 
-            // confined to lower 128-bit half → forward to 128 bit
+            // lower 128-bit half
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
             {
-                constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
-                const auto lo = detail::lower_half(src);
-                store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
+                constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
+                const half_batch lo = detail::lower_half(src);
+                store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
             }
-            // confined to upper 128-bit half → forward to 128 bit
+            // upper 128-bit half
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
             {
-                constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
-                const auto hi = detail::upper_half(src);
-                store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
+                constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
+                const half_batch hi = detail::upper_half(src);
+                store_masked<half_arch>(mem + half_size, hi, mhi, Mode {}, half_arch {});
             }
             else
             {
 
@@ -138,7 +138,8 @@ namespace xsimd
         }
 
         // single templated implementation for integer masked loads (32/64-bit)
-        template <class A, class T, bool... Values, class Mode>
+        template <class A, class T, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
         load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
         {
@@ -148,26 +149,30 @@ namespace xsimd
             return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
         }
 
-        template <class A, bool... Values, class Mode>
+        template <class A, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2>) noexcept
         {
             return load_masked<A, int32_t>(mem, mask, convert<int32_t> {}, Mode {}, avx2 {});
         }
 
-        template <class A, bool... Values, class Mode>
+        template <class A, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
         XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<avx2>) noexcept
         {
             const auto r = load_masked<A, int32_t>(reinterpret_cast<int32_t const*>(mem), batch_bool_constant<int32_t, A, Values...> {}, convert<int32_t> {}, Mode {}, avx2 {});
             return bitwise_cast<uint32_t>(r);
         }
 
-        template <class A, bool... Values, class Mode>
+        template <class A, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
         XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2>) noexcept
         {
             return load_masked<A, int64_t>(mem, mask, convert<int64_t> {}, Mode {}, avx2 {});
         }
 
-        template <class A, bool... Values, class Mode>
+        template <class A, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
         XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<avx2>) noexcept
         {
             const auto r = load_masked<A, int64_t>(reinterpret_cast<int64_t const*>(mem), batch_bool_constant<int64_t, A, Values...> {}, convert<int64_t> {}, Mode {}, avx2 {});
@@ -190,39 +195,44 @@ namespace xsimd
             }
         }
 
-        template <class A, class T, bool... Values, class Mode>
+        template <class A, class T, bool... Values, class Mode,
+                  typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4) && std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
         XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
         {
             constexpr size_t lanes_per_half = batch<T, A>::size / 2;
+            using half_batch = ::xsimd::make_sized_batch_t<T, lanes_per_half>;
+            using half_arch = typename half_batch::arch_type;
 
-            // confined to lower 128-bit half → forward to SSE
+            // lower 128-bit half
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
             {
-                constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
-                const auto lo = detail::lower_half(src);
-                store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
+                constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
+                const half_batch lo = detail::lower_half(src);
+                store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
             }
-            // confined to upper 128-bit half → forward to SSE
+            // upper 128-bit half
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
             {
-                constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
-                const auto hi = detail::upper_half(src);
-                store_masked<sse4_2>(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {});
+                constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
+                const half_batch hi = detail::upper_half(src);
+                store_masked<half_arch>(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {});
             }
             else
             {
                 detail::maskstore<T, A>(mem, mask.as_batch(), src);
             }
         }
 
-        template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
+        template <class A, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
+        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
         {
             const auto s32 = bitwise_cast<int32_t>(src);
-            store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, mask, Mode {}, avx2 {});
+            store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});
         }
 
-        template <class A, bool... Values, class Mode>
+        template <class A, bool... Values, class Mode,
+                  class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
         XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
         {
             const auto s64 = bitwise_cast<int64_t>(src);
Original file line number	Diff line number	Diff line change
`@@ -388,57 +388,64 @@ namespace xsimd`
`388`	`388`	`}`
`389`	`389`	`}`
`390`	`390`
	`391`	`+ // Integer→float reinterpret bridges. Excluded for AVX-512VL archs which provide`
	`392`	`+ // their own EVEX masked integer ovlds; without the exclusion gcc-10 sees the bridge`
	`393`	`+ // and the VL native as equally specialized for A=avx512vl_*. (bridge_not_vl in fwd.hpp)`
`391`	`394`	`template <class A, bool... Values, class Mode>`
`392`		`- XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept`
	`395`	`+ XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<int32_t, A>>`
	`396`	`+ load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept`
`393`	`397`	`{`
`394`	`398`	`const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});`
`395`	`399`	`return bitwise_cast<int32_t>(f);`
`396`	`400`	`}`
`397`	`401`
`398`	`402`	`template <class A, bool... Values, class Mode>`
`399`		`- XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept`
	`403`	`+ XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<uint32_t, A>>`
	`404`	`+ load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept`
`400`	`405`	`{`
`401`	`406`	`const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});`
`402`	`407`	`return bitwise_cast<uint32_t>(f);`
`403`	`408`	`}`
`404`	`409`
`405`	`410`	`template <class A, bool... Values, class Mode>`
`406`		`- XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>>`
	`411`	`+ XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<int64_t, A>>`
`407`	`412`	`load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept`
`408`	`413`	`{`
`409`	`414`	`const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});`
`410`	`415`	`return bitwise_cast<int64_t>(d);`
`411`	`416`	`}`
`412`	`417`
`413`	`418`	`template <class A, bool... Values, class Mode>`
`414`		`- XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>>`
	`419`	`+ XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<uint64_t, A>>`
`415`	`420`	`load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept`
`416`	`421`	`{`
`417`	`422`	`const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});`
`418`	`423`	`return bitwise_cast<uint64_t>(d);`
`419`	`424`	`}`
`420`	`425`
`421`	`426`	`template <class A, bool... Values, class Mode>`
`422`		`- XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept`
	`427`	`+ XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>`
	`428`	`+ store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept`
`423`	`429`	`{`
`424`	`430`	`store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});`
`425`	`431`	`}`
`426`	`432`
`427`	`433`	`template <class A, bool... Values, class Mode>`
`428`		`- XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept`
	`434`	`+ XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>`
	`435`	`+ store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept`
`429`	`436`	`{`
`430`	`437`	`store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});`
`431`	`438`	`}`
`432`	`439`
`433`	`440`	`template <class A, bool... Values, class Mode>`
`434`		`- XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>`
	`441`	`+ XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>`
`435`	`442`	`store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept`
`436`	`443`	`{`
`437`	`444`	`store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});`
`438`	`445`	`}`
`439`	`446`
`440`	`447`	`template <class A, bool... Values, class Mode>`
`441`		`- XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>`
	`448`	`+ XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>`
`442`	`449`	`store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept`
`443`	`450`	`{`
`444`	`451`	`store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});`
Original file line number	Diff line number	Diff line change
`@@ -993,19 +993,20 @@ namespace xsimd`
`993`	`993`	`{`
`994`	`994`	`using int_t = as_integer_t<T>;`
`995`	`995`	`constexpr size_t half_size = batch<T, A>::size / 2;`
	`996`	`+ using half_arch = typename ::xsimd::make_sized_batch_t<T, half_size>::arch_type;`
`996`	`997`
`997`		`- // confined to lower 128-bit half → forward to 128 bit`
	`998`	`+ // lower 128-bit half`
`998`	`999`	`XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)`
`999`	`1000`	`{`
`1000`		`- constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});`
`1001`		`- const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});`
	`1001`	`+ constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(batch_bool_constant<int_t, A, Values...> {});`
	`1002`	`+ const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, half_arch {});`
`1002`	`1003`	`return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));`
`1003`	`1004`	`}`
`1004`		`- // confined to upper 128-bit half → forward to 128 bit`
	`1005`	`+ // upper 128-bit half`
`1005`	`1006`	`else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)`
`1006`	`1007`	`{`
`1007`		`- constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);`
`1008`		`- const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});`
	`1008`	`+ constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);`
	`1009`	`+ const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, half_arch {});`
`1009`	`1010`	`return detail::zero_extend<A>(hi);`
`1010`	`1011`	`}`
`1011`	`1012`	`else`
`@@ -1019,36 +1020,39 @@ namespace xsimd`
`1019`	`1020`	`namespace detail`
`1020`	`1021`	`{`
`1021`	`1022`	`template <class A>`
`1022`		`- XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept`
	`1023`	`+ XSIMD_INLINE void maskstore(float* mem, batch<as_integer_t<float>, A> const& mask, batch<float, A> const& src) noexcept`
`1023`	`1024`	`{`
`1024`	`1025`	`_mm256_maskstore_ps(mem, mask, src);`
`1025`	`1026`	`}`
`1026`	`1027`
`1027`	`1028`	`template <class A>`
`1028`		`- XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept`
	`1029`	`+ XSIMD_INLINE void maskstore(double* mem, batch<as_integer_t<double>, A> const& mask, batch<double, A> const& src) noexcept`
`1029`	`1030`	`{`
`1030`	`1031`	`_mm256_maskstore_pd(mem, mask, src);`
`1031`	`1032`	`}`
`1032`	`1033`	`}`
`1033`	`1034`
`1034`		`- template <class A, class T, bool... Values, class Mode>`
	`1035`	`+ template <class A, class T, bool... Values, class Mode,`
	`1036`	`+ typename = std::enable_if_t<std::is_floating_point<T>::value>>`
`1035`	`1037`	`XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept`
`1036`	`1038`	`{`
`1037`	`1039`	`constexpr size_t half_size = batch<T, A>::size / 2;`
	`1040`	`+ using half_batch = ::xsimd::make_sized_batch_t<T, half_size>;`
	`1041`	`+ using half_arch = typename half_batch::arch_type;`
`1038`	`1042`
`1039`		`- // confined to lower 128-bit half → forward to 128 bit`
	`1043`	`+ // lower 128-bit half`
`1040`	`1044`	`XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)`
`1041`	`1045`	`{`
`1042`		`- constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);`
`1043`		`- const auto lo = detail::lower_half(src);`
`1044`		`- store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});`
	`1046`	`+ constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);`
	`1047`	`+ const half_batch lo = detail::lower_half(src);`
	`1048`	`+ store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});`
`1045`	`1049`	`}`
`1046`		`- // confined to upper 128-bit half → forward to 128 bit`
	`1050`	`+ // upper 128-bit half`
`1047`	`1051`	`else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)`
`1048`	`1052`	`{`
`1049`		`- constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);`
`1050`		`- const auto hi = detail::upper_half(src);`
`1051`		`- store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});`
	`1053`	`+ constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);`
	`1054`	`+ const half_batch hi = detail::upper_half(src);`
	`1055`	`+ store_masked<half_arch>(mem + half_size, hi, mhi, Mode {}, half_arch {});`
`1052`	`1056`	`}`
`1053`	`1057`	`else`
`1054`	`1058`	`{`
Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,8 @@ namespace xsimd`
`138`	`138`	`}`
`139`	`139`
`140`	`140`	`// single templated implementation for integer masked loads (32/64-bit)`
`141`		`- template <class A, class T, bool... Values, class Mode>`
	`141`	`+ template <class A, class T, bool... Values, class Mode,`
	`142`	`+ class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>`
`142`	`143`	`XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>`
`143`	`144`	`load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept`
`144`	`145`	`{`
`@@ -148,26 +149,30 @@ namespace xsimd`
`148`	`149`	`return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());`
`149`	`150`	`}`
`150`	`151`
`151`		`- template <class A, bool... Values, class Mode>`
	`152`	`+ template <class A, bool... Values, class Mode,`
	`153`	`+ class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>`
`152`	`154`	`XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2>) noexcept`
`153`	`155`	`{`
`154`	`156`	`return load_masked<A, int32_t>(mem, mask, convert<int32_t> {}, Mode {}, avx2 {});`
`155`	`157`	`}`
`156`	`158`
`157`		`- template <class A, bool... Values, class Mode>`
	`159`	`+ template <class A, bool... Values, class Mode,`
	`160`	`+ class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>`
`158`	`161`	`XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<avx2>) noexcept`
`159`	`162`	`{`
`160`	`163`	`const auto r = load_masked<A, int32_t>(reinterpret_cast<int32_t const*>(mem), batch_bool_constant<int32_t, A, Values...> {}, convert<int32_t> {}, Mode {}, avx2 {});`
`161`	`164`	`return bitwise_cast<uint32_t>(r);`
`162`	`165`	`}`
`163`	`166`
`164`		`- template <class A, bool... Values, class Mode>`
	`167`	`+ template <class A, bool... Values, class Mode,`
	`168`	`+ class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>`
`165`	`169`	`XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2>) noexcept`
`166`	`170`	`{`
`167`	`171`	`return load_masked<A, int64_t>(mem, mask, convert<int64_t> {}, Mode {}, avx2 {});`
`168`	`172`	`}`
`169`	`173`
`170`		`- template <class A, bool... Values, class Mode>`
	`174`	`+ template <class A, bool... Values, class Mode,`
	`175`	`+ class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>`
`171`	`176`	`XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<avx2>) noexcept`
`172`	`177`	`{`
`173`	`178`	`const auto r = load_masked<A, int64_t>(reinterpret_cast<int64_t const*>(mem), batch_bool_constant<int64_t, A, Values...> {}, convert<int64_t> {}, Mode {}, avx2 {});`
`@@ -190,39 +195,44 @@ namespace xsimd`
`190`	`195`	`}`
`191`	`196`	`}`
`192`	`197`
`193`		`- template <class A, class T, bool... Values, class Mode>`
	`198`	`+ template <class A, class T, bool... Values, class Mode,`
	`199`	`+ typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4) && std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>`
`194`	`200`	`XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept`
`195`	`201`	`{`
`196`	`202`	`constexpr size_t lanes_per_half = batch<T, A>::size / 2;`
	`203`	`+ using half_batch = ::xsimd::make_sized_batch_t<T, lanes_per_half>;`
	`204`	`+ using half_arch = typename half_batch::arch_type;`
`197`	`205`
`198`		`- // confined to lower 128-bit half → forward to SSE`
	`206`	`+ // lower 128-bit half`
`199`	`207`	`XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)`
`200`	`208`	`{`
`201`		`- constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);`
`202`		`- const auto lo = detail::lower_half(src);`
`203`		`- store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});`
	`209`	`+ constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);`
	`210`	`+ const half_batch lo = detail::lower_half(src);`
	`211`	`+ store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});`
`204`	`212`	`}`
`205`		`- // confined to upper 128-bit half → forward to SSE`
	`213`	`+ // upper 128-bit half`
`206`	`214`	`else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)`
`207`	`215`	`{`
`208`		`- constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);`
`209`		`- const auto hi = detail::upper_half(src);`
`210`		`- store_masked<sse4_2>(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {});`
	`216`	`+ constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);`
	`217`	`+ const half_batch hi = detail::upper_half(src);`
	`218`	`+ store_masked<half_arch>(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {});`
`211`	`219`	`}`
`212`	`220`	`else`
`213`	`221`	`{`
`214`	`222`	`detail::maskstore<T, A>(mem, mask.as_batch(), src);`
`215`	`223`	`}`
`216`	`224`	`}`
`217`	`225`
`218`		`- template <class A, bool... Values, class Mode>`
`219`		`- XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2>) noexcept`
	`226`	`+ template <class A, bool... Values, class Mode,`
	`227`	`+ class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>`
	`228`	`+ XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept`
`220`	`229`	`{`
`221`	`230`	`const auto s32 = bitwise_cast<int32_t>(src);`
`222`		`- store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, mask, Mode {}, avx2 {});`
	`231`	`+ store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});`
`223`	`232`	`}`
`224`	`233`
`225`		`- template <class A, bool... Values, class Mode>`
	`234`	`+ template <class A, bool... Values, class Mode,`
	`235`	`+ class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>`
`226`	`236`	`XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<avx2>) noexcept`
`227`	`237`	`{`
`228`	`238`	`const auto s64 = bitwise_cast<int64_t>(src);`