xtensor-stack · serge-sans-paille · Apr 27, 2026
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -29,6 +29,7 @@ jobs:
           - { compiler: 'clang', version: '17', flags: 'avx' }
           - { compiler: 'clang', version: '17', flags: 'sse3' }
           - { compiler: 'clang', version: '18', flags: 'avx512' }
+          - { compiler: 'clang', version: '18', flags: 'avx_128' }
     steps:
     - name: Setup compiler
       if: ${{ matrix.sys.compiler == 'gcc' }}
@@ -76,6 +77,9 @@ jobs:
         if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge"
         fi
+        if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then
+          CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge -DXSIMD_DEFAULT_ARCH=avx_128"
+        fi
         if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona"
         fi

diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -994,18 +994,18 @@ namespace xsimd
             using int_t = as_integer_t<T>;
             constexpr size_t half_size = batch<T, A>::size / 2;
 
-            // confined to lower 128-bit half → forward to SSE2
+            // confined to lower 128-bit half → forward to 128 bit
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
             {
                 constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
-                const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, sse4_2 {});
+                const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
                 return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
             }
-            // confined to upper 128-bit half → forward to SSE2
+            // confined to upper 128-bit half → forward to 128 bit
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
             {
                 constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
-                const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, sse4_2 {});
+                const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
                 return detail::zero_extend<A>(hi);
             }
             else
@@ -1036,19 +1036,19 @@ namespace xsimd
         {
             constexpr size_t half_size = batch<T, A>::size / 2;
 
-            // confined to lower 128-bit half → forward to SSE2
+            // confined to lower 128-bit half → forward to 128 bit
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
             {
                 constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
                 const auto lo = detail::lower_half(src);
-                store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
+                store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
             }
-            // confined to upper 128-bit half → forward to SSE2
+            // confined to upper 128-bit half → forward to 128 bit
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
             {
                 constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
                 const auto hi = detail::upper_half(src);
-                store_masked<sse4_2>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
+                store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
             }
             else
             {

diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp
@@ -0,0 +1,163 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_128_HPP
+#define XSIMD_AVX_128_HPP
+
+#include <type_traits>
+
+#include "../types/xsimd_avx_register.hpp"
+#include "../types/xsimd_batch_constant.hpp"
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        using namespace types;
+
+        // broadcast
+        template <class A, class T, class = std::enable_if_t<std::is_same<T, float>::value>>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx_128>) noexcept
+        {
+            return _mm_broadcast_ss(&val);
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_EQ_OQ);
+        }
+
+        // gt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_GT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_GT_OQ);
+        }
+
+        // ge
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_GE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_GE_OQ);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_LT_OQ);
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_LE_OQ);
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_NEQ_UQ);
+        }
+
+        // load_masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<float, A> load_masked(float const* mem, batch_bool_constant<float, A, Values...> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_ps(mem, mask.as_batch());
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<double, A> load_masked(double const* mem, batch_bool_constant<double, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_pd(mem, mask.as_batch());
+        }
+
+        // store_masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskstore_ps(mem, mask.as_batch(), src);
+        }
+
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskstore_pd(mem, mask.as_batch(), src);
+        }
+
+        // swizzle (dynamic mask)
+        template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(std::is_same<T, float>::value)
+            {
+                return _mm_permutevar_ps(self, mask);
+            }
+            else
+            {
+                // FIXME: _mm_permutevar_pd fails validation, but it shouldn't o_O
+                return swizzle(self, mask, sse4_2 {});
+                // return _mm_permutevar_pd(self, mask);
+            }
+        }
+
+        // swizzle (constant mask)
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<avx_128>) noexcept
+        {
+            return _mm_permute_ps(self, detail::mod_shuffle(V0, V1, V2, V3));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<avx_128>) noexcept
+        {
+            return _mm_permute_pd(self, detail::mod_shuffle(V0, V1));
+        }
+
+    }
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp
@@ -50,6 +50,7 @@
 
 #if XSIMD_WITH_AVX
 #include "./xsimd_avx.hpp"
+#include "./xsimd_avx_128.hpp"
 #endif
 
 #if XSIMD_WITH_FMA3_AVX

diff --git a/include/xsimd/types/xsimd_avx_register.hpp b/include/xsimd/types/xsimd_avx_register.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_AVX_REGISTER_HPP
 
 #include "./xsimd_common_arch.hpp"
+#include "./xsimd_sse4_2_register.hpp"
 
 namespace xsimd
 {
@@ -30,6 +31,18 @@ namespace xsimd
         static constexpr bool requires_alignment() noexcept { return true; }
         static constexpr char const* name() noexcept { return "avx"; }
     };
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX instructions extension for 128 bits registers
+     */
+    struct avx_128 : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx/128"; }
+    };
 }
 
 #if XSIMD_WITH_AVX
@@ -58,6 +71,8 @@ namespace xsimd
         XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
         XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
         XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx_128, sse4_2);
     }
 }
 #endif