Tentative support for avx2 extensions to 128 bit registers

serge-sans-paille · serge-sans-paille · commit 3a61c904ce28 · 2026-05-01T17:16:49.000Z
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -17,6 +17,7 @@ jobs:
           - { compiler: 'gcc',   version: '12', flags: 'force_no_instr_set' }
           - { compiler: 'gcc',   version: '13', flags: 'enable_xtl_complex' }
           - { compiler: 'gcc',   version: '14', flags: 'avx' }
+          - { compiler: 'gcc',   version: '14', flags: 'avx2' }
           - { compiler: 'gcc',   version: '13', flags: 'avx512' }
           - { compiler: 'gcc',   version: '10', flags: 'avx512' }
           - { compiler: 'gcc',   version: '12', flags: 'i386' }
@@ -30,6 +31,7 @@ jobs:
           - { compiler: 'clang', version: '17', flags: 'sse3' }
           - { compiler: 'clang', version: '18', flags: 'avx512' }
           - { compiler: 'clang', version: '18', flags: 'avx_128' }
+          - { compiler: 'clang', version: '18', flags: 'avx2_128' }
     steps:
     - name: Setup compiler
       if: ${{ matrix.sys.compiler == 'gcc' }}
@@ -80,6 +82,12 @@ jobs:
         if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge -DXSIMD_DEFAULT_ARCH=avx_128"
         fi
+        if [[ '${{ matrix.sys.flags }}' == 'avx2' ]]; then
+          CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell"
+        fi
+        if [[ '${{ matrix.sys.flags }}' == 'avx2_128' ]]; then
+          CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell -DXSIMD_DEFAULT_ARCH=avx2_128"
+        fi
         if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona"
         fi
diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp
@@ -0,0 +1,170 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_128_HPP
+#define XSIMD_AVX2_128_HPP
+
+#include <type_traits>
+
+#include "../types/xsimd_avx2_register.hpp"
+#include "../types/xsimd_batch_constant.hpp"
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        using namespace types;
+
+        // select
+        template <class A, class T, bool... Values, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2_128>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_blend_epi32(false_br, true_br, mask);
+            }
+            else
+            {
+                return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, avx_128 {});
+            }
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_sllv_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_sllv_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2_128>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srav_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx_128 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srlv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm_srlv_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx_128 {});
+                }
+            }
+        }
+
+        // load_masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2_128>) noexcept
+        {
+            return _mm_maskload_epi32(mem, mask.as_batch());
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx2_128>) noexcept
+        {
+            return _mm_maskload_epi32((int32_t*)mem, mask.as_batch());
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_epi64(mem, mask.as_batch());
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_epi64((int64_t*)mem, mask.as_batch());
+        }
+
+        // store_masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
+        {
+            return _mm_maskstore_epi32(mem, mask.as_batch(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
+        {
+            return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskstore_epi64(mem, mask.as_batch(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src);
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx2_128>) noexcept
+        {
+            return _mm_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx2_128>) noexcept
+        {
+            return _mm_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
+        }
+
+        template <class A, class U,
+                  detail::enable_sized_integral_t<U, 4> = 0>
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
+                                            batch<U, A> const& index,
+                                            kernel::requires_arch<avx2_128>) noexcept
+        {
+            return _mm_i32gather_ps(src, index, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        XSIMD_INLINE batch<double, A> gather(batch<double, A> const&, double const* src,
+                                             batch<U, A> const& index,
+                                             requires_arch<avx2_128>) noexcept
+        {
+            return _mm_i64gather_pd(src, index, sizeof(double));
+        }
+    }
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp
@@ -63,6 +63,7 @@
 
 #if XSIMD_WITH_AVX2
 #include "./xsimd_avx2.hpp"
+#include "./xsimd_avx2_128.hpp"
 #endif
 
 #if XSIMD_WITH_FMA3_AVX2
diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp
@@ -163,7 +163,7 @@ namespace xsimd
 
     using all_x86_architectures = arch_list<
         avx512vnni<avx512vbmi2>, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
-        avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, avx_128, fma4, fma3<sse4_2>,
+        avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, avx2_128, avx_128, fma4, fma3<sse4_2>,
         sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
 
     using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
diff --git a/include/xsimd/types/xsimd_avx2_register.hpp b/include/xsimd/types/xsimd_avx2_register.hpp
@@ -28,6 +28,18 @@ namespace xsimd
         static constexpr char const* name() noexcept { return "avx2"; }
     };
 
+    /**
+     * @ingroup architectures
+     *
+     * AVX2 instructions extension for 128 bits registers
+     */
+    struct avx2_128 : avx_128
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx2/128"; }
+    };
+
 #if XSIMD_WITH_AVX2
 
 #if !XSIMD_WITH_AVX
@@ -37,6 +49,7 @@ namespace xsimd
     namespace types
     {
         XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2_128, avx_128);
     }
 #endif
 }