Skip to content

Commit 3a61c90

Browse files
Tentative support for avx2 extensions to 128 bit registers
1 parent 9748316 commit 3a61c90

5 files changed

Lines changed: 193 additions & 1 deletion

File tree

.github/workflows/linux.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ jobs:
1717
- { compiler: 'gcc', version: '12', flags: 'force_no_instr_set' }
1818
- { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' }
1919
- { compiler: 'gcc', version: '14', flags: 'avx' }
20+
- { compiler: 'gcc', version: '14', flags: 'avx2' }
2021
- { compiler: 'gcc', version: '13', flags: 'avx512' }
2122
- { compiler: 'gcc', version: '10', flags: 'avx512' }
2223
- { compiler: 'gcc', version: '12', flags: 'i386' }
@@ -30,6 +31,7 @@ jobs:
3031
- { compiler: 'clang', version: '17', flags: 'sse3' }
3132
- { compiler: 'clang', version: '18', flags: 'avx512' }
3233
- { compiler: 'clang', version: '18', flags: 'avx_128' }
34+
- { compiler: 'clang', version: '18', flags: 'avx2_128' }
3335
steps:
3436
- name: Setup compiler
3537
if: ${{ matrix.sys.compiler == 'gcc' }}
@@ -80,6 +82,12 @@ jobs:
8082
if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then
8183
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge -DXSIMD_DEFAULT_ARCH=avx_128"
8284
fi
85+
if [[ '${{ matrix.sys.flags }}' == 'avx2' ]]; then
86+
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell"
87+
fi
88+
if [[ '${{ matrix.sys.flags }}' == 'avx2_128' ]]; then
89+
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell -DXSIMD_DEFAULT_ARCH=avx2_128"
90+
fi
8391
if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then
8492
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona"
8593
fi
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
/***************************************************************************
2+
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3+
* Martin Renou *
4+
* Copyright (c) QuantStack *
5+
* Copyright (c) Serge Guelton *
6+
* Copyright (c) Marco Barbone *
7+
* *
8+
* Distributed under the terms of the BSD 3-Clause License. *
9+
* *
10+
* The full license is in the file LICENSE, distributed with this software. *
11+
****************************************************************************/
12+
13+
#ifndef XSIMD_AVX2_128_HPP
14+
#define XSIMD_AVX2_128_HPP
15+
16+
#include <type_traits>
17+
18+
#include "../types/xsimd_avx2_register.hpp"
19+
#include "../types/xsimd_batch_constant.hpp"
20+
21+
namespace xsimd
22+
{
23+
namespace kernel
24+
{
25+
using namespace types;
26+
27+
// select
28+
template <class A, class T, bool... Values, class = std::enable_if_t<std::is_integral<T>::value>>
29+
XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2_128>) noexcept
30+
{
31+
constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
32+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
33+
{
34+
return _mm_blend_epi32(false_br, true_br, mask);
35+
}
36+
else
37+
{
38+
return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, avx_128 {});
39+
}
40+
}
41+
42+
// bitwise_lshift
43+
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
44+
XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2_128>) noexcept
45+
{
46+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
47+
{
48+
return _mm_sllv_epi32(self, other);
49+
}
50+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
51+
{
52+
return _mm_sllv_epi64(self, other);
53+
}
54+
else
55+
{
56+
return bitwise_lshift(self, other, avx {});
57+
}
58+
}
59+
60+
// bitwise_rshift
61+
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
62+
XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2_128>) noexcept
63+
{
64+
if (std::is_signed<T>::value)
65+
{
66+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
67+
{
68+
return _mm_srav_epi32(self, other);
69+
}
70+
else
71+
{
72+
return bitwise_rshift(self, other, avx_128 {});
73+
}
74+
}
75+
else
76+
{
77+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
78+
{
79+
return _mm_srlv_epi32(self, other);
80+
}
81+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
82+
{
83+
return _mm_srlv_epi64(self, other);
84+
}
85+
else
86+
{
87+
return bitwise_rshift(self, other, avx_128 {});
88+
}
89+
}
90+
}
91+
92+
// load_masked
93+
template <class A, bool... Values, class Mode>
94+
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2_128>) noexcept
95+
{
96+
return _mm_maskload_epi32(mem, mask.as_batch());
97+
}
98+
template <class A, bool... Values, class Mode>
99+
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx2_128>) noexcept
100+
{
101+
return _mm_maskload_epi32((int32_t*)mem, mask.as_batch());
102+
}
103+
template <class A, bool... Values, class Mode>
104+
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
105+
{
106+
return _mm_maskload_epi64(mem, mask.as_batch());
107+
}
108+
template <class A, bool... Values, class Mode>
109+
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
110+
{
111+
return _mm_maskload_epi64((int64_t*)mem, mask.as_batch());
112+
}
113+
114+
// store_masked
115+
template <class A, bool... Values, class Mode>
116+
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
117+
{
118+
return _mm_maskstore_epi32(mem, mask.as_batch(), src);
119+
}
120+
template <class A, bool... Values, class Mode>
121+
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
122+
{
123+
return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src);
124+
}
125+
template <class A, bool... Values, class Mode>
126+
XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
127+
{
128+
return _mm_maskstore_epi64(mem, mask.as_batch(), src);
129+
}
130+
template <class A, bool... Values, class Mode>
131+
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
132+
{
133+
return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src);
134+
}
135+
136+
// gather
137+
template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
138+
XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
139+
kernel::requires_arch<avx2_128>) noexcept
140+
{
141+
return _mm_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
142+
}
143+
144+
template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
145+
XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
146+
kernel::requires_arch<avx2_128>) noexcept
147+
{
148+
return _mm_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
149+
}
150+
151+
template <class A, class U,
152+
detail::enable_sized_integral_t<U, 4> = 0>
153+
XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
154+
batch<U, A> const& index,
155+
kernel::requires_arch<avx2_128>) noexcept
156+
{
157+
return _mm_i32gather_ps(src, index, sizeof(float));
158+
}
159+
160+
template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
161+
XSIMD_INLINE batch<double, A> gather(batch<double, A> const&, double const* src,
162+
batch<U, A> const& index,
163+
requires_arch<avx2_128>) noexcept
164+
{
165+
return _mm_i64gather_pd(src, index, sizeof(double));
166+
}
167+
}
168+
}
169+
170+
#endif

include/xsimd/arch/xsimd_isa.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363

6464
#if XSIMD_WITH_AVX2
6565
#include "./xsimd_avx2.hpp"
66+
#include "./xsimd_avx2_128.hpp"
6667
#endif
6768

6869
#if XSIMD_WITH_FMA3_AVX2

include/xsimd/config/xsimd_arch.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ namespace xsimd
163163

164164
using all_x86_architectures = arch_list<
165165
avx512vnni<avx512vbmi2>, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
166-
avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, avx_128, fma4, fma3<sse4_2>,
166+
avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, avx2_128, avx_128, fma4, fma3<sse4_2>,
167167
sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
168168

169169
using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;

include/xsimd/types/xsimd_avx2_register.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,18 @@ namespace xsimd
2828
static constexpr char const* name() noexcept { return "avx2"; }
2929
};
3030

31+
/**
32+
* @ingroup architectures
33+
*
34+
* AVX2 instructions extension for 128 bits registers
35+
*/
36+
struct avx2_128 : avx_128
37+
{
38+
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
39+
static constexpr bool available() noexcept { return true; }
40+
static constexpr char const* name() noexcept { return "avx2/128"; }
41+
};
42+
3143
#if XSIMD_WITH_AVX2
3244

3345
#if !XSIMD_WITH_AVX
@@ -37,6 +49,7 @@ namespace xsimd
3749
namespace types
3850
{
3951
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
52+
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2_128, avx_128);
4053
}
4154
#endif
4255
}

0 commit comments

Comments
 (0)