Skip to content

Commit db9038c

Browse files
Tentative support for avx extensions to 128 bit registers
1 parent e80ac4f commit db9038c

5 files changed

Lines changed: 190 additions & 8 deletions

File tree

.github/workflows/linux.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jobs:
2929
- { compiler: 'clang', version: '17', flags: 'avx' }
3030
- { compiler: 'clang', version: '17', flags: 'sse3' }
3131
- { compiler: 'clang', version: '18', flags: 'avx512' }
32+
- { compiler: 'clang', version: '18', flags: 'avx_128' }
3233
steps:
3334
- name: Setup compiler
3435
if: ${{ matrix.sys.compiler == 'gcc' }}
@@ -76,6 +77,9 @@ jobs:
7677
if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then
7778
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge"
7879
fi
80+
if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then
81+
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge -DXSIMD_DEFAULT_ARCH=avx_128"
82+
fi
7983
if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then
8084
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona"
8185
fi

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -994,18 +994,18 @@ namespace xsimd
994994
using int_t = as_integer_t<T>;
995995
constexpr size_t half_size = batch<T, A>::size / 2;
996996

997-
// confined to lower 128-bit half → forward to SSE2
997+
// confined to lower 128-bit half → forward to 128 bit
998998
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
999999
{
10001000
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
1001-
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, sse4_2 {});
1001+
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
10021002
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
10031003
}
1004-
// confined to upper 128-bit half → forward to SSE2
1004+
// confined to upper 128-bit half → forward to 128 bit
10051005
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10061006
{
10071007
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
1008-
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, sse4_2 {});
1008+
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
10091009
return detail::zero_extend<A>(hi);
10101010
}
10111011
else
@@ -1036,19 +1036,19 @@ namespace xsimd
10361036
{
10371037
constexpr size_t half_size = batch<T, A>::size / 2;
10381038

1039-
// confined to lower 128-bit half → forward to SSE2
1039+
// confined to lower 128-bit half → forward to 128 bit
10401040
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
10411041
{
10421042
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
10431043
const auto lo = detail::lower_half(src);
1044-
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
1044+
store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
10451045
}
1046-
// confined to upper 128-bit half → forward to SSE2
1046+
// confined to upper 128-bit half → forward to 128 bit
10471047
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10481048
{
10491049
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
10501050
const auto hi = detail::upper_half(src);
1051-
store_masked<sse4_2>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
1051+
store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
10521052
}
10531053
else
10541054
{
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
/***************************************************************************
2+
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3+
* Martin Renou *
4+
* Copyright (c) QuantStack *
5+
* Copyright (c) Serge Guelton *
6+
* Copyright (c) Marco Barbone *
7+
* *
8+
* Distributed under the terms of the BSD 3-Clause License. *
9+
* *
10+
* The full license is in the file LICENSE, distributed with this software. *
11+
****************************************************************************/
12+
13+
#ifndef XSIMD_AVX_128_HPP
14+
#define XSIMD_AVX_128_HPP
15+
16+
#include <complex>
17+
#include <limits>
18+
#include <type_traits>
19+
20+
#include "../types/xsimd_avx_register.hpp"
21+
#include "../types/xsimd_batch_constant.hpp"
22+
23+
namespace xsimd
24+
{
25+
namespace kernel
26+
{
27+
using namespace types;
28+
29+
// broadcast
30+
template<class A, class T, class = std::enable_if_t<std::is_same<T, float>::value>>
31+
XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx_128>) noexcept {
32+
return _mm_broadcast_ss(&val);
33+
}
34+
35+
// eq
36+
template <class A>
37+
XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
38+
{
39+
return _mm_cmp_ps(self, other, _CMP_EQ_OQ);
40+
}
41+
template <class A>
42+
XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
43+
{
44+
return _mm_cmp_pd(self, other, _CMP_EQ_OQ);
45+
}
46+
47+
// gt
48+
template <class A>
49+
XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
50+
{
51+
return _mm_cmp_ps(self, other, _CMP_GT_OQ);
52+
}
53+
template <class A>
54+
XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
55+
{
56+
return _mm_cmp_pd(self, other, _CMP_GT_OQ);
57+
}
58+
59+
// ge
60+
template <class A>
61+
XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
62+
{
63+
return _mm_cmp_ps(self, other, _CMP_GE_OQ);
64+
}
65+
template <class A>
66+
XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
67+
{
68+
return _mm_cmp_pd(self, other, _CMP_GE_OQ);
69+
}
70+
71+
// lt
72+
template <class A>
73+
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
74+
{
75+
return _mm_cmp_ps(self, other, _CMP_LT_OQ);
76+
}
77+
template <class A>
78+
XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
79+
{
80+
return _mm_cmp_pd(self, other, _CMP_LT_OQ);
81+
}
82+
83+
// le
84+
template <class A>
85+
XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
86+
{
87+
return _mm_cmp_ps(self, other, _CMP_LE_OQ);
88+
}
89+
template <class A>
90+
XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
91+
{
92+
return _mm_cmp_pd(self, other, _CMP_LE_OQ);
93+
}
94+
95+
// neq
96+
template <class A>
97+
XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
98+
{
99+
return _mm_cmp_ps(self, other, _CMP_NEQ_UQ);
100+
}
101+
template <class A>
102+
XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
103+
{
104+
return _mm_cmp_pd(self, other, _CMP_NEQ_UQ);
105+
}
106+
107+
// load_masked
108+
template <class A, bool... Values, class Mode>
109+
XSIMD_INLINE batch<float, A> load_masked(float const* mem, batch_bool_constant<float, A, Values...> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
110+
{
111+
return _mm_maskload_ps(mem, mask.as_batch());
112+
}
113+
template <class A, bool... Values, class Mode>
114+
XSIMD_INLINE batch<double, A> load_masked(double const* mem, batch_bool_constant<double, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
115+
{
116+
return _mm_maskload_pd(mem, mask.as_batch());
117+
}
118+
119+
// store_masked
120+
template <class A, bool... Values, class Mode>
121+
XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
122+
{
123+
return _mm_maskstore_ps(mem, mask.as_batch(), src);
124+
}
125+
126+
template <class A, bool... Values, class Mode>
127+
XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
128+
{
129+
return _mm_maskstore_pd(mem, mask.as_batch(), src);
130+
}
131+
132+
// swizzle (dynamic mask)
133+
template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
134+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept
135+
{
136+
XSIMD_IF_CONSTEXPR(std::is_same<T, float>::value) {
137+
return _mm_permutevar_ps(self, mask);
138+
}
139+
else {
140+
// FIXME: _mm_permutevar_pd fails validation, but it shouldn't o_O
141+
return swizzle(self, mask, sse4_2{});
142+
//return _mm_permutevar_pd(self, mask);
143+
}
144+
}
145+
146+
// swizzle (constant mask)
147+
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
148+
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<avx_128>) noexcept
149+
{
150+
return _mm_permute_ps(self, detail::mod_shuffle(V0, V1, V2, V3));
151+
}
152+
153+
template <class A, uint32_t V0, uint32_t V1>
154+
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<avx_128>) noexcept
155+
{
156+
return _mm_permute_pd(self, detail::mod_shuffle(V0, V1));
157+
}
158+
159+
}
160+
}
161+
162+
#endif

include/xsimd/arch/xsimd_isa.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050

5151
#if XSIMD_WITH_AVX
5252
#include "./xsimd_avx.hpp"
53+
#include "./xsimd_avx_128.hpp"
5354
#endif
5455

5556
#if XSIMD_WITH_FMA3_AVX

include/xsimd/types/xsimd_avx_register.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define XSIMD_AVX_REGISTER_HPP
1414

1515
#include "./xsimd_common_arch.hpp"
16+
#include "./xsimd_sse4_2_register.hpp"
1617

1718
namespace xsimd
1819
{
@@ -30,6 +31,18 @@ namespace xsimd
3031
static constexpr bool requires_alignment() noexcept { return true; }
3132
static constexpr char const* name() noexcept { return "avx"; }
3233
};
34+
35+
/**
36+
* @ingroup architectures
37+
*
38+
* AVX instructions extension for 128 bits registers
39+
*/
40+
struct avx_128 : sse4_2
41+
{
42+
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
43+
static constexpr bool available() noexcept { return true; }
44+
static constexpr char const* name() noexcept { return "avx/128"; }
45+
};
3346
}
3447

3548
#if XSIMD_WITH_AVX
@@ -58,6 +71,8 @@ namespace xsimd
5871
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
5972
XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
6073
XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
74+
75+
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx_128, sse4_2);
6176
}
6277
}
6378
#endif

0 commit comments

Comments
 (0)