Skip to content

Commit 90c6d8c

Browse files
Tentative support for avx extensions to 128 bit registers
1 parent e80ac4f commit 90c6d8c

5 files changed

Lines changed: 191 additions & 8 deletions

File tree

.github/workflows/linux.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jobs:
2929
- { compiler: 'clang', version: '17', flags: 'avx' }
3030
- { compiler: 'clang', version: '17', flags: 'sse3' }
3131
- { compiler: 'clang', version: '18', flags: 'avx512' }
32+
- { compiler: 'clang', version: '18', flags: 'avx_128' }
3233
steps:
3334
- name: Setup compiler
3435
if: ${{ matrix.sys.compiler == 'gcc' }}
@@ -76,6 +77,9 @@ jobs:
7677
if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then
7778
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge"
7879
fi
80+
if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then
81+
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge -DXSIMD_DEFAULT_ARCH=avx_128"
82+
fi
7983
if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then
8084
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona"
8185
fi

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -994,18 +994,18 @@ namespace xsimd
994994
using int_t = as_integer_t<T>;
995995
constexpr size_t half_size = batch<T, A>::size / 2;
996996

997-
// confined to lower 128-bit half → forward to SSE2
997+
// confined to lower 128-bit half → forward to 128 bit
998998
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
999999
{
10001000
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
1001-
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, sse4_2 {});
1001+
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
10021002
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
10031003
}
1004-
// confined to upper 128-bit half → forward to SSE2
1004+
// confined to upper 128-bit half → forward to 128 bit
10051005
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10061006
{
10071007
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
1008-
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, sse4_2 {});
1008+
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
10091009
return detail::zero_extend<A>(hi);
10101010
}
10111011
else
@@ -1036,19 +1036,19 @@ namespace xsimd
10361036
{
10371037
constexpr size_t half_size = batch<T, A>::size / 2;
10381038

1039-
// confined to lower 128-bit half → forward to SSE2
1039+
// confined to lower 128-bit half → forward to 128 bit
10401040
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
10411041
{
10421042
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
10431043
const auto lo = detail::lower_half(src);
1044-
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
1044+
store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
10451045
}
1046-
// confined to upper 128-bit half → forward to SSE2
1046+
// confined to upper 128-bit half → forward to 128 bit
10471047
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10481048
{
10491049
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
10501050
const auto hi = detail::upper_half(src);
1051-
store_masked<sse4_2>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
1051+
store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
10521052
}
10531053
else
10541054
{
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
/***************************************************************************
2+
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3+
* Martin Renou *
4+
* Copyright (c) QuantStack *
5+
* Copyright (c) Serge Guelton *
6+
* Copyright (c) Marco Barbone *
7+
* *
8+
* Distributed under the terms of the BSD 3-Clause License. *
9+
* *
10+
* The full license is in the file LICENSE, distributed with this software. *
11+
****************************************************************************/
12+
13+
#ifndef XSIMD_AVX_128_HPP
14+
#define XSIMD_AVX_128_HPP
15+
16+
#include <type_traits>
17+
18+
#include "../types/xsimd_avx_register.hpp"
19+
#include "../types/xsimd_batch_constant.hpp"
20+
21+
namespace xsimd
22+
{
23+
namespace kernel
24+
{
25+
using namespace types;
26+
27+
// broadcast
28+
template <class A, class T, class = std::enable_if_t<std::is_same<T, float>::value>>
29+
XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx_128>) noexcept
30+
{
31+
return _mm_broadcast_ss(&val);
32+
}
33+
34+
// eq
35+
template <class A>
36+
XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
37+
{
38+
return _mm_cmp_ps(self, other, _CMP_EQ_OQ);
39+
}
40+
template <class A>
41+
XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
42+
{
43+
return _mm_cmp_pd(self, other, _CMP_EQ_OQ);
44+
}
45+
46+
// gt
47+
template <class A>
48+
XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
49+
{
50+
return _mm_cmp_ps(self, other, _CMP_GT_OQ);
51+
}
52+
template <class A>
53+
XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
54+
{
55+
return _mm_cmp_pd(self, other, _CMP_GT_OQ);
56+
}
57+
58+
// ge
59+
template <class A>
60+
XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
61+
{
62+
return _mm_cmp_ps(self, other, _CMP_GE_OQ);
63+
}
64+
template <class A>
65+
XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
66+
{
67+
return _mm_cmp_pd(self, other, _CMP_GE_OQ);
68+
}
69+
70+
// lt
71+
template <class A>
72+
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
73+
{
74+
return _mm_cmp_ps(self, other, _CMP_LT_OQ);
75+
}
76+
template <class A>
77+
XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
78+
{
79+
return _mm_cmp_pd(self, other, _CMP_LT_OQ);
80+
}
81+
82+
// le
83+
template <class A>
84+
XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
85+
{
86+
return _mm_cmp_ps(self, other, _CMP_LE_OQ);
87+
}
88+
template <class A>
89+
XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
90+
{
91+
return _mm_cmp_pd(self, other, _CMP_LE_OQ);
92+
}
93+
94+
// neq
95+
template <class A>
96+
XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
97+
{
98+
return _mm_cmp_ps(self, other, _CMP_NEQ_UQ);
99+
}
100+
template <class A>
101+
XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
102+
{
103+
return _mm_cmp_pd(self, other, _CMP_NEQ_UQ);
104+
}
105+
106+
// load_masked
107+
template <class A, bool... Values, class Mode>
108+
XSIMD_INLINE batch<float, A> load_masked(float const* mem, batch_bool_constant<float, A, Values...> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
109+
{
110+
return _mm_maskload_ps(mem, mask.as_batch());
111+
}
112+
template <class A, bool... Values, class Mode>
113+
XSIMD_INLINE batch<double, A> load_masked(double const* mem, batch_bool_constant<double, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
114+
{
115+
return _mm_maskload_pd(mem, mask.as_batch());
116+
}
117+
118+
// store_masked
119+
template <class A, bool... Values, class Mode>
120+
XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
121+
{
122+
return _mm_maskstore_ps(mem, mask.as_batch(), src);
123+
}
124+
125+
template <class A, bool... Values, class Mode>
126+
XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
127+
{
128+
return _mm_maskstore_pd(mem, mask.as_batch(), src);
129+
}
130+
131+
// swizzle (dynamic mask)
132+
template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
133+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept
134+
{
135+
XSIMD_IF_CONSTEXPR(std::is_same<T, float>::value)
136+
{
137+
return _mm_permutevar_ps(self, mask);
138+
}
139+
else
140+
{
141+
// FIXME: _mm_permutevar_pd fails validation, but it shouldn't o_O
142+
return swizzle(self, mask, sse4_2 {});
143+
// return _mm_permutevar_pd(self, mask);
144+
}
145+
}
146+
147+
// swizzle (constant mask)
148+
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
149+
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<avx_128>) noexcept
150+
{
151+
return _mm_permute_ps(self, detail::mod_shuffle(V0, V1, V2, V3));
152+
}
153+
154+
template <class A, uint32_t V0, uint32_t V1>
155+
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<avx_128>) noexcept
156+
{
157+
return _mm_permute_pd(self, detail::mod_shuffle(V0, V1));
158+
}
159+
160+
}
161+
}
162+
163+
#endif

include/xsimd/arch/xsimd_isa.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050

5151
#if XSIMD_WITH_AVX
5252
#include "./xsimd_avx.hpp"
53+
#include "./xsimd_avx_128.hpp"
5354
#endif
5455

5556
#if XSIMD_WITH_FMA3_AVX

include/xsimd/types/xsimd_avx_register.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define XSIMD_AVX_REGISTER_HPP
1414

1515
#include "./xsimd_common_arch.hpp"
16+
#include "./xsimd_sse4_2_register.hpp"
1617

1718
namespace xsimd
1819
{
@@ -30,6 +31,18 @@ namespace xsimd
3031
static constexpr bool requires_alignment() noexcept { return true; }
3132
static constexpr char const* name() noexcept { return "avx"; }
3233
};
34+
35+
/**
36+
* @ingroup architectures
37+
*
38+
* AVX instructions extension for 128 bits registers
39+
*/
40+
struct avx_128 : sse4_2
41+
{
42+
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
43+
static constexpr bool available() noexcept { return true; }
44+
static constexpr char const* name() noexcept { return "avx/128"; }
45+
};
3346
}
3447

3548
#if XSIMD_WITH_AVX
@@ -58,6 +71,8 @@ namespace xsimd
5871
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
5972
XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
6073
XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
74+
75+
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx_128, sse4_2);
6176
}
6277
}
6378
#endif

0 commit comments

Comments
 (0)