Skip to content

Commit fe2938e

Browse files
committed
fix: avx512vl masked load/store reach per-arch EVEX intrinsics
1 parent 7d30b9c commit fe2938e

16 files changed

Lines changed: 370 additions & 159 deletions

.github/workflows/linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ jobs:
100100
fi
101101
if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then
102102
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
103-
CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128"
103+
CXXFLAGS="$CXXFLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128"
104104
fi
105105
if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then
106106
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#ifndef XSIMD_COMMON_MEMORY_HPP
1313
#define XSIMD_COMMON_MEMORY_HPP
1414

15+
#include "../../types/xsimd_avx512vl_register.hpp"
1516
#include "../../types/xsimd_batch_constant.hpp"
1617
#include "./xsimd_common_details.hpp"
1718

@@ -388,57 +389,64 @@ namespace xsimd
388389
}
389390
}
390391

392+
// Integer→float reinterpret bridges. Excluded for AVX-512VL archs which provide
393+
// their own EVEX masked integer ovlds; without the exclusion gcc-10 sees the bridge
394+
// and the VL native as equally specialized for A=avx512vl_*. (bridge_not_vl in fwd.hpp)
391395
template <class A, bool... Values, class Mode>
392-
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
396+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<int32_t, A>>
397+
load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
393398
{
394399
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
395400
return bitwise_cast<int32_t>(f);
396401
}
397402

398403
template <class A, bool... Values, class Mode>
399-
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
404+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<uint32_t, A>>
405+
load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
400406
{
401407
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
402408
return bitwise_cast<uint32_t>(f);
403409
}
404410

405411
template <class A, bool... Values, class Mode>
406-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>>
412+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<int64_t, A>>
407413
load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept
408414
{
409415
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
410416
return bitwise_cast<int64_t>(d);
411417
}
412418

413419
template <class A, bool... Values, class Mode>
414-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>>
420+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<uint64_t, A>>
415421
load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept
416422
{
417423
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
418424
return bitwise_cast<uint64_t>(d);
419425
}
420426

421427
template <class A, bool... Values, class Mode>
422-
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
428+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>
429+
store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
423430
{
424431
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
425432
}
426433

427434
template <class A, bool... Values, class Mode>
428-
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
435+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>
436+
store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
429437
{
430438
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
431439
}
432440

433441
template <class A, bool... Values, class Mode>
434-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
442+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>
435443
store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept
436444
{
437445
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
438446
}
439447

440448
template <class A, bool... Values, class Mode>
441-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
449+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>
442450
store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept
443451
{
444452
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -993,19 +993,20 @@ namespace xsimd
993993
{
994994
using int_t = as_integer_t<T>;
995995
constexpr size_t half_size = batch<T, A>::size / 2;
996+
using half_arch = typename ::xsimd::make_sized_batch_t<T, half_size>::arch_type;
996997

997-
// confined to lower 128-bit half → forward to 128 bit
998+
// lower 128-bit half
998999
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
9991000
{
1000-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
1001-
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
1001+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(batch_bool_constant<int_t, A, Values...> {});
1002+
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, half_arch {});
10021003
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
10031004
}
1004-
// confined to upper 128-bit half → forward to 128 bit
1005+
// upper 128-bit half
10051006
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10061007
{
1007-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
1008-
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
1008+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
1009+
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, half_arch {});
10091010
return detail::zero_extend<A>(hi);
10101011
}
10111012
else
@@ -1019,36 +1020,39 @@ namespace xsimd
10191020
namespace detail
10201021
{
10211022
template <class A>
1022-
XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept
1023+
XSIMD_INLINE void maskstore(float* mem, batch<as_integer_t<float>, A> const& mask, batch<float, A> const& src) noexcept
10231024
{
10241025
_mm256_maskstore_ps(mem, mask, src);
10251026
}
10261027

10271028
template <class A>
1028-
XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept
1029+
XSIMD_INLINE void maskstore(double* mem, batch<as_integer_t<double>, A> const& mask, batch<double, A> const& src) noexcept
10291030
{
10301031
_mm256_maskstore_pd(mem, mask, src);
10311032
}
10321033
}
10331034

1034-
template <class A, class T, bool... Values, class Mode>
1035+
template <class A, class T, bool... Values, class Mode,
1036+
typename = std::enable_if_t<std::is_floating_point<T>::value>>
10351037
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
10361038
{
10371039
constexpr size_t half_size = batch<T, A>::size / 2;
1040+
using half_batch = ::xsimd::make_sized_batch_t<T, half_size>;
1041+
using half_arch = typename half_batch::arch_type;
10381042

1039-
// confined to lower 128-bit half → forward to 128 bit
1043+
// lower 128-bit half
10401044
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
10411045
{
1042-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
1043-
const auto lo = detail::lower_half(src);
1044-
store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
1046+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
1047+
const half_batch lo = detail::lower_half(src);
1048+
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
10451049
}
1046-
// confined to upper 128-bit half → forward to 128 bit
1050+
// upper 128-bit half
10471051
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10481052
{
1049-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
1050-
const auto hi = detail::upper_half(src);
1051-
store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
1053+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
1054+
const half_batch hi = detail::upper_half(src);
1055+
store_masked<half_arch>(mem + half_size, hi, mhi, Mode {}, half_arch {});
10521056
}
10531057
else
10541058
{

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define XSIMD_AVX2_HPP
1414

1515
#include "../types/xsimd_avx2_register.hpp"
16+
#include "../types/xsimd_avx512vl_register.hpp"
1617
#include "../types/xsimd_batch_constant.hpp"
1718
#include "./utils/shifts.hpp"
1819

@@ -138,7 +139,8 @@ namespace xsimd
138139
}
139140

140141
// single templated implementation for integer masked loads (32/64-bit)
141-
template <class A, class T, bool... Values, class Mode>
142+
template <class A, class T, bool... Values, class Mode,
143+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
142144
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
143145
load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
144146
{
@@ -148,26 +150,30 @@ namespace xsimd
148150
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
149151
}
150152

151-
template <class A, bool... Values, class Mode>
153+
template <class A, bool... Values, class Mode,
154+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
152155
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2>) noexcept
153156
{
154157
return load_masked<A, int32_t>(mem, mask, convert<int32_t> {}, Mode {}, avx2 {});
155158
}
156159

157-
template <class A, bool... Values, class Mode>
160+
template <class A, bool... Values, class Mode,
161+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
158162
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<avx2>) noexcept
159163
{
160164
const auto r = load_masked<A, int32_t>(reinterpret_cast<int32_t const*>(mem), batch_bool_constant<int32_t, A, Values...> {}, convert<int32_t> {}, Mode {}, avx2 {});
161165
return bitwise_cast<uint32_t>(r);
162166
}
163167

164-
template <class A, bool... Values, class Mode>
168+
template <class A, bool... Values, class Mode,
169+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
165170
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2>) noexcept
166171
{
167172
return load_masked<A, int64_t>(mem, mask, convert<int64_t> {}, Mode {}, avx2 {});
168173
}
169174

170-
template <class A, bool... Values, class Mode>
175+
template <class A, bool... Values, class Mode,
176+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
171177
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<avx2>) noexcept
172178
{
173179
const auto r = load_masked<A, int64_t>(reinterpret_cast<int64_t const*>(mem), batch_bool_constant<int64_t, A, Values...> {}, convert<int64_t> {}, Mode {}, avx2 {});
@@ -190,39 +196,44 @@ namespace xsimd
190196
}
191197
}
192198

193-
template <class A, class T, bool... Values, class Mode>
199+
template <class A, class T, bool... Values, class Mode,
200+
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4) && std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
194201
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
195202
{
196203
constexpr size_t lanes_per_half = batch<T, A>::size / 2;
204+
using half_batch = ::xsimd::make_sized_batch_t<T, lanes_per_half>;
205+
using half_arch = typename half_batch::arch_type;
197206

198-
// confined to lower 128-bit half → forward to SSE
207+
// lower 128-bit half
199208
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
200209
{
201-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
202-
const auto lo = detail::lower_half(src);
203-
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
210+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
211+
const half_batch lo = detail::lower_half(src);
212+
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
204213
}
205-
// confined to upper 128-bit half → forward to SSE
214+
// upper 128-bit half
206215
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
207216
{
208-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
209-
const auto hi = detail::upper_half(src);
210-
store_masked<sse4_2>(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {});
217+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
218+
const half_batch hi = detail::upper_half(src);
219+
store_masked<half_arch>(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {});
211220
}
212221
else
213222
{
214223
detail::maskstore<T, A>(mem, mask.as_batch(), src);
215224
}
216225
}
217226

218-
template <class A, bool... Values, class Mode>
219-
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
227+
template <class A, bool... Values, class Mode,
228+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
229+
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
220230
{
221231
const auto s32 = bitwise_cast<int32_t>(src);
222-
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, mask, Mode {}, avx2 {});
232+
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});
223233
}
224234

225-
template <class A, bool... Values, class Mode>
235+
template <class A, bool... Values, class Mode,
236+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
226237
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
227238
{
228239
const auto s64 = bitwise_cast<int64_t>(src);

0 commit comments

Comments
 (0)