Skip to content

Commit ea882e6

Browse files
committed
fix(avx-512vl): masked load/store reach per-arch EVEX intrinsics
Three coupled changes; gcc-10 partial-ordering forces them into a single commit: 1) Rewrite avx512vl_128 / avx512vl_256 masked load/store. Adds the missing int64/uint64/float/double load_masked ovlds, corrects the batch_bool_constant typing on store_masked (was uint32_t/uint64_t for signed-int/float/double stores, now matches the value type), and branches aligned vs. unaligned to the right EVEX intrinsic. Unsigned ovlds delegate to the signed one via bitwise_cast. 2) Constrain the non-VL master ovlds (avx_128 float/double, avx2_128 int32/uint32 + int64/uint64, avx2 templated and int32/uint32/int64/ uint64) and the common-memory int<->float bridges with !is_base_of<avx512vl_*, A>. gcc-10's partial ordering otherwise sees a concrete requires_arch<X> and the inherited concrete requires_arch<Y> (Y a base of X) as equally specialized, likewise for templated bridge<A> vs. native<avx512vl_*> when A is VL. gcc-14 handles both cases naturally so this is a no-op there. The avx native gains an is_floating_point<T> SFINAE and the avx2 templated gains is_integral<T> && sizeof>=4 so the new half-fold dispatch (half_arch = avx for floats, avx2 for ints in a 512-bit batch) is unambiguous on gcc-10. 3) Resolve the half-fold target arch in avx / avx2 / avx512f through make_sized_batch_t<T, half>::arch_type so the dispatch picks avx512vl_128 / avx512vl_256 when available and emits EVEX vmovdqu32{k}{z} instead of VEX vpmaskmovd / vmaskmovps. (Without (3), (2)'s is_integral SFINAE on the avx2 templated form leaves the pre-existing avx512f.hpp:339 'store_masked<avx2>(float*, __m256, ...)' callsite with no matching ovld on gcc-10.) The xsimd_batch dispatch drops the explicit <A, T, U, Values...> args on the kernel::store_masked call so the SFINAE'd overload set can be resolved by ADL, and adds a fwd decl of make_sized_batch ahead of xsimd_isa.hpp so the half-fold sites can see the type at parse time. bridge_not_vl lives in xsimd_common_fwd next to the bridge fwd-decls; fwd.hpp now pulls xsimd_avx512vl_register so the trait sees complete types. The 4 redundant register.hpp includes that would otherwise be added at the point-of-use are dropped — they're reachable transitively through fwd.hpp.
1 parent 7c36cbc commit ea882e6

10 files changed

Lines changed: 333 additions & 140 deletions

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -388,57 +388,64 @@ namespace xsimd
388388
}
389389
}
390390

391+
// Integer→float reinterpret bridges. Excluded for AVX-512VL archs which provide
392+
// their own EVEX masked integer ovlds; without the exclusion gcc-10 sees the bridge
393+
// and the VL native as equally specialized for A=avx512vl_*. (bridge_not_vl in fwd.hpp)
391394
template <class A, bool... Values, class Mode>
392-
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
395+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<int32_t, A>>
396+
load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
393397
{
394398
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
395399
return bitwise_cast<int32_t>(f);
396400
}
397401

398402
template <class A, bool... Values, class Mode>
399-
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
403+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<uint32_t, A>>
404+
load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
400405
{
401406
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
402407
return bitwise_cast<uint32_t>(f);
403408
}
404409

405410
template <class A, bool... Values, class Mode>
406-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>>
411+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<int64_t, A>>
407412
load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept
408413
{
409414
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
410415
return bitwise_cast<int64_t>(d);
411416
}
412417

413418
template <class A, bool... Values, class Mode>
414-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>>
419+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<uint64_t, A>>
415420
load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept
416421
{
417422
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
418423
return bitwise_cast<uint64_t>(d);
419424
}
420425

421426
template <class A, bool... Values, class Mode>
422-
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
427+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>
428+
store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
423429
{
424430
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
425431
}
426432

427433
template <class A, bool... Values, class Mode>
428-
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
434+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>
435+
store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
429436
{
430437
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
431438
}
432439

433440
template <class A, bool... Values, class Mode>
434-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
441+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>
435442
store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept
436443
{
437444
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
438445
}
439446

440447
template <class A, bool... Values, class Mode>
441-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
448+
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>
442449
store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept
443450
{
444451
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -993,19 +993,20 @@ namespace xsimd
993993
{
994994
using int_t = as_integer_t<T>;
995995
constexpr size_t half_size = batch<T, A>::size / 2;
996+
using half_arch = typename ::xsimd::make_sized_batch_t<T, half_size>::arch_type;
996997

997-
// confined to lower 128-bit half → forward to 128 bit
998+
// lower 128-bit half
998999
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
9991000
{
1000-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
1001-
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
1001+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(batch_bool_constant<int_t, A, Values...> {});
1002+
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, half_arch {});
10021003
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
10031004
}
1004-
// confined to upper 128-bit half → forward to 128 bit
1005+
// upper 128-bit half
10051006
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10061007
{
1007-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
1008-
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
1008+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
1009+
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, half_arch {});
10091010
return detail::zero_extend<A>(hi);
10101011
}
10111012
else
@@ -1019,36 +1020,39 @@ namespace xsimd
10191020
namespace detail
10201021
{
10211022
template <class A>
1022-
XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept
1023+
XSIMD_INLINE void maskstore(float* mem, batch<as_integer_t<float>, A> const& mask, batch<float, A> const& src) noexcept
10231024
{
10241025
_mm256_maskstore_ps(mem, mask, src);
10251026
}
10261027

10271028
template <class A>
1028-
XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept
1029+
XSIMD_INLINE void maskstore(double* mem, batch<as_integer_t<double>, A> const& mask, batch<double, A> const& src) noexcept
10291030
{
10301031
_mm256_maskstore_pd(mem, mask, src);
10311032
}
10321033
}
10331034

1034-
template <class A, class T, bool... Values, class Mode>
1035+
template <class A, class T, bool... Values, class Mode,
1036+
typename = std::enable_if_t<std::is_floating_point<T>::value>>
10351037
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
10361038
{
10371039
constexpr size_t half_size = batch<T, A>::size / 2;
1040+
using half_batch = ::xsimd::make_sized_batch_t<T, half_size>;
1041+
using half_arch = typename half_batch::arch_type;
10381042

1039-
// confined to lower 128-bit half → forward to 128 bit
1043+
// lower 128-bit half
10401044
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
10411045
{
1042-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
1043-
const auto lo = detail::lower_half(src);
1044-
store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
1046+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
1047+
const half_batch lo = detail::lower_half(src);
1048+
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
10451049
}
1046-
// confined to upper 128-bit half → forward to 128 bit
1050+
// upper 128-bit half
10471051
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10481052
{
1049-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
1050-
const auto hi = detail::upper_half(src);
1051-
store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
1053+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
1054+
const half_batch hi = detail::upper_half(src);
1055+
store_masked<half_arch>(mem + half_size, hi, mhi, Mode {}, half_arch {});
10521056
}
10531057
else
10541058
{

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ namespace xsimd
138138
}
139139

140140
// single templated implementation for integer masked loads (32/64-bit)
141-
template <class A, class T, bool... Values, class Mode>
141+
template <class A, class T, bool... Values, class Mode,
142+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
142143
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
143144
load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
144145
{
@@ -148,26 +149,30 @@ namespace xsimd
148149
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
149150
}
150151

151-
template <class A, bool... Values, class Mode>
152+
template <class A, bool... Values, class Mode,
153+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
152154
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2>) noexcept
153155
{
154156
return load_masked<A, int32_t>(mem, mask, convert<int32_t> {}, Mode {}, avx2 {});
155157
}
156158

157-
template <class A, bool... Values, class Mode>
159+
template <class A, bool... Values, class Mode,
160+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
158161
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<avx2>) noexcept
159162
{
160163
const auto r = load_masked<A, int32_t>(reinterpret_cast<int32_t const*>(mem), batch_bool_constant<int32_t, A, Values...> {}, convert<int32_t> {}, Mode {}, avx2 {});
161164
return bitwise_cast<uint32_t>(r);
162165
}
163166

164-
template <class A, bool... Values, class Mode>
167+
template <class A, bool... Values, class Mode,
168+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
165169
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2>) noexcept
166170
{
167171
return load_masked<A, int64_t>(mem, mask, convert<int64_t> {}, Mode {}, avx2 {});
168172
}
169173

170-
template <class A, bool... Values, class Mode>
174+
template <class A, bool... Values, class Mode,
175+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
171176
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<avx2>) noexcept
172177
{
173178
const auto r = load_masked<A, int64_t>(reinterpret_cast<int64_t const*>(mem), batch_bool_constant<int64_t, A, Values...> {}, convert<int64_t> {}, Mode {}, avx2 {});
@@ -190,39 +195,44 @@ namespace xsimd
190195
}
191196
}
192197

193-
template <class A, class T, bool... Values, class Mode>
198+
template <class A, class T, bool... Values, class Mode,
199+
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4) && std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
194200
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
195201
{
196202
constexpr size_t lanes_per_half = batch<T, A>::size / 2;
203+
using half_batch = ::xsimd::make_sized_batch_t<T, lanes_per_half>;
204+
using half_arch = typename half_batch::arch_type;
197205

198-
// confined to lower 128-bit half → forward to SSE
206+
// lower 128-bit half
199207
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
200208
{
201-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
202-
const auto lo = detail::lower_half(src);
203-
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
209+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
210+
const half_batch lo = detail::lower_half(src);
211+
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
204212
}
205-
// confined to upper 128-bit half → forward to SSE
213+
// upper 128-bit half
206214
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
207215
{
208-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
209-
const auto hi = detail::upper_half(src);
210-
store_masked<sse4_2>(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {});
216+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
217+
const half_batch hi = detail::upper_half(src);
218+
store_masked<half_arch>(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {});
211219
}
212220
else
213221
{
214222
detail::maskstore<T, A>(mem, mask.as_batch(), src);
215223
}
216224
}
217225

218-
template <class A, bool... Values, class Mode>
219-
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
226+
template <class A, bool... Values, class Mode,
227+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
228+
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
220229
{
221230
const auto s32 = bitwise_cast<int32_t>(src);
222-
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, mask, Mode {}, avx2 {});
231+
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});
223232
}
224233

225-
template <class A, bool... Values, class Mode>
234+
template <class A, bool... Values, class Mode,
235+
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
226236
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
227237
{
228238
const auto s64 = bitwise_cast<int64_t>(src);

0 commit comments

Comments
 (0)