Skip to content

Commit fa06792

Browse files
committed
refactor(masked-memory): dispatch via overload conversion ranking
Drop the cross-arch SFINAE/tag mechanism: a concrete requires_arch<avx512vl_128|256> overload now beats the inherited avx2/avx2_128 one by overload conversion ranking, so no arch file knows about another. xsimd_common_memory.hpp keeps only requires_arch<common> and dispatches on the arch-agnostic trait masked_memory_uses_fp_bitcast (integral with a same-width float register -> reuse that float vmaskmov* path, else a scalar buffer). avx/avx2/avx2_128 drop every is_base_of<avx512vl_*, A> guard; avx2_128 routes native 128-bit integer masked memory through vpmaskmov* (long long* cast for 64-bit) and tags int64/uint64 on avx2_128 (those intrinsics need AVX2). detail::maskstore takes a bool mask and casts internally; xsimd_batch.hpp keeps a make_sized_batch fwd-decl and simplifies the store_masked call; xsimd_isa.hpp documents the _128-first include order; sse2.hpp adapts to the new store_masked(common) signature.
1 parent 34bca15 commit fa06792

8 files changed

Lines changed: 140 additions & 129 deletions

File tree

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 68 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define XSIMD_COMMON_MEMORY_HPP
1414

1515
#include "../../types/xsimd_batch_constant.hpp"
16+
#include "../../utils/xsimd_type_traits.hpp"
1617
#include "./xsimd_common_details.hpp"
1718

1819
#include <algorithm>
@@ -360,88 +361,87 @@ namespace xsimd
360361
return load_unaligned<A>(mem, convert<T> {}, A {});
361362
}
362363

363-
template <class A, class T_in, class T_out, bool... Values, class alignment>
364-
XSIMD_INLINE batch<T_out, A>
365-
load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...>, convert<T_out>, alignment, requires_arch<common>) noexcept
366-
{
367-
constexpr std::size_t size = batch<T_out, A>::size;
368-
alignas(A::alignment()) std::array<T_out, size> buffer {};
369-
constexpr bool mask[size] = { Values... };
370-
371-
for (std::size_t i = 0; i < size; ++i)
372-
buffer[i] = mask[i] ? static_cast<T_out>(mem[i]) : T_out(0);
373-
374-
return batch<T_out, A>::load(buffer.data(), aligned_mode {});
375-
}
376-
377-
template <class A, class T_in, class T_out, bool... Values, class alignment>
378-
XSIMD_INLINE void
379-
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
364+
// Masked-memory dispatch idiom. To give an arch a native masked path, add a
365+
// `requires_arch<that-arch>` overload in its arch file; conversion ranking makes
366+
// it beat the inherited one. Keep this base layer arch-agnostic:
367+
// (a) specialize via a concrete `requires_arch<arch>` overload -- no register
368+
// tag, no `enable_if` on `A`;
369+
// (b) base overloads use the `requires_arch<common>` tag only; a generic
370+
// `requires_arch<A>` here ties with an arch's own overload (gcc-10 ambiguity);
371+
// (c) capability decisions go through arch-agnostic traits (see below).
372+
namespace detail
380373
{
381-
constexpr std::size_t size = batch<T_in, A>::size;
382-
constexpr bool mask[size] = { Values... };
374+
// True when an integer access can borrow the same-width float `vmaskmov*` path
375+
// (integral type, same-size float exists, arch has that float register);
376+
// otherwise the scalar-buffer fallback is used. Names no architecture.
377+
template <class A, class T_in, class T_out>
378+
using masked_memory_uses_fp_bitcast = std::integral_constant<bool,
379+
std::is_same<T_in, T_out>::value
380+
&& std::is_integral<T_out>::value
381+
&& !std::is_void<sized_fp_t<sizeof(T_out)>>::value
382+
&& types::has_simd_register<sized_fp_t<sizeof(T_out)>, A>::value>;
383383

384-
for (std::size_t i = 0; i < size; ++i)
385-
if (mask[i])
386-
{
387-
mem[i] = static_cast<T_out>(src.get(i));
388-
}
389-
}
384+
// Scalar-buffer fallback: materialize masked-off lanes as zero, then load.
385+
template <class A, class T_in, class T_out, bool... Values, class alignment>
386+
XSIMD_INLINE batch<T_out, A>
387+
load_masked_common(T_in const* mem, batch_bool_constant<T_out, A, Values...>, convert<T_out>, alignment, std::false_type /* uses_fp_bitcast */) noexcept
388+
{
389+
constexpr std::size_t size = batch<T_out, A>::size;
390+
alignas(A::alignment()) std::array<T_out, size> buffer {};
391+
constexpr bool mask[size] = { Values... };
390392

391-
template <class A, bool... Values, class Mode>
392-
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
393-
{
394-
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
395-
return bitwise_cast<int32_t>(f);
396-
}
393+
for (std::size_t i = 0; i < size; ++i)
394+
buffer[i] = mask[i] ? static_cast<T_out>(mem[i]) : T_out(0);
397395

398-
template <class A, bool... Values, class Mode>
399-
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
400-
{
401-
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
402-
return bitwise_cast<uint32_t>(f);
403-
}
396+
return batch<T_out, A>::load(buffer.data(), aligned_mode {});
397+
}
404398

405-
template <class A, bool... Values, class Mode>
406-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>>
407-
load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept
408-
{
409-
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
410-
return bitwise_cast<int64_t>(d);
411-
}
399+
// Integer-via-float path: reinterpret to the same-width float type, reuse the
400+
// floating-point masked load (e.g. `vmaskmovps`), then bitcast the result back.
401+
template <class A, class T, bool... Values, class Mode>
402+
XSIMD_INLINE batch<T, A>
403+
load_masked_common(T const* mem, batch_bool_constant<T, A, Values...>, convert<T>, Mode, std::true_type /* uses_fp_bitcast */) noexcept
404+
{
405+
using fp_t = sized_fp_t<sizeof(T)>;
406+
const auto f = ::xsimd::kernel::load_masked<A>(reinterpret_cast<const fp_t*>(mem), batch_bool_constant<fp_t, A, Values...> {}, convert<fp_t> {}, Mode {}, A {});
407+
return bitwise_cast<T>(f);
408+
}
412409

413-
template <class A, bool... Values, class Mode>
414-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>>
415-
load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept
416-
{
417-
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
418-
return bitwise_cast<uint64_t>(d);
419-
}
410+
template <class A, class T_in, class T_out, bool... Values, class alignment>
411+
XSIMD_INLINE void
412+
store_masked_common(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, std::false_type /* uses_fp_bitcast */) noexcept
413+
{
414+
constexpr std::size_t size = batch<T_in, A>::size;
415+
constexpr bool mask[size] = { Values... };
420416

421-
template <class A, bool... Values, class Mode>
422-
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
423-
{
424-
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
425-
}
417+
for (std::size_t i = 0; i < size; ++i)
418+
if (mask[i])
419+
{
420+
mem[i] = static_cast<T_out>(src.get(i));
421+
}
422+
}
426423

427-
template <class A, bool... Values, class Mode>
428-
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
429-
{
430-
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
424+
template <class A, class T, bool... Values, class Mode>
425+
XSIMD_INLINE void
426+
store_masked_common(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...>, Mode, std::true_type /* uses_fp_bitcast */) noexcept
427+
{
428+
using fp_t = sized_fp_t<sizeof(T)>;
429+
::xsimd::kernel::store_masked<A>(reinterpret_cast<fp_t*>(mem), bitwise_cast<fp_t>(src), batch_bool_constant<fp_t, A, Values...> {}, Mode {}, A {});
430+
}
431431
}
432432

433-
template <class A, bool... Values, class Mode>
434-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
435-
store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept
433+
template <class A, class T_in, class T_out, bool... Values, class alignment>
434+
XSIMD_INLINE batch<T_out, A>
435+
load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...> mask, convert<T_out> cvt, alignment mode, requires_arch<common>) noexcept
436436
{
437-
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
437+
return detail::load_masked_common(mem, mask, cvt, mode, detail::masked_memory_uses_fp_bitcast<A, T_in, T_out> {});
438438
}
439439

440-
template <class A, bool... Values, class Mode>
441-
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
442-
store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept
440+
template <class A, class T_in, class T_out, bool... Values, class alignment>
441+
XSIMD_INLINE void
442+
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...> mask, alignment mode, requires_arch<common>) noexcept
443443
{
444-
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
444+
detail::store_masked_common(mem, src, mask, mode, detail::masked_memory_uses_fp_bitcast<A, T_in, T_out> {});
445445
}
446446

447447
template <class A, class T_in, class T_out>

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -993,19 +993,20 @@ namespace xsimd
993993
{
994994
using int_t = as_integer_t<T>;
995995
constexpr size_t half_size = batch<T, A>::size / 2;
996+
using half_arch = typename ::xsimd::make_sized_batch_t<T, half_size>::arch_type;
996997

997-
// confined to lower 128-bit half → forward to 128 bit
998+
// lower 128-bit half
998999
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
9991000
{
1000-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
1001-
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
1001+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(batch_bool_constant<int_t, A, Values...> {});
1002+
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, half_arch {});
10021003
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
10031004
}
1004-
// confined to upper 128-bit half → forward to 128 bit
1005+
// upper 128-bit half
10051006
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10061007
{
1007-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
1008-
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
1008+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
1009+
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, half_arch {});
10091010
return detail::zero_extend<A>(hi);
10101011
}
10111012
else
@@ -1021,38 +1022,41 @@ namespace xsimd
10211022
template <class A>
10221023
XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept
10231024
{
1024-
_mm256_maskstore_ps(mem, mask, src);
1025+
_mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src);
10251026
}
10261027

10271028
template <class A>
10281029
XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept
10291030
{
1030-
_mm256_maskstore_pd(mem, mask, src);
1031+
_mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src);
10311032
}
10321033
}
10331034

1034-
template <class A, class T, bool... Values, class Mode>
1035+
template <class A, class T, bool... Values, class Mode,
1036+
typename = std::enable_if_t<std::is_floating_point<T>::value>>
10351037
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
10361038
{
10371039
constexpr size_t half_size = batch<T, A>::size / 2;
1040+
using half_batch = ::xsimd::make_sized_batch_t<T, half_size>;
1041+
using half_arch = typename half_batch::arch_type;
10381042

1039-
// confined to lower 128-bit half → forward to 128 bit
1043+
// lower 128-bit half
10401044
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
10411045
{
1042-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
1043-
const auto lo = detail::lower_half(src);
1044-
store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
1046+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
1047+
const half_batch lo = detail::lower_half(src);
1048+
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
10451049
}
1046-
// confined to upper 128-bit half → forward to 128 bit
1050+
// upper 128-bit half
10471051
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
10481052
{
1049-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
1050-
const auto hi = detail::upper_half(src);
1051-
store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
1053+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
1054+
const half_batch hi = detail::upper_half(src);
1055+
store_masked<half_arch>(mem + half_size, hi, mhi, Mode {}, half_arch {});
10521056
}
10531057
else
10541058
{
1055-
detail::maskstore(mem, mask.as_batch(), src);
1059+
detail::maskstore(mem, mask.as_batch_bool(), src);
10561060
}
10571061
}
10581062

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -190,24 +190,27 @@ namespace xsimd
190190
}
191191
}
192192

193-
template <class A, class T, bool... Values, class Mode>
193+
template <class A, class T, bool... Values, class Mode,
194+
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4)>>
194195
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
195196
{
196197
constexpr size_t lanes_per_half = batch<T, A>::size / 2;
198+
using half_batch = ::xsimd::make_sized_batch_t<T, lanes_per_half>;
199+
using half_arch = typename half_batch::arch_type;
197200

198-
// confined to lower 128-bit half → forward to SSE
201+
// lower 128-bit half
199202
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
200203
{
201-
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
202-
const auto lo = detail::lower_half(src);
203-
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
204+
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
205+
const half_batch lo = detail::lower_half(src);
206+
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
204207
}
205-
// confined to upper 128-bit half → forward to SSE
208+
// upper 128-bit half
206209
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
207210
{
208-
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
209-
const auto hi = detail::upper_half(src);
210-
store_masked<sse4_2>(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {});
211+
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
212+
const half_batch hi = detail::upper_half(src);
213+
store_masked<half_arch>(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {});
211214
}
212215
else
213216
{
@@ -216,10 +219,10 @@ namespace xsimd
216219
}
217220

218221
template <class A, bool... Values, class Mode>
219-
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
222+
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
220223
{
221224
const auto s32 = bitwise_cast<int32_t>(src);
222-
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, mask, Mode {}, avx2 {});
225+
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});
223226
}
224227

225228
template <class A, bool... Values, class Mode>

0 commit comments

Comments
 (0)