Skip to content

Commit 397718b

Browse files
committed
Disambiguate avx512f load_masked
1 parent 33ebd43 commit 397718b

1 file changed

Lines changed: 114 additions & 9 deletions

File tree

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 114 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -298,11 +298,25 @@ namespace xsimd
298298

299299
} // namespace detail
300300

301-
template <class A, class T, bool... Values, class Mode,
302-
typename = std::enable_if_t<(sizeof(T) >= 4)>>
303-
XSIMD_INLINE batch<T, A> load_masked(T const* mem,
304-
batch_bool_constant<T, A, Values...> mask,
305-
convert<T>, Mode, requires_arch<avx512f>) noexcept
301+
// The AVX512F masked-load logic lives in this plain `*_avx512f` helper
302+
// (no `requires_arch` tag) and is exposed through the concrete
303+
// element-type overloads below.
304+
//
305+
// Why not a single generic `load_masked(T const*, ..., requires_arch<avx512f>)`?
306+
// It is ambiguous against the concrete-type / generic-arch overloads in
307+
// xsimd_common_memory.hpp (e.g. `load_masked(int32_t const*, ...,
308+
// requires_arch<A>)`): the avx512f overload is more specialized on the
309+
// architecture while the common one is more specialized on the pointer
310+
// type, so partial ordering cannot pick a winner. When AVX512DQ/BW is
311+
// available a fully concrete `requires_arch<avx512dq>` overload is the
312+
// unique best match and hides this, but a pure-AVX512F target (the
313+
// `avx512f` preset) has no such tie-breaker and the call fails to
314+
// compile. Concrete element-type `requires_arch<avx512f>` overloads make
315+
// the avx512f candidate the unique best match for every integer type.
316+
template <class A, class T, bool... Values, class Mode>
317+
XSIMD_INLINE batch<T, A> load_masked_avx512f(T const* mem,
318+
batch_bool_constant<T, A, Values...> mask,
319+
Mode) noexcept
306320
{
307321
constexpr auto half = batch<T, A>::size / 2;
308322
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding
@@ -324,12 +338,59 @@ namespace xsimd
324338
}
325339
}
326340

341+
template <class A, bool... Values, class Mode>
342+
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem,
343+
batch_bool_constant<int32_t, A, Values...> mask,
344+
convert<int32_t>, Mode, requires_arch<avx512f>) noexcept
345+
{
346+
return load_masked_avx512f(mem, mask, Mode {});
347+
}
348+
349+
template <class A, bool... Values, class Mode>
350+
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem,
351+
batch_bool_constant<uint32_t, A, Values...> mask,
352+
convert<uint32_t>, Mode, requires_arch<avx512f>) noexcept
353+
{
354+
return load_masked_avx512f(mem, mask, Mode {});
355+
}
356+
357+
template <class A, bool... Values, class Mode>
358+
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem,
359+
batch_bool_constant<int64_t, A, Values...> mask,
360+
convert<int64_t>, Mode, requires_arch<avx512f>) noexcept
361+
{
362+
return load_masked_avx512f(mem, mask, Mode {});
363+
}
364+
365+
template <class A, bool... Values, class Mode>
366+
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem,
367+
batch_bool_constant<uint64_t, A, Values...> mask,
368+
convert<uint64_t>, Mode, requires_arch<avx512f>) noexcept
369+
{
370+
return load_masked_avx512f(mem, mask, Mode {});
371+
}
372+
373+
// float/double (and any other >=4-byte type) have no concrete-type
374+
// generic-arch competitor in xsimd_common_memory.hpp, so a single
375+
// generic avx512f overload stays unambiguous for them.
327376
template <class A, class T, bool... Values, class Mode,
328377
typename = std::enable_if_t<(sizeof(T) >= 4)>>
329-
XSIMD_INLINE void store_masked(T* mem,
330-
batch<T, A> const& src,
331-
batch_bool_constant<T, A, Values...> mask,
332-
Mode, requires_arch<avx512f>) noexcept
378+
XSIMD_INLINE batch<T, A> load_masked(T const* mem,
379+
batch_bool_constant<T, A, Values...> mask,
380+
convert<T>, Mode, requires_arch<avx512f>) noexcept
381+
{
382+
return load_masked_avx512f(mem, mask, Mode {});
383+
}
384+
385+
// Same ambiguity as load_masked above (see comment there): factor the
386+
// AVX512F store logic into a plain helper and expose it via concrete
387+
// element-type `requires_arch<avx512f>` overloads so a pure-AVX512F
388+
// target has a unique best match.
389+
template <class A, class T, bool... Values, class Mode>
390+
XSIMD_INLINE void store_masked_avx512f(T* mem,
391+
batch<T, A> const& src,
392+
batch_bool_constant<T, A, Values...> mask,
393+
Mode) noexcept
333394
{
334395
constexpr auto half = batch<T, A>::size / 2;
335396
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding
@@ -351,6 +412,50 @@ namespace xsimd
351412
}
352413
}
353414

415+
template <class A, bool... Values, class Mode>
416+
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src,
417+
batch_bool_constant<int32_t, A, Values...> mask,
418+
Mode, requires_arch<avx512f>) noexcept
419+
{
420+
store_masked_avx512f(mem, src, mask, Mode {});
421+
}
422+
423+
template <class A, bool... Values, class Mode>
424+
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src,
425+
batch_bool_constant<uint32_t, A, Values...> mask,
426+
Mode, requires_arch<avx512f>) noexcept
427+
{
428+
store_masked_avx512f(mem, src, mask, Mode {});
429+
}
430+
431+
template <class A, bool... Values, class Mode>
432+
XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src,
433+
batch_bool_constant<int64_t, A, Values...> mask,
434+
Mode, requires_arch<avx512f>) noexcept
435+
{
436+
store_masked_avx512f(mem, src, mask, Mode {});
437+
}
438+
439+
template <class A, bool... Values, class Mode>
440+
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src,
441+
batch_bool_constant<uint64_t, A, Values...> mask,
442+
Mode, requires_arch<avx512f>) noexcept
443+
{
444+
store_masked_avx512f(mem, src, mask, Mode {});
445+
}
446+
447+
// float/double (and any other >=4-byte type) have no concrete-type
448+
// generic-arch competitor, so a single generic overload is unambiguous.
449+
template <class A, class T, bool... Values, class Mode,
450+
typename = std::enable_if_t<(sizeof(T) >= 4)>>
451+
XSIMD_INLINE void store_masked(T* mem,
452+
batch<T, A> const& src,
453+
batch_bool_constant<T, A, Values...> mask,
454+
Mode, requires_arch<avx512f>) noexcept
455+
{
456+
store_masked_avx512f(mem, src, mask, Mode {});
457+
}
458+
354459
// abs
355460
template <class A>
356461
XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept

0 commit comments

Comments
 (0)