@@ -298,11 +298,25 @@ namespace xsimd
298298
299299 } // namespace detail
300300
301- template <class A , class T , bool ... Values, class Mode ,
302- typename = std::enable_if_t <(sizeof (T) >= 4 )>>
303- XSIMD_INLINE batch<T, A> load_masked (T const * mem,
304- batch_bool_constant<T, A, Values...> mask,
305- convert<T>, Mode, requires_arch<avx512f>) noexcept
301+ // The AVX512F masked-load logic lives in this plain `*_avx512f` helper
302+ // (no `requires_arch` tag) and is exposed through the concrete
303+ // element-type overloads below.
304+ //
305+ // Why not a single generic `load_masked(T const*, ..., requires_arch<avx512f>)`?
306+ // It is ambiguous against the concrete-type / generic-arch overloads in
307+ // xsimd_common_memory.hpp (e.g. `load_masked(int32_t const*, ...,
308+ // requires_arch<A>)`): the avx512f overload is more specialized on the
309+ // architecture while the common one is more specialized on the pointer
310+ // type, so partial ordering cannot pick a winner. When AVX512DQ/BW is
311+ // available a fully concrete `requires_arch<avx512dq>` overload is the
312+ // unique best match and hides this, but a pure-AVX512F target (the
313+ // `avx512f` preset) has no such tie-breaker and the call fails to
314+ // compile. Concrete element-type `requires_arch<avx512f>` overloads make
315+ // the avx512f candidate the unique best match for every integer type.
316+ template <class A , class T , bool ... Values, class Mode >
317+ XSIMD_INLINE batch<T, A> load_masked_avx512f (T const * mem,
318+ batch_bool_constant<T, A, Values...> mask,
319+ Mode) noexcept
306320 {
307321 constexpr auto half = batch<T, A>::size / 2 ;
308322 XSIMD_IF_CONSTEXPR (mask.countl_zero () >= half) // lower-half AVX2 forwarding
@@ -324,12 +338,59 @@ namespace xsimd
324338 }
325339 }
326340
341+ template <class A , bool ... Values, class Mode >
342+ XSIMD_INLINE batch<int32_t , A> load_masked (int32_t const * mem,
343+ batch_bool_constant<int32_t , A, Values...> mask,
344+ convert<int32_t >, Mode, requires_arch<avx512f>) noexcept
345+ {
346+ return load_masked_avx512f (mem, mask, Mode {});
347+ }
348+
349+ template <class A , bool ... Values, class Mode >
350+ XSIMD_INLINE batch<uint32_t , A> load_masked (uint32_t const * mem,
351+ batch_bool_constant<uint32_t , A, Values...> mask,
352+ convert<uint32_t >, Mode, requires_arch<avx512f>) noexcept
353+ {
354+ return load_masked_avx512f (mem, mask, Mode {});
355+ }
356+
357+ template <class A , bool ... Values, class Mode >
358+ XSIMD_INLINE batch<int64_t , A> load_masked (int64_t const * mem,
359+ batch_bool_constant<int64_t , A, Values...> mask,
360+ convert<int64_t >, Mode, requires_arch<avx512f>) noexcept
361+ {
362+ return load_masked_avx512f (mem, mask, Mode {});
363+ }
364+
365+ template <class A , bool ... Values, class Mode >
366+ XSIMD_INLINE batch<uint64_t , A> load_masked (uint64_t const * mem,
367+ batch_bool_constant<uint64_t , A, Values...> mask,
368+ convert<uint64_t >, Mode, requires_arch<avx512f>) noexcept
369+ {
370+ return load_masked_avx512f (mem, mask, Mode {});
371+ }
372+
373+ // float/double (and any other >=4-byte type) have no concrete-type
374+ // generic-arch competitor in xsimd_common_memory.hpp, so a single
375+ // generic avx512f overload stays unambiguous for them.
327376 template <class A , class T , bool ... Values, class Mode ,
328377 typename = std::enable_if_t <(sizeof (T) >= 4 )>>
329- XSIMD_INLINE void store_masked (T* mem,
330- batch<T, A> const & src,
331- batch_bool_constant<T, A, Values...> mask,
332- Mode, requires_arch<avx512f>) noexcept
378+ XSIMD_INLINE batch<T, A> load_masked (T const * mem,
379+ batch_bool_constant<T, A, Values...> mask,
380+ convert<T>, Mode, requires_arch<avx512f>) noexcept
381+ {
382+ return load_masked_avx512f (mem, mask, Mode {});
383+ }
384+
385+ // Same ambiguity as load_masked above (see comment there): factor the
386+ // AVX512F store logic into a plain helper and expose it via concrete
387+ // element-type `requires_arch<avx512f>` overloads so a pure-AVX512F
388+ // target has a unique best match.
389+ template <class A , class T , bool ... Values, class Mode >
390+ XSIMD_INLINE void store_masked_avx512f (T* mem,
391+ batch<T, A> const & src,
392+ batch_bool_constant<T, A, Values...> mask,
393+ Mode) noexcept
333394 {
334395 constexpr auto half = batch<T, A>::size / 2 ;
335396 XSIMD_IF_CONSTEXPR (mask.countl_zero () >= half) // lower-half AVX2 forwarding
@@ -351,6 +412,50 @@ namespace xsimd
351412 }
352413 }
353414
415+ template <class A , bool ... Values, class Mode >
416+ XSIMD_INLINE void store_masked (int32_t * mem, batch<int32_t , A> const & src,
417+ batch_bool_constant<int32_t , A, Values...> mask,
418+ Mode, requires_arch<avx512f>) noexcept
419+ {
420+ store_masked_avx512f (mem, src, mask, Mode {});
421+ }
422+
423+ template <class A , bool ... Values, class Mode >
424+ XSIMD_INLINE void store_masked (uint32_t * mem, batch<uint32_t , A> const & src,
425+ batch_bool_constant<uint32_t , A, Values...> mask,
426+ Mode, requires_arch<avx512f>) noexcept
427+ {
428+ store_masked_avx512f (mem, src, mask, Mode {});
429+ }
430+
431+ template <class A , bool ... Values, class Mode >
432+ XSIMD_INLINE void store_masked (int64_t * mem, batch<int64_t , A> const & src,
433+ batch_bool_constant<int64_t , A, Values...> mask,
434+ Mode, requires_arch<avx512f>) noexcept
435+ {
436+ store_masked_avx512f (mem, src, mask, Mode {});
437+ }
438+
439+ template <class A , bool ... Values, class Mode >
440+ XSIMD_INLINE void store_masked (uint64_t * mem, batch<uint64_t , A> const & src,
441+ batch_bool_constant<uint64_t , A, Values...> mask,
442+ Mode, requires_arch<avx512f>) noexcept
443+ {
444+ store_masked_avx512f (mem, src, mask, Mode {});
445+ }
446+
447+ // float/double (and any other >=4-byte type) have no concrete-type
448+ // generic-arch competitor, so a single generic overload is unambiguous.
449+ template <class A , class T , bool ... Values, class Mode ,
450+ typename = std::enable_if_t <(sizeof (T) >= 4 )>>
451+ XSIMD_INLINE void store_masked (T* mem,
452+ batch<T, A> const & src,
453+ batch_bool_constant<T, A, Values...> mask,
454+ Mode, requires_arch<avx512f>) noexcept
455+ {
456+ store_masked_avx512f (mem, src, mask, Mode {});
457+ }
458+
354459 // abs
355460 template <class A >
356461 XSIMD_INLINE batch<float , A> abs (batch<float , A> const & self, requires_arch<avx512f>) noexcept
0 commit comments