|
13 | 13 | #define XSIMD_COMMON_MEMORY_HPP |
14 | 14 |
|
15 | 15 | #include "../../types/xsimd_batch_constant.hpp" |
| 16 | +#include "../../utils/xsimd_type_traits.hpp" |
16 | 17 | #include "./xsimd_common_details.hpp" |
17 | 18 |
|
18 | 19 | #include <algorithm> |
@@ -360,88 +361,87 @@ namespace xsimd |
360 | 361 | return load_unaligned<A>(mem, convert<T> {}, A {}); |
361 | 362 | } |
362 | 363 |
|
363 | | - template <class A, class T_in, class T_out, bool... Values, class alignment> |
364 | | - XSIMD_INLINE batch<T_out, A> |
365 | | - load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...>, convert<T_out>, alignment, requires_arch<common>) noexcept |
366 | | - { |
367 | | - constexpr std::size_t size = batch<T_out, A>::size; |
368 | | - alignas(A::alignment()) std::array<T_out, size> buffer {}; |
369 | | - constexpr bool mask[size] = { Values... }; |
370 | | - |
371 | | - for (std::size_t i = 0; i < size; ++i) |
372 | | - buffer[i] = mask[i] ? static_cast<T_out>(mem[i]) : T_out(0); |
373 | | - |
374 | | - return batch<T_out, A>::load(buffer.data(), aligned_mode {}); |
375 | | - } |
376 | | - |
377 | | - template <class A, class T_in, class T_out, bool... Values, class alignment> |
378 | | - XSIMD_INLINE void |
379 | | - store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept |
| 364 | + // Masked-memory dispatch idiom. To give an arch a native masked path, add a |
| 365 | + // `requires_arch<that-arch>` overload in its arch file; conversion ranking makes |
| 366 | + // it beat the inherited one. Keep this base layer arch-agnostic: |
| 367 | + // (a) specialize via a concrete `requires_arch<arch>` overload -- no register |
| 368 | + // tag, no `enable_if` on `A`; |
| 369 | + // (b) base overloads use the `requires_arch<common>` tag only; a generic |
| 370 | + // `requires_arch<A>` here ties with an arch's own overload (gcc-10 ambiguity); |
| 371 | + // (c) capability decisions go through arch-agnostic traits (see below). |
| 372 | + namespace detail |
380 | 373 | { |
381 | | - constexpr std::size_t size = batch<T_in, A>::size; |
382 | | - constexpr bool mask[size] = { Values... }; |
| 374 | + // True when an integer access can borrow the same-width float `vmaskmov*` path |
| 375 | + // (integral type, same-size float exists, arch has that float register); |
| 376 | + // otherwise the scalar-buffer fallback is used. Names no architecture. |
| 377 | + template <class A, class T_in, class T_out> |
| 378 | + using masked_memory_uses_fp_bitcast = std::integral_constant<bool, |
| 379 | + std::is_same<T_in, T_out>::value |
| 380 | + && std::is_integral<T_out>::value |
| 381 | + && !std::is_void<sized_fp_t<sizeof(T_out)>>::value |
| 382 | + && types::has_simd_register<sized_fp_t<sizeof(T_out)>, A>::value>; |
383 | 383 |
|
384 | | - for (std::size_t i = 0; i < size; ++i) |
385 | | - if (mask[i]) |
386 | | - { |
387 | | - mem[i] = static_cast<T_out>(src.get(i)); |
388 | | - } |
389 | | - } |
| 384 | + // Scalar-buffer fallback: materialize masked-off lanes as zero, then load. |
| 385 | + template <class A, class T_in, class T_out, bool... Values, class alignment> |
| 386 | + XSIMD_INLINE batch<T_out, A> |
| 387 | + load_masked_common(T_in const* mem, batch_bool_constant<T_out, A, Values...>, convert<T_out>, alignment, std::false_type /* uses_fp_bitcast */) noexcept |
| 388 | + { |
| 389 | + constexpr std::size_t size = batch<T_out, A>::size; |
| 390 | + alignas(A::alignment()) std::array<T_out, size> buffer {}; |
| 391 | + constexpr bool mask[size] = { Values... }; |
390 | 392 |
|
391 | | - template <class A, bool... Values, class Mode> |
392 | | - XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept |
393 | | - { |
394 | | - const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {}); |
395 | | - return bitwise_cast<int32_t>(f); |
396 | | - } |
| 393 | + for (std::size_t i = 0; i < size; ++i) |
| 394 | + buffer[i] = mask[i] ? static_cast<T_out>(mem[i]) : T_out(0); |
397 | 395 |
|
398 | | - template <class A, bool... Values, class Mode> |
399 | | - XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept |
400 | | - { |
401 | | - const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {}); |
402 | | - return bitwise_cast<uint32_t>(f); |
403 | | - } |
| 396 | + return batch<T_out, A>::load(buffer.data(), aligned_mode {}); |
| 397 | + } |
404 | 398 |
|
405 | | - template <class A, bool... Values, class Mode> |
406 | | - XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>> |
407 | | - load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept |
408 | | - { |
409 | | - const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {}); |
410 | | - return bitwise_cast<int64_t>(d); |
411 | | - } |
| 399 | + // Integer-via-float path: reinterpret to the same-width float type, reuse the |
| 400 | + // floating-point masked load (e.g. `vmaskmovps`), then bitcast the result back. |
| 401 | + template <class A, class T, bool... Values, class Mode> |
| 402 | + XSIMD_INLINE batch<T, A> |
| 403 | + load_masked_common(T const* mem, batch_bool_constant<T, A, Values...>, convert<T>, Mode, std::true_type /* uses_fp_bitcast */) noexcept |
| 404 | + { |
| 405 | + using fp_t = sized_fp_t<sizeof(T)>; |
| 406 | + const auto f = ::xsimd::kernel::load_masked<A>(reinterpret_cast<const fp_t*>(mem), batch_bool_constant<fp_t, A, Values...> {}, convert<fp_t> {}, Mode {}, A {}); |
| 407 | + return bitwise_cast<T>(f); |
| 408 | + } |
412 | 409 |
|
413 | | - template <class A, bool... Values, class Mode> |
414 | | - XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>> |
415 | | - load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept |
416 | | - { |
417 | | - const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {}); |
418 | | - return bitwise_cast<uint64_t>(d); |
419 | | - } |
| 410 | + template <class A, class T_in, class T_out, bool... Values, class alignment> |
| 411 | + XSIMD_INLINE void |
| 412 | + store_masked_common(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, std::false_type /* uses_fp_bitcast */) noexcept |
| 413 | + { |
| 414 | + constexpr std::size_t size = batch<T_in, A>::size; |
| 415 | + constexpr bool mask[size] = { Values... }; |
420 | 416 |
|
421 | | - template <class A, bool... Values, class Mode> |
422 | | - XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept |
423 | | - { |
424 | | - store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {}); |
425 | | - } |
| 417 | + for (std::size_t i = 0; i < size; ++i) |
| 418 | + if (mask[i]) |
| 419 | + { |
| 420 | + mem[i] = static_cast<T_out>(src.get(i)); |
| 421 | + } |
| 422 | + } |
426 | 423 |
|
427 | | - template <class A, bool... Values, class Mode> |
428 | | - XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept |
429 | | - { |
430 | | - store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {}); |
| 424 | + template <class A, class T, bool... Values, class Mode> |
| 425 | + XSIMD_INLINE void |
| 426 | + store_masked_common(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...>, Mode, std::true_type /* uses_fp_bitcast */) noexcept |
| 427 | + { |
| 428 | + using fp_t = sized_fp_t<sizeof(T)>; |
| 429 | + ::xsimd::kernel::store_masked<A>(reinterpret_cast<fp_t*>(mem), bitwise_cast<fp_t>(src), batch_bool_constant<fp_t, A, Values...> {}, Mode {}, A {}); |
| 430 | + } |
431 | 431 | } |
432 | 432 |
|
433 | | - template <class A, bool... Values, class Mode> |
434 | | - XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value> |
435 | | - store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept |
| 433 | + template <class A, class T_in, class T_out, bool... Values, class alignment> |
| 434 | + XSIMD_INLINE batch<T_out, A> |
| 435 | + load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...> mask, convert<T_out> cvt, alignment mode, requires_arch<common>) noexcept |
436 | 436 | { |
437 | | - store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {}); |
| 437 | + return detail::load_masked_common(mem, mask, cvt, mode, detail::masked_memory_uses_fp_bitcast<A, T_in, T_out> {}); |
438 | 438 | } |
439 | 439 |
|
440 | | - template <class A, bool... Values, class Mode> |
441 | | - XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value> |
442 | | - store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept |
| 440 | + template <class A, class T_in, class T_out, bool... Values, class alignment> |
| 441 | + XSIMD_INLINE void |
| 442 | + store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...> mask, alignment mode, requires_arch<common>) noexcept |
443 | 443 | { |
444 | | - store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {}); |
| 444 | + detail::store_masked_common(mem, src, mask, mode, detail::masked_memory_uses_fp_bitcast<A, T_in, T_out> {}); |
445 | 445 | } |
446 | 446 |
|
447 | 447 | template <class A, class T_in, class T_out> |
|
0 commit comments