|
18 | 18 | #include <tuple> |
19 | 19 | #include <type_traits> |
20 | 20 |
|
21 | | -#include "../types/xsimd_bit.hpp" |
22 | 21 | #include "../types/xsimd_neon_register.hpp" |
23 | 22 | #include "../types/xsimd_utils.hpp" |
| 23 | +#include "./common/xsimd_common_bit.hpp" |
24 | 24 | #include "./common/xsimd_common_cast.hpp" |
25 | 25 |
|
26 | 26 | // Wrap intrinsics so we can pass them as function pointers |
@@ -3362,36 +3362,12 @@ namespace xsimd |
3362 | 3362 | /********* |
3363 | 3363 | * count * |
3364 | 3364 | *********/ |
3365 | | - template <class A, class T, detail::enable_sized_t<T, 1> = 0> |
3366 | | - XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon>) noexcept |
3367 | | - { |
3368 | | - uint8x8_t narrowed = vshrn_n_u16(vreinterpretq_u16_u8(self), 4); |
3369 | | - uint64_t result = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0); |
3370 | | - return xsimd::detail::popcount(result) / 4; |
3371 | | - } |
3372 | | - |
3373 | | - template <class A, class T, detail::enable_sized_t<T, 2> = 0> |
3374 | | - XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon>) noexcept |
3375 | | - { |
3376 | | - uint8x8_t narrowed = vmovn_u16(self); |
3377 | | - uint64_t result = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0); |
3378 | | - return xsimd::detail::popcount(result) / 8; |
3379 | | - } |
3380 | | - |
3381 | | - template <class A, class T, detail::enable_sized_t<T, 4> = 0> |
3382 | | - XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon>) noexcept |
3383 | | - { |
3384 | | - uint16x4_t narrowed = vmovn_u32(self); |
3385 | | - uint64_t result = vget_lane_u64(vreinterpret_u64_u16(narrowed), 0); |
3386 | | - return xsimd::detail::popcount(result) / 16; |
3387 | | - } |
3388 | | - |
3389 | | - template <class A, class T, detail::enable_sized_t<T, 8> = 0> |
| 3365 | + template <class A, class T> |
3390 | 3366 | XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon>) noexcept |
3391 | 3367 | { |
3392 | | - uint32x2_t narrowed = vmovn_u64(self); |
3393 | | - uint64_t result = vget_lane_u64(vreinterpret_u64_u32(narrowed), 0); |
3394 | | - return xsimd::detail::popcount(result) / 32; |
| 3368 | + uint8x16_t popcnts = vcntq_u8(bitwise_cast<uint8_t, T, A>(bitwise_cast<T, A>(self))); |
| 3369 | + uint64x2_t total = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(popcnts))); |
| 3370 | + return vget_lane_u64(vadd_u64(vgetq_low_u64(total), vgetq_high_u64(total)), 0) / (sizeof(T) * 8); |
3395 | 3371 | } |
3396 | 3372 |
|
3397 | 3373 | #define WRAP_MASK_OP(OP) \ |
|
0 commit comments