Skip to content

Commit 83914de

Browse files
committed
Introduce count{l,r}_{zero,one} for batch_bool
In #1236, it was mentioned that variable-sized bit groups for certain `batch_bool` reductions would be slightly more efficient than extracting a proper bitmask. To achieve this, the xsimd API is extended with the functions `xsimd::count{l,r}_{zero,one}`, and `count` is revised to allow per-platform kernels. The default implementations for each function simply apply the corresponding scalar operation (for which `__cpp_lib_bitops == 201907L` is partially backported) on `batch_bool::mask`. This is specialized for NEON(64) by instead applying the scalar operation to the narrowed batch, then scaling the result by the bit group size.
1 parent c3a8d37 commit 83914de

File tree

9 files changed

+804
-47
lines changed

9 files changed

+804
-47
lines changed
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
/****************************************************************
2+
* Partial backport of `__cpp_lib_bitops == 201907L` from C++20 *
3+
****************************************************************/
4+
5+
#ifndef XSIMD_BIT_HPP
6+
#define XSIMD_BIT_HPP
7+
8+
#include <version>
9+
10+
#if __cpp_lib_bitops >= 201907L
11+
12+
#include <bit>
13+
14+
namespace xsimd
15+
{
16+
namespace detail
17+
{
18+
using std::countl_one;
19+
using std::countl_zero;
20+
using std::countr_one;
21+
using std::countr_zero;
22+
using std::popcount;
23+
}
24+
}
25+
26+
#else
27+
28+
#include <climits>
29+
#include <type_traits>
30+
31+
#ifdef __has_builtin
32+
#define XSIMD_HAS_BUILTIN(x) __has_builtin(x)
33+
#else
34+
#define XSIMD_HAS_BUILTIN(x) 0
35+
#endif
36+
37+
#ifdef _MSC_VER
38+
#include <intrin.h>
39+
#endif
40+
41+
namespace xsimd
42+
{
43+
namespace detail
44+
{
45+
// FIXME: We could do better by dispatching to the appropriate popcount instruction
46+
// depending on the arch.
47+
48+
template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
49+
XSIMD_INLINE int popcount(T x) noexcept
50+
{
51+
#if XSIMD_HAS_BUILTIN(__builtin_popcountg)
52+
return __builtin_popcountg(x);
53+
#else
54+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
55+
{
56+
#if XSIMD_HAS_BUILTIN(__builtin_popcount)
57+
return __builtin_popcount(x);
58+
#elif defined(_MSC_VER)
59+
return __popcnt(x);
60+
#else
61+
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
62+
return ((uint64_t)x * 0x200040008001ULL & 0x111111111111111ULL) % 0xf;
63+
#endif
64+
}
65+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
66+
{
67+
#if XSIMD_HAS_BUILTIN(__builtin_popcount)
68+
return __builtin_popcount(x);
69+
#elif defined(_MSC_VER)
70+
return __popcnt16(x);
71+
#else
72+
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
73+
constexpr unsigned long long msb12 = 0x1001001001001ULL;
74+
constexpr unsigned long long mask5 = 0x84210842108421ULL;
75+
76+
unsigned int v = (unsigned int)x;
77+
78+
return ((v & 0xfff) * msb12 & mask5) % 0x1f
79+
+ (((v & 0xfff000) >> 12) * msb12 & mask5) % 0x1f;
80+
#endif
81+
}
82+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
83+
{
84+
#if XSIMD_HAS_BUILTIN(__builtin_popcount)
85+
return __builtin_popcount(x);
86+
#elif defined(_MSC_VER)
87+
return __popcnt(x);
88+
#else
89+
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
90+
x = x - ((x >> 1) & (T) ~(T)0 / 3);
91+
x = (x & (T) ~(T)0 / 15 * 3) + ((x >> 2) & (T) ~(T)0 / 15 * 3);
92+
x = (x + (x >> 4)) & (T) ~(T)0 / 255 * 15;
93+
return (x * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * CHAR_BIT;
94+
#endif
95+
}
96+
else
97+
{
98+
// sizeof(T) == 8
99+
#if XSIMD_HAS_BUILTIN(__builtin_popcountll)
100+
return __builtin_popcountll(x);
101+
#elif XSIMD_HAS_BUILTIN(__builtin_popcount)
102+
return __builtin_popcount((unsigned int)x) + __builtin_popcount((unsigned int)(x >> 32));
103+
#elif defined(_MSC_VER)
104+
#ifdef _M_X64
105+
return (int)__popcnt64(x);
106+
#else
107+
return (int)(__popcnt((unsigned int)x) + __popcnt((unsigned int)(x >> 32)));
108+
#endif
109+
#else
110+
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
111+
x = x - ((x >> 1) & (T) ~(T)0 / 3);
112+
x = (x & (T) ~(T)0 / 15 * 3) + ((x >> 2) & (T) ~(T)0 / 15 * 3);
113+
x = (x + (x >> 4)) & (T) ~(T)0 / 255 * 15;
114+
return (x * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * CHAR_BIT;
115+
#endif
116+
}
117+
#endif
118+
}
119+
120+
template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
121+
XSIMD_INLINE int countl_zero(T x) noexcept
122+
{
123+
#if XSIMD_HAS_BUILTIN(__builtin_clzg)
124+
return __builtin_clzg(x, (int)(sizeof(T) * CHAR_BIT));
125+
#else
126+
if (x == 0)
127+
return sizeof(T) * CHAR_BIT;
128+
129+
XSIMD_IF_CONSTEXPR(sizeof(T) <= 4)
130+
{
131+
#if XSIMD_HAS_BUILTIN(__builtin_clz)
132+
return __builtin_clz((unsigned int)x) - (4 - sizeof(T)) * CHAR_BIT;
133+
#elif defined(_MSC_VER)
134+
unsigned long index;
135+
_BitScanReverse(&index, (unsigned long)x);
136+
return sizeof(T) * CHAR_BIT - index - 1;
137+
#else
138+
x |= x >> 1;
139+
x |= x >> 2;
140+
x |= x >> 4;
141+
XSIMD_IF_CONSTEXPR(sizeof(T) >= 2)
142+
{
143+
x |= x >> 8;
144+
}
145+
XSIMD_IF_CONSTEXPR(sizeof(T) >= 4)
146+
{
147+
x |= x >> 16;
148+
}
149+
return sizeof(T) * CHAR_BIT - popcount(x);
150+
#endif
151+
}
152+
else
153+
{
154+
// sizeof(T) == 8
155+
#if XSIMD_HAS_BUILTIN(__builtin_clzll)
156+
return __builtin_clzll((unsigned long long)x);
157+
#elif defined(_MSC_VER) && defined(_M_X64)
158+
unsigned long index;
159+
_BitScanReverse64(&index, (unsigned long long)x);
160+
return sizeof(T) * CHAR_BIT - index - 1;
161+
#else
162+
x |= x >> 1;
163+
x |= x >> 2;
164+
x |= x >> 4;
165+
x |= x >> 8;
166+
x |= x >> 16;
167+
x |= x >> 32;
168+
return sizeof(T) * CHAR_BIT - popcount(x);
169+
#endif
170+
}
171+
#endif
172+
}
173+
174+
template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
175+
XSIMD_INLINE int countl_one(T x) noexcept
176+
{
177+
return countl_zero(T(~x));
178+
}
179+
180+
template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
181+
XSIMD_INLINE int countr_zero(T x) noexcept
182+
{
183+
#if XSIMD_HAS_BUILTIN(__builtin_ctzg)
184+
return __builtin_ctzg(x, (int)(sizeof(T) * CHAR_BIT));
185+
#else
186+
if (x == 0)
187+
return sizeof(T) * CHAR_BIT;
188+
189+
XSIMD_IF_CONSTEXPR(sizeof(T) <= 4)
190+
{
191+
#if XSIMD_HAS_BUILTIN(__builtin_ctz)
192+
return __builtin_ctz((unsigned int)x);
193+
#elif defined(_MSC_VER)
194+
unsigned long index;
195+
_BitScanForward(&index, (unsigned long)x);
196+
return index;
197+
#endif
198+
}
199+
else
200+
{
201+
// sizeof(T) == 8
202+
#if XSIMD_HAS_BUILTIN(__builtin_ctzll)
203+
return __builtin_ctzll((unsigned long long)x);
204+
#elif defined(_MSC_VER) && defined(_M_X64)
205+
unsigned long index;
206+
_BitScanForward64(&index, (unsigned long long)x);
207+
return index;
208+
#endif
209+
}
210+
211+
// https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup
212+
return popcount((T)((x & -x) - 1));
213+
#endif
214+
}
215+
216+
template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
217+
XSIMD_INLINE int countr_one(T x) noexcept
218+
{
219+
return countr_zero(T(~x));
220+
}
221+
222+
}
223+
}
224+
225+
#endif
226+
#endif

include/xsimd/arch/common/xsimd_common_logical.hpp

Lines changed: 31 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#ifndef XSIMD_COMMON_LOGICAL_HPP
1313
#define XSIMD_COMMON_LOGICAL_HPP
1414

15+
#include "./xsimd_common_bit.hpp"
1516
#include "./xsimd_common_details.hpp"
1617

1718
#include <climits>
@@ -28,43 +29,37 @@ namespace xsimd
2829
template <class A, class T>
2930
XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<common>) noexcept
3031
{
31-
uint64_t m = self.mask();
32-
XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size < 14)
33-
{
34-
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
35-
return (m * 0x200040008001ULL & 0x111111111111111ULL) % 0xf;
36-
}
37-
else
38-
{
39-
#if defined __has_builtin
40-
#if __has_builtin(__builtin_popcountg)
41-
#define builtin_popcount(v) __builtin_popcountg(v)
42-
#endif
43-
#endif
32+
return xsimd::detail::popcount(self.mask());
33+
}
4434

45-
#ifdef builtin_popcount
46-
return builtin_popcount(m);
47-
#else
48-
// FIXME: we could do better by dispatching to the appropriate
49-
// popcount instruction depending on the arch...
50-
XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size <= 32)
51-
{
52-
uint32_t m32 = static_cast<uint32_t>(m);
53-
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
54-
m32 = m32 - ((m32 >> 1) & 0x55555555); // reuse input as temporary
55-
m32 = (m32 & 0x33333333) + ((m32 >> 2) & 0x33333333); // temp
56-
return (((m32 + (m32 >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; // count
57-
}
58-
else
59-
{
60-
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
61-
m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp
62-
m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp
63-
m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp
64-
return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count
65-
}
66-
#endif
67-
}
35+
template <class A, class T>
36+
XSIMD_INLINE size_t countl_zero(batch_bool<T, A> const& self, requires_arch<common>) noexcept
37+
{
38+
constexpr size_t unused_bits = 64 - batch_bool<T, A>::size;
39+
constexpr uint64_t lower_mask = batch_bool<T, A>::size < 64 ? ((uint64_t)1 << (batch_bool<T, A>::size % 64)) - 1 : (uint64_t)-1;
40+
return xsimd::detail::countl_zero(self.mask() & lower_mask) - unused_bits;
41+
}
42+
43+
template <class A, class T>
44+
XSIMD_INLINE size_t countl_one(batch_bool<T, A> const& self, requires_arch<common>) noexcept
45+
{
46+
constexpr size_t unused_bits = 64 - batch_bool<T, A>::size;
47+
constexpr uint64_t upper_mask = batch_bool<T, A>::size < 64 ? ~(((uint64_t)1 << (batch_bool<T, A>::size % 64)) - 1) : (uint64_t)0;
48+
return xsimd::detail::countl_one(self.mask() | upper_mask) - unused_bits;
49+
}
50+
51+
template <class A, class T>
52+
XSIMD_INLINE size_t countr_zero(batch_bool<T, A> const& self, requires_arch<common>) noexcept
53+
{
54+
constexpr uint64_t stop = batch_bool<T, A>::size < 64 ? (uint64_t)1 << (batch_bool<T, A>::size % 64) : 0;
55+
return xsimd::detail::countr_zero(self.mask() | stop);
56+
}
57+
58+
template <class A, class T>
59+
XSIMD_INLINE size_t countr_one(batch_bool<T, A> const& self, requires_arch<common>) noexcept
60+
{
61+
constexpr uint64_t stop = batch_bool<T, A>::size < 64 ? ~((uint64_t)1 << (batch_bool<T, A>::size % 64)) : (uint64_t)-1;
62+
return xsimd::detail::countr_one(self.mask() & stop);
6863
}
6964

7065
// from mask

include/xsimd/arch/xsimd_common.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define XSIMD_COMMON_HPP
1414

1515
#include "./common/xsimd_common_arithmetic.hpp"
16+
#include "./common/xsimd_common_bit.hpp"
1617
#include "./common/xsimd_common_cast.hpp"
1718
#include "./common/xsimd_common_complex.hpp"
1819
#include "./common/xsimd_common_logical.hpp"

0 commit comments

Comments
 (0)