Skip to content

Commit 11e5f59

Browse files
committed
Introduce powerOf2Floor() and popCount8() functions
This is a prerequisite for the feature "Graph meter coloring (with GraphData structure rework)". powerOf2Floor() will utilize __builtin_clz() or stdc_bit_floor_ui() (__builtin_clz() is preferred) if either is supported. popCount8() will utilize ARM NEON instructions and x86 POPCNT instruction if the machine supports either of them. I am not adopting the C23 standard interface stdc_count_ones_uc() yet, as I am not sure C libraries would implement it as fast as our version. Signed-off-by: Kang-Che Sung <explorer09@gmail.com>
1 parent 017a71a commit 11e5f59

3 files changed

Lines changed: 108 additions & 2 deletions

File tree

XUtils.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ in the source distribution for its full text.
1212
#include <assert.h>
1313
#include <errno.h>
1414
#include <fcntl.h>
15+
#include <limits.h> // IWYU pragma: keep
1516
#include <math.h>
1617
#include <stdarg.h>
1718
#include <stdint.h>
@@ -387,3 +388,14 @@ unsigned int countTrailingZeros(unsigned int x) {
387388
return mod37BitPosition[(-x & x) % 37];
388389
}
389390
#endif
391+
392+
#if !defined(HAVE_BUILTIN_CLZ) && !defined(HAVE_STDC_BIT_FLOOR)
393+
/* Returns the nearest power of two that is not greater than x.
394+
If x is 0, returns 0. */
395+
unsigned int powerOf2Floor(unsigned int x) {
396+
for (unsigned int shift = 1; shift < sizeof(x) * CHAR_BIT; shift <<= 1)
397+
x |= x >> shift;
398+
399+
return x - (x >> 1);
400+
}
401+
#endif

XUtils.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,27 @@ in the source distribution for its full text.
1515
#endif
1616

1717
#include <dirent.h>
18+
#include <limits.h> // IWYU pragma: keep
1819
#include <stdbool.h>
1920
#include <stddef.h> // IWYU pragma: keep
21+
#include <stdint.h> // IWYU pragma: keep
2022
#include <stdio.h>
2123
#include <stdlib.h> // IWYU pragma: keep
2224
#include <string.h> // IWYU pragma: keep
2325

2426
#include "Compat.h"
2527
#include "Macros.h"
2628

29+
#ifdef HAVE_STDBIT_H
30+
#include <stdbit.h>
31+
#endif
32+
33+
#if defined(HAVE_ARM_NEON_H) && defined(__ARM_NEON)
34+
// ARM C Language Extensions (ACLE) recommends us to check __ARM_NEON before
35+
// including <arm_neon.h>
36+
#include <arm_neon.h>
37+
#endif
38+
2739

2840
ATTR_NORETURN
2941
void fail(void);
@@ -150,6 +162,49 @@ static inline unsigned int countTrailingZeros(unsigned int x) {
150162
unsigned int countTrailingZeros(unsigned int x);
151163
#endif
152164

165+
/* Returns the nearest power of two that is not greater than x.
166+
If x is 0, returns 0. */
167+
#if defined(HAVE_BUILTIN_CLZ)
168+
static inline unsigned int powerOf2Floor(unsigned int x) {
169+
if (x == 0)
170+
return 0;
171+
172+
return 1U << ((int)(sizeof(x) * CHAR_BIT) - 1 - __builtin_clz(x));
173+
}
174+
#elif defined(HAVE_STDC_BIT_FLOOR)
175+
static inline unsigned int powerOf2Floor(unsigned int x) {
176+
return stdc_bit_floor_ui(x);
177+
}
178+
#else
179+
unsigned int powerOf2Floor(unsigned int x);
180+
#endif
181+
182+
static inline unsigned int popCount8(uint8_t x) {
183+
#if defined(HAVE_ARM_NEON_H) && defined(__ARM_NEON)
184+
// With ARM Advanced SIMD extension (NEON), this generates smaller code than
185+
// __builtin_popcount.
186+
//
187+
// Initialize the vector register. Set all lanes at once so that the
188+
// compiler will not emit instruction to zero-initialize other lanes.
189+
uint8x8_t v = vdup_n_u8(x);
190+
// Count the number of set bits for each lane (8-bit) in the vector.
191+
v = vcnt_u8(v);
192+
// Get lane 0 and discard lanes 1 to 7. (Return type was uint8_t)
193+
return vget_lane_u8(v, 0);
194+
#elif defined(HAVE_BUILTIN_POPCOUNT) && defined(__POPCNT__)
195+
// x86 POPCNT instruction. __builtin_popcount translates to it when it is
196+
// enabled ("-mpopcnt"). (Return type was int)
197+
return (unsigned int)__builtin_popcount(x);
198+
#else
199+
// This code is optimized for uint8_t input and smaller than the subroutine
200+
// call of the compiler __builtin_popcount (which is tuned for
201+
// unsigned int input type and not uint8_t).
202+
uint32_t n = (uint32_t)(x * 0x08040201U);
203+
n = (uint32_t)(((n >> 3) & 0x11111111U) * 0x11111111U) >> 28;
204+
return n;
205+
#endif
206+
}
207+
153208
/* IEC unit prefixes */
154209
static const char unitPrefixes[] = { 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y', 'R', 'Q' };
155210

configure.ac

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,10 @@ fi])
182182

183183
# Optional Section
184184

185-
AC_CHECK_HEADERS([execinfo.h])
185+
AC_CHECK_HEADERS([ \
186+
execinfo.h \
187+
stdbit.h \
188+
])
186189

187190
if test "$my_htop_platform" = darwin; then
188191
AC_CHECK_HEADERS([mach/mach_time.h])
@@ -308,11 +311,47 @@ AC_LINK_IFELSE([
308311

309312
AC_MSG_CHECKING(for __builtin_ctz)
310313
AC_COMPILE_IFELSE([
311-
AC_LANG_PROGRAM([], [[__builtin_ctz(1); /* Supported in GCC 3.4 or later */]])],
314+
AC_LANG_PROGRAM([], [[return __builtin_ctz(1U); /* Supported in GCC 3.4 or later */]])],
312315
[AC_DEFINE([HAVE_BUILTIN_CTZ], 1, [Define to 1 if the compiler supports '__builtin_ctz' function.])
313316
AC_MSG_RESULT(yes)],
314317
AC_MSG_RESULT(no))
315318

319+
AC_MSG_CHECKING(for __builtin_clz)
320+
AC_COMPILE_IFELSE([
321+
AC_LANG_PROGRAM([], [[return __builtin_clz(-1U); /* Supported in GCC 3.4 or later */]])],
322+
[AC_DEFINE([HAVE_BUILTIN_CLZ], 1, [Define to 1 if the compiler supports '__builtin_clz' function.])
323+
AC_MSG_RESULT(yes)],
324+
AC_MSG_RESULT(no))
325+
326+
AC_MSG_CHECKING(for __builtin_popcount)
327+
AC_COMPILE_IFELSE([
328+
AC_LANG_PROGRAM([], [[return __builtin_popcount(0U); /* Supported in GCC 3.4 or later */]])],
329+
[AC_DEFINE([HAVE_BUILTIN_POPCOUNT], 1, [Define to 1 if the compiler supports '__builtin_popcount' function.])
330+
AC_MSG_RESULT(yes)],
331+
AC_MSG_RESULT(no))
332+
333+
AC_MSG_CHECKING(for stdc_bit_floor)
334+
AC_LINK_IFELSE([
335+
AC_LANG_PROGRAM(
336+
[[
337+
#include <stdbit.h>
338+
]],
339+
[[
340+
/* Both the type-generic and type-specific versions should exist.
341+
htop uses the type-specific version. */
342+
return stdc_bit_floor(0U) || stdc_bit_floor_ui(0U);
343+
]])],
344+
[AC_DEFINE([HAVE_STDC_BIT_FLOOR], 1, [Define to 1 if stdc_bit_floor functions are supported.])
345+
AC_MSG_RESULT(yes)],
346+
AC_MSG_RESULT(no))
347+
348+
case "$host_cpu" in
349+
arm*|aarch64*)
350+
dnl ARM NEON intrinsics
351+
AC_CHECK_HEADERS([arm_neon.h])
352+
;;
353+
esac
354+
316355
# ----------------------------------------------------------------------
317356

318357

0 commit comments

Comments
 (0)