Skip to content

Commit 3f3b9f3

Browse files
committed
perf: AVX2 vectorized prefix-sum in percentile scan (manually applied).
2 parents aa4eed7 + 64dd6fa commit 3f3b9f3

3 files changed

Lines changed: 117 additions & 7 deletions

File tree

src/hdr_histogram.c

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@
3434
# define HDR_UNLIKELY(x) (x)
3535
#endif
3636

37+
/* Runtime-dispatched AVX2 path: keep the rest of this TU at the project's
38+
baseline ISA so the shipped binary does not silently require AVX2. */
39+
#if (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) \
40+
&& (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
41+
# define HDR_HAS_AVX2_DISPATCH 1
42+
# include <immintrin.h>
43+
#endif
44+
3745
/* ###### ####### ## ## ## ## ######## ###### */
3846
/* ## ## ## ## ## ## ### ## ## ## ## */
3947
/* ## ## ## ## ## #### ## ## ## */
@@ -700,22 +708,65 @@ int64_t hdr_min(const struct hdr_histogram* h)
700708
return non_zero_min(h);
701709
}
702710

703-
static int64_t get_value_from_idx_up_to_count(const struct hdr_histogram* h, int64_t count_at_percentile)
711+
static int64_t get_value_from_idx_up_to_count_scalar(
712+
const struct hdr_histogram* h, int64_t count_at_percentile)
704713
{
705714
int64_t count_to_idx = 0;
706-
707-
count_at_percentile = 0 < count_at_percentile ? count_at_percentile : 1;
708-
for (int32_t idx = 0; idx < h->counts_len; idx++)
709-
{
715+
for (int32_t idx = 0; idx < h->counts_len; idx++) {
710716
count_to_idx += h->counts[idx];
711717
if (count_to_idx >= count_at_percentile)
712-
{
713718
return hdr_value_at_index(h, idx);
714-
}
715719
}
720+
return 0;
721+
}
716722

723+
#ifdef HDR_HAS_AVX2_DISPATCH
724+
__attribute__((target("avx2")))
725+
static int64_t get_value_from_idx_up_to_count_avx2(
726+
const struct hdr_histogram* h, int64_t count_at_percentile)
727+
{
728+
int64_t running = 0;
729+
int32_t idx = 0;
730+
const int32_t limit = h->counts_len & ~3;
731+
732+
for (; idx < limit; idx += 4) {
733+
__m256i v = _mm256_loadu_si256((const __m256i*)&h->counts[idx]);
734+
__m128i lo = _mm256_castsi256_si128(v);
735+
__m128i hi = _mm256_extracti128_si256(v, 1);
736+
__m128i s = _mm_add_epi64(lo, hi);
737+
/* Lanes are non-negative counts whose total fits in int64_t (total_count
738+
invariant), so the chunk sum cannot overflow under valid state. Use
739+
unsigned add to avoid signed-overflow UB if invariants are violated. */
740+
int64_t chunk = (int64_t)((uint64_t)_mm_extract_epi64(s, 0)
741+
+ (uint64_t)_mm_extract_epi64(s, 1));
742+
743+
if (__builtin_expect(running + chunk >= count_at_percentile, 0)) {
744+
for (int32_t j = idx; j < idx + 4; j++) {
745+
running += h->counts[j];
746+
if (running >= count_at_percentile)
747+
return hdr_value_at_index(h, j);
748+
}
749+
}
750+
running += chunk;
751+
}
752+
for (; idx < h->counts_len; idx++) {
753+
running += h->counts[idx];
754+
if (running >= count_at_percentile)
755+
return hdr_value_at_index(h, idx);
756+
}
717757
return 0;
718758
}
759+
#endif
760+
761+
static int64_t get_value_from_idx_up_to_count(const struct hdr_histogram* h, int64_t count_at_percentile)
762+
{
763+
count_at_percentile = count_at_percentile > 0 ? count_at_percentile : 1;
764+
#ifdef HDR_HAS_AVX2_DISPATCH
765+
if (__builtin_cpu_supports("avx2"))
766+
return get_value_from_idx_up_to_count_avx2(h, count_at_percentile);
767+
#endif
768+
return get_value_from_idx_up_to_count_scalar(h, count_at_percentile);
769+
}
719770

720771

721772
int64_t hdr_value_at_percentile(const struct hdr_histogram* h, double percentile)

test/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
option(HDR_HISTOGRAM_BUILD_BENCHMARK "Build benchmark" OFF)
22
if(HDR_HISTOGRAM_BUILD_BENCHMARK)
3+
add_executable(hdr_percentile_bench hdr_percentile_bench.c)
4+
target_link_libraries(hdr_percentile_bench hdr_histogram_static)
5+
36
if(UNIX)
47
enable_language(CXX)
58

test/hdr_percentile_bench.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#include <stdint.h>
2+
#include <stdlib.h>
3+
#include <stdio.h>
4+
#include <hdr/hdr_histogram.h>
5+
#include <hdr/hdr_time.h>
6+
7+
static hdr_timespec diff(hdr_timespec s, hdr_timespec e) {
8+
hdr_timespec t;
9+
if (e.tv_nsec < s.tv_nsec) { t.tv_sec = e.tv_sec-s.tv_sec-1; t.tv_nsec = 1000000000+e.tv_nsec-s.tv_nsec; }
10+
else { t.tv_sec = e.tv_sec-s.tv_sec; t.tv_nsec = e.tv_nsec-s.tv_nsec; }
11+
return t;
12+
}
13+
14+
int main(void) {
15+
struct hdr_histogram* h;
16+
hdr_init(1, INT64_C(3600000000), 3, &h);
17+
18+
/* Spread non-zero entries across the full bucket range so the percentile
19+
scan actually walks past the prologue. Fibonacci-hash spread (constant
20+
2654435761) gives an even distribution over a coprime modulus. */
21+
for (int64_t v = 1; v <= 1000000; v++) {
22+
int64_t value = (int64_t)(((uint64_t)v * 2654435761u) % 1000000000u) + 1;
23+
hdr_record_value(h, value);
24+
}
25+
26+
const double percentiles[] = {50.0, 75.0, 90.0, 95.0, 99.0, 99.9, 99.99};
27+
const int n_percentiles = (int)(sizeof(percentiles) / sizeof(percentiles[0]));
28+
const int64_t iters = 1000000;
29+
const int warmup_runs = 3;
30+
const int n_runs = 20;
31+
32+
double best_secs = 1e18, total_secs = 0;
33+
int64_t sink_total = 0;
34+
for (int run = 0; run < warmup_runs + n_runs; run++) {
35+
hdr_timespec t0, t1;
36+
hdr_gettime(&t0);
37+
volatile int64_t sink = 0;
38+
for (int64_t i = 0; i < iters; i++)
39+
sink += hdr_value_at_percentile(h, percentiles[i % n_percentiles]);
40+
hdr_gettime(&t1);
41+
hdr_timespec taken = diff(t0, t1);
42+
double secs = taken.tv_sec + taken.tv_nsec / 1e9;
43+
if (run >= warmup_runs) {
44+
if (secs < best_secs) best_secs = secs;
45+
total_secs += secs;
46+
sink_total += sink;
47+
}
48+
}
49+
double best_qps = iters / best_secs / 1e6;
50+
double mean_qps = iters / (total_secs / n_runs) / 1e6;
51+
printf("best: %.2f M queries/sec mean: %.2f M queries/sec (sink=%lld)\n",
52+
best_qps, mean_qps, (long long)sink_total);
53+
54+
hdr_close(h);
55+
return 0;
56+
}

0 commit comments

Comments
 (0)