|
34 | 34 | # define HDR_UNLIKELY(x) (x) |
35 | 35 | #endif |
36 | 36 |
|
| 37 | +/* Runtime-dispatched AVX2 path: keep the rest of this TU at the project's |
| 38 | + baseline ISA so the shipped binary does not silently require AVX2. */ |
| 39 | +#if (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) \ |
| 40 | + && (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER) |
| 41 | +# define HDR_HAS_AVX2_DISPATCH 1 |
| 42 | +# include <immintrin.h> |
| 43 | +#endif |
| 44 | + |
37 | 45 | /* ###### ####### ## ## ## ## ######## ###### */ |
38 | 46 | /* ## ## ## ## ## ## ### ## ## ## ## */ |
39 | 47 | /* ## ## ## ## ## #### ## ## ## */ |
@@ -700,22 +708,65 @@ int64_t hdr_min(const struct hdr_histogram* h) |
700 | 708 | return non_zero_min(h); |
701 | 709 | } |
702 | 710 |
|
703 | | -static int64_t get_value_from_idx_up_to_count(const struct hdr_histogram* h, int64_t count_at_percentile) |
| 711 | +static int64_t get_value_from_idx_up_to_count_scalar( |
| 712 | + const struct hdr_histogram* h, int64_t count_at_percentile) |
704 | 713 | { |
705 | 714 | int64_t count_to_idx = 0; |
706 | | - |
707 | | - count_at_percentile = 0 < count_at_percentile ? count_at_percentile : 1; |
708 | | - for (int32_t idx = 0; idx < h->counts_len; idx++) |
709 | | - { |
| 715 | + for (int32_t idx = 0; idx < h->counts_len; idx++) { |
710 | 716 | count_to_idx += h->counts[idx]; |
711 | 717 | if (count_to_idx >= count_at_percentile) |
712 | | - { |
713 | 718 | return hdr_value_at_index(h, idx); |
714 | | - } |
715 | 719 | } |
| 720 | + return 0; |
| 721 | +} |
716 | 722 |
|
| 723 | +#ifdef HDR_HAS_AVX2_DISPATCH |
| 724 | +__attribute__((target("avx2"))) |
| 725 | +static int64_t get_value_from_idx_up_to_count_avx2( |
| 726 | + const struct hdr_histogram* h, int64_t count_at_percentile) |
| 727 | +{ |
| 728 | + int64_t running = 0; |
| 729 | + int32_t idx = 0; |
| 730 | + const int32_t limit = h->counts_len & ~3; |
| 731 | + |
| 732 | + for (; idx < limit; idx += 4) { |
| 733 | + __m256i v = _mm256_loadu_si256((const __m256i*)&h->counts[idx]); |
| 734 | + __m128i lo = _mm256_castsi256_si128(v); |
| 735 | + __m128i hi = _mm256_extracti128_si256(v, 1); |
| 736 | + __m128i s = _mm_add_epi64(lo, hi); |
| 737 | + /* Lanes are non-negative counts whose total fits in int64_t (total_count |
| 738 | + invariant), so the chunk sum cannot overflow under valid state. Use |
| 739 | + unsigned add to avoid signed-overflow UB if invariants are violated. */ |
| 740 | + int64_t chunk = (int64_t)((uint64_t)_mm_extract_epi64(s, 0) |
| 741 | + + (uint64_t)_mm_extract_epi64(s, 1)); |
| 742 | + |
| 743 | + if (__builtin_expect(running + chunk >= count_at_percentile, 0)) { |
| 744 | + for (int32_t j = idx; j < idx + 4; j++) { |
| 745 | + running += h->counts[j]; |
| 746 | + if (running >= count_at_percentile) |
| 747 | + return hdr_value_at_index(h, j); |
| 748 | + } |
| 749 | + } |
| 750 | + running += chunk; |
| 751 | + } |
| 752 | + for (; idx < h->counts_len; idx++) { |
| 753 | + running += h->counts[idx]; |
| 754 | + if (running >= count_at_percentile) |
| 755 | + return hdr_value_at_index(h, idx); |
| 756 | + } |
717 | 757 | return 0; |
718 | 758 | } |
| 759 | +#endif |
| 760 | + |
| 761 | +static int64_t get_value_from_idx_up_to_count(const struct hdr_histogram* h, int64_t count_at_percentile) |
| 762 | +{ |
| 763 | + count_at_percentile = count_at_percentile > 0 ? count_at_percentile : 1; |
| 764 | +#ifdef HDR_HAS_AVX2_DISPATCH |
| 765 | + if (__builtin_cpu_supports("avx2")) |
| 766 | + return get_value_from_idx_up_to_count_avx2(h, count_at_percentile); |
| 767 | +#endif |
| 768 | + return get_value_from_idx_up_to_count_scalar(h, count_at_percentile); |
| 769 | +} |
719 | 770 |
|
720 | 771 |
|
721 | 772 | int64_t hdr_value_at_percentile(const struct hdr_histogram* h, double percentile) |
|
0 commit comments