Skip to content

Commit 5f9d5bf

Browse files
TocarIPcopybara-github
authored andcommitted
Optimzie crc32 on AMD Milan+
We have AVX encoded vector PCLMULQDQ on Milan, so use it to make crc32c computations ~10% faster. We need to use inline asm, since building this twice with different complier flags for dynamic dispatch performed worse due to missing inlining. BM_Calculate/0 1.136n ± 0% 1.136n ± 1% ~ (p=0.968 n=6) BM_Calculate/1 1.420n ± 0% 1.421n ± 1% ~ (p=0.870 n=6) BM_Calculate/100 9.089n ± 0% 9.660n ± 1% +6.29% (p=0.002 n=6) BM_Calculate/2048 75.30n ± 1% 67.67n ± 1% -10.13% (p=0.002 n=6) BM_Calculate/10000 313.1n ± 0% 286.1n ± 0% -8.63% (p=0.002 n=6) BM_Calculate/500000 14.91µ ± 4% 13.49µ ± 1% -9.48% (p=0.002 n=6) BM_Extend/0 1.136n ± 1% 1.136n ± 1% ~ (p=0.636 n=6) BM_Extend/1 1.420n ± 0% 1.420n ± 1% ~ (p=0.636 n=6) BM_Extend/100 9.247n ± 2% 9.800n ± 2% +5.99% (p=0.002 n=6) BM_Extend/2048 75.73n ± 1% 67.37n ± 1% -11.04% (p=0.002 n=6) BM_Extend/10000 313.2n ± 1% 286.2n ± 0% -8.62% (p=0.002 n=6) BM_Extend/500000 14.87µ ± 1% 13.57µ ± 1% -8.74% (p=0.002 n=6) BM_Extend/100000000 3.185m ± 2% 2.816m ± 3% -11.60% (p=0.002 n=6) BM_ExtendCacheMiss/10 26.07m ± 1% 26.06m ± 1% ~ (p=1.000 n=6) BM_ExtendCacheMiss/100 13.86m ± 4% 14.36m ± 2% +3.61% (p=0.026 n=6) BM_ExtendCacheMiss/1000 27.02m ± 4% 27.28m ± 4% ~ (p=0.699 n=6) BM_ExtendCacheMiss/100000 5.114m ± 5% 4.600m ± 8% -10.07% (p=0.002 n=6) BM_ExtendByZeroes/1 1.420n ± 0% 1.420n ± 0% ~ (p=0.670 n=12) BM_ExtendByZeroes/10 1.704n ± 1% 1.704n ± 0% ~ (p=1.000 n=6) BM_ExtendByZeroes/100 3.128n ± 0% 3.128n ± 0% ~ (p=1.000 n=6) BM_ExtendByZeroes/1000 6.758n ± 0% 6.638n ± 1% -1.78% (p=0.002 n=6) BM_ExtendByZeroes/10000 6.619n ± 1% 6.503n ± 0% -1.75% (p=0.002 n=6) BM_ExtendByZeroes/100000 8.537n ± 1% 8.479n ± 0% -0.67% (p=0.019 n=6) BM_ExtendByZeroes/1000000 9.766n ± 1% 9.692n ± 1% -0.75% (p=0.002 n=6) PiperOrigin-RevId: 900870516 Change-Id: I1382ae2ffeed35e1d55a0916290144cae5256fe0
1 parent cd0423d commit 5f9d5bf

2 files changed

Lines changed: 182 additions & 80 deletions

File tree

absl/crc/internal/crc32_x86_arm_combined_simd.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
1616
#define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
1717

18+
#include <array>
1819
#include <cstdint>
1920

2021
#include "absl/base/config.h"
@@ -65,6 +66,13 @@ using V128 = uint64x2_t;
6566
using V128 = __m128i;
6667
#endif
6768

69+
#if defined(__AVX__)
70+
using V256 = __m256i;
71+
#else
72+
// Placeholder for V256 when AVX is not available.
73+
using V256 = std::array<uint64_t, 4>;
74+
#endif
75+
6876
// Starting with the initial value in |crc|, accumulates a CRC32 value for
6977
// unsigned integers of different sizes.
7078
uint32_t CRC32_u8(uint32_t crc, uint8_t v);
@@ -119,6 +127,17 @@ int64_t V128_Low64(const V128 l);
119127
// Add packed 64-bit integers in |l| and |r|.
120128
V128 V128_Add64(const V128 l, const V128 r);
121129

130+
#if defined(__AVX__)
131+
inline V256 V256_LoadU(const V256* src);
132+
inline V256 V256_Broadcast128(const V128* src);
133+
#else
134+
template <typename T = V256>
135+
T V256_LoadU(const T* src);
136+
137+
template <typename T = V256>
138+
T V256_Broadcast128(const V128* src);
139+
#endif
140+
122141
#endif
123142

124143
#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
@@ -271,6 +290,26 @@ inline V128 V128_Add64(const V128 l, const V128 r) { return vaddq_u64(l, r); }
271290

272291
#endif
273292

293+
#if defined(__AVX__)
294+
inline V256 V256_LoadU(const V256* src) { return _mm256_loadu_si256(src); }
295+
296+
inline V256 V256_Broadcast128(const V128* src) {
297+
return _mm256_castps_si256(
298+
_mm256_broadcast_ps(reinterpret_cast<const __m128*>(src)));
299+
}
300+
#elif defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) || \
301+
defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
302+
template <typename T>
303+
inline T V256_LoadU(const T* src) {
304+
return T{};
305+
}
306+
307+
template <typename T>
308+
inline T V256_Broadcast128(const V128* src) {
309+
return T{};
310+
}
311+
#endif
312+
274313
} // namespace crc_internal
275314
ABSL_NAMESPACE_END
276315
} // namespace absl

0 commit comments

Comments
 (0)