|
26 | 26 | #include "arrow/util/bpacking_scalar_internal.h" |
27 | 27 | #include "arrow/util/bpacking_simd_internal.h" |
28 | 28 |
|
29 | | -#if defined(ARROW_HAVE_RUNTIME_AVX2) |
| 29 | +#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE128) |
30 | 30 | # include "arrow/util/cpu_info.h" |
31 | 31 | #endif |
32 | 32 |
|
@@ -107,10 +107,10 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo |
107 | 107 | // will not emit runs larger than 512 (though other implementation might), so we biased |
108 | 108 | // the benchmarks towards a rather small scale. |
109 | 109 | static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2); |
110 | | -constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8}; |
111 | | -constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13}; |
112 | | -constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20}; |
113 | | -constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47}; |
| 110 | +constexpr auto kBitWidths8 = std::initializer_list<int64_t>{1, 2, 8}; |
| 111 | +constexpr auto kBitWidths16 = std::initializer_list<int64_t>{1, 2, 8, 13}; |
| 112 | +constexpr auto kBitWidths32 = std::initializer_list<int64_t>{1, 2, 8, 20}; |
| 113 | +constexpr auto kBitWidths64 = std::initializer_list<int64_t>{1, 2, 8, 20, 47}; |
114 | 114 |
|
115 | 115 | static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = { |
116 | 116 | {0, 1}, |
@@ -159,125 +159,69 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t> |
159 | 159 | return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg)); |
160 | 160 | } |
161 | 161 |
|
162 | | -BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &bpacking::unpack_scalar<bool>) |
163 | | - ->ArgsProduct(kBitWidthsNumValuesBool); |
164 | | -BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false, |
165 | | - &bpacking::unpack_scalar<uint8_t>) |
166 | | - ->ArgsProduct(kBitWidthsNumValues8); |
167 | | -BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, |
168 | | - &bpacking::unpack_scalar<uint16_t>) |
169 | | - ->ArgsProduct(kBitWidthsNumValues16); |
170 | | -BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, |
171 | | - &bpacking::unpack_scalar<uint32_t>) |
172 | | - ->ArgsProduct(kBitWidthsNumValues32); |
173 | | -BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, |
174 | | - &bpacking::unpack_scalar<uint64_t>) |
175 | | - ->ArgsProduct(kBitWidthsNumValues64); |
| 162 | +// Register BM_Unpack{Bool,Uint8,Uint16,Uint32,Uint64} benchmarks for a given |
| 163 | +// UNPACK_FUNC templated on each of those types, with explicit skip args. |
| 164 | +#define BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, SKIP, SKIP_MSG) \ |
| 165 | + BENCHMARK_CAPTURE(BM_UnpackBool, LABEL, ALIGNED, &UNPACK_FUNC<bool>, SKIP, SKIP_MSG) \ |
| 166 | + ->ArgsProduct(kBitWidthsNumValuesBool); \ |
| 167 | + BENCHMARK_CAPTURE(BM_UnpackUint8, LABEL, ALIGNED, &UNPACK_FUNC<uint8_t>, SKIP, \ |
| 168 | + SKIP_MSG) \ |
| 169 | + ->ArgsProduct(kBitWidthsNumValues8); \ |
| 170 | + BENCHMARK_CAPTURE(BM_UnpackUint16, LABEL, ALIGNED, &UNPACK_FUNC<uint16_t>, SKIP, \ |
| 171 | + SKIP_MSG) \ |
| 172 | + ->ArgsProduct(kBitWidthsNumValues16); \ |
| 173 | + BENCHMARK_CAPTURE(BM_UnpackUint32, LABEL, ALIGNED, &UNPACK_FUNC<uint32_t>, SKIP, \ |
| 174 | + SKIP_MSG) \ |
| 175 | + ->ArgsProduct(kBitWidthsNumValues32); \ |
| 176 | + BENCHMARK_CAPTURE(BM_UnpackUint64, LABEL, ALIGNED, &UNPACK_FUNC<uint64_t>, SKIP, \ |
| 177 | + SKIP_MSG) \ |
| 178 | + ->ArgsProduct(kBitWidthsNumValues64) |
| 179 | + |
| 180 | +#define BENCHMARK_UNPACK_ALL_TYPES(LABEL, ALIGNED, UNPACK_FUNC) \ |
| 181 | + BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, false, "") |
| 182 | + |
| 183 | +#define BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(LABEL, ALIGNED, UNPACK_FUNC, CPU_FEATURE, \ |
| 184 | + SKIP_MSG) \ |
| 185 | + BENCHMARK_UNPACK_ALL_TYPES_SKIP( \ |
| 186 | + LABEL, ALIGNED, UNPACK_FUNC, \ |
| 187 | + !CpuInfo::GetInstance()->IsSupported(CpuInfo::CPU_FEATURE), SKIP_MSG) |
| 188 | + |
| 189 | +BENCHMARK_UNPACK_ALL_TYPES(ScalarUnaligned, false, bpacking::unpack_scalar); |
176 | 190 |
|
177 | 191 | #if defined(ARROW_HAVE_SSE4_2) |
178 | | -BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &bpacking::unpack_sse4_2<bool>) |
179 | | - ->ArgsProduct(kBitWidthsNumValuesBool); |
180 | | -BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false, |
181 | | - &bpacking::unpack_sse4_2<uint8_t>) |
182 | | - ->ArgsProduct(kBitWidthsNumValues8); |
183 | | -BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, |
184 | | - &bpacking::unpack_sse4_2<uint16_t>) |
185 | | - ->ArgsProduct(kBitWidthsNumValues16); |
186 | | -BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, |
187 | | - &bpacking::unpack_sse4_2<uint32_t>) |
188 | | - ->ArgsProduct(kBitWidthsNumValues32); |
189 | | -BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, |
190 | | - &bpacking::unpack_sse4_2<uint64_t>) |
191 | | - ->ArgsProduct(kBitWidthsNumValues64); |
| 192 | +BENCHMARK_UNPACK_ALL_TYPES(Sse42Unaligned, false, bpacking::unpack_sse4_2); |
192 | 193 | #endif |
193 | 194 |
|
194 | 195 | #if defined(ARROW_HAVE_RUNTIME_AVX2) |
195 | | -BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &bpacking::unpack_avx2<bool>, |
196 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), |
197 | | - "Avx2 not available") |
198 | | - ->ArgsProduct(kBitWidthsNumValuesBool); |
199 | | -BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &bpacking::unpack_avx2<uint8_t>, |
200 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), |
201 | | - "Avx2 not available") |
202 | | - ->ArgsProduct(kBitWidthsNumValues8); |
203 | | -BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &bpacking::unpack_avx2<uint16_t>, |
204 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), |
205 | | - "Avx2 not available") |
206 | | - ->ArgsProduct(kBitWidthsNumValues16); |
207 | | -BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, &bpacking::unpack_avx2<uint32_t>, |
208 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), |
209 | | - "Avx2 not available") |
210 | | - ->ArgsProduct(kBitWidthsNumValues32); |
211 | | -BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &bpacking::unpack_avx2<uint64_t>, |
212 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), |
213 | | - "Avx2 not available") |
214 | | - ->ArgsProduct(kBitWidthsNumValues64); |
| 196 | +BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx2Unaligned, false, bpacking::unpack_avx2, AVX2, |
| 197 | + "Avx2 not available"); |
215 | 198 | #endif |
216 | 199 |
|
217 | 200 | #if defined(ARROW_HAVE_RUNTIME_AVX512) |
218 | | -BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &bpacking::unpack_avx512<bool>, |
219 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), |
220 | | - "Avx512 not available") |
221 | | - ->ArgsProduct(kBitWidthsNumValuesBool); |
222 | | -BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false, |
223 | | - &bpacking::unpack_avx512<uint8_t>, |
224 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), |
225 | | - "Avx512 not available") |
226 | | - ->ArgsProduct(kBitWidthsNumValues8); |
227 | | -BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, |
228 | | - &bpacking::unpack_avx512<uint16_t>, |
229 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), |
230 | | - "Avx512 not available") |
231 | | - ->ArgsProduct(kBitWidthsNumValues16); |
232 | | -BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, |
233 | | - &bpacking::unpack_avx512<uint32_t>, |
234 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), |
235 | | - "Avx512 not available") |
236 | | - ->ArgsProduct(kBitWidthsNumValues32); |
237 | | -BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, |
238 | | - &bpacking::unpack_avx512<uint64_t>, |
239 | | - !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), |
240 | | - "Avx512 not available") |
241 | | - ->ArgsProduct(kBitWidthsNumValues64); |
| 201 | +BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx512Unaligned, false, bpacking::unpack_avx512, |
| 202 | + AVX512, "Avx512 not available"); |
242 | 203 | #endif |
243 | 204 |
|
244 | 205 | #if defined(ARROW_HAVE_NEON) |
245 | | -BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, &bpacking::unpack_neon<bool>) |
246 | | - ->ArgsProduct(kBitWidthsNumValuesBool); |
247 | | -BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &bpacking::unpack_neon<uint8_t>) |
248 | | - ->ArgsProduct(kBitWidthsNumValues8); |
249 | | -BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &bpacking::unpack_neon<uint16_t>) |
250 | | - ->ArgsProduct(kBitWidthsNumValues16); |
251 | | -BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &bpacking::unpack_neon<uint32_t>) |
252 | | - ->ArgsProduct(kBitWidthsNumValues32); |
253 | | -BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &bpacking::unpack_neon<uint64_t>) |
254 | | - ->ArgsProduct(kBitWidthsNumValues64); |
| 206 | +BENCHMARK_UNPACK_ALL_TYPES(NeonUnaligned, false, bpacking::unpack_neon); |
| 207 | +#endif |
| 208 | + |
| 209 | +#if defined(ARROW_HAVE_RUNTIME_SVE128) |
| 210 | +BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve128Unaligned, false, bpacking::unpack_sve128, |
| 211 | + SVE128, "Sve128 not available"); |
255 | 212 | #endif |
256 | 213 |
|
257 | | -BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>) |
258 | | - ->ArgsProduct(kBitWidthsNumValuesBool); |
259 | | -BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>) |
260 | | - ->ArgsProduct(kBitWidthsNumValuesBool); |
261 | | - |
262 | | -BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>) |
263 | | - ->ArgsProduct(kBitWidthsNumValues8); |
264 | | -BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>) |
265 | | - ->ArgsProduct(kBitWidthsNumValues8); |
266 | | - |
267 | | -BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>) |
268 | | - ->ArgsProduct(kBitWidthsNumValues16); |
269 | | -BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>) |
270 | | - ->ArgsProduct(kBitWidthsNumValues16); |
271 | | - |
272 | | -BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, &unpack<uint32_t>) |
273 | | - ->ArgsProduct(kBitWidthsNumValues32); |
274 | | -BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, &unpack<uint32_t>) |
275 | | - ->ArgsProduct(kBitWidthsNumValues32); |
276 | | - |
277 | | -BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, &unpack<uint64_t>) |
278 | | - ->ArgsProduct(kBitWidthsNumValues64); |
279 | | -BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, &unpack<uint64_t>) |
280 | | - ->ArgsProduct(kBitWidthsNumValues64); |
| 214 | +#if defined(ARROW_HAVE_RUNTIME_SVE256) |
| 215 | +BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve256Unaligned, false, bpacking::unpack_sve256, |
| 216 | + SVE256, "Sve256 not available"); |
| 217 | +#endif |
| 218 | + |
| 219 | +BENCHMARK_UNPACK_ALL_TYPES(DynamicAligned, true, unpack); |
| 220 | +BENCHMARK_UNPACK_ALL_TYPES(DynamicUnaligned, false, unpack); |
| 221 | + |
| 222 | +#undef BENCHMARK_UNPACK_ALL_TYPES_RUNTIME |
| 223 | +#undef BENCHMARK_UNPACK_ALL_TYPES |
| 224 | +#undef BENCHMARK_UNPACK_ALL_TYPES_SKIP |
281 | 225 |
|
282 | 226 | } // namespace |
283 | 227 | } // namespace arrow::internal |
0 commit comments