Skip to content

Commit d1a8cf4

Browse files
AntoinePrvpitrou
andauthored
GH-47769: [C++] SVE dynamic dispatch (#49756)
### Rationale for this change Just like we dynamically dispatch to AVX2 on x86 CPUs, we want to dynamically dispatch to more advanced SIMD extension on ARM64 chips. ### What changes are included in this PR? - A new macro to enable selecting the runtime SVE version - Detection of the ARM64 CPU features available at runtime - Adding SVE to the dynamic dispatch for bit unpacking algorithms. ### Are these changes tested? ### Are there any user-facing changes? No. * GitHub Issue: #47769 Lead-authored-by: Antoine Prouvost <AntoinePrv@users.noreply.github.com> Co-authored-by: AntoinePrv <AntoinePrv@users.noreply.github.com> Co-authored-by: Antoine Pitrou <antoine@python.org> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 6198adc commit d1a8cf4

18 files changed

Lines changed: 535 additions & 377 deletions

cpp/cmake_modules/DefineOptions.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,9 @@ takes precedence over ccache if a storage backend is configured" ON)
191191
"SSE4_2"
192192
"AVX2"
193193
"AVX512"
194+
"SVE128" # fixed size SVE
195+
"SVE256" # "
196+
"SVE512" # "
194197
"MAX")
195198

196199
define_option(ARROW_ALTIVEC "Build with Altivec if compiler has support" ON)

cpp/cmake_modules/SetupCxxFlags.cmake

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,31 @@ elseif(ARROW_CPU_FLAG STREQUAL "ppc")
134134
elseif(ARROW_CPU_FLAG STREQUAL "aarch64")
135135
# Arm64 compiler flags, gcc/clang only
136136
set(ARROW_ARMV8_MARCH "armv8-a")
137-
check_cxx_compiler_flag("-march=${ARROW_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE)
137+
set(ARROW_SVE_FLAGS "-march=${ARROW_ARMV8_MARCH}+sve")
138+
set(ARROW_SVE128_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=128")
139+
set(ARROW_SVE256_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=256")
140+
set(ARROW_SVE512_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=512")
141+
# We only have a way to do SVE dynamic dispatch on Linux (BSD may be possible
142+
# but is currently not implemented).
143+
# We still support explicitly setting runtime SIMD level to some SVE values
144+
# on these platforms as this can be useful in development for building SVE
145+
# code locally. The compiler supports it but the code won't run.
146+
if((APPLE OR WIN32) AND ARROW_RUNTIME_SIMD_LEVEL STREQUAL "MAX")
147+
set(ARROW_RUNTIME_SIMD_LEVEL "NONE")
148+
endif()
149+
check_cxx_compiler_flag("${ARROW_SVE_FLAGS}" CXX_SUPPORTS_SVE)
150+
if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE128|SVE256|SVE512|MAX)$")
151+
set(ARROW_HAVE_RUNTIME_SVE128 ON)
152+
add_definitions(-DARROW_HAVE_RUNTIME_SVE128)
153+
endif()
154+
if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE256|SVE512|MAX)$")
155+
set(ARROW_HAVE_RUNTIME_SVE256 ON)
156+
add_definitions(-DARROW_HAVE_RUNTIME_SVE256)
157+
endif()
158+
if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE512|MAX)$")
159+
set(ARROW_HAVE_RUNTIME_SVE512 ON)
160+
add_definitions(-DARROW_HAVE_RUNTIME_SVE512)
161+
endif()
138162
if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT")
139163
set(ARROW_SIMD_LEVEL "NEON")
140164
endif()
@@ -528,8 +552,7 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64")
528552
if(NOT CXX_SUPPORTS_SVE)
529553
message(FATAL_ERROR "SVE required but compiler doesn't support it.")
530554
endif()
531-
# -march=armv8-a+sve
532-
set(ARROW_ARMV8_MARCH "${ARROW_ARMV8_MARCH}+sve")
555+
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_SVE_FLAGS}")
533556
string(REGEX MATCH "[0-9]+" SVE_VECTOR_BITS ${ARROW_SIMD_LEVEL})
534557
if(SVE_VECTOR_BITS)
535558
set(ARROW_HAVE_SVE${SVE_VECTOR_BITS} ON)
@@ -540,8 +563,9 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64")
540563
set(ARROW_HAVE_SVE_SIZELESS ON)
541564
add_definitions(-DARROW_HAVE_SVE_SIZELESS)
542565
endif()
566+
else() # ARM v8 without SVE
567+
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}")
543568
endif()
544-
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}")
545569
elseif(NOT ARROW_SIMD_LEVEL STREQUAL "NONE")
546570
message(WARNING "ARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} not supported by Arm.")
547571
endif()

cpp/src/arrow/CMakeLists.txt

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,27 @@ macro(append_runtime_avx512_src SRCS SRC)
343343
endif()
344344
endmacro()
345345

346+
macro(append_runtime_sve128_src SRCS SRC)
347+
if(ARROW_HAVE_RUNTIME_SVE128)
348+
list(APPEND ${SRCS} ${SRC})
349+
set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE128_FLAGS}")
350+
endif()
351+
endmacro()
352+
353+
macro(append_runtime_sve256_src SRCS SRC)
354+
if(ARROW_HAVE_RUNTIME_SVE256)
355+
list(APPEND ${SRCS} ${SRC})
356+
set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE256_FLAGS}")
357+
endif()
358+
endmacro()
359+
360+
macro(append_runtime_sve512_src SRCS SRC)
361+
if(ARROW_HAVE_RUNTIME_SVE512)
362+
list(APPEND ${SRCS} ${SRC})
363+
set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_SVE512_FLAGS}")
364+
endif()
365+
endmacro()
366+
346367
# Write out compile-time configuration constants
347368
string(REPLACE "${CMAKE_SOURCE_DIR}" "<CMAKE_SOURCE_DIR>" REDACTED_CXX_FLAGS
348369
${CMAKE_CXX_FLAGS})
@@ -498,7 +519,7 @@ set(ARROW_UTIL_SRCS
498519
util/bitmap_ops.cc
499520
util/bpacking.cc
500521
util/bpacking_scalar.cc
501-
util/bpacking_simd_default.cc
522+
util/bpacking_simd_128.cc
502523
util/byte_size.cc
503524
util/byte_stream_split_internal.cc
504525
util/cancel.cc
@@ -543,9 +564,12 @@ set(ARROW_UTIL_SRCS
543564

544565
append_runtime_avx2_src(ARROW_UTIL_SRCS util/byte_stream_split_internal_avx2.cc)
545566

546-
append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_avx2.cc)
567+
append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
547568
append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_simd_avx512.cc)
548569

570+
append_runtime_sve128_src(ARROW_UTIL_SRCS util/bpacking_simd_128_alt.cc)
571+
append_runtime_sve256_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
572+
549573
if(ARROW_WITH_BROTLI)
550574
list(APPEND ARROW_UTIL_SRCS util/compression_brotli.cc)
551575
endif()

cpp/src/arrow/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ arrow_util_srcs = [
177177
'util/bitmap_ops.cc',
178178
'util/bpacking.cc',
179179
'util/bpacking_scalar.cc',
180-
'util/bpacking_simd_default.cc',
180+
'util/bpacking_simd_128.cc',
181181
'util/byte_size.cc',
182182
'util/byte_stream_split_internal.cc',
183183
'util/cancel.cc',

cpp/src/arrow/util/bpacking.cc

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,29 @@ struct UnpackDynamicFunction {
3333

3434
static constexpr auto implementations() {
3535
return std::array{
36+
// x86 implementations
3637
#if defined(ARROW_HAVE_SSE4_2)
3738
Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2<Uint>},
38-
#else
39-
Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
40-
#endif
41-
#if defined(ARROW_HAVE_RUNTIME_AVX2)
39+
# if defined(ARROW_HAVE_RUNTIME_AVX2)
4240
Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2<Uint>},
43-
#endif
44-
#if defined(ARROW_HAVE_RUNTIME_AVX512)
41+
# endif
42+
# if defined(ARROW_HAVE_RUNTIME_AVX512)
4543
Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512<Uint>},
44+
# endif
45+
46+
// ARM implementations
47+
#elif defined(ARROW_HAVE_NEON)
48+
Implementation{DispatchLevel::NONE, &bpacking::unpack_neon<Uint>},
49+
# if defined(ARROW_HAVE_RUNTIME_SVE128)
50+
Implementation{DispatchLevel::SVE128, &bpacking::unpack_sve128<Uint>},
51+
# endif
52+
# if defined(ARROW_HAVE_RUNTIME_SVE256)
53+
Implementation{DispatchLevel::SVE256, &bpacking::unpack_sve256<Uint>},
54+
# endif
55+
56+
// Other implementations
57+
#else
58+
Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
4659
#endif
4760
};
4861
}
@@ -52,12 +65,14 @@ struct UnpackDynamicFunction {
5265

5366
template <typename Uint>
5467
void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
55-
#if defined(ARROW_HAVE_NEON)
56-
return bpacking::unpack_neon(in, out, opts);
57-
#else
58-
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
59-
return dispatch.func(in, out, opts);
60-
#endif
68+
auto constexpr kImplementations = UnpackDynamicFunction<Uint>::implementations();
69+
if constexpr (kImplementations.size() == 1) {
70+
constexpr auto func = kImplementations.front().second;
71+
func(in, out, opts);
72+
} else {
73+
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
74+
return dispatch.func(in, out, opts);
75+
}
6176
}
6277

6378
template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);

cpp/src/arrow/util/bpacking_benchmark.cc

Lines changed: 55 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#include "arrow/util/bpacking_scalar_internal.h"
2727
#include "arrow/util/bpacking_simd_internal.h"
2828

29-
#if defined(ARROW_HAVE_RUNTIME_AVX2)
29+
#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE128)
3030
# include "arrow/util/cpu_info.h"
3131
#endif
3232

@@ -107,10 +107,10 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
107107
// will not emit runs larger than 512 (though other implementation might), so we biased
108108
// the benchmarks towards a rather small scale.
109109
static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2);
110-
constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8};
111-
constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
112-
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
113-
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};
110+
constexpr auto kBitWidths8 = std::initializer_list<int64_t>{1, 2, 8};
111+
constexpr auto kBitWidths16 = std::initializer_list<int64_t>{1, 2, 8, 13};
112+
constexpr auto kBitWidths32 = std::initializer_list<int64_t>{1, 2, 8, 20};
113+
constexpr auto kBitWidths64 = std::initializer_list<int64_t>{1, 2, 8, 20, 47};
114114

115115
static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
116116
{0, 1},
@@ -159,125 +159,69 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t>
159159
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
160160
}
161161

162-
BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &bpacking::unpack_scalar<bool>)
163-
->ArgsProduct(kBitWidthsNumValuesBool);
164-
BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false,
165-
&bpacking::unpack_scalar<uint8_t>)
166-
->ArgsProduct(kBitWidthsNumValues8);
167-
BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false,
168-
&bpacking::unpack_scalar<uint16_t>)
169-
->ArgsProduct(kBitWidthsNumValues16);
170-
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false,
171-
&bpacking::unpack_scalar<uint32_t>)
172-
->ArgsProduct(kBitWidthsNumValues32);
173-
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false,
174-
&bpacking::unpack_scalar<uint64_t>)
175-
->ArgsProduct(kBitWidthsNumValues64);
162+
// Register BM_Unpack{Bool,Uint8,Uint16,Uint32,Uint64} benchmarks for a given
163+
// UNPACK_FUNC templated on each of those types, with explicit skip args.
164+
#define BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, SKIP, SKIP_MSG) \
165+
BENCHMARK_CAPTURE(BM_UnpackBool, LABEL, ALIGNED, &UNPACK_FUNC<bool>, SKIP, SKIP_MSG) \
166+
->ArgsProduct(kBitWidthsNumValuesBool); \
167+
BENCHMARK_CAPTURE(BM_UnpackUint8, LABEL, ALIGNED, &UNPACK_FUNC<uint8_t>, SKIP, \
168+
SKIP_MSG) \
169+
->ArgsProduct(kBitWidthsNumValues8); \
170+
BENCHMARK_CAPTURE(BM_UnpackUint16, LABEL, ALIGNED, &UNPACK_FUNC<uint16_t>, SKIP, \
171+
SKIP_MSG) \
172+
->ArgsProduct(kBitWidthsNumValues16); \
173+
BENCHMARK_CAPTURE(BM_UnpackUint32, LABEL, ALIGNED, &UNPACK_FUNC<uint32_t>, SKIP, \
174+
SKIP_MSG) \
175+
->ArgsProduct(kBitWidthsNumValues32); \
176+
BENCHMARK_CAPTURE(BM_UnpackUint64, LABEL, ALIGNED, &UNPACK_FUNC<uint64_t>, SKIP, \
177+
SKIP_MSG) \
178+
->ArgsProduct(kBitWidthsNumValues64)
179+
180+
#define BENCHMARK_UNPACK_ALL_TYPES(LABEL, ALIGNED, UNPACK_FUNC) \
181+
BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, false, "")
182+
183+
#define BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(LABEL, ALIGNED, UNPACK_FUNC, CPU_FEATURE, \
184+
SKIP_MSG) \
185+
BENCHMARK_UNPACK_ALL_TYPES_SKIP( \
186+
LABEL, ALIGNED, UNPACK_FUNC, \
187+
!CpuInfo::GetInstance()->IsSupported(CpuInfo::CPU_FEATURE), SKIP_MSG)
188+
189+
BENCHMARK_UNPACK_ALL_TYPES(ScalarUnaligned, false, bpacking::unpack_scalar);
176190

177191
#if defined(ARROW_HAVE_SSE4_2)
178-
BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &bpacking::unpack_sse4_2<bool>)
179-
->ArgsProduct(kBitWidthsNumValuesBool);
180-
BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false,
181-
&bpacking::unpack_sse4_2<uint8_t>)
182-
->ArgsProduct(kBitWidthsNumValues8);
183-
BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false,
184-
&bpacking::unpack_sse4_2<uint16_t>)
185-
->ArgsProduct(kBitWidthsNumValues16);
186-
BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false,
187-
&bpacking::unpack_sse4_2<uint32_t>)
188-
->ArgsProduct(kBitWidthsNumValues32);
189-
BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false,
190-
&bpacking::unpack_sse4_2<uint64_t>)
191-
->ArgsProduct(kBitWidthsNumValues64);
192+
BENCHMARK_UNPACK_ALL_TYPES(Sse42Unaligned, false, bpacking::unpack_sse4_2);
192193
#endif
193194

194195
#if defined(ARROW_HAVE_RUNTIME_AVX2)
195-
BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &bpacking::unpack_avx2<bool>,
196-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
197-
"Avx2 not available")
198-
->ArgsProduct(kBitWidthsNumValuesBool);
199-
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &bpacking::unpack_avx2<uint8_t>,
200-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
201-
"Avx2 not available")
202-
->ArgsProduct(kBitWidthsNumValues8);
203-
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &bpacking::unpack_avx2<uint16_t>,
204-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
205-
"Avx2 not available")
206-
->ArgsProduct(kBitWidthsNumValues16);
207-
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, &bpacking::unpack_avx2<uint32_t>,
208-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
209-
"Avx2 not available")
210-
->ArgsProduct(kBitWidthsNumValues32);
211-
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &bpacking::unpack_avx2<uint64_t>,
212-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
213-
"Avx2 not available")
214-
->ArgsProduct(kBitWidthsNumValues64);
196+
BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx2Unaligned, false, bpacking::unpack_avx2, AVX2,
197+
"Avx2 not available");
215198
#endif
216199

217200
#if defined(ARROW_HAVE_RUNTIME_AVX512)
218-
BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &bpacking::unpack_avx512<bool>,
219-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
220-
"Avx512 not available")
221-
->ArgsProduct(kBitWidthsNumValuesBool);
222-
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false,
223-
&bpacking::unpack_avx512<uint8_t>,
224-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
225-
"Avx512 not available")
226-
->ArgsProduct(kBitWidthsNumValues8);
227-
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false,
228-
&bpacking::unpack_avx512<uint16_t>,
229-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
230-
"Avx512 not available")
231-
->ArgsProduct(kBitWidthsNumValues16);
232-
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false,
233-
&bpacking::unpack_avx512<uint32_t>,
234-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
235-
"Avx512 not available")
236-
->ArgsProduct(kBitWidthsNumValues32);
237-
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false,
238-
&bpacking::unpack_avx512<uint64_t>,
239-
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
240-
"Avx512 not available")
241-
->ArgsProduct(kBitWidthsNumValues64);
201+
BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx512Unaligned, false, bpacking::unpack_avx512,
202+
AVX512, "Avx512 not available");
242203
#endif
243204

244205
#if defined(ARROW_HAVE_NEON)
245-
BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, &bpacking::unpack_neon<bool>)
246-
->ArgsProduct(kBitWidthsNumValuesBool);
247-
BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &bpacking::unpack_neon<uint8_t>)
248-
->ArgsProduct(kBitWidthsNumValues8);
249-
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &bpacking::unpack_neon<uint16_t>)
250-
->ArgsProduct(kBitWidthsNumValues16);
251-
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &bpacking::unpack_neon<uint32_t>)
252-
->ArgsProduct(kBitWidthsNumValues32);
253-
BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &bpacking::unpack_neon<uint64_t>)
254-
->ArgsProduct(kBitWidthsNumValues64);
206+
BENCHMARK_UNPACK_ALL_TYPES(NeonUnaligned, false, bpacking::unpack_neon);
207+
#endif
208+
209+
#if defined(ARROW_HAVE_RUNTIME_SVE128)
210+
BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve128Unaligned, false, bpacking::unpack_sve128,
211+
SVE128, "Sve128 not available");
255212
#endif
256213

257-
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
258-
->ArgsProduct(kBitWidthsNumValuesBool);
259-
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
260-
->ArgsProduct(kBitWidthsNumValuesBool);
261-
262-
BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>)
263-
->ArgsProduct(kBitWidthsNumValues8);
264-
BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>)
265-
->ArgsProduct(kBitWidthsNumValues8);
266-
267-
BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
268-
->ArgsProduct(kBitWidthsNumValues16);
269-
BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
270-
->ArgsProduct(kBitWidthsNumValues16);
271-
272-
BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, &unpack<uint32_t>)
273-
->ArgsProduct(kBitWidthsNumValues32);
274-
BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, &unpack<uint32_t>)
275-
->ArgsProduct(kBitWidthsNumValues32);
276-
277-
BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, &unpack<uint64_t>)
278-
->ArgsProduct(kBitWidthsNumValues64);
279-
BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, &unpack<uint64_t>)
280-
->ArgsProduct(kBitWidthsNumValues64);
214+
#if defined(ARROW_HAVE_RUNTIME_SVE256)
215+
BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve256Unaligned, false, bpacking::unpack_sve256,
216+
SVE256, "Sve256 not available");
217+
#endif
218+
219+
BENCHMARK_UNPACK_ALL_TYPES(DynamicAligned, true, unpack);
220+
BENCHMARK_UNPACK_ALL_TYPES(DynamicUnaligned, false, unpack);
221+
222+
#undef BENCHMARK_UNPACK_ALL_TYPES_RUNTIME
223+
#undef BENCHMARK_UNPACK_ALL_TYPES
224+
#undef BENCHMARK_UNPACK_ALL_TYPES_SKIP
281225

282226
} // namespace
283227
} // namespace arrow::internal

0 commit comments

Comments
 (0)