Skip to content

Commit 0b981a8

Browse files
authored
Add bit transpose operations (#6928)
Logic to convert bit buffers into transpose layout. This is useful where intermediary arrays are in transpose layout In particular this is necessary to handle DeltaArray validity correctly On Zen 5 the bmi instructions are 20-30% faster, VBMI are ~10-20x faster On Zen3 bmi instructions are still around 20-30% faster On M4 the neon version is ~60-100% faster while untranspose stays the same Full benchmark results for posterity on Zen 5 machine (m8azn) ``` Compiling fastlanes v0.5.0 (/home/ubuntu/fastlanes) Finished `bench` profile [optimized] target(s) in 0.56s Running benches/bit_transpose.rs (target/release/deps/bit_transpose-dd4b19170a0386ff) Timer precision: 10 ns bit_transpose fastest │ slowest │ median │ mean │ samples │ iters ├─ transpose_scalar 59.8 ns │ 1.059 µs │ 69.8 ns │ 75.7 ns │ 100 │ 100 ├─ transpose_scalar_throughput 8.349 µs │ 15.75 µs │ 8.369 µs │ 8.443 µs │ 100 │ 100 ├─ untranspose_scalar 39.8 ns │ 40.73 ns │ 40.11 ns │ 39.99 ns │ 100 │ 3200 ├─ untranspose_scalar_throughput 8.889 µs │ 14.26 µs │ 8.909 µs │ 8.985 µs │ 100 │ 100 ╰─ x86 │ │ │ │ │ ├─ transpose_bmi2 29.64 ns │ 30.26 ns │ 29.95 ns │ 29.94 ns │ 100 │ 6400 ├─ transpose_bmi2_throughput 6.309 µs │ 10.92 µs │ 6.339 µs │ 6.386 µs │ 100 │ 100 ├─ transpose_vbmi 3.608 ns │ 3.667 ns │ 3.628 ns │ 3.631 ns │ 100 │ 51200 ├─ transpose_vbmi_throughput 504.8 ns │ 524.8 ns │ 509.8 ns │ 508 ns │ 100 │ 200 ├─ untranspose_bmi2 27.61 ns │ 70.42 ns │ 27.76 ns │ 28.24 ns │ 100 │ 6400 ├─ untranspose_bmi2_throughput 6.199 µs │ 10.57 µs │ 6.219 µs │ 6.262 µs │ 100 │ 100 ├─ untranspose_vbmi 3.589 ns │ 3.628 ns │ 3.608 ns │ 3.604 ns │ 100 │ 51200 ╰─ untranspose_vbmi_throughput 489.8 ns │ 504.8 ns │ 494.8 ns │ 496.7 ns │ 100 │ 200 ``` on a Zen 3 machine (c6a) ``` Compiling fastlanes v0.5.0 (/home/ubuntu/fastlanes) Finished `bench` profile [optimized] target(s) in 1.02s Running benches/bit_transpose.rs (target/release/deps/bit_transpose-dd4b19170a0386ff) Timer precision: 20 ns bit_transpose fastest │ slowest │ median │ mean │ samples │ iters ├─ transpose_scalar 70.65 ns │ 962.9 ns │ 70.97 ns │ 81.99 ns │ 100 │ 3200 ├─ transpose_scalar_throughput 15.68 µs │ 37.39 µs │ 15.72 µs │ 16.04 µs │ 100 │ 100 ├─ untranspose_scalar 72.84 ns │ 242.2 ns │ 75.04 ns │ 77.92 ns │ 100 │ 3200 ├─ untranspose_scalar_throughput 16.13 µs │ 32.4 µs │ 16.26 µs │ 16.69 µs │ 100 │ 100 ╰─ x86 │ │ │ │ │ ├─ transpose_bmi2 58.75 ns │ 229.4 ns │ 59.09 ns │ 61.18 ns │ 100 │ 3200 ├─ transpose_bmi2_throughput 12.33 µs │ 27.84 µs │ 12.41 µs │ 12.68 µs │ 100 │ 100 ├─ transpose_vbmi warning: No benchmark function registered for 'transpose_vbmi' ├─ transpose_vbmi_throughput warning: No benchmark function registered for 'transpose_vbmi_throughput' ├─ untranspose_bmi2 57.22 ns │ 452.2 ns │ 58.78 ns │ 63.58 ns │ 100 │ 3200 ├─ untranspose_bmi2_throughput 13.23 µs │ 25.47 µs │ 13.29 µs │ 13.61 µs │ 100 │ 100 ├─ untranspose_vbmi warning: No benchmark function registered for 'untranspose_vbmi' ╰─ untranspose_vbmi_throughput warning: No benchmark function registered for 'untranspose_vbmi_throughput' ``` on a M4 Max ``` Finished [`bench` profile [optimized]](https://doc.rust-lang.org/cargo/reference/profiles.html#default-profiles) target(s) in 0.02s Running benches/bit_transpose.rs (target/release/deps/bit_transpose-9780c0103c6d2dc3) Timer precision: 41 ns bit_transpose fastest │ slowest │ median │ mean │ samples │ iters ├─ transpose_scalar 24.48 ns │ 25.78 ns │ 24.97 ns │ 25.03 ns │ 100 │ 25600 ├─ transpose_scalar_throughput 5.249 µs │ 9.166 µs │ 5.332 µs │ 5.545 µs │ 100 │ 100 ├─ untranspose_scalar 25.79 ns │ 37.5 ns │ 26.44 ns │ 28.07 ns │ 100 │ 12800 ├─ untranspose_scalar_throughput 5.457 µs │ 7.999 µs │ 5.583 µs │ 5.821 µs │ 100 │ 100 ╰─ aarch64 │ │ │ │ │ ├─ transpose_neon 14.23 ns │ 20.57 ns │ 14.55 ns │ 15.22 ns │ 100 │ 25600 ├─ transpose_neon_throughput 3.187 µs │ 4.52 µs │ 3.228 µs │ 3.262 µs │ 100 │ 200 ├─ untranspose_neon 23.67 ns │ 35.06 ns │ 24.32 ns │ 25.19 ns │ 100 │ 25600 ╰─ untranspose_neon_throughput 4.874 µs │ 7.082 µs │ 4.958 µs │ 5.174 µs │ 100 │ 100Finished ``` Signed-off-by: Robert Kruszewski <github@robertk.io> --------- Signed-off-by: Robert Kruszewski <github@robertk.io>
1 parent ad7b09f commit 0b981a8

8 files changed

Lines changed: 1744 additions & 0 deletions

File tree

encodings/fastlanes/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,8 @@ required-features = ["_test-harness"]
5555
name = "compute_between"
5656
harness = false
5757
required-features = ["_test-harness"]
58+
59+
[[bench]]
60+
name = "bit_transpose"
61+
harness = false
62+
required-features = ["_test-harness"]
Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,312 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
#![allow(clippy::unwrap_used)]
5+
6+
use divan::Bencher;
7+
use vortex_fastlanes::bit_transpose::scalar::transpose_bits_scalar;
8+
use vortex_fastlanes::bit_transpose::scalar::untranspose_bits_scalar;
9+
10+
fn main() {
11+
divan::main();
12+
}
13+
14+
/// Generate deterministic test data.
15+
#[allow(clippy::cast_possible_truncation)]
16+
fn generate_test_data(seed: usize) -> [u8; 128] {
17+
let mut data = [0u8; 128];
18+
for (i, byte) in data.iter_mut().enumerate() {
19+
*byte = seed.wrapping_mul(17).wrapping_add(i).wrapping_mul(31) as u8;
20+
}
21+
data
22+
}
23+
24+
const BATCH_SIZE: usize = 1000;
25+
26+
// ============================================================================
27+
// Transpose: single array
28+
// ============================================================================
29+
30+
#[divan::bench]
31+
fn transpose_scalar(bencher: Bencher) {
32+
let input = generate_test_data(42);
33+
34+
bencher
35+
.with_inputs(|| (&input, [0u8; 128]))
36+
.bench_refs(|(input, output)| {
37+
transpose_bits_scalar(input, output);
38+
});
39+
}
40+
41+
// ============================================================================
42+
// Transpose: throughput (1000 arrays)
43+
// ============================================================================
44+
45+
#[divan::bench]
46+
fn transpose_scalar_throughput(bencher: Bencher) {
47+
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
48+
49+
bencher
50+
.with_inputs(|| (&inputs, vec![[0u8; 128]; BATCH_SIZE]))
51+
.bench_refs(|(inputs, outputs)| {
52+
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
53+
transpose_bits_scalar(input, output);
54+
}
55+
});
56+
}
57+
58+
// ============================================================================
59+
// Untranspose: single array
60+
// ============================================================================
61+
62+
#[divan::bench]
63+
fn untranspose_scalar(bencher: Bencher) {
64+
let input = generate_test_data(42);
65+
66+
bencher
67+
.with_inputs(|| (&input, [0u8; 128]))
68+
.bench_refs(|(input, output)| {
69+
untranspose_bits_scalar(input, output);
70+
});
71+
}
72+
73+
// ============================================================================
74+
// Untranspose: throughput (1000 arrays)
75+
// ============================================================================
76+
77+
#[divan::bench]
78+
fn untranspose_scalar_throughput(bencher: Bencher) {
79+
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
80+
81+
bencher
82+
.with_inputs(|| (&inputs, vec![[0u8; 128]; BATCH_SIZE]))
83+
.bench_refs(|(inputs, outputs)| {
84+
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
85+
untranspose_bits_scalar(input, output);
86+
}
87+
});
88+
}
89+
90+
// ============================================================================
91+
// x86_64 benchmarks
92+
// ============================================================================
93+
94+
#[cfg(target_arch = "x86_64")]
95+
mod x86 {
96+
use divan::Bencher;
97+
use vortex_fastlanes::bit_transpose::x86::has_bmi2;
98+
use vortex_fastlanes::bit_transpose::x86::has_vbmi;
99+
use vortex_fastlanes::bit_transpose::x86::transpose_bits_bmi2;
100+
use vortex_fastlanes::bit_transpose::x86::transpose_bits_vbmi;
101+
use vortex_fastlanes::bit_transpose::x86::untranspose_bits_bmi2;
102+
use vortex_fastlanes::bit_transpose::x86::untranspose_bits_vbmi;
103+
104+
use super::BATCH_SIZE;
105+
use super::generate_test_data;
106+
107+
// --- Transpose: single array ---
108+
109+
#[divan::bench]
110+
fn transpose_bmi2(bencher: Bencher) {
111+
if !has_bmi2() {
112+
return;
113+
}
114+
115+
let input = generate_test_data(42);
116+
117+
bencher
118+
.with_inputs(|| (&input, [0u8; 128]))
119+
.bench_refs(|(input, output)| {
120+
unsafe { transpose_bits_bmi2(input, output) };
121+
});
122+
}
123+
124+
#[divan::bench]
125+
fn transpose_vbmi(bencher: Bencher) {
126+
if !has_vbmi() {
127+
return;
128+
}
129+
130+
let input = generate_test_data(42);
131+
132+
bencher
133+
.with_inputs(|| (&input, [0u8; 128]))
134+
.bench_refs(|(input, output)| {
135+
unsafe { transpose_bits_vbmi(input, output) };
136+
});
137+
}
138+
139+
// --- Untranspose: single array ---
140+
141+
#[divan::bench]
142+
fn untranspose_bmi2(bencher: Bencher) {
143+
if !has_bmi2() {
144+
return;
145+
}
146+
147+
let input = generate_test_data(42);
148+
149+
bencher
150+
.with_inputs(|| (&input, [0u8; 128]))
151+
.bench_refs(|(input, output)| {
152+
unsafe { untranspose_bits_bmi2(input, output) };
153+
});
154+
}
155+
156+
#[divan::bench]
157+
fn untranspose_vbmi(bencher: Bencher) {
158+
if !has_vbmi() {
159+
return;
160+
}
161+
162+
let input = generate_test_data(42);
163+
164+
bencher
165+
.with_inputs(|| (&input, [0u8; 128]))
166+
.bench_refs(|(input, output)| {
167+
unsafe { untranspose_bits_vbmi(input, output) };
168+
});
169+
}
170+
171+
// --- Transpose: throughput (1000 arrays) ---
172+
173+
#[divan::bench]
174+
fn transpose_bmi2_throughput(bencher: Bencher) {
175+
if !has_bmi2() {
176+
return;
177+
}
178+
179+
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
180+
181+
bencher
182+
.with_inputs(|| (&inputs, vec![[0u8; 128]; BATCH_SIZE]))
183+
.bench_refs(|(inputs, outputs)| {
184+
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
185+
unsafe { transpose_bits_bmi2(input, output) };
186+
}
187+
});
188+
}
189+
190+
#[divan::bench]
191+
fn transpose_vbmi_throughput(bencher: Bencher) {
192+
if !has_vbmi() {
193+
return;
194+
}
195+
196+
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
197+
198+
bencher
199+
.with_inputs(|| (&inputs, vec![[0u8; 128]; BATCH_SIZE]))
200+
.bench_refs(|(inputs, outputs)| {
201+
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
202+
unsafe { transpose_bits_vbmi(input, output) };
203+
}
204+
});
205+
}
206+
207+
// --- Untranspose: throughput (1000 arrays) ---
208+
209+
#[divan::bench]
210+
fn untranspose_bmi2_throughput(bencher: Bencher) {
211+
if !has_bmi2() {
212+
return;
213+
}
214+
215+
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
216+
217+
bencher
218+
.with_inputs(|| (&inputs, vec![[0u8; 128]; BATCH_SIZE]))
219+
.bench_refs(|(inputs, outputs)| {
220+
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
221+
unsafe { untranspose_bits_bmi2(input, output) };
222+
}
223+
});
224+
}
225+
226+
#[divan::bench]
227+
fn untranspose_vbmi_throughput(bencher: Bencher) {
228+
if !has_vbmi() {
229+
return;
230+
}
231+
232+
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
233+
234+
bencher
235+
.with_inputs(|| (&inputs, vec![[0u8; 128]; BATCH_SIZE]))
236+
.bench_refs(|(inputs, outputs)| {
237+
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
238+
unsafe { untranspose_bits_vbmi(input, output) };
239+
}
240+
});
241+
}
242+
}
243+
244+
// ============================================================================
245+
// aarch64 benchmarks
246+
// ============================================================================
247+
248+
#[cfg(target_arch = "aarch64")]
249+
mod aarch64 {
250+
use vortex_fastlanes::bit_transpose::aarch64::transpose_bits_neon;
251+
use vortex_fastlanes::bit_transpose::aarch64::untranspose_bits_neon;
252+
253+
use super::BATCH_SIZE;
254+
use super::Bencher;
255+
use super::generate_test_data;
256+
257+
// --- Transpose: single array ---
258+
259+
#[divan::bench]
260+
fn transpose_neon(bencher: Bencher) {
261+
let input = generate_test_data(42);
262+
263+
bencher
264+
.with_inputs(|| (&input, [0u8; 128]))
265+
.bench_refs(|(input, output)| {
266+
unsafe { transpose_bits_neon(input, output) };
267+
});
268+
}
269+
270+
// --- Untranspose: single array ---
271+
272+
#[divan::bench]
273+
fn untranspose_neon(bencher: Bencher) {
274+
let input = generate_test_data(42);
275+
276+
bencher
277+
.with_inputs(|| (&input, [0u8; 128]))
278+
.bench_refs(|(input, output)| {
279+
unsafe { untranspose_bits_neon(input, output) };
280+
});
281+
}
282+
283+
// --- Transpose: throughput (1000 arrays) ---
284+
285+
#[divan::bench]
286+
fn transpose_neon_throughput(bencher: Bencher) {
287+
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
288+
289+
bencher
290+
.with_inputs(|| (&inputs, vec![[0u8; 128]; BATCH_SIZE]))
291+
.bench_refs(|(inputs, outputs)| {
292+
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
293+
unsafe { transpose_bits_neon(input, output) };
294+
}
295+
});
296+
}
297+
298+
// --- Untranspose: throughput (1000 arrays) ---
299+
300+
#[divan::bench]
301+
fn untranspose_neon_throughput(bencher: Bencher) {
302+
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
303+
304+
bencher
305+
.with_inputs(|| (&inputs, vec![[0u8; 128]; BATCH_SIZE]))
306+
.bench_refs(|(inputs, outputs)| {
307+
for (input, output) in inputs.iter().zip(outputs.iter_mut()) {
308+
unsafe { untranspose_bits_neon(input, output) };
309+
}
310+
});
311+
}
312+
}

encodings/fastlanes/public-api.lock

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
pub mod vortex_fastlanes
22

3+
pub mod vortex_fastlanes::bit_transpose
4+
5+
pub fn vortex_fastlanes::bit_transpose::transpose_bits(input: &[u8; 128], output: &mut [u8; 128])
6+
7+
pub fn vortex_fastlanes::bit_transpose::untranspose_bits(input: &[u8; 128], output: &mut [u8; 128])
8+
39
pub mod vortex_fastlanes::bitpack_compress
410

511
pub fn vortex_fastlanes::bitpack_compress::bit_width_histogram(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> vortex_error::VortexResult<alloc::vec::Vec<usize>>

0 commit comments

Comments
 (0)