Skip to content

Commit 9b5447a

Browse files
perf(buffer): read byte-aligned bit operands directly as words (#8436)
## Summary Split out of #8425 so each perf change can be looked at on its own, as suggested by @joseph-isaacs. This is the "bit collect" half, no count fusion. `bitwise_binary_op` only took the direct word path when both operands were 64-bit aligned, otherwise falling back to `iter_padded`. It now reads the backing bytes as `u64` via `as_chunks::<8>` for any byte-aligned offset, leaving `iter_padded` for sub-byte offsets only. This also drops the `from_trusted_len_iter` unsafe for a safe `BufferMut` push. --------- Signed-off-by: Han Damin <miniex@daminstudio.net> Co-authored-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent fb5cbbe commit 9b5447a

1 file changed

Lines changed: 78 additions & 28 deletions

File tree

vortex-buffer/src/bit/ops.rs

Lines changed: 78 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@ use std::mem::MaybeUninit;
55

66
use crate::BitBuffer;
77
use crate::BitBufferMut;
8-
use crate::Buffer;
8+
use crate::BufferMut;
99
use crate::ByteBufferMut;
10-
use crate::trusted_len::TrustedLenExt;
1110

1211
/// Read up to 8 bytes as a little-endian `u64`, zero-padding the high bytes when fewer than 8 are
1312
/// supplied. Using [`u64::from_le_bytes`] keeps the bit-numbering identical on little- and
@@ -188,36 +187,52 @@ pub(super) fn bitwise_binary_op<F: FnMut(u64, u64) -> u64>(
188187
mut op: F,
189188
) -> BitBuffer {
190189
assert_eq!(left.len(), right.len());
191-
192-
// If the buffers are aligned, we can use the fast path.
193-
if left.offset().is_multiple_of(8) && right.offset().is_multiple_of(8) {
194-
let left_chunks = left.unaligned_chunks();
195-
let right_chunks = right.unaligned_chunks();
196-
if left_chunks.lead_padding() == 0
197-
&& left_chunks.trailing_padding() == 0
198-
&& right_chunks.lead_padding() == 0
199-
&& right_chunks.trailing_padding() == 0
200-
{
201-
let iter = left_chunks
202-
.iter()
203-
.zip(right_chunks.iter())
204-
.map(|(l, r)| op(l, r));
205-
let iter = unsafe { iter.trusted_len() };
206-
let result = Buffer::<u64>::from_trusted_len_iter(iter).into_byte_buffer();
207-
return BitBuffer::new(result, left.len());
208-
}
190+
let len = left.len();
191+
if len == 0 {
192+
return BitBuffer::empty();
209193
}
210194

211-
let iter = left
212-
.chunks()
213-
.iter_padded()
214-
.zip(right.chunks().iter_padded())
215-
.map(|(l, r)| op(l, r));
216-
let iter = unsafe { iter.trusted_len() };
195+
let n_bytes = len.div_ceil(8);
196+
let out = if left.offset().is_multiple_of(8) && right.offset().is_multiple_of(8) {
197+
// Byte-aligned operands: logical bits map onto physical `u64` words, so read the backing
198+
// bytes straight as words and build the result from a `TrustedLen` iterator.
199+
let l_start = left.offset() / 8;
200+
let r_start = right.offset() / 8;
201+
let lhs = &left.inner().as_slice()[l_start..l_start + n_bytes];
202+
let rhs = &right.inner().as_slice()[r_start..r_start + n_bytes];
217203

218-
let result = Buffer::<u64>::from_trusted_len_iter(iter).into_byte_buffer();
204+
let (lhs_words, lhs_tail) = lhs.as_chunks::<8>();
205+
let (rhs_words, rhs_tail) = rhs.as_chunks::<8>();
206+
207+
let mut out = BufferMut::<u64>::from_trusted_len_iter(
208+
lhs_words
209+
.iter()
210+
.zip(rhs_words)
211+
.map(|(l, r)| op(u64::from_le_bytes(*l), u64::from_le_bytes(*r))),
212+
);
213+
if !lhs_tail.is_empty() {
214+
out.push(op(read_u64_le(lhs_tail), read_u64_le(rhs_tail)));
215+
}
216+
out
217+
} else {
218+
// Sub-byte offset: `iter_padded` realigns the bits and appends one pad word, so take
219+
// exactly `ceil(len / 64)` words.
220+
let n_words = len.div_ceil(64);
221+
let mut out = BufferMut::<u64>::with_capacity(n_words);
222+
for (l, r) in left
223+
.chunks()
224+
.iter_padded()
225+
.zip(right.chunks().iter_padded())
226+
.take(n_words)
227+
{
228+
out.push(op(l, r));
229+
}
230+
out
231+
};
219232

220-
BitBuffer::new(result, left.len())
233+
let mut bytes = out.into_byte_buffer();
234+
bytes.truncate(n_bytes);
235+
BitBuffer::new(bytes.freeze(), len)
221236
}
222237

223238
#[cfg(test)]
@@ -317,6 +332,41 @@ mod tests {
317332
assert_eq!(result, bitbuffer![false, true, true, false]);
318333
}
319334

335+
/// `bitwise_binary_op` must match a naive per-bit reference for every op, offset and length,
336+
/// independent of the chunked kernels.
337+
#[rstest]
338+
#[case::aligned(0, 0)]
339+
#[case::byte_aligned(8, 16)]
340+
#[case::byte_aligned_mismatch(16, 0)]
341+
#[case::sub_byte(3, 3)]
342+
#[case::sub_byte_mismatch(0, 5)]
343+
fn binary_op_matches_naive(#[case] left_offset: usize, #[case] right_offset: usize) {
344+
#[allow(clippy::cast_possible_truncation)]
345+
let make = |offset: usize, len: usize, salt: u8| -> BitBuffer {
346+
let bytes: ByteBufferMut = (0..(offset + len).div_ceil(8).max(1))
347+
.map(|i| (i as u8).wrapping_mul(31).wrapping_add(salt))
348+
.collect();
349+
BitBufferMut::from_buffer(bytes, offset, len).freeze()
350+
};
351+
let ops: [fn(u64, u64) -> u64; 4] =
352+
[|a, b| a & b, |a, b| a | b, |a, b| a ^ b, |a, b| a & !b];
353+
354+
for len in [1usize, 5, 8, 63, 64, 65, 127, 128, 200, 256] {
355+
let left = make(left_offset, len, 0xC3);
356+
let right = make(right_offset, len, 0x5A);
357+
for op in ops {
358+
let got = bitwise_binary_op(&left, &right, op);
359+
let expected: BitBuffer = (0..len)
360+
.map(|i| op(u64::from(left.value(i)), u64::from(right.value(i))) & 1 == 1)
361+
.collect();
362+
assert_eq!(
363+
got, expected,
364+
"loff={left_offset} roff={right_offset} len={len}"
365+
);
366+
}
367+
}
368+
}
369+
320370
/// Regression test for a bug where [`bitwise_unary_op`] produced corrupt results when
321371
/// the [`BitBuffer`]'s underlying byte pointer was not u64-aligned. Slicing a buffer by
322372
/// a non-multiple-of-8 number of bytes can cause this misalignment. The bug only

0 commit comments

Comments
 (0)