Skip to content

Commit 967dce5

Browse files
authored
Update DeltaArray to always pad values to 1024 elements and use bit transpose functions (#7007)
This pr aims to fix the long standing issue with pushing down computation over DecimalArrays where their validity wasn't matching the value layout. Now the validity is also transposed. At the same time we avoid scalar encoding and pad values to 1024 elements fix #4973 --------- Signed-off-by: Robert Kruszewski <github@robertk.io>
1 parent 4a0ed9b commit 967dce5

17 files changed

Lines changed: 556 additions & 271 deletions

File tree

encodings/fastlanes/public-api.lock

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,18 @@ pub mod vortex_fastlanes
22

33
pub mod vortex_fastlanes::bit_transpose
44

5+
pub fn vortex_fastlanes::bit_transpose::transpose_bitbuffer(bits: vortex_buffer::bit::buf::BitBuffer) -> vortex_buffer::bit::buf::BitBuffer
6+
57
pub fn vortex_fastlanes::bit_transpose::transpose_bits(input: &[u8; 128], output: &mut [u8; 128])
68

9+
pub fn vortex_fastlanes::bit_transpose::transpose_validity(validity: &vortex_array::validity::Validity, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::validity::Validity>
10+
11+
pub fn vortex_fastlanes::bit_transpose::untranspose_bitbuffer(bits: vortex_buffer::bit::buf::BitBuffer) -> vortex_buffer::bit::buf::BitBuffer
12+
713
pub fn vortex_fastlanes::bit_transpose::untranspose_bits(input: &[u8; 128], output: &mut [u8; 128])
814

15+
pub fn vortex_fastlanes::bit_transpose::untranspose_validity(validity: &vortex_array::validity::Validity, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::validity::Validity>
16+
917
pub mod vortex_fastlanes::bitpack_compress
1018

1119
pub fn vortex_fastlanes::bitpack_compress::bit_width_histogram(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> vortex_error::VortexResult<alloc::vec::Vec<usize>>
@@ -296,7 +304,7 @@ pub type vortex_fastlanes::Delta::Metadata = vortex_array::metadata::ProstMetada
296304

297305
pub type vortex_fastlanes::Delta::OperationsVTable = vortex_fastlanes::Delta
298306

299-
pub type vortex_fastlanes::Delta::ValidityVTable = vortex_array::vtable::validity::ValidityVTableFromChildSliceHelper
307+
pub type vortex_fastlanes::Delta::ValidityVTable = vortex_fastlanes::Delta
300308

301309
pub fn vortex_fastlanes::Delta::array_eq(array: &vortex_fastlanes::DeltaArray, other: &vortex_fastlanes::DeltaArray, precision: vortex_array::hash::Precision) -> bool
302310

@@ -340,6 +348,10 @@ impl vortex_array::vtable::operations::OperationsVTable<vortex_fastlanes::Delta>
340348

341349
pub fn vortex_fastlanes::Delta::scalar_at(array: &vortex_fastlanes::DeltaArray, index: usize) -> vortex_error::VortexResult<vortex_array::scalar::Scalar>
342350

351+
impl vortex_array::vtable::validity::ValidityVTable<vortex_fastlanes::Delta> for vortex_fastlanes::Delta
352+
353+
pub fn vortex_fastlanes::Delta::validity(array: &vortex_fastlanes::DeltaArray) -> vortex_error::VortexResult<vortex_array::validity::Validity>
354+
343355
pub struct vortex_fastlanes::DeltaArray
344356

345357
impl vortex_fastlanes::DeltaArray
@@ -358,9 +370,7 @@ pub fn vortex_fastlanes::DeltaArray::offset(&self) -> usize
358370

359371
pub fn vortex_fastlanes::DeltaArray::try_from_delta_compress_parts(bases: vortex_array::array::ArrayRef, deltas: vortex_array::array::ArrayRef) -> vortex_error::VortexResult<Self>
360372

361-
pub fn vortex_fastlanes::DeltaArray::try_from_primitive_array(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> vortex_error::VortexResult<Self>
362-
363-
pub fn vortex_fastlanes::DeltaArray::try_from_vec<T: vortex_array::dtype::ptype::NativePType>(vec: alloc::vec::Vec<T>) -> vortex_error::VortexResult<Self>
373+
pub fn vortex_fastlanes::DeltaArray::try_from_primitive_array(array: &vortex_array::arrays::primitive::array::PrimitiveArray, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<Self>
364374

365375
pub fn vortex_fastlanes::DeltaArray::try_new(bases: vortex_array::array::ArrayRef, deltas: vortex_array::array::ArrayRef, offset: usize, logical_len: usize) -> vortex_error::VortexResult<Self>
366376

@@ -394,10 +404,6 @@ impl vortex_array::array::IntoArray for vortex_fastlanes::DeltaArray
394404

395405
pub fn vortex_fastlanes::DeltaArray::into_array(self) -> vortex_array::array::ArrayRef
396406

397-
impl vortex_array::vtable::validity::ValidityChildSliceHelper for vortex_fastlanes::DeltaArray
398-
399-
pub fn vortex_fastlanes::DeltaArray::unsliced_child_and_slice(&self) -> (&vortex_array::array::ArrayRef, usize, usize)
400-
401407
pub struct vortex_fastlanes::FoR
402408

403409
impl vortex_fastlanes::FoR
@@ -668,6 +674,6 @@ impl vortex_array::vtable::validity::ValidityChildSliceHelper for vortex_fastlan
668674

669675
pub fn vortex_fastlanes::RLEArray::unsliced_child_and_slice(&self) -> (&vortex_array::array::ArrayRef, usize, usize)
670676

671-
pub fn vortex_fastlanes::delta_compress(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> vortex_error::VortexResult<(vortex_array::arrays::primitive::array::PrimitiveArray, vortex_array::arrays::primitive::array::PrimitiveArray)>
677+
pub fn vortex_fastlanes::delta_compress(array: &vortex_array::arrays::primitive::array::PrimitiveArray, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<(vortex_array::arrays::primitive::array::PrimitiveArray, vortex_array::arrays::primitive::array::PrimitiveArray)>
672678

673679
pub fn vortex_fastlanes::initialize(session: &mut vortex_session::VortexSession)

encodings/fastlanes/src/bit_transpose/mod.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ mod scalar;
2525
#[cfg(not(feature = "_test-harness"))]
2626
mod x86;
2727

28+
mod validity;
29+
30+
pub use validity::*;
31+
2832
/// Base indices for the first 64 output bytes (lanes 0-7).
2933
/// Each entry indicates the starting input byte index for that output byte group.
3034
/// Pattern: [0*2, 4*2, 2*2, 6*2, 1*2, 5*2, 3*2, 7*2] = [0, 8, 4, 12, 2, 10, 6, 14]
@@ -39,6 +43,8 @@ const TRANSPOSE_2X2: u64 = 0x00AA_00AA_00AA_00AA;
3943
const TRANSPOSE_4X4: u64 = 0x0000_CCCC_0000_CCCC;
4044
const TRANSPOSE_8X8: u64 = 0x0000_0000_F0F0_F0F0;
4145

46+
/// Transpose 1024-bits into FastLanes layout.
47+
///
4248
/// Dispatch to the best available implementation at runtime.
4349
#[inline]
4450
pub fn transpose_bits(input: &[u8; 128], output: &mut [u8; 128]) {
@@ -64,6 +70,8 @@ pub fn transpose_bits(input: &[u8; 128], output: &mut [u8; 128]) {
6470
scalar::transpose_bits_scalar(input, output);
6571
}
6672

73+
/// Untranspose 1024-bits from FastLanes layout.
74+
///
6775
/// Dispatch untranspose to the best available implementation at runtime.
6876
#[inline]
6977
pub fn untranspose_bits(input: &[u8; 128], output: &mut [u8; 128]) {
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use std::mem;
5+
use std::mem::MaybeUninit;
6+
7+
use vortex_array::Canonical;
8+
use vortex_array::ExecutionCtx;
9+
use vortex_array::IntoArray;
10+
use vortex_array::arrays::BoolArray;
11+
use vortex_array::validity::Validity;
12+
use vortex_buffer::BitBuffer;
13+
use vortex_buffer::ByteBuffer;
14+
use vortex_buffer::ByteBufferMut;
15+
use vortex_error::VortexExpect;
16+
use vortex_error::VortexResult;
17+
18+
use crate::bit_transpose::transpose_bits;
19+
use crate::bit_transpose::untranspose_bits;
20+
21+
pub fn transpose_validity(validity: &Validity, ctx: &mut ExecutionCtx) -> VortexResult<Validity> {
22+
match validity {
23+
Validity::Array(mask) => {
24+
let bools = mask
25+
.clone()
26+
.execute::<Canonical>(ctx)?
27+
.into_bool()
28+
.into_bit_buffer();
29+
30+
Ok(Validity::Array(
31+
BoolArray::new(transpose_bitbuffer(bools), Validity::NonNullable).into_array(),
32+
))
33+
}
34+
v @ Validity::AllValid | v @ Validity::AllInvalid | v @ Validity::NonNullable => {
35+
Ok(v.clone())
36+
}
37+
}
38+
}
39+
40+
#[inline]
41+
pub fn transpose_bitbuffer(bits: BitBuffer) -> BitBuffer {
42+
let (offset, len, bytes) = bits.into_inner();
43+
44+
if bytes.len().is_multiple_of(128) {
45+
match bytes.try_into_mut() {
46+
Ok(mut bytes_mut) => {
47+
// We can ignore the spare trailer capacity that can be an artifact of allocator as we requested 128 multiple chunks
48+
let (chunks, _) = bytes_mut.as_chunks_mut::<128>();
49+
let mut tmp = [0u8; 128];
50+
for chunk in chunks {
51+
transpose_bits(chunk, &mut tmp);
52+
chunk.copy_from_slice(&tmp);
53+
}
54+
BitBuffer::new_with_offset(bytes_mut.freeze().into_byte_buffer(), len, offset)
55+
}
56+
Err(bytes) => bits_op_with_copy(bytes, len, offset, transpose_bits),
57+
}
58+
} else {
59+
bits_op_with_copy(bytes, len, offset, transpose_bits)
60+
}
61+
}
62+
63+
pub fn untranspose_validity(validity: &Validity, ctx: &mut ExecutionCtx) -> VortexResult<Validity> {
64+
match validity {
65+
Validity::Array(mask) => {
66+
let bools = mask
67+
.clone()
68+
.execute::<Canonical>(ctx)?
69+
.into_bool()
70+
.into_bit_buffer();
71+
72+
Ok(Validity::Array(
73+
BoolArray::new(untranspose_bitbuffer(bools), Validity::NonNullable).into_array(),
74+
))
75+
}
76+
v @ Validity::AllValid | v @ Validity::AllInvalid | v @ Validity::NonNullable => {
77+
Ok(v.clone())
78+
}
79+
}
80+
}
81+
82+
#[inline]
83+
pub fn untranspose_bitbuffer(bits: BitBuffer) -> BitBuffer {
84+
assert!(
85+
bits.inner().len().is_multiple_of(128),
86+
"Transpose BitBuffer must be 128-byte aligned"
87+
);
88+
let (offset, len, bytes) = bits.into_inner();
89+
match bytes.try_into_mut() {
90+
Ok(mut bytes_mut) => {
91+
let (chunks, _) = bytes_mut.as_chunks_mut::<128>();
92+
let mut tmp = [0u8; 128];
93+
for chunk in chunks {
94+
untranspose_bits(chunk, &mut tmp);
95+
chunk.copy_from_slice(&tmp);
96+
}
97+
BitBuffer::new_with_offset(bytes_mut.freeze().into_byte_buffer(), len, offset)
98+
}
99+
Err(bytes) => bits_op_with_copy(bytes, len, offset, untranspose_bits),
100+
}
101+
}
102+
103+
fn bits_op_with_copy<F: Fn(&[u8; 128], &mut [u8; 128])>(
104+
bytes: ByteBuffer,
105+
len: usize,
106+
offset: usize,
107+
op: F,
108+
) -> BitBuffer {
109+
let output_len = bytes.len().next_multiple_of(128);
110+
let mut output = ByteBufferMut::with_capacity(output_len);
111+
let (input_chunks, input_trailer) = bytes.as_chunks::<128>();
112+
// We can ignore the spare trailer capacity that can be an artifact of allocator as we requested 128 multiple chunks
113+
let (output_chunks, _) = output.spare_capacity_mut().as_chunks_mut::<128>();
114+
115+
for (input, output) in input_chunks.iter().zip(output_chunks.iter_mut()) {
116+
op(input, unsafe {
117+
mem::transmute::<&mut [MaybeUninit<u8>; 128], &mut [u8; 128]>(output)
118+
});
119+
}
120+
121+
if !input_trailer.is_empty() {
122+
let mut padded_input = [0u8; 128];
123+
padded_input[0..input_trailer.len()].clone_from_slice(input_trailer);
124+
op(&padded_input, unsafe {
125+
mem::transmute::<&mut [MaybeUninit<u8>; 128], &mut [u8; 128]>(
126+
output_chunks
127+
.last_mut()
128+
.vortex_expect("Output wasn't a multiple of 128 bytes"),
129+
)
130+
});
131+
}
132+
133+
unsafe { output.set_len(output_len) };
134+
BitBuffer::new_with_offset(
135+
output.freeze().into_byte_buffer(),
136+
len.next_multiple_of(1024),
137+
offset,
138+
)
139+
}

0 commit comments

Comments
 (0)