Skip to content

Commit 4d73f97

Browse files
authored
fix potential UB in ByteBool encoding (#7518)
## Summary Our existing ByteBool constructor accepts arbitrary bytes as input, and will attempt to transmute it to a `&[bool]`. If those bytes are not `0x00` or `0x01` that can trigger UB on release builds. ## API Changes We eliminate the `ByteBool::as_slice() -> &[bool]` method, replacing it with an accessor to the truthy byte values. We don't really need to access this as a bool-slice anyway since we can do all operations on the bytes and wrap them back up as a ByteBool. ## Tests No additional tests needed --------- Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent 102de51 commit 4d73f97

6 files changed

Lines changed: 92 additions & 79 deletions

File tree

encodings/bytebool/public-api.lock

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,32 +76,22 @@ pub struct vortex_bytebool::ByteBoolData
7676

7777
impl vortex_bytebool::ByteBoolData
7878

79-
pub fn vortex_bytebool::ByteBoolData::as_slice(&self) -> &[bool]
80-
8179
pub fn vortex_bytebool::ByteBoolData::buffer(&self) -> &vortex_array::buffer::BufferHandle
8280

83-
pub fn vortex_bytebool::ByteBoolData::from_vec<V: core::convert::Into<vortex_array::validity::Validity>>(data: alloc::vec::Vec<bool>, validity: V) -> Self
84-
8581
pub fn vortex_bytebool::ByteBoolData::is_empty(&self) -> bool
8682

8783
pub fn vortex_bytebool::ByteBoolData::len(&self) -> usize
8884

89-
pub fn vortex_bytebool::ByteBoolData::new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> Self
85+
pub fn vortex_bytebool::ByteBoolData::new(buffer: vortex_array::buffer::BufferHandle) -> Self
86+
87+
pub fn vortex_bytebool::ByteBoolData::truthy_bytes(&self) -> &[u8]
9088

9189
pub fn vortex_bytebool::ByteBoolData::validate(buffer: &vortex_array::buffer::BufferHandle, validity: &vortex_array::validity::Validity, dtype: &vortex_array::dtype::DType, len: usize) -> vortex_error::VortexResult<()>
9290

9391
impl core::clone::Clone for vortex_bytebool::ByteBoolData
9492

9593
pub fn vortex_bytebool::ByteBoolData::clone(&self) -> vortex_bytebool::ByteBoolData
9694

97-
impl core::convert::From<alloc::vec::Vec<bool>> for vortex_bytebool::ByteBoolData
98-
99-
pub fn vortex_bytebool::ByteBoolData::from(value: alloc::vec::Vec<bool>) -> Self
100-
101-
impl core::convert::From<alloc::vec::Vec<core::option::Option<bool>>> for vortex_bytebool::ByteBoolData
102-
103-
pub fn vortex_bytebool::ByteBoolData::from(value: alloc::vec::Vec<core::option::Option<bool>>) -> Self
104-
10595
impl core::fmt::Debug for vortex_bytebool::ByteBoolData
10696

10797
pub fn vortex_bytebool::ByteBoolData::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result

encodings/bytebool/src/array.rs

Lines changed: 30 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use vortex_array::vtable::VTable;
2929
use vortex_array::vtable::ValidityVTable;
3030
use vortex_array::vtable::child_to_validity;
3131
use vortex_array::vtable::validity_to_child;
32-
use vortex_buffer::BitBuffer;
32+
use vortex_buffer::BitBufferMut;
3333
use vortex_buffer::ByteBuffer;
3434
use vortex_error::VortexResult;
3535
use vortex_error::vortex_bail;
@@ -131,7 +131,7 @@ impl VTable for ByteBool {
131131
}
132132
let buffer = buffers[0].clone();
133133

134-
let data = ByteBoolData::new(buffer, validity.clone());
134+
let data = ByteBoolData::new(buffer);
135135
let slots = ByteBoolData::make_slots(&validity, len);
136136
Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots))
137137
}
@@ -149,7 +149,8 @@ impl VTable for ByteBool {
149149
}
150150

151151
fn execute(array: Array<Self>, _ctx: &mut ExecutionCtx) -> VortexResult<ExecutionResult> {
152-
let boolean_buffer = BitBuffer::from(array.as_slice());
152+
// convert truthy values to set/unset bits
153+
let boolean_buffer = BitBufferMut::from(array.truthy_bytes()).freeze();
153154
let validity = array.validity()?;
154155
Ok(ExecutionResult::done(
155156
BoolArray::new(boolean_buffer, validity).into_array(),
@@ -198,9 +199,17 @@ pub struct ByteBool;
198199

199200
impl ByteBool {
200201
pub fn new(buffer: BufferHandle, validity: Validity) -> ByteBoolArray {
202+
if let Some(len) = validity.maybe_len() {
203+
assert_eq!(
204+
buffer.len(),
205+
len,
206+
"ByteBool validity and bytes must have same length"
207+
);
208+
}
201209
let dtype = DType::Bool(validity.nullability());
210+
202211
let slots = ByteBoolData::make_slots(&validity, buffer.len());
203-
let data = ByteBoolData::new(buffer, validity);
212+
let data = ByteBoolData::new(buffer);
204213
let len = data.len();
205214
unsafe {
206215
Array::from_parts_unchecked(
@@ -212,29 +221,22 @@ impl ByteBool {
212221
/// Construct a [`ByteBoolArray`] from a `Vec<bool>` and validity.
213222
pub fn from_vec<V: Into<Validity>>(data: Vec<bool>, validity: V) -> ByteBoolArray {
214223
let validity = validity.into();
215-
let data = ByteBoolData::from_vec(data, validity.clone());
216-
let dtype = DType::Bool(validity.nullability());
217-
let len = data.len();
218-
let slots = ByteBoolData::make_slots(&validity, len);
219-
unsafe {
220-
Array::from_parts_unchecked(
221-
ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots),
222-
)
223-
}
224+
// NOTE: this will not cause allocation on release builds
225+
let bytes: Vec<u8> = data.into_iter().map(|b| b as u8).collect();
226+
let handle = BufferHandle::new_host(ByteBuffer::from(bytes));
227+
ByteBool::new(handle, validity)
224228
}
225229

226230
/// Construct a [`ByteBoolArray`] from optional bools.
227231
pub fn from_option_vec(data: Vec<Option<bool>>) -> ByteBoolArray {
228232
let validity = Validity::from_iter(data.iter().map(|v| v.is_some()));
229-
let data = ByteBoolData::from(data);
230-
let dtype = DType::Bool(validity.nullability());
231-
let len = data.len();
232-
let slots = ByteBoolData::make_slots(&validity, len);
233-
unsafe {
234-
Array::from_parts_unchecked(
235-
ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots),
236-
)
237-
}
233+
// NOTE: this will not cause allocation on release builds
234+
let bytes: Vec<u8> = data
235+
.into_iter()
236+
.map(|b| b.unwrap_or_default() as u8)
237+
.collect();
238+
let handle = BufferHandle::new_host(ByteBuffer::from(bytes));
239+
ByteBool::new(handle, validity)
238240
}
239241
}
240242

@@ -265,17 +267,7 @@ impl ByteBoolData {
265267
vec![validity_to_child(validity, len)]
266268
}
267269

268-
pub fn new(buffer: BufferHandle, validity: Validity) -> Self {
269-
let length = buffer.len();
270-
if let Some(vlen) = validity.maybe_len()
271-
&& length != vlen
272-
{
273-
vortex_panic!(
274-
"Buffer length ({}) does not match validity length ({})",
275-
length,
276-
vlen
277-
);
278-
}
270+
pub fn new(buffer: BufferHandle) -> Self {
279271
Self { buffer }
280272
}
281273

@@ -289,21 +281,15 @@ impl ByteBoolData {
289281
self.buffer.len() == 0
290282
}
291283

292-
// TODO(ngates): deprecate construction from vec
293-
pub fn from_vec<V: Into<Validity>>(data: Vec<bool>, validity: V) -> Self {
294-
let validity = validity.into();
295-
// SAFETY: we are transmuting a Vec<bool> into a Vec<u8>
296-
let data: Vec<u8> = unsafe { std::mem::transmute(data) };
297-
Self::new(BufferHandle::new_host(ByteBuffer::from(data)), validity)
298-
}
299-
300284
pub fn buffer(&self) -> &BufferHandle {
301285
&self.buffer
302286
}
303287

304-
pub fn as_slice(&self) -> &[bool] {
305-
// Safety: The internal buffer contains byte-sized bools
306-
unsafe { std::mem::transmute(self.buffer().as_host().as_slice()) }
288+
/// Get access to the underlying 8-bit truthy values.
289+
///
290+
/// The zero byte indicates `false`, and any non-zero byte is a `true`.
291+
pub fn truthy_bytes(&self) -> &[u8] {
292+
self.buffer().as_host().as_slice()
307293
}
308294
}
309295

@@ -326,23 +312,6 @@ impl OperationsVTable<ByteBool> for ByteBool {
326312
}
327313
}
328314

329-
impl From<Vec<bool>> for ByteBoolData {
330-
fn from(value: Vec<bool>) -> Self {
331-
Self::from_vec(value, Validity::AllValid)
332-
}
333-
}
334-
335-
impl From<Vec<Option<bool>>> for ByteBoolData {
336-
fn from(value: Vec<Option<bool>>) -> Self {
337-
let validity = Validity::from_iter(value.iter().map(|v| v.is_some()));
338-
339-
// This doesn't reallocate, and the compiler even vectorizes it
340-
let data = value.into_iter().map(Option::unwrap_or_default).collect();
341-
342-
Self::from_vec(data, validity)
343-
}
344-
}
345-
346315
#[cfg(test)]
347316
mod tests {
348317
use vortex_array::ArrayContext;

encodings/bytebool/src/compute.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ use vortex_array::ExecutionCtx;
88
use vortex_array::IntoArray;
99
use vortex_array::arrays::PrimitiveArray;
1010
use vortex_array::arrays::dict::TakeExecute;
11+
use vortex_array::buffer::BufferHandle;
1112
use vortex_array::dtype::DType;
1213
use vortex_array::match_each_integer_ptype;
1314
use vortex_array::scalar_fn::fns::cast::CastReduce;
1415
use vortex_array::scalar_fn::fns::mask::MaskReduce;
1516
use vortex_array::validity::Validity;
17+
use vortex_buffer::ByteBuffer;
1618
use vortex_error::VortexResult;
1719

1820
use super::ByteBool;
@@ -58,23 +60,25 @@ impl TakeExecute for ByteBool {
5860
ctx: &mut ExecutionCtx,
5961
) -> VortexResult<Option<ArrayRef>> {
6062
let indices = indices.clone().execute::<PrimitiveArray>(ctx)?;
61-
let bools = array.as_slice();
63+
let values = array.truthy_bytes();
6264

6365
// This handles combining validity from both source array and nullable indices
6466
let validity = array.validity()?.take(&indices.clone().into_array())?;
6567

66-
let taken_bools = match_each_integer_ptype!(indices.ptype(), |I| {
68+
let taken = match_each_integer_ptype!(indices.ptype(), |I| {
6769
indices
6870
.as_slice::<I>()
6971
.iter()
7072
.map(|&idx| {
7173
let idx: usize = idx.as_();
72-
bools[idx]
74+
values[idx]
7375
})
74-
.collect::<Vec<bool>>()
76+
.collect::<ByteBuffer>()
7577
});
7678

77-
Ok(Some(ByteBool::from_vec(taken_bools, validity).into_array()))
79+
Ok(Some(
80+
ByteBool::new(BufferHandle::new_host(taken), validity).into_array(),
81+
))
7882
}
7983
}
8084

encodings/bytebool/src/lib.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,45 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
//! A Vortex encoding that mirrors Arrow's [8-bit Boolean canonical extension type][spec].
5+
//!
6+
//! Each element is stored as a single byte. The zero byte represents `false` and any
7+
//! non-zero byte represents `true`, matching the truthy semantics of the Arrow spec. This
8+
//! trades 8x the storage of the bit-packed `Bool` layout for cheaper per-byte access —
9+
//! useful when data arrives from a C ABI or other source that already emits byte-wide
10+
//! booleans. On execution the array materializes into the standard bit-packed
11+
//! [`BoolArray`][vortex_array::arrays::BoolArray].
12+
//!
13+
//! # Examples
14+
//!
15+
//! Any non-zero byte in the backing buffer is treated as `true` when the array executes
16+
//! to a canonical [`BoolArray`][vortex_array::arrays::BoolArray]:
17+
//!
18+
//! ```
19+
//! # use vortex_array::{IntoArray, LEGACY_SESSION, VortexSessionExecute};
20+
//! # use vortex_array::arrays::BoolArray;
21+
//! # use vortex_array::arrays::bool::BoolArrayExt;
22+
//! # use vortex_array::buffer::BufferHandle;
23+
//! # use vortex_array::validity::Validity;
24+
//! # use vortex_buffer::ByteBuffer;
25+
//! # use vortex_bytebool::ByteBool;
26+
//! # use vortex_error::VortexResult;
27+
//! # fn main() -> VortexResult<()> {
28+
//! # let mut ctx = LEGACY_SESSION.create_execution_ctx();
29+
//! let handle = BufferHandle::new_host(ByteBuffer::from(vec![0u8, 1, 42, 0]));
30+
//! let array = ByteBool::new(handle, Validity::NonNullable);
31+
//!
32+
//! let bits = array.into_array().execute::<BoolArray>(&mut ctx)?.to_bit_buffer();
33+
//! assert!(!bits.value(0));
34+
//! assert!(bits.value(1));
35+
//! assert!(bits.value(2)); // byte 42 is truthy
36+
//! assert!(!bits.value(3));
37+
//! # Ok(())
38+
//! # }
39+
//! ```
40+
//!
41+
//! [spec]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean
42+
443
pub use array::*;
544

645
mod array;

vortex-buffer/public-api.lock

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,10 @@ impl core::convert::From<&[bool]> for vortex_buffer::BitBufferMut
504504

505505
pub fn vortex_buffer::BitBufferMut::from(value: &[bool]) -> Self
506506

507+
impl core::convert::From<&[u8]> for vortex_buffer::BitBufferMut
508+
509+
pub fn vortex_buffer::BitBufferMut::from(value: &[u8]) -> Self
510+
507511
impl core::convert::From<alloc::vec::Vec<bool>> for vortex_buffer::BitBufferMut
508512

509513
pub fn vortex_buffer::BitBufferMut::from(value: alloc::vec::Vec<bool>) -> Self

vortex-buffer/src/bit/buf_mut.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,13 @@ impl From<&[bool]> for BitBufferMut {
573573
}
574574
}
575575

576+
// allow building a buffer from a set of truthy byte values.
577+
impl From<&[u8]> for BitBufferMut {
578+
fn from(value: &[u8]) -> Self {
579+
BitBufferMut::collect_bool(value.len(), |i| value[i] > 0)
580+
}
581+
}
582+
576583
impl From<Vec<bool>> for BitBufferMut {
577584
fn from(value: Vec<bool>) -> Self {
578585
value.as_slice().into()

0 commit comments

Comments
 (0)