Skip to content

Commit 8eee959

Browse files
feat[vortex-runend]: handle canonicalizing Ree<Utf8> (#6788)
This allows writing these kinds of types. I'm unaware of the motivation behind restricting ree values to just primitive and bool prior to this commit. ## Summary The motivation for this PR is that I want to write RunEndArrays with these value types. ## Testing Unit tests for each new canonicalization path. --------- Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com> Signed-off-by: Robert Kruszewski <github@robertk.io> Co-authored-by: Robert Kruszewski <github@robertk.io>
1 parent 2fd4df3 commit 8eee959

5 files changed

Lines changed: 133 additions & 43 deletions

File tree

encodings/runend/benches/run_end_compress.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55

66
use divan::Bencher;
77
use itertools::repeat_n;
8-
use vortex_array::DynArray;
98
use vortex_array::IntoArray;
109
use vortex_array::LEGACY_SESSION;
1110
use vortex_array::RecursiveCanonical;
1211
use vortex_array::VortexSessionExecute;
1312
use vortex_array::arrays::PrimitiveArray;
13+
use vortex_array::arrays::VarBinViewArray;
1414
use vortex_array::compute::warm_up_vtables;
1515
use vortex_array::dtype::IntegerPType;
1616
use vortex_array::validity::Validity;
@@ -76,8 +76,12 @@ fn decompress<T: IntegerPType>(bencher: Bencher, (length, run_step): (usize, usi
7676
let array = run_end_array.into_array();
7777

7878
bencher
79-
.with_inputs(|| &array)
80-
.bench_refs(|array| array.to_canonical());
79+
.with_inputs(|| (array.clone(), LEGACY_SESSION.create_execution_ctx()))
80+
.bench_values(|(array, mut execution_ctx)| {
81+
array
82+
.execute::<RecursiveCanonical>(&mut execution_ctx)
83+
.unwrap()
84+
});
8185
}
8286

8387
#[divan::bench(args = BENCH_ARGS)]
@@ -113,3 +117,26 @@ fn take_indices(bencher: Bencher, (length, run_step): (usize, usize)) {
113117
.unwrap()
114118
});
115119
}
120+
121+
#[divan::bench(args = BENCH_ARGS)]
122+
fn decompress_utf8(bencher: Bencher, (length, run_step): (usize, usize)) {
123+
let num_runs = length.div_ceil(run_step);
124+
let ends = (0..num_runs)
125+
.map(|i| ((i + 1) * run_step).min(length) as u64)
126+
.collect::<Buffer<_>>()
127+
.into_array();
128+
129+
let values = VarBinViewArray::from_iter_str((0..num_runs).map(|i| format!("run_value_{i}")))
130+
.into_array();
131+
132+
let run_end_array = RunEndArray::new(ends, values);
133+
let array = run_end_array.into_array();
134+
135+
bencher
136+
.with_inputs(|| (array.clone(), LEGACY_SESSION.create_execution_ctx()))
137+
.bench_values(|(array, mut execution_ctx)| {
138+
array
139+
.execute::<RecursiveCanonical>(&mut execution_ctx)
140+
.unwrap()
141+
});
142+
}

encodings/runend/public-api.lock

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ pub fn vortex_runend::compress::runend_decode_typed_bool(run_ends: impl core::it
1010

1111
pub fn vortex_runend::compress::runend_decode_typed_primitive<T: vortex_array::dtype::ptype::NativePType>(run_ends: impl core::iter::traits::iterator::Iterator<Item = usize>, values: &[T], values_validity: vortex_mask::Mask, values_nullability: vortex_array::dtype::nullability::Nullability, length: usize) -> vortex_array::arrays::primitive::array::PrimitiveArray
1212

13+
pub fn vortex_runend::compress::runend_decode_varbinview(ends: vortex_array::arrays::primitive::array::PrimitiveArray, values: vortex_array::arrays::varbinview::array::VarBinViewArray, offset: usize, length: usize) -> vortex_error::VortexResult<vortex_array::arrays::varbinview::array::VarBinViewArray>
14+
1315
pub fn vortex_runend::compress::runend_encode(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> (vortex_array::arrays::primitive::array::PrimitiveArray, vortex_array::array::ArrayRef)
1416

1517
pub struct vortex_runend::RunEndArray

encodings/runend/src/arbitrary.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ pub struct ArbitraryRunEndArray(pub RunEndArray);
2222

2323
impl<'a> Arbitrary<'a> for ArbitraryRunEndArray {
2424
fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
25-
// RunEnd supports Bool or Primitive types for values
26-
// Pick a random primitive type for values
25+
// Pick a random primitive type for values.
2726
let ptype: PType = u.arbitrary()?;
2827
let nullability: Nullability = u.arbitrary()?;
2928
let dtype = DType::Primitive(ptype, nullability);

encodings/runend/src/array.rs

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use vortex_array::Precision;
1515
use vortex_array::ProstMetadata;
1616
use vortex_array::SerializeMetadata;
1717
use vortex_array::arrays::PrimitiveVTable;
18+
use vortex_array::arrays::VarBinViewArray;
1819
use vortex_array::buffer::BufferHandle;
1920
use vortex_array::dtype::DType;
2021
use vortex_array::dtype::Nullability;
@@ -39,6 +40,7 @@ use vortex_session::VortexSession;
3940

4041
use crate::compress::runend_decode_bools;
4142
use crate::compress::runend_decode_primitive;
43+
use crate::compress::runend_decode_varbinview;
4244
use crate::compress::runend_encode;
4345
use crate::kernel::PARENT_KERNELS;
4446
use crate::rules::RULES;
@@ -239,12 +241,6 @@ impl RunEndArray {
239241
"run ends must be unsigned integers, was {}",
240242
ends.dtype(),
241243
);
242-
vortex_ensure!(
243-
values.dtype().is_primitive() || values.dtype().is_boolean(),
244-
"RunEnd array can only have Bool or Primitive values, {} given",
245-
values.dtype()
246-
);
247-
248244
vortex_ensure!(
249245
ends.len() == values.len(),
250246
"run ends len != run values len, {} != {}",
@@ -342,32 +338,7 @@ impl RunEndArray {
342338
///
343339
/// # Validation
344340
///
345-
/// The `ends` must be non-nullable unsigned integers. The values may be `Bool` or `Primitive`
346-
/// types.
347-
///
348-
/// # Examples
349-
///
350-
/// ```
351-
/// # use vortex_array::arrays::{BoolArray, VarBinViewArray};
352-
/// # use vortex_array::IntoArray;
353-
/// # use vortex_buffer::buffer;
354-
/// # use vortex_runend::RunEndArray;
355-
///
356-
/// // Error to provide incorrectly-typed values!
357-
/// let result = RunEndArray::try_new(
358-
/// buffer![1u8, 2u8].into_array(),
359-
/// VarBinViewArray::from_iter_str(["bad", "dtype"]).into_array(),
360-
/// );
361-
/// assert!(result.is_err());
362-
///
363-
/// // This array is happy
364-
/// let result = RunEndArray::try_new(
365-
/// buffer![1u8, 2u8].into_array(),
366-
/// BoolArray::from_iter([false, true]).into_array(),
367-
/// );
368-
///
369-
/// assert!(result.is_ok());
370-
/// ```
341+
/// The `ends` must be non-nullable unsigned integers.
371342
pub fn try_new(ends: ArrayRef, values: ArrayRef) -> VortexResult<Self> {
372343
let length: usize = if ends.is_empty() {
373344
0
@@ -510,6 +481,7 @@ pub(super) fn run_end_canonicalize(
510481
ctx: &mut ExecutionCtx,
511482
) -> VortexResult<ArrayRef> {
512483
let pends = array.ends().clone().execute_as("ends", ctx)?;
484+
513485
Ok(match array.dtype() {
514486
DType::Bool(_) => {
515487
let bools = array.values().clone().execute_as("values", ctx)?;
@@ -519,13 +491,22 @@ pub(super) fn run_end_canonicalize(
519491
let pvalues = array.values().clone().execute_as("values", ctx)?;
520492
runend_decode_primitive(pends, pvalues, array.offset(), array.len())?.into_array()
521493
}
522-
_ => vortex_panic!("Only Primitive and Bool values are supported"),
494+
DType::Utf8(_) | DType::Binary(_) => {
495+
let values = array
496+
.values()
497+
.clone()
498+
.execute_as::<VarBinViewArray>("values", ctx)?;
499+
runend_decode_varbinview(pends, values, array.offset(), array.len())?.into_array()
500+
}
501+
_ => vortex_bail!("Unsupported RunEnd value type: {}", array.dtype()),
523502
})
524503
}
525504

526505
#[cfg(test)]
527506
mod tests {
528507
use vortex_array::IntoArray;
508+
use vortex_array::arrays::DictArray;
509+
use vortex_array::arrays::VarBinViewArray;
529510
use vortex_array::assert_arrays_eq;
530511
use vortex_array::dtype::DType;
531512
use vortex_array::dtype::Nullability;
@@ -552,4 +533,33 @@ mod tests {
552533
let expected = buffer![1, 1, 2, 2, 2, 3, 3, 3, 3, 3].into_array();
553534
assert_arrays_eq!(arr.into_array(), expected);
554535
}
536+
537+
#[test]
538+
fn test_runend_utf8() {
539+
let values = VarBinViewArray::from_iter_str(["a", "b", "c"]).into_array();
540+
let arr = RunEndArray::new(buffer![2u32, 5, 10].into_array(), values);
541+
assert_eq!(arr.len(), 10);
542+
assert_eq!(arr.dtype(), &DType::Utf8(Nullability::NonNullable));
543+
544+
let expected =
545+
VarBinViewArray::from_iter_str(["a", "a", "b", "b", "b", "c", "c", "c", "c", "c"])
546+
.into_array();
547+
assert_arrays_eq!(arr.into_array(), expected);
548+
}
549+
550+
#[test]
551+
fn test_runend_dict() {
552+
let dict_values = VarBinViewArray::from_iter_str(["x", "y", "z"]).into_array();
553+
let dict_codes = buffer![0u32, 1, 2].into_array();
554+
let dict = DictArray::try_new(dict_codes, dict_values).unwrap();
555+
556+
let arr =
557+
RunEndArray::try_new(buffer![2u32, 5, 10].into_array(), dict.into_array()).unwrap();
558+
assert_eq!(arr.len(), 10);
559+
560+
let expected =
561+
VarBinViewArray::from_iter_str(["x", "x", "y", "y", "y", "z", "z", "z", "z", "z"])
562+
.into_array();
563+
assert_arrays_eq!(arr.into_array(), expected);
564+
}
555565
}

encodings/runend/src/compress.rs

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ use vortex_array::ToCanonical;
88
use vortex_array::arrays::BoolArray;
99
use vortex_array::arrays::ConstantArray;
1010
use vortex_array::arrays::PrimitiveArray;
11+
use vortex_array::arrays::VarBinViewArray;
12+
use vortex_array::buffer::BufferHandle;
1113
use vortex_array::dtype::NativePType;
1214
use vortex_array::dtype::Nullability;
1315
use vortex_array::expr::stats::Precision;
@@ -204,13 +206,16 @@ pub fn runend_decode_bools(
204206
}))
205207
}
206208

207-
pub fn runend_decode_typed_primitive<T: NativePType>(
209+
/// Decode a run-end encoded slice of values into a flat `Buffer<T>` and `Validity`.
210+
///
211+
/// This is the core decode loop shared by primitive and varbinview run-end decoding.
212+
fn runend_decode_slice<T: Copy + Default>(
208213
run_ends: impl Iterator<Item = usize>,
209214
values: &[T],
210215
values_validity: Mask,
211216
values_nullability: Nullability,
212217
length: usize,
213-
) -> PrimitiveArray {
218+
) -> (Buffer<T>, Validity) {
214219
match values_validity {
215220
Mask::AllTrue(_) => {
216221
let mut decoded: BufferMut<T> = BufferMut::with_capacity(length);
@@ -225,9 +230,9 @@ pub fn runend_decode_typed_primitive<T: NativePType>(
225230
// We preallocate enough capacity because we know the total length
226231
unsafe { decoded.push_n_unchecked(*value, end - decoded.len()) };
227232
}
228-
PrimitiveArray::new(decoded, values_nullability.into())
233+
(decoded.into(), values_nullability.into())
229234
}
230-
Mask::AllFalse(_) => PrimitiveArray::new(Buffer::<T>::zeroed(length), Validity::AllInvalid),
235+
Mask::AllFalse(_) => (Buffer::<T>::zeroed(length), Validity::AllInvalid),
231236
Mask::Values(mask) => {
232237
let mut decoded = BufferMut::with_capacity(length);
233238
let mut decoded_validity = BitBufferMut::with_capacity(length);
@@ -258,11 +263,28 @@ pub fn runend_decode_typed_primitive<T: NativePType>(
258263
}
259264
}
260265
}
261-
PrimitiveArray::new(decoded, Validity::from(decoded_validity.freeze()))
266+
(decoded.into(), Validity::from(decoded_validity.freeze()))
262267
}
263268
}
264269
}
265270

271+
pub fn runend_decode_typed_primitive<T: NativePType>(
272+
run_ends: impl Iterator<Item = usize>,
273+
values: &[T],
274+
values_validity: Mask,
275+
values_nullability: Nullability,
276+
length: usize,
277+
) -> PrimitiveArray {
278+
let (decoded, validity) = runend_decode_slice(
279+
run_ends,
280+
values,
281+
values_validity,
282+
values_nullability,
283+
length,
284+
);
285+
PrimitiveArray::new(decoded, validity)
286+
}
287+
266288
pub fn runend_decode_typed_bool(
267289
run_ends: impl Iterator<Item = usize>,
268290
values: &BitBuffer,
@@ -304,6 +326,36 @@ pub fn runend_decode_typed_bool(
304326
}
305327
}
306328

329+
/// Decode a run-end encoded VarBinView array by expanding views directly.
330+
pub fn runend_decode_varbinview(
331+
ends: PrimitiveArray,
332+
values: VarBinViewArray,
333+
offset: usize,
334+
length: usize,
335+
) -> VortexResult<VarBinViewArray> {
336+
let validity_mask = values.validity_mask()?;
337+
let views = values.views();
338+
339+
let (decoded_views, validity) = match_each_unsigned_integer_ptype!(ends.ptype(), |E| {
340+
runend_decode_slice(
341+
trimmed_ends_iter(ends.as_slice::<E>(), offset, length),
342+
views,
343+
validity_mask,
344+
values.dtype().nullability(),
345+
length,
346+
)
347+
});
348+
349+
let parts = values.into_parts();
350+
let view_handle = BufferHandle::new_host(decoded_views.into_byte_buffer());
351+
352+
// SAFETY: we are expanding views from a valid VarBinViewArray with the same
353+
// buffers, so all buffer indices and offsets remain valid.
354+
Ok(unsafe {
355+
VarBinViewArray::new_handle_unchecked(view_handle, parts.buffers, parts.dtype, validity)
356+
})
357+
}
358+
307359
#[cfg(test)]
308360
mod test {
309361
use vortex_array::ToCanonical;

0 commit comments

Comments
 (0)