Skip to content

Commit 7dcab2c

Browse files
committed
fix[vortex-array]: handle lazily sliced REE to_arrow
This previous code was short-circuiting with a zero offset, with the assumption that ends were already correctly physically sliced. However, slicing is a lazy operation using metadata, so this commit fixes a couple of edge cases by executing the fast path only if the ends are correcly sliced. Otherwise, ends are trimmed and transformed with the offset in for correct arrow array construction. Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com>
1 parent 12fe78e commit 7dcab2c

2 files changed

Lines changed: 79 additions & 17 deletions

File tree

encodings/runend/src/arrow.rs

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ mod tests {
6464
use arrow_array::types::Int64Type;
6565
use arrow_schema::DataType;
6666
use arrow_schema::Field;
67+
use rstest::rstest;
6768
use vortex_array::IntoArray as _;
6869
use vortex_array::VortexSessionExecute as _;
6970
use vortex_array::arrays::PrimitiveArray;
@@ -251,19 +252,37 @@ mod tests {
251252
Ok(())
252253
}
253254

254-
#[test]
255-
fn test_sliced_runend_to_arrow_ree() -> VortexResult<()> {
256-
let array = RunEndArray::encode(
257-
PrimitiveArray::from_iter(vec![10i32, 10, 20, 20, 20, 30, 30]).into_array(),
258-
)?;
259-
// Slicing from index 1 produces a non-zero offset in the RunEndArray.
260-
let sliced = array.into_array().slice(1..5)?;
255+
/// Slicing a RunEndArray and converting to Arrow REE must produce
256+
/// correctly trimmed and adjusted run ends for both zero and non-zero offsets.
257+
#[rstest]
258+
#[case::nonzero_offset(
259+
&[10i32, 10, 20, 20, 20, 30, 30],
260+
1..5usize,
261+
&[1i32, 4],
262+
&[10i32, 20],
263+
)]
264+
#[case::zero_offset_excess_runs(
265+
&[10i32, 10, 10, 20, 20, 30, 30, 30, 30, 30],
266+
0..4usize,
267+
&[3i32, 4],
268+
&[10i32, 20],
269+
)]
270+
fn sliced_runend_to_arrow_ree(
271+
#[case] input: &[i32],
272+
#[case] slice_range: std::ops::Range<usize>,
273+
#[case] expected_ends: &[i32],
274+
#[case] expected_values: &[i32],
275+
) -> VortexResult<()> {
276+
let array =
277+
RunEndArray::encode(PrimitiveArray::from_iter(input.iter().copied()).into_array())?;
278+
let sliced = array.into_array().slice(slice_range.clone())?;
261279
let target = ree_type(DataType::Int32, DataType::Int32);
262280
let result = execute(sliced, &target)?;
263281

282+
assert_eq!(result.len(), slice_range.len());
264283
let expected = RunArray::<Int32Type>::try_new(
265-
&Int32Array::from(vec![1, 4]),
266-
&Int32Array::from(vec![10, 20]),
284+
&Int32Array::from(expected_ends.to_vec()),
285+
&Int32Array::from(expected_values.to_vec()),
267286
)?;
268287
assert_eq!(result.as_ref(), &expected);
269288
Ok(())

vortex-array/src/arrow/executor/run_end.rs

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,22 +118,35 @@ fn build_run_array<R: RunEndIndexType>(
118118
where
119119
R::Native: std::ops::Sub<Output = R::Native> + Ord,
120120
{
121-
if offset == 0 {
122-
return Ok(
123-
Arc::new(RunArray::<R>::try_new(ends.as_primitive::<R>(), values)?) as ArrowArrayRef,
124-
);
125-
}
126-
127121
let offset_native = R::Native::from_usize(offset)
128122
.ok_or_else(|| vortex_err!("Offset {offset} exceeds run-end index capacity"))?;
129123
let length_native = R::Native::from_usize(length)
130124
.ok_or_else(|| vortex_err!("Length {length} exceeds run-end index capacity"))?;
131125

132-
let adjusted = ends
126+
let ends_prim = ends.as_primitive::<R>();
127+
if offset == 0 && ends_prim.values().last() == Some(&length_native) {
128+
// Fast path: no trimming or adjustment needed.
129+
return Ok(Arc::new(RunArray::<R>::try_new(ends_prim, values)?) as ArrowArrayRef);
130+
}
131+
132+
// Trim to only include runs covering the [offset, offset+length) range.
133+
// Runs beyond this would produce duplicate adjusted ends, violating
134+
// Arrow's strict-ordering requirement for RunArray.
135+
let num_runs = ends_prim
136+
.values()
137+
.iter()
138+
.position(|&e| e - offset_native >= length_native)
139+
.map(|i| i + 1)
140+
.unwrap_or(ends_prim.len());
141+
142+
let trimmed_ends = ends.slice(0, num_runs);
143+
let trimmed_values = values.slice(0, num_runs);
144+
145+
let adjusted = trimmed_ends
133146
.as_primitive::<R>()
134147
.unary(|end| (end - offset_native).min(length_native));
135148

136-
Ok(Arc::new(RunArray::<R>::try_new(&adjusted, values)?) as ArrowArrayRef)
149+
Ok(Arc::new(RunArray::<R>::try_new(&adjusted, &trimmed_values)?) as ArrowArrayRef)
137150
}
138151

139152
/// Convert a constant array to a run-end encoded array with a single run.
@@ -275,4 +288,34 @@ mod tests {
275288
assert_eq!(result.as_ref(), &expected);
276289
Ok(())
277290
}
291+
292+
/// Regression: build_run_array must trim excess trailing runs and
293+
/// respect the `length` parameter. This happens when a vortex
294+
/// RunEndArray is sliced to fewer rows than the physical run_ends cover.
295+
#[rstest]
296+
#[case::offset_zero(0, 5, &[3, 5], &[100, 200])]
297+
#[case::nonzero_offset(2, 3, &[1, 3], &[100, 200])]
298+
#[case::all_runs_needed_but_last_exceeds(0, 8, &[3, 5, 8], &[100, 200, 300])]
299+
fn build_run_array_trims_excess_runs(
300+
#[case] offset: usize,
301+
#[case] length: usize,
302+
#[case] expected_ends: &[i32],
303+
#[case] expected_values: &[i64],
304+
) -> VortexResult<()> {
305+
// 3 runs covering 10 rows: [0..3), [3..5), [5..10)
306+
let ends: arrow_array::ArrayRef = Arc::new(Int32Array::from(vec![3i32, 5, 10]));
307+
let values: arrow_array::ArrayRef = Arc::new(Int64Array::from(vec![100i64, 200, 300]));
308+
309+
let result = super::build_run_array::<Int32Type>(&ends, &values, offset, length)?;
310+
assert_eq!(result.len(), length);
311+
312+
let ree = result
313+
.as_any()
314+
.downcast_ref::<RunArray<Int32Type>>()
315+
.unwrap();
316+
assert_eq!(ree.run_ends().values(), expected_ends);
317+
let values = ree.values().as_any().downcast_ref::<Int64Array>().unwrap();
318+
assert_eq!(values.values(), expected_values);
319+
Ok(())
320+
}
278321
}

0 commit comments

Comments
 (0)