Skip to content

Commit 002f49e

Browse files
authored
fix[vortex-array]: handle lazily sliced REE to_arrow (#7011)
## Summary This previous code was short-circuiting with a zero offset, with the assumption that ends were already correctly physically sliced. However, slicing is a lazy operation using metadata, so this commit fixes a couple of edge cases by executing the fast path only if the ends are correcly sliced. Otherwise, ends are trimmed and transformed with the offset in for correct arrow array construction. <!-- Thank you for submitting a pull request! We appreciate your time and effort. Please make sure to provide enough information so that we can review your pull request. The Summary and Testing sections below contain guidance on what to include. --> <!-- If this PR is related to a tracked effort, please link to the relevant issue here (e.g., `Closes: #123`). Otherwise, feel free to ignore / delete this. In this section, please: 1. Explain the rationale for this change. 2. Summarize the changes included in this PR. A general rule of thumb is that larger PRs should have larger summaries. If there are a lot of changes, please help us review the code by explaining what was changed and why. If there is an issue or discussion attached, there is no need to duplicate all the details, but clarity is always preferred over brevity. --> <!-- ## API Changes Uncomment this section if there are any user-facing changes. Consider whether the change affects users in one of the following ways: 1. Breaks public APIs in some way. 2. Changes the underlying behavior of one of the engine integrations. 3. Should some documentation be updated to reflect this change? If a public API is changed in a breaking manner, make sure to add the appropriate label. You can run `./scripts/public-api.sh` locally to see if there are any public API changes (and this also runs in our CI). --> ## Testing <!-- Please describe how this change was tested. Here are some common categories for testing in Vortex: 1. Verifying existing behavior is maintained. 2. Verifying new behavior and functionality works correctly. 3. Serialization compatibility (backwards and forwards) should be maintained or explicitly broken. --> Enhanced tests to reproduce e2e arrow execution error and added some unit tests to build_run_array Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com>
1 parent c63a800 commit 002f49e

2 files changed

Lines changed: 79 additions & 17 deletions

File tree

encodings/runend/src/arrow.rs

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ mod tests {
6464
use arrow_array::types::Int64Type;
6565
use arrow_schema::DataType;
6666
use arrow_schema::Field;
67+
use rstest::rstest;
6768
use vortex_array::IntoArray as _;
6869
use vortex_array::VortexSessionExecute as _;
6970
use vortex_array::arrays::PrimitiveArray;
@@ -251,19 +252,37 @@ mod tests {
251252
Ok(())
252253
}
253254

254-
#[test]
255-
fn test_sliced_runend_to_arrow_ree() -> VortexResult<()> {
256-
let array = RunEndArray::encode(
257-
PrimitiveArray::from_iter(vec![10i32, 10, 20, 20, 20, 30, 30]).into_array(),
258-
)?;
259-
// Slicing from index 1 produces a non-zero offset in the RunEndArray.
260-
let sliced = array.into_array().slice(1..5)?;
255+
/// Slicing a RunEndArray and converting to Arrow REE must produce
256+
/// correctly trimmed and adjusted run ends for both zero and non-zero offsets.
257+
#[rstest]
258+
#[case::nonzero_offset(
259+
&[10i32, 10, 20, 20, 20, 30, 30],
260+
1..5usize,
261+
&[1i32, 4],
262+
&[10i32, 20],
263+
)]
264+
#[case::zero_offset_excess_runs(
265+
&[10i32, 10, 10, 20, 20, 30, 30, 30, 30, 30],
266+
0..4usize,
267+
&[3i32, 4],
268+
&[10i32, 20],
269+
)]
270+
fn sliced_runend_to_arrow_ree(
271+
#[case] input: &[i32],
272+
#[case] slice_range: std::ops::Range<usize>,
273+
#[case] expected_ends: &[i32],
274+
#[case] expected_values: &[i32],
275+
) -> VortexResult<()> {
276+
let array =
277+
RunEndArray::encode(PrimitiveArray::from_iter(input.iter().copied()).into_array())?;
278+
let sliced = array.into_array().slice(slice_range.clone())?;
261279
let target = ree_type(DataType::Int32, DataType::Int32);
262280
let result = execute(sliced, &target)?;
263281

282+
assert_eq!(result.len(), slice_range.len());
264283
let expected = RunArray::<Int32Type>::try_new(
265-
&Int32Array::from(vec![1, 4]),
266-
&Int32Array::from(vec![10, 20]),
284+
&Int32Array::from(expected_ends.to_vec()),
285+
&Int32Array::from(expected_values.to_vec()),
267286
)?;
268287
assert_eq!(result.as_ref(), &expected);
269288
Ok(())

vortex-array/src/arrow/executor/run_end.rs

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,22 +118,35 @@ fn build_run_array<R: RunEndIndexType>(
118118
where
119119
R::Native: std::ops::Sub<Output = R::Native> + Ord,
120120
{
121-
if offset == 0 {
122-
return Ok(
123-
Arc::new(RunArray::<R>::try_new(ends.as_primitive::<R>(), values)?) as ArrowArrayRef,
124-
);
125-
}
126-
127121
let offset_native = R::Native::from_usize(offset)
128122
.ok_or_else(|| vortex_err!("Offset {offset} exceeds run-end index capacity"))?;
129123
let length_native = R::Native::from_usize(length)
130124
.ok_or_else(|| vortex_err!("Length {length} exceeds run-end index capacity"))?;
131125

132-
let adjusted = ends
126+
let ends_prim = ends.as_primitive::<R>();
127+
if offset == 0 && ends_prim.values().last() == Some(&length_native) {
128+
// Fast path: no trimming or adjustment needed.
129+
return Ok(Arc::new(RunArray::<R>::try_new(ends_prim, values)?) as ArrowArrayRef);
130+
}
131+
132+
// Trim to only include runs covering the [offset, offset+length) range.
133+
// Runs beyond this would produce duplicate adjusted ends, violating
134+
// Arrow's strict-ordering requirement for RunArray.
135+
// Run ends are strictly increasing, so we can binary search.
136+
let num_runs = (ends_prim
137+
.values()
138+
.partition_point(|&e| e - offset_native < length_native)
139+
+ 1)
140+
.min(ends_prim.len());
141+
142+
let trimmed_ends = ends.slice(0, num_runs);
143+
let trimmed_values = values.slice(0, num_runs);
144+
145+
let adjusted = trimmed_ends
133146
.as_primitive::<R>()
134147
.unary(|end| (end - offset_native).min(length_native));
135148

136-
Ok(Arc::new(RunArray::<R>::try_new(&adjusted, values)?) as ArrowArrayRef)
149+
Ok(Arc::new(RunArray::<R>::try_new(&adjusted, &trimmed_values)?) as ArrowArrayRef)
137150
}
138151

139152
/// Convert a constant array to a run-end encoded array with a single run.
@@ -275,4 +288,34 @@ mod tests {
275288
assert_eq!(result.as_ref(), &expected);
276289
Ok(())
277290
}
291+
292+
/// Regression: build_run_array must trim excess trailing runs and
293+
/// respect the `length` parameter. This happens when a vortex
294+
/// RunEndArray is sliced to fewer rows than the physical run_ends cover.
295+
#[rstest]
296+
#[case::offset_zero(0, 5, &[3, 5], &[100, 200])]
297+
#[case::nonzero_offset(2, 3, &[1, 3], &[100, 200])]
298+
#[case::all_runs_needed_but_last_exceeds(0, 8, &[3, 5, 8], &[100, 200, 300])]
299+
fn build_run_array_trims_excess_runs(
300+
#[case] offset: usize,
301+
#[case] length: usize,
302+
#[case] expected_ends: &[i32],
303+
#[case] expected_values: &[i64],
304+
) -> VortexResult<()> {
305+
// 3 runs covering 10 rows: [0..3), [3..5), [5..10)
306+
let ends: arrow_array::ArrayRef = Arc::new(Int32Array::from(vec![3i32, 5, 10]));
307+
let values: arrow_array::ArrayRef = Arc::new(Int64Array::from(vec![100i64, 200, 300]));
308+
309+
let result = super::build_run_array::<Int32Type>(&ends, &values, offset, length)?;
310+
assert_eq!(result.len(), length);
311+
312+
let ree = result
313+
.as_any()
314+
.downcast_ref::<RunArray<Int32Type>>()
315+
.unwrap();
316+
assert_eq!(ree.run_ends().values(), expected_ends);
317+
let values = ree.values().as_any().downcast_ref::<Int64Array>().unwrap();
318+
assert_eq!(values.values(), expected_values);
319+
Ok(())
320+
}
278321
}

0 commit comments

Comments
 (0)