perf[arrow-select]: add specialized REE interleave

asubiotto · asubiotto · commit f103e47aa333 · 2026-04-30T14:17:02.000+02:00
The specialized interleave works by preserving run ends as much as possible by
coalescing groups of adjacent logical indices pointing to the same source and
calling interleave on the run end values.

Future work could additionally coalesce values across sources, but this
requires a value equality check.

Signed-off-by: Alfonso Subiotto Marques &lt;alfonso.subiotto@polarsignals.com&gt;
diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs
@@ -108,6 +108,12 @@ pub fn interleave(
         DataType::Struct(fields) => interleave_struct(fields, values, indices),
         DataType::List(field) => interleave_list::<i32>(values, indices, field),
         DataType::LargeList(field) => interleave_list::<i64>(values, indices, field),
+        DataType::RunEndEncoded(r, _) => match r.data_type() {
+            DataType::Int16 => interleave_run_end::<Int16Type>(values, indices),
+            DataType::Int32 => interleave_run_end::<Int32Type>(values, indices),
+            DataType::Int64 => interleave_run_end::<Int64Type>(values, indices),
+            t => unreachable!("illegal run-end type {t}"),
+        },
         _ => interleave_fallback(values, indices)
     }
 }
@@ -411,6 +417,70 @@ fn interleave_list<O: OffsetSizeTrait>(
     Ok(Arc::new(list_array))
 }
 
+/// Specialized [`interleave`] for [`RunArray`].
+fn interleave_run_end<R: RunEndIndexType>(
+    values: &[&dyn Array],
+    indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+    if indices.is_empty() {
+        return Ok(new_empty_array(values[0].data_type()));
+    }
+
+    let n = indices.len();
+    R::Native::from_usize(n).ok_or_else(|| {
+        ArrowError::ComputeError(format!(
+            "interleave_run_end: output length {n} does not fit run-end type"
+        ))
+    })?;
+
+    let runs: Vec<&RunArray<R>> = values.iter().map(|a| a.as_run::<R>()).collect();
+    let value_arrays: Vec<&dyn Array> = runs.iter().map(|r| r.values().as_ref()).collect();
+
+    // Resolve each (array, logical_row) to (array, physical_row), so we can
+    // lookup physical indices by batch.
+    let mut phys_pairs: Vec<(usize, usize)> = vec![(0, 0); n];
+    let mut grouped: Vec<(Vec<R::Native>, Vec<usize>)> =
+        (0..runs.len()).map(|_| (Vec::new(), Vec::new())).collect();
+    for (out_pos, &(arr, row)) in indices.iter().enumerate() {
+        let row = R::Native::from_usize(row).ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "interleave_run_end: row index {row} out of range"
+            ))
+        })?;
+        grouped[arr].0.push(row);
+        grouped[arr].1.push(out_pos);
+    }
+    for (arr_idx, (logical_rows, out_positions)) in grouped.into_iter().enumerate() {
+        let phys = runs[arr_idx].get_physical_indices(&logical_rows)?;
+        for (p, out_pos) in phys.iter().zip(out_positions.iter()) {
+            phys_pairs[*out_pos] = (arr_idx, *p);
+        }
+    }
+
+    // Coalesce by physical-pair equality only: emit a new run when the
+    // (array_idx, physical_idx) pair changes between adjacent output rows.
+    // TODO: We could perform an equality check across sources to extend the
+    // output run, but we can't call make_comparator from this crate.
+    let mut run_ends_buf: Vec<R::Native> = Vec::with_capacity(n);
+    let mut dedup_pairs: Vec<(usize, usize)> = Vec::with_capacity(n);
+    dedup_pairs.push(phys_pairs[0]);
+    for i in 1..n {
+        if phys_pairs[i] != phys_pairs[i - 1] {
+            run_ends_buf.push(R::Native::from_usize(i).unwrap());
+            dedup_pairs.push(phys_pairs[i]);
+        }
+    }
+    run_ends_buf.push(R::Native::from_usize(n).unwrap());
+
+    let taken_values = interleave(&value_arrays, &dedup_pairs)?;
+    let run_ends = PrimitiveArray::<R>::from_iter_values(run_ends_buf);
+
+    Ok(Arc::new(RunArray::<R>::try_new(
+        &run_ends,
+        taken_values.as_ref(),
+    )?))
+}
+
 /// Fallback implementation of interleave using [`MutableArrayData`]
 fn interleave_fallback(
     values: &[&dyn Array],