Skip to content

Commit dab39bf

Browse files
Optimize interleave boolean gather (#8350)
## Summary This PR optimizes the boolean gather path in `InterleaveArray` and adds comprehensive benchmarks comparing Vortex's implementation --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk> Co-authored-by: Claude <noreply@anthropic.com>
1 parent 809e2a4 commit dab39bf

3 files changed

Lines changed: 302 additions & 127 deletions

File tree

vortex-array/benches/interleave.rs

Lines changed: 94 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,30 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
//! Benchmarks the Vortex [`Interleave`](vortex_array::arrays::Interleave) boolean execute path on a
5+
//! focused set of configurations:
6+
//!
7+
//! - `round_robin`, 2 children: a merge — `array_index = i % N`, `row_index = i / N`.
8+
//! - `random`, 2 children: fully random `(array_index, row_index)` per output row.
9+
//! - `random`, 64 children: the same random gather spread over many value arrays.
10+
//!
11+
//! Each is run nullable and non-nullable.
12+
413
#![expect(clippy::unwrap_used)]
514

15+
use std::fmt::Display;
16+
use std::fmt::Formatter;
17+
618
use divan::Bencher;
719
use rand::RngExt;
820
use rand::SeedableRng;
921
use rand::distr::Uniform;
1022
use rand::prelude::StdRng;
23+
use vortex_array::ArrayRef;
1124
use vortex_array::Canonical;
1225
use vortex_array::IntoArray;
1326
use vortex_array::VortexSessionExecute;
27+
use vortex_array::array_session;
1428
use vortex_array::arrays::BoolArray;
1529
use vortex_array::arrays::InterleaveArray;
1630
use vortex_buffer::Buffer;
@@ -21,18 +35,74 @@ fn main() {
2135

2236
const ARRAY_SIZE: usize = 8_192;
2337

24-
/// Builds `num_branches` boolean value arrays plus random `(array_indices, row_indices)` selectors
25-
/// describing a full random-access gather of `ARRAY_SIZE` output rows.
26-
fn inputs(
27-
num_branches: usize,
38+
/// The access pattern used to generate the `(array_index, row_index)` selectors.
39+
#[derive(Clone, Copy)]
40+
enum Pattern {
41+
/// A merge: `array_index = i % N`, `row_index = i / N`.
42+
RoundRobin,
43+
/// Fully random `(array_index, row_index)` per output row.
44+
Random,
45+
}
46+
47+
impl Display for Pattern {
48+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
49+
f.write_str(match self {
50+
Pattern::RoundRobin => "round_robin",
51+
Pattern::Random => "random",
52+
})
53+
}
54+
}
55+
56+
/// A single benchmark configuration: access pattern, branch count, and nullability.
57+
#[derive(Clone, Copy)]
58+
struct Combo {
59+
pattern: Pattern,
60+
branches: usize,
2861
nullable: bool,
29-
) -> (Vec<vortex_array::ArrayRef>, Buffer<u32>, Buffer<u32>) {
62+
}
63+
64+
impl Display for Combo {
65+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
66+
write!(
67+
f,
68+
"{}/n{}/{}",
69+
self.pattern,
70+
self.branches,
71+
if self.nullable { "null" } else { "nonnull" }
72+
)
73+
}
74+
}
75+
76+
/// The configurations the benchmark covers: 2-child round-robin, 2-child random, and 64-child
77+
/// random — each nullable and non-nullable.
78+
fn combos() -> Vec<Combo> {
79+
let mut out = Vec::new();
80+
for nullable in [false, true] {
81+
for (pattern, branches) in [
82+
(Pattern::RoundRobin, 2),
83+
(Pattern::Random, 2),
84+
(Pattern::Random, 64),
85+
] {
86+
out.push(Combo {
87+
pattern,
88+
branches,
89+
nullable,
90+
});
91+
}
92+
}
93+
out
94+
}
95+
96+
/// Builds the Vortex value arrays and the `u32` selector buffers for a [`Combo`].
97+
///
98+
/// Seeded only by the combo so a run is deterministic and comparable across revisions.
99+
fn vortex_inputs(combo: Combo) -> (Vec<ArrayRef>, Buffer<u32>, Buffer<u32>) {
30100
let mut rng = StdRng::seed_from_u64(0);
31101
let bit = Uniform::new(0u8, 2).unwrap();
32102

33-
let values = (0..num_branches)
103+
let values = (0..combo.branches)
34104
.map(|_| {
35-
if nullable {
105+
if combo.nullable {
36106
BoolArray::from_iter(
37107
(0..ARRAY_SIZE).map(|_| (rng.sample(bit) == 0).then_some(rng.sample(bit) == 0)),
38108
)
@@ -43,37 +113,27 @@ fn inputs(
43113
})
44114
.collect();
45115

46-
let branch = Uniform::new(0u32, u32::try_from(num_branches).unwrap()).unwrap();
116+
let branch = Uniform::new(0u32, u32::try_from(combo.branches).unwrap()).unwrap();
47117
let row = Uniform::new(0u32, u32::try_from(ARRAY_SIZE).unwrap()).unwrap();
48-
let array_indices: Buffer<u32> = (0..ARRAY_SIZE).map(|_| rng.sample(branch)).collect();
49-
let row_indices: Buffer<u32> = (0..ARRAY_SIZE).map(|_| rng.sample(row)).collect();
50-
(values, array_indices, row_indices)
51-
}
52-
53-
#[divan::bench(args = [2, 4])]
54-
fn interleave_bool(bencher: Bencher, num_branches: usize) {
55-
let (values, array_indices, row_indices) = inputs(num_branches, false);
56-
let session = vortex_array::array_session();
57-
bencher
58-
.with_inputs(|| {
59-
(
60-
InterleaveArray::try_new(
61-
values.clone(),
62-
array_indices.clone().into_array(),
63-
row_indices.clone().into_array(),
64-
)
65-
.unwrap()
66-
.into_array(),
67-
session.create_execution_ctx(),
68-
)
118+
let array_indices: Buffer<u32> = (0..ARRAY_SIZE)
119+
.map(|i| match combo.pattern {
120+
Pattern::Random => rng.sample(branch),
121+
Pattern::RoundRobin => u32::try_from(i % combo.branches).unwrap(),
69122
})
70-
.bench_refs(|(array, ctx)| array.clone().execute::<Canonical>(ctx));
123+
.collect();
124+
let row_indices: Buffer<u32> = (0..ARRAY_SIZE)
125+
.map(|i| match combo.pattern {
126+
Pattern::Random => rng.sample(row),
127+
Pattern::RoundRobin => u32::try_from((i / combo.branches) % ARRAY_SIZE).unwrap(),
128+
})
129+
.collect();
130+
(values, array_indices, row_indices)
71131
}
72132

73-
#[divan::bench(args = [2, 4])]
74-
fn interleave_bool_nullable(bencher: Bencher, num_branches: usize) {
75-
let (values, array_indices, row_indices) = inputs(num_branches, true);
76-
let session = vortex_array::array_session();
133+
#[divan::bench(args = combos())]
134+
fn vortex(bencher: Bencher, combo: Combo) {
135+
let (values, array_indices, row_indices) = vortex_inputs(combo);
136+
let session = array_session();
77137
bencher
78138
.with_inputs(|| {
79139
(

vortex-array/src/arrays/interleave/execute/bool.rs

Lines changed: 64 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ use vortex_buffer::BitBuffer;
88
use vortex_buffer::BitBufferMut;
99
use vortex_error::VortexResult;
1010
use vortex_error::vortex_ensure;
11-
use vortex_mask::Mask;
1211

1312
use super::super::Interleave;
1413
use super::super::InterleaveArrayExt;
@@ -21,62 +20,44 @@ use crate::executor::ExecutionCtx;
2120
use crate::executor::ExecutionResult;
2221
use crate::match_each_unsigned_integer_ptype;
2322
use crate::require_child;
24-
use crate::validity::Validity;
2523

2624
/// Gathers `N` boolean values under unsigned `array_indices` / `row_indices` selectors, scattering
27-
/// each selected bit (and its validity) into the output position it routes to.
25+
/// each selected bit into the output position it routes to.
2826
pub(super) fn execute(
2927
array: Array<Interleave>,
30-
ctx: &mut ExecutionCtx,
28+
_ctx: &mut ExecutionCtx,
3129
) -> VortexResult<ExecutionResult> {
3230
let num_values = array.num_values();
3331

34-
// Drive every value and both selectors to canonical encodings so we can operate on raw bits.
32+
// Drive both selectors and every value to canonical encodings so we can operate on raw bits.
3533
let mut array = array;
34+
array = require_child!(array, array.array_indices(), 0 => Primitive);
35+
array = require_child!(array, array.row_indices(), 1 => Primitive);
3636
for i in 0..num_values {
37-
array = require_child!(array, array.value(i), i => Bool);
37+
array = require_child!(array, array.value(i), i + 2 => Bool);
3838
}
39-
array = require_child!(array, array.array_indices(), num_values => Primitive);
40-
array = require_child!(array, array.row_indices(), num_values + 1 => Primitive);
4139

42-
let dtype = array.as_ref().dtype().clone();
43-
let len = array.as_ref().len();
44-
let nullable = dtype.is_nullable();
45-
46-
// Materialize each value's bits, and its validity mask only when the output can be null.
40+
// Materialize each value's bits; the selectors gather one bit per output below.
4741
let mut value_bits = Vec::with_capacity(num_values);
48-
let mut value_validity = Vec::with_capacity(num_values);
4942
for i in 0..num_values {
50-
let value = array.value(i).as_::<Bool>();
51-
let bits = value.to_bit_buffer();
52-
let validity = nullable
53-
.then(|| value.validity()?.execute_mask(bits.len(), ctx))
54-
.transpose()?;
55-
value_bits.push(bits);
56-
value_validity.push(validity);
43+
value_bits.push(array.value(i).as_::<Bool>().to_bit_buffer());
5744
}
5845

46+
let validity = array.as_ref().validity()?;
47+
5948
// Scatter directly from the typed selector buffers — no intermediate `usize` materialization.
6049
let array_indices = array.array_indices().as_::<Primitive>();
6150
let row_indices = array.row_indices().as_::<Primitive>();
62-
let (values, validity) = match_each_unsigned_integer_ptype!(array_indices.ptype(), |A| {
51+
let values = match_each_unsigned_integer_ptype!(array_indices.ptype(), |A| {
6352
match_each_unsigned_integer_ptype!(row_indices.ptype(), |R| {
6453
gather(
65-
len,
66-
num_values,
6754
&value_bits,
68-
&value_validity,
6955
array_indices.as_slice::<A>(),
7056
row_indices.as_slice::<R>(),
71-
nullable,
7257
)?
7358
})
7459
});
7560

76-
let validity = match validity {
77-
Some(bits) => Validity::from(bits.freeze()),
78-
None => Validity::NonNullable,
79-
};
8061
Ok(ExecutionResult::done(BoolArray::try_new(
8162
values.freeze(),
8263
validity,
@@ -85,43 +66,70 @@ pub(super) fn execute(
8566

8667
/// The scatter, monomorphized on the selector integer widths so each `(array_index, row_index)`
8768
/// pair is read straight from its packed buffer.
88-
///
89-
/// Output bits (and validity) are produced with [`BitBufferMut::collect_bool`], which packs 64
90-
/// results per word: every output bit is written branchlessly, avoiding a per-row `set`/`unset`
91-
/// (each of which would bounds-check and branch on the random bit value).
92-
#[allow(clippy::too_many_arguments)]
9369
fn gather<A: AsPrimitive<usize>, R: AsPrimitive<usize>>(
94-
len: usize,
95-
num_values: usize,
9670
value_bits: &[BitBuffer],
97-
value_validity: &[Option<Mask>],
9871
branches: &[A],
9972
rows: &[R],
100-
nullable: bool,
101-
) -> VortexResult<(BitBufferMut, Option<BitBufferMut>)> {
102-
// Validate the per-row bounds once up front (returning an error rather than panicking), so the
103-
// word-packing passes below are tight branchless loops.
73+
) -> VortexResult<BitBufferMut> {
74+
let len = validate_selectors(value_bits, branches, rows)?;
75+
76+
// SAFETY: `validate_selectors` proved `branches.len() == rows.len() == len`, and for every
77+
// `i < len` that `branches[i] < value_bits.len()` and `rows[i] < value_bits[branches[i]].len()`.
78+
Ok(unsafe { gather_bits(len, value_bits, branches, rows) })
79+
}
80+
81+
/// Validates the per-row selector bounds, returning the output length (`branches.len()`).
82+
///
83+
/// On success, `rows.len() == branches.len() == len` and, for every `i < len`,
84+
/// `branches[i] < value_bits.len()` and `rows[i] < value_bits[branches[i]].len()` — exactly the
85+
/// preconditions of [`gather_bits`]. Errors (rather than panics) on any out-of-bounds selector.
86+
fn validate_selectors<A: AsPrimitive<usize>, R: AsPrimitive<usize>>(
87+
value_bits: &[BitBuffer],
88+
branches: &[A],
89+
rows: &[R],
90+
) -> VortexResult<usize> {
91+
// The two selectors are validated to equal length at construction, which is the output length.
92+
let len = branches.len();
93+
vortex_ensure!(
94+
rows.len() == len,
95+
"interleave selectors differ in length: array_indices {len}, row_indices {}",
96+
rows.len()
97+
);
98+
10499
for i in 0..len {
105100
let branch = branches[i].as_();
106-
vortex_ensure!(branch < num_values, "interleave array index out of bounds");
101+
vortex_ensure!(
102+
branch < value_bits.len(),
103+
"interleave array index out of bounds"
104+
);
107105
vortex_ensure!(
108106
rows[i].as_() < value_bits[branch].len(),
109107
"interleave row index out of bounds"
110108
);
111109
}
112110

113-
let values =
114-
BitBufferMut::collect_bool(len, |i| value_bits[branches[i].as_()].value(rows[i].as_()));
115-
116-
// A missing per-value mask means every row of that value is valid; only materialized when the
117-
// output can be null.
118-
let validity = nullable.then(|| {
119-
BitBufferMut::collect_bool(len, |i| {
120-
value_validity[branches[i].as_()]
121-
.as_ref()
122-
.is_none_or(|mask| mask.value(rows[i].as_()))
123-
})
124-
});
111+
Ok(len)
112+
}
125113

126-
Ok((values, validity))
114+
/// Gathers one bit per output from `bits[branches[i]]` at position `rows[i]`, packing 64 results per
115+
/// word with [`BitBufferMut::collect_bool`].
116+
///
117+
/// The bounds-checked `BitBuffer::value` is slower still.
118+
///
119+
/// # Safety
120+
///
121+
/// `branches` and `rows` must both contain at least `len` elements. For every `i < len`,
122+
/// `branches[i] < bits.len()` and `rows[i] < bits[branches[i]].len()`.
123+
unsafe fn gather_bits<A: AsPrimitive<usize>, R: AsPrimitive<usize>>(
124+
len: usize,
125+
bits: &[BitBuffer],
126+
branches: &[A],
127+
rows: &[R],
128+
) -> BitBufferMut {
129+
// SAFETY: `collect_bool` calls this for `i < len`, and the caller guarantees `branches[i]` and
130+
// `rows[i]` are in bounds for `bits` / the selected buffer.
131+
BitBufferMut::collect_bool(len, |i| unsafe {
132+
bits.get_unchecked(branches.get_unchecked(i).as_())
133+
.value_unchecked(rows.get_unchecked(i).as_())
134+
})
127135
}

0 commit comments

Comments
 (0)