Skip to content

Commit 4b089af

Browse files
perf: aggregate min/max (#8061)
Adds a divan benchmark exercising the min/max aggregation over primitive arrays (i32/i64/f64, with and without nulls) so we can measure and inspect the codegen of the max reduction path. Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk> --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk> Co-authored-by: Claude <noreply@anthropic.com>
1 parent d7134a9 commit 4b089af

6 files changed

Lines changed: 381 additions & 35 deletions

File tree

vortex-array/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,14 @@ serde_json = { workspace = true }
9797
serde_test = { workspace = true }
9898
vortex-array = { path = ".", features = ["_test-harness", "table-display"] }
9999

100+
[[bench]]
101+
name = "aggregate_max"
102+
harness = false
103+
104+
[[bench]]
105+
name = "aggregate_sum"
106+
harness = false
107+
100108
[[bench]]
101109
name = "cast_primitive"
102110
harness = false
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use std::sync::LazyLock;
5+
6+
use divan::Bencher;
7+
use rand::prelude::*;
8+
use vortex_array::IntoArray;
9+
use vortex_array::VortexSessionExecute;
10+
use vortex_array::arrays::PrimitiveArray;
11+
use vortex_array::session::ArraySession;
12+
use vortex_session::VortexSession;
13+
14+
fn main() {
15+
divan::main();
16+
}
17+
18+
const N: usize = 100_000;
19+
20+
static SESSION: LazyLock<VortexSession> =
21+
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
22+
23+
#[divan::bench]
24+
fn max_i32(bencher: Bencher) {
25+
let mut rng = StdRng::seed_from_u64(1);
26+
let data: Vec<i32> = (0..N).map(|_| rng.random::<i32>()).collect();
27+
bencher
28+
.with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
29+
.bench_refs(|a| {
30+
a.statistics()
31+
.compute_max::<i32>(&mut SESSION.create_execution_ctx())
32+
});
33+
}
34+
35+
#[divan::bench]
36+
fn max_i64(bencher: Bencher) {
37+
let mut rng = StdRng::seed_from_u64(2);
38+
let data: Vec<i64> = (0..N).map(|_| rng.random::<i64>()).collect();
39+
bencher
40+
.with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
41+
.bench_refs(|a| {
42+
a.statistics()
43+
.compute_max::<i64>(&mut SESSION.create_execution_ctx())
44+
});
45+
}
46+
47+
#[divan::bench]
48+
fn max_f64(bencher: Bencher) {
49+
let mut rng = StdRng::seed_from_u64(3);
50+
let data: Vec<f64> = (0..N).map(|_| rng.random::<f64>()).collect();
51+
bencher
52+
.with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
53+
.bench_refs(|a| {
54+
a.statistics()
55+
.compute_max::<f64>(&mut SESSION.create_execution_ctx())
56+
});
57+
}
58+
59+
// Clustered nulls: long valid runs broken up by null blocks (run-based path's best case).
60+
#[divan::bench]
61+
fn max_i32_nulls_clustered(bencher: Bencher) {
62+
let mut rng = StdRng::seed_from_u64(4);
63+
let data: Vec<Option<i32>> = (0..N)
64+
.map(|i| {
65+
if (i / 64) % 10 == 0 {
66+
None
67+
} else {
68+
Some(rng.random::<i32>())
69+
}
70+
})
71+
.collect();
72+
bencher
73+
.with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
74+
.bench_refs(|a| {
75+
a.statistics()
76+
.compute_max::<i32>(&mut SESSION.create_execution_ctx())
77+
});
78+
}
79+
80+
// Scattered nulls: ~50% random nulls producing many short runs (run-based path's worst case).
81+
#[divan::bench]
82+
fn max_i32_nulls_scattered(bencher: Bencher) {
83+
let mut rng = StdRng::seed_from_u64(5);
84+
let data: Vec<Option<i32>> = (0..N)
85+
.map(|_| rng.random_bool(0.5).then(|| rng.random::<i32>()))
86+
.collect();
87+
bencher
88+
.with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
89+
.bench_refs(|a| {
90+
a.statistics()
91+
.compute_max::<i32>(&mut SESSION.create_execution_ctx())
92+
});
93+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use std::sync::LazyLock;
5+
6+
use divan::Bencher;
7+
use rand::prelude::*;
8+
use vortex_array::IntoArray;
9+
use vortex_array::VortexSessionExecute;
10+
use vortex_array::arrays::PrimitiveArray;
11+
use vortex_array::expr::stats::Stat;
12+
use vortex_array::session::ArraySession;
13+
use vortex_session::VortexSession;
14+
15+
fn main() {
16+
divan::main();
17+
}
18+
19+
const N: usize = 100_000;
20+
21+
static SESSION: LazyLock<VortexSession> =
22+
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
23+
24+
#[divan::bench]
25+
fn sum_i32(bencher: Bencher) {
26+
let mut rng = StdRng::seed_from_u64(1);
27+
let data: Vec<i32> = (0..N).map(|_| rng.random_range(-1000..1000)).collect();
28+
bencher
29+
.with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
30+
.bench_refs(|a| {
31+
a.statistics()
32+
.compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
33+
});
34+
}
35+
36+
#[divan::bench]
37+
fn sum_u32(bencher: Bencher) {
38+
let mut rng = StdRng::seed_from_u64(2);
39+
let data: Vec<u32> = (0..N).map(|_| rng.random_range(0..2000)).collect();
40+
bencher
41+
.with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
42+
.bench_refs(|a| {
43+
a.statistics()
44+
.compute_as::<u64>(Stat::Sum, &mut SESSION.create_execution_ctx())
45+
});
46+
}
47+
48+
#[divan::bench]
49+
fn sum_i64(bencher: Bencher) {
50+
let mut rng = StdRng::seed_from_u64(3);
51+
let data: Vec<i64> = (0..N).map(|_| rng.random_range(-1000..1000)).collect();
52+
bencher
53+
.with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
54+
.bench_refs(|a| {
55+
a.statistics()
56+
.compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
57+
});
58+
}
59+
60+
// Clustered nulls: long runs of valid values broken up by occasional null blocks. This is the
61+
// case the run-based valid path is expected to accelerate.
62+
#[divan::bench]
63+
fn sum_i32_nulls_clustered(bencher: Bencher) {
64+
let mut rng = StdRng::seed_from_u64(4);
65+
let data: Vec<Option<i32>> = (0..N)
66+
.map(|i| {
67+
if (i / 64) % 10 == 0 {
68+
None
69+
} else {
70+
Some(rng.random_range(-1000..1000))
71+
}
72+
})
73+
.collect();
74+
bencher
75+
.with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
76+
.bench_refs(|a| {
77+
a.statistics()
78+
.compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
79+
});
80+
}
81+
82+
// Scattered nulls: ~50% nulls placed at random, producing many short runs. This is the worst case
83+
// for a run-based valid path, used to guard against regressions versus a per-element loop.
84+
#[divan::bench]
85+
fn sum_i32_nulls_scattered(bencher: Bencher) {
86+
let mut rng = StdRng::seed_from_u64(5);
87+
let data: Vec<Option<i32>> = (0..N)
88+
.map(|_| rng.random_bool(0.5).then(|| rng.random_range(-1000..1000)))
89+
.collect();
90+
bencher
91+
.with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
92+
.bench_refs(|a| {
93+
a.statistics()
94+
.compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
95+
});
96+
}

vortex-array/src/aggregate_fn/fns/min_max/mod.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,32 @@ mod tests {
324324
Ok(())
325325
}
326326

327+
#[test]
328+
fn test_prim_min_max_multiple_null_runs() -> VortexResult<()> {
329+
// Several disjoint valid runs separated by nulls exercise the per-run fold; the extrema
330+
// (min 1, max 9) fall in different runs.
331+
let p = PrimitiveArray::from_option_iter([
332+
Some(5i32),
333+
Some(3),
334+
None,
335+
None,
336+
Some(9),
337+
None,
338+
Some(1),
339+
Some(7),
340+
])
341+
.into_array();
342+
let mut ctx = LEGACY_SESSION.create_execution_ctx();
343+
assert_eq!(
344+
min_max(&p, &mut ctx)?,
345+
Some(MinMaxResult {
346+
min: 1.into(),
347+
max: 9.into()
348+
})
349+
);
350+
Ok(())
351+
}
352+
327353
#[test]
328354
fn test_bool_min_max() -> VortexResult<()> {
329355
let mut ctx = LEGACY_SESSION.create_execution_ctx();

vortex-array/src/aggregate_fn/fns/min_max/primitive.rs

Lines changed: 64 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,75 @@ where
4141
.validity()?
4242
.execute_mask(array.as_ref().len(), ctx)?
4343
{
44-
Mask::AllTrue(_) => compute_min_max(array.as_slice::<T>().iter()),
44+
Mask::AllTrue(_) => {
45+
let slice = array.as_slice::<T>();
46+
// Integers have no NaNs, so a plain min/max reduction is correct and, unlike the
47+
// `itertools::minmax_by` + NaN-filter path, autovectorizes to packed min/max.
48+
if T::PTYPE.is_int() {
49+
integer_min_max_raw(slice).map(min_max_result)
50+
} else {
51+
compute_min_max(slice.iter())
52+
}
53+
}
4554
Mask::AllFalse(_) => None,
46-
Mask::Values(v) => compute_min_max(
47-
array
48-
.as_slice::<T>()
49-
.iter()
50-
.zip(v.bit_buffer().iter())
51-
.filter_map(|(v, m)| m.then_some(v)),
52-
),
55+
Mask::Values(v) => {
56+
let slice = array.as_slice::<T>();
57+
// Each `[start, end)` run is fully valid, so integers can reuse the vectorized
58+
// packed min/max per run and fold the run results; floats chain the runs through
59+
// the NaN-filtering reduction.
60+
if T::PTYPE.is_int() {
61+
v.slices()
62+
.iter()
63+
.filter_map(|&(start, end)| integer_min_max_raw(&slice[start..end]))
64+
.reduce(|(amin, amax), (rmin, rmax)| {
65+
(
66+
if rmin.is_lt(amin) { rmin } else { amin },
67+
if rmax.is_gt(amax) { rmax } else { amax },
68+
)
69+
})
70+
.map(min_max_result)
71+
} else {
72+
compute_min_max(
73+
v.slices()
74+
.iter()
75+
.flat_map(|&(start, end)| slice[start..end].iter()),
76+
)
77+
}
78+
}
5379
},
5480
)
5581
}
5682

83+
/// Min/max of an all-valid integer slice as native values. Autovectorizes to packed min/max.
84+
fn integer_min_max_raw<T>(slice: &[T]) -> Option<(T, T)>
85+
where
86+
T: NativePType,
87+
{
88+
let (&first, rest) = slice.split_first()?;
89+
let mut min = first;
90+
let mut max = first;
91+
for &v in rest {
92+
if v.is_lt(min) {
93+
min = v;
94+
}
95+
if v.is_gt(max) {
96+
max = v;
97+
}
98+
}
99+
Some((min, max))
100+
}
101+
102+
fn min_max_result<T>((min, max): (T, T)) -> MinMaxResult
103+
where
104+
T: NativePType,
105+
PValue: From<T>,
106+
{
107+
MinMaxResult {
108+
min: Scalar::primitive(min, NonNullable),
109+
max: Scalar::primitive(max, NonNullable),
110+
}
111+
}
112+
57113
fn compute_min_max<'a, T>(iter: impl Iterator<Item = &'a T>) -> Option<MinMaxResult>
58114
where
59115
T: NativePType,

0 commit comments

Comments
 (0)