perf: aggregate min/max (#8061)

joseph-isaacs · claude · web-flow · commit 4b089afa5a24 · 2026-05-27T11:24:47.000+01:00
Adds a divan benchmark exercising the min/max aggregation over primitive
arrays (i32/i64/f64, with and without nulls) so we can measure and
inspect
the codegen of the max reduction path.

Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;

---------

Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;
Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
@@ -97,6 +97,14 @@ serde_json = { workspace = true }
 serde_test = { workspace = true }
 vortex-array = { path = ".", features = ["_test-harness", "table-display"] }
 
+[[bench]]
+name = "aggregate_max"
+harness = false
+
+[[bench]]
+name = "aggregate_sum"
+harness = false
+
 [[bench]]
 name = "cast_primitive"
 harness = false
diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::LazyLock;
+
+use divan::Bencher;
+use rand::prelude::*;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::session::ArraySession;
+use vortex_session::VortexSession;
+
+fn main() {
+    divan::main();
+}
+
+const N: usize = 100_000;
+
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+#[divan::bench]
+fn max_i32(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(1);
+    let data: Vec<i32> = (0..N).map(|_| rng.random::<i32>()).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i32>(&mut SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn max_i64(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(2);
+    let data: Vec<i64> = (0..N).map(|_| rng.random::<i64>()).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i64>(&mut SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn max_f64(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(3);
+    let data: Vec<f64> = (0..N).map(|_| rng.random::<f64>()).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<f64>(&mut SESSION.create_execution_ctx())
+        });
+}
+
+// Clustered nulls: long valid runs broken up by null blocks (run-based path's best case).
+#[divan::bench]
+fn max_i32_nulls_clustered(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(4);
+    let data: Vec<Option<i32>> = (0..N)
+        .map(|i| {
+            if (i / 64) % 10 == 0 {
+                None
+            } else {
+                Some(rng.random::<i32>())
+            }
+        })
+        .collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i32>(&mut SESSION.create_execution_ctx())
+        });
+}
+
+// Scattered nulls: ~50% random nulls producing many short runs (run-based path's worst case).
+#[divan::bench]
+fn max_i32_nulls_scattered(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(5);
+    let data: Vec<Option<i32>> = (0..N)
+        .map(|_| rng.random_bool(0.5).then(|| rng.random::<i32>()))
+        .collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i32>(&mut SESSION.create_execution_ctx())
+        });
+}
diff --git a/vortex-array/benches/aggregate_sum.rs b/vortex-array/benches/aggregate_sum.rs
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::LazyLock;
+
+use divan::Bencher;
+use rand::prelude::*;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::expr::stats::Stat;
+use vortex_array::session::ArraySession;
+use vortex_session::VortexSession;
+
+fn main() {
+    divan::main();
+}
+
+const N: usize = 100_000;
+
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+#[divan::bench]
+fn sum_i32(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(1);
+    let data: Vec<i32> = (0..N).map(|_| rng.random_range(-1000..1000)).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn sum_u32(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(2);
+    let data: Vec<u32> = (0..N).map(|_| rng.random_range(0..2000)).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<u64>(Stat::Sum, &mut SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn sum_i64(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(3);
+    let data: Vec<i64> = (0..N).map(|_| rng.random_range(-1000..1000)).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
+        });
+}
+
+// Clustered nulls: long runs of valid values broken up by occasional null blocks. This is the
+// case the run-based valid path is expected to accelerate.
+#[divan::bench]
+fn sum_i32_nulls_clustered(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(4);
+    let data: Vec<Option<i32>> = (0..N)
+        .map(|i| {
+            if (i / 64) % 10 == 0 {
+                None
+            } else {
+                Some(rng.random_range(-1000..1000))
+            }
+        })
+        .collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
+        });
+}
+
+// Scattered nulls: ~50% nulls placed at random, producing many short runs. This is the worst case
+// for a run-based valid path, used to guard against regressions versus a per-element loop.
+#[divan::bench]
+fn sum_i32_nulls_scattered(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(5);
+    let data: Vec<Option<i32>> = (0..N)
+        .map(|_| rng.random_bool(0.5).then(|| rng.random_range(-1000..1000)))
+        .collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
+        });
+}
diff --git a/vortex-array/src/aggregate_fn/fns/min_max/mod.rs b/vortex-array/src/aggregate_fn/fns/min_max/mod.rs
@@ -324,6 +324,32 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_prim_min_max_multiple_null_runs() -> VortexResult<()> {
+        // Several disjoint valid runs separated by nulls exercise the per-run fold; the extrema
+        // (min 1, max 9) fall in different runs.
+        let p = PrimitiveArray::from_option_iter([
+            Some(5i32),
+            Some(3),
+            None,
+            None,
+            Some(9),
+            None,
+            Some(1),
+            Some(7),
+        ])
+        .into_array();
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        assert_eq!(
+            min_max(&p, &mut ctx)?,
+            Some(MinMaxResult {
+                min: 1.into(),
+                max: 9.into()
+            })
+        );
+        Ok(())
+    }
+
     #[test]
     fn test_bool_min_max() -> VortexResult<()> {
         let mut ctx = LEGACY_SESSION.create_execution_ctx();
diff --git a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs
@@ -41,19 +41,75 @@ where
             .validity()?
             .execute_mask(array.as_ref().len(), ctx)?
         {
-            Mask::AllTrue(_) => compute_min_max(array.as_slice::<T>().iter()),
+            Mask::AllTrue(_) => {
+                let slice = array.as_slice::<T>();
+                // Integers have no NaNs, so a plain min/max reduction is correct and, unlike the
+                // `itertools::minmax_by` + NaN-filter path, autovectorizes to packed min/max.
+                if T::PTYPE.is_int() {
+                    integer_min_max_raw(slice).map(min_max_result)
+                } else {
+                    compute_min_max(slice.iter())
+                }
+            }
             Mask::AllFalse(_) => None,
-            Mask::Values(v) => compute_min_max(
-                array
-                    .as_slice::<T>()
-                    .iter()
-                    .zip(v.bit_buffer().iter())
-                    .filter_map(|(v, m)| m.then_some(v)),
-            ),
+            Mask::Values(v) => {
+                let slice = array.as_slice::<T>();
+                // Each `[start, end)` run is fully valid, so integers can reuse the vectorized
+                // packed min/max per run and fold the run results; floats chain the runs through
+                // the NaN-filtering reduction.
+                if T::PTYPE.is_int() {
+                    v.slices()
+                        .iter()
+                        .filter_map(|&(start, end)| integer_min_max_raw(&slice[start..end]))
+                        .reduce(|(amin, amax), (rmin, rmax)| {
+                            (
+                                if rmin.is_lt(amin) { rmin } else { amin },
+                                if rmax.is_gt(amax) { rmax } else { amax },
+                            )
+                        })
+                        .map(min_max_result)
+                } else {
+                    compute_min_max(
+                        v.slices()
+                            .iter()
+                            .flat_map(|&(start, end)| slice[start..end].iter()),
+                    )
+                }
+            }
         },
     )
 }
 
+/// Min/max of an all-valid integer slice as native values. Autovectorizes to packed min/max.
+fn integer_min_max_raw<T>(slice: &[T]) -> Option<(T, T)>
+where
+    T: NativePType,
+{
+    let (&first, rest) = slice.split_first()?;
+    let mut min = first;
+    let mut max = first;
+    for &v in rest {
+        if v.is_lt(min) {
+            min = v;
+        }
+        if v.is_gt(max) {
+            max = v;
+        }
+    }
+    Some((min, max))
+}
+
+fn min_max_result<T>((min, max): (T, T)) -> MinMaxResult
+where
+    T: NativePType,
+    PValue: From<T>,
+{
+    MinMaxResult {
+        min: Scalar::primitive(min, NonNullable),
+        max: Scalar::primitive(max, NonNullable),
+    }
+}
+
 fn compute_min_max<'a, T>(iter: impl Iterator<Item = &'a T>) -> Option<MinMaxResult>
 where
     T: NativePType,
diff --git a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs