Skip to content

Commit 079e0da

Browse files
committed
Add divan benchmark comparing scalar vs AVX2 vs Mojo take kernel
Adds `take_primitive_simd` benchmark that calls all three gather implementations through identical `fn(&[T], &[u32]) -> Buffer<T>` signatures on raw buffers. No Vortex Array overhead. Results on AVX2 (65K values, random u32 indices, median): u32, n=100K: scalar=66.9µs, avx2=46.0µs (1.45x), mojo=44.0µs (1.52x) u64, n=100K: scalar=67.1µs, avx2=55.6µs (1.21x), mojo=55.4µs (1.21x) Signed-off-by: Claude <noreply@anthropic.com> https://claude.ai/code/session_01EVcJZP4ZmfvWRRg2CsgvST
1 parent 3ad4a64 commit 079e0da

7 files changed

Lines changed: 146 additions & 3 deletions

File tree

vortex-array/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ harness = false
165165
name = "take_primitive"
166166
harness = false
167167

168+
[[bench]]
169+
name = "take_primitive_simd"
170+
harness = false
171+
168172
[[bench]]
169173
name = "take_struct"
170174
harness = false
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Raw buffer-level benchmarks for the primitive take (gather) kernel.
5+
//!
6+
//! Compares scalar, AVX2, and Mojo SIMD gather. All three are called through
7+
//! the same `fn(&[T], &[u32]) -> Buffer<T>` Rust signature on raw slices.
8+
//!
9+
//! Run with: `cargo bench -p vortex-array --bench take_primitive_simd`
10+
11+
#![allow(clippy::cast_possible_truncation)]
12+
#![allow(clippy::unwrap_used)]
13+
14+
use divan::Bencher;
15+
use rand::distr::Uniform;
16+
use rand::prelude::*;
17+
use vortex_array::arrays::primitive::{bench_take_avx2, bench_take_mojo, bench_take_scalar};
18+
19+
fn main() {
20+
divan::main();
21+
}
22+
23+
const NUM_INDICES: &[usize] = &[1_000, 10_000, 100_000];
24+
const NUM_VALUES: usize = 65_536;
25+
26+
fn make_u32_indices(num_indices: usize) -> Vec<u32> {
27+
let rng = StdRng::seed_from_u64(42);
28+
let range = Uniform::new(0u32, NUM_VALUES as u32).unwrap();
29+
rng.sample_iter(range).take(num_indices).collect()
30+
}
31+
32+
// ---------------------------------------------------------------------------
33+
// u32 values
34+
// ---------------------------------------------------------------------------
35+
36+
#[divan::bench(args = NUM_INDICES, sample_count = 10_000)]
37+
fn gather_u32_scalar(bencher: Bencher, n: usize) {
38+
let values: Vec<u32> = (0..NUM_VALUES as u32).collect();
39+
let indices = make_u32_indices(n);
40+
bencher.bench(|| divan::black_box(bench_take_scalar(&values, &indices)));
41+
}
42+
43+
#[divan::bench(args = NUM_INDICES, sample_count = 10_000)]
44+
fn gather_u32_avx2(bencher: Bencher, n: usize) {
45+
let values: Vec<u32> = (0..NUM_VALUES as u32).collect();
46+
let indices = make_u32_indices(n);
47+
bencher.bench(|| divan::black_box(bench_take_avx2(&values, &indices)));
48+
}
49+
50+
#[divan::bench(args = NUM_INDICES, sample_count = 10_000)]
51+
fn gather_u32_mojo(bencher: Bencher, n: usize) {
52+
let values: Vec<u32> = (0..NUM_VALUES as u32).collect();
53+
let indices = make_u32_indices(n);
54+
bencher.bench(|| divan::black_box(bench_take_mojo(&values, &indices)));
55+
}
56+
57+
// ---------------------------------------------------------------------------
58+
// u64 values
59+
// ---------------------------------------------------------------------------
60+
61+
#[divan::bench(args = NUM_INDICES, sample_count = 10_000)]
62+
fn gather_u64_scalar(bencher: Bencher, n: usize) {
63+
let values: Vec<u64> = (0..NUM_VALUES as u64).map(|i| i * 100).collect();
64+
let indices = make_u32_indices(n);
65+
bencher.bench(|| divan::black_box(bench_take_scalar(&values, &indices)));
66+
}
67+
68+
#[divan::bench(args = NUM_INDICES, sample_count = 10_000)]
69+
fn gather_u64_avx2(bencher: Bencher, n: usize) {
70+
let values: Vec<u64> = (0..NUM_VALUES as u64).map(|i| i * 100).collect();
71+
let indices = make_u32_indices(n);
72+
bencher.bench(|| divan::black_box(bench_take_avx2(&values, &indices)));
73+
}
74+
75+
#[divan::bench(args = NUM_INDICES, sample_count = 10_000)]
76+
fn gather_u64_mojo(bencher: Bencher, n: usize) {
77+
let values: Vec<u64> = (0..NUM_VALUES as u64).map(|i| i * 100).collect();
78+
let indices = make_u32_indices(n);
79+
bencher.bench(|| divan::black_box(bench_take_mojo(&values, &indices)));
80+
}

vortex-array/src/arrays/primitive/compute/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ mod fill_null;
77
mod mask;
88
pub(crate) mod rules;
99
mod slice;
10-
mod take;
10+
pub(crate) mod take;
1111

1212
#[cfg(test)]
1313
mod tests {

vortex-array/src/arrays/primitive/compute/take/avx2.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,10 @@ where
121121
/// The caller must ensure the `avx2` feature is enabled.
122122
#[target_feature(enable = "avx2")]
123123
#[doc(hidden)]
124-
unsafe fn take_avx2<V: NativePType, I: UnsignedPType>(buffer: &[V], indices: &[I]) -> Buffer<V> {
124+
pub(super) unsafe fn take_avx2<V: NativePType, I: UnsignedPType>(
125+
buffer: &[V],
126+
indices: &[I],
127+
) -> Buffer<V> {
125128
macro_rules! dispatch_avx2 {
126129
($indices:ty, $values:ty) => {
127130
{ let result = dispatch_avx2!($indices, $values, cast: $values); result }

vortex-array/src/arrays/primitive/compute/take/mod.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,53 @@ fn take_primitive_scalar<T: NativePType, I: IntegerPType>(
144144
result.freeze()
145145
}
146146

147+
// ---------------------------------------------------------------------------
148+
// Benchmark-visible helpers — expose the raw scalar and Mojo gather kernels
149+
// with identical signatures so benchmarks can compare them directly.
150+
// ---------------------------------------------------------------------------
151+
152+
/// Scalar gather: `result[i] = buffer[indices[i]]`. No SIMD.
153+
#[doc(hidden)]
154+
pub fn bench_take_scalar<T: NativePType, I: IntegerPType>(
155+
buffer: &[T],
156+
indices: &[I],
157+
) -> Buffer<T> {
158+
take_primitive_scalar(buffer, indices)
159+
}
160+
161+
/// AVX2 gather via hand-written intrinsics. Falls back to scalar on non-x86 or when AVX2
162+
/// is unavailable at runtime.
163+
#[doc(hidden)]
164+
pub fn bench_take_avx2<T: NativePType, I: crate::dtype::UnsignedPType>(
165+
buffer: &[T],
166+
indices: &[I],
167+
) -> Buffer<T> {
168+
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
169+
{
170+
if is_x86_feature_detected!("avx2") {
171+
// SAFETY: We just checked AVX2 is available.
172+
return unsafe { avx2::take_avx2(buffer, indices) };
173+
}
174+
}
175+
take_primitive_scalar(buffer, indices)
176+
}
177+
178+
/// SIMD gather via the Mojo AOT kernel. Falls back to scalar when Mojo is not available.
179+
#[doc(hidden)]
180+
pub fn bench_take_mojo<T: NativePType, I: crate::dtype::UnsignedPType>(
181+
buffer: &[T],
182+
indices: &[I],
183+
) -> Buffer<T> {
184+
#[cfg(vortex_mojo)]
185+
{
186+
mojo::take_mojo(buffer, indices)
187+
}
188+
#[cfg(not(vortex_mojo))]
189+
{
190+
take_primitive_scalar(buffer, indices)
191+
}
192+
}
193+
147194
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
148195
#[cfg(test)]
149196
mod test {

vortex-array/src/arrays/primitive/compute/take/mojo.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,10 @@ impl TakeImpl for TakeKernelMojo {
8080
}
8181

8282
/// Dispatch to the appropriate Mojo kernel based on value byte width and index type.
83-
fn take_mojo<V: NativePType, I: UnsignedPType>(values: &[V], indices: &[I]) -> Buffer<V> {
83+
pub(super) fn take_mojo<V: NativePType, I: UnsignedPType>(
84+
values: &[V],
85+
indices: &[I],
86+
) -> Buffer<V> {
8487
let len = indices.len();
8588
let mut buffer = BufferMut::<V>::with_capacity(len);
8689

vortex-array/src/arrays/primitive/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ pub(crate) mod compute;
1313

1414
mod vtable;
1515
pub use compute::rules::PrimitiveMaskedValidityRule;
16+
#[doc(hidden)]
17+
pub use compute::take::bench_take_avx2;
18+
#[doc(hidden)]
19+
pub use compute::take::bench_take_mojo;
20+
#[doc(hidden)]
21+
pub use compute::take::bench_take_scalar;
1622
pub use vtable::Primitive;
1723

1824
mod native_value;

0 commit comments

Comments
 (0)