Skip to content

Commit f0b8fb9

Browse files
perf[buffer]: iteration for fallible operations with validity (#8120)
Currently use (and arrow) handle fallible operations with scalar (non-SIMD) code. This PR add a trait and methods to have fast SIMD checked operations (includes cast) but verified else where that `checked_add` benefits --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent 9f494a1 commit f0b8fb9

15 files changed

Lines changed: 1729 additions & 66 deletions

File tree

Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ members = [
44
"vortex-array-macros",
55
"vortex-error",
66
"vortex-buffer",
7+
"vortex-compute",
78
"vortex-mask",
89
"vortex-utils",
910
"vortex-session",
@@ -282,6 +283,7 @@ vortex-btrblocks = { version = "0.1.0", path = "./vortex-btrblocks", default-fea
282283
vortex-buffer = { version = "0.1.0", path = "./vortex-buffer", default-features = false }
283284
vortex-bytebool = { version = "0.1.0", path = "./encodings/bytebool", default-features = false }
284285
vortex-compressor = { version = "0.1.0", path = "./vortex-compressor", default-features = false }
286+
vortex-compute = { version = "0.1.0", path = "./vortex-compute", default-features = false }
285287
vortex-datafusion = { version = "0.1.0", path = "./vortex-datafusion", default-features = false }
286288
vortex-datetime-parts = { version = "0.1.0", path = "./encodings/datetime-parts", default-features = false }
287289
vortex-decimal-byte-parts = { version = "0.1.0", path = "encodings/decimal-byte-parts", default-features = false }

vortex-array/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ tracing = { workspace = true }
7070
uuid = { workspace = true }
7171
vortex-array-macros = { workspace = true }
7272
vortex-buffer = { workspace = true, features = ["arrow"] }
73+
vortex-compute = { workspace = true }
7374
vortex-error = { workspace = true, features = ["flatbuffers"] }
7475
vortex-flatbuffers = { workspace = true, features = ["array", "dtype"] }
7576
vortex-mask = { workspace = true }

vortex-array/benches/cast_primitive.rs

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
#![expect(clippy::unwrap_used)]
5+
46
use std::sync::LazyLock;
57

68
use divan::Bencher;
@@ -20,20 +22,18 @@ fn main() {
2022
divan::main();
2123
}
2224

23-
const N: usize = 100_000;
25+
// Sizes used for the fallible-path benches below. Kept small enough to fit in L2 so
26+
// the kernel cost shows up clearly rather than being hidden by DRAM bandwidth.
27+
const SIZES: &[usize] = &[65_536];
2428

2529
static SESSION: LazyLock<VortexSession> = LazyLock::new(vortex_array::array_session);
2630

27-
#[divan::bench]
28-
fn cast_u16_to_u32(bencher: Bencher) {
31+
#[divan::bench(args = SIZES)]
32+
fn cast_u16_to_u32(bencher: Bencher, n: usize) {
2933
let mut rng = StdRng::seed_from_u64(42);
30-
#[expect(clippy::cast_possible_truncation)]
31-
let arr = PrimitiveArray::from_option_iter((0..N).map(|i| {
32-
if rng.random_bool(0.5) {
33-
None
34-
} else {
35-
Some(i as u16)
36-
}
34+
let arr = PrimitiveArray::from_option_iter((0..n).map(|i| {
35+
#[expect(clippy::cast_possible_truncation)]
36+
rng.random_bool(0.5).then_some(i as u16)
3737
}))
3838
.into_array();
3939
// Pre-compute min/max so values_fit_in is a cache hit during the benchmark.
@@ -43,7 +43,43 @@ fn cast_u16_to_u32(bencher: Bencher) {
4343
bencher
4444
.with_inputs(|| (arr.clone(), SESSION.create_execution_ctx()))
4545
.bench_refs(|(a, ctx)| {
46-
#[expect(clippy::unwrap_used)]
46+
a.cast(DType::Primitive(PType::U32, Nullability::Nullable))
47+
.unwrap()
48+
.execute::<Canonical>(ctx)
49+
});
50+
}
51+
52+
/// Narrowing fallible cast that goes through `try_map_with_mask`. Inputs are bounded
53+
/// so every value fits, isolating the kernel's per-lane checked-cast overhead.
54+
#[divan::bench(args = SIZES)]
55+
fn cast_u32_to_u8(bencher: Bencher, n: usize) {
56+
let mut rng = StdRng::seed_from_u64(42);
57+
let arr = PrimitiveArray::from_option_iter((0..n).map(|_| {
58+
rng.random_bool(0.7)
59+
.then(|| rng.random_range(0..u8::MAX) as u32)
60+
}))
61+
.into_array();
62+
bencher
63+
.with_inputs(|| (arr.clone(), SESSION.create_execution_ctx()))
64+
.bench_refs(|(a, ctx)| {
65+
a.cast(DType::Primitive(PType::U8, Nullability::Nullable))
66+
.unwrap()
67+
.execute::<Canonical>(ctx)
68+
});
69+
}
70+
71+
/// Sign-change cast i32 → u32. Values are non-negative so the kernel succeeds
72+
/// but still pays the per-lane `try_from` check.
73+
#[divan::bench(args = SIZES)]
74+
fn cast_i32_to_u32(bencher: Bencher, n: usize) {
75+
let mut rng = StdRng::seed_from_u64(42);
76+
let arr = PrimitiveArray::from_option_iter(
77+
(0..n).map(|_| rng.random_bool(0.7).then(|| rng.random_range(0..i32::MAX))),
78+
)
79+
.into_array();
80+
bencher
81+
.with_inputs(|| (arr.clone(), SESSION.create_execution_ctx()))
82+
.bench_refs(|(a, ctx)| {
4783
a.cast(DType::Primitive(PType::U32, Nullability::Nullable))
4884
.unwrap()
4985
.execute::<Canonical>(ctx)

vortex-array/src/arrays/primitive/array/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,9 @@ impl PrimitiveData {
596596
}
597597

598598
/// Try to extract a mutable buffer from the PrimitiveData with zero copy.
599+
///
600+
/// # Panic
601+
/// If the buffer is not of type T this will panic
599602
pub fn try_into_buffer_mut<T: NativePType>(self) -> Result<BufferMut<T>, Buffer<T>> {
600603
if T::PTYPE != self.ptype() {
601604
vortex_panic!(

vortex-array/src/arrays/primitive/compute/cast.rs

Lines changed: 85 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ use num_traits::AsPrimitive;
55
use num_traits::NumCast;
66
use vortex_buffer::Buffer;
77
use vortex_buffer::BufferMut;
8+
use vortex_compute::lane_kernels::IndexedSinkExt;
9+
use vortex_compute::lane_kernels::IndexedSourceExt;
10+
use vortex_compute::lane_kernels::ReinterpretSink;
811
use vortex_error::VortexResult;
912
use vortex_error::vortex_bail;
1013
use vortex_error::vortex_err;
@@ -102,9 +105,7 @@ impl CastKernel for Primitive {
102105
}
103106
}
104107

105-
/// Cast values from `F` to `T`. For infallible casts this is a pure pass; for fallible casts
106-
/// each valid value goes through a checked `NumCast::from` and the kernel bails if any of them
107-
/// overflow `T`. Invalid positions use the wrapping `as` cast since their values are masked out.
108+
/// Cast Primitive values from `F` to `T`.
108109
fn cast_values<F, T>(
109110
array: ArrayView<'_, Primitive>,
110111
new_validity: Validity,
@@ -114,53 +115,99 @@ where
114115
F: NativePType + AsPrimitive<T>,
115116
T: NativePType,
116117
{
117-
let values = array.as_slice::<F>();
118-
119-
// Fast path: statically infallible, or cached min/max prove every valid value fits in `T`.
120-
// The cached check never triggers a stats computation — if the bounds aren't already known
121-
// we fall through to the per-lane loop below.
122-
if values_always_fit(F::PTYPE, T::PTYPE) || values_fit_in(array, T::PTYPE, ctx, false) {
123-
return Ok(PrimitiveArray::new(cast::<F, T>(values), new_validity).into_array());
124-
}
125-
126-
// TODO(joe): if the values source and target have the same bit-width we can
127-
// mutate in place.
128-
129-
// Fallible: invalid lanes are pre-multiplied to zero so the checked cast always succeeds for
130-
// them; valid lanes go through `NumCast::from` and the whole cast bails on the first overflow.
131-
let mask = array.validity()?.execute_mask(array.len(), ctx)?;
132118
let overflow = || {
133119
vortex_err!(
134120
Compute: "Cannot cast {} to {} — value exceeds target range",
135121
F::PTYPE, T::PTYPE,
136122
)
137123
};
138-
let buffer: Buffer<T> = match &mask {
139-
Mask::AllTrue(_) => BufferMut::try_from_trusted_len_iter(
124+
125+
// Returns `true` if every value of `from` is representable in `to` without loss.
126+
fn casts_losslessly_to(from: PType, to: PType) -> bool {
127+
from.least_supertype(to) == Some(to)
128+
}
129+
130+
// Skip the fallible kernel when type widening or (cached) min/max prove every value fits.
131+
let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable);
132+
let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE)
133+
|| cached_values_fit_in(array, &target_dtype).unwrap_or(false);
134+
135+
let len = array.len();
136+
137+
// If F and T have the same byte width, try to take unique ownership of the buffer.
138+
let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width();
139+
let owned: Option<BufferMut<F>> = same_bit_width
140+
.then(|| array.into_owned().try_into_buffer_mut::<F>().ok())
141+
.flatten();
142+
let values: &[F] = array.as_slice::<F>();
143+
144+
if infallible {
145+
return match owned {
146+
Some(mut buf) => {
147+
ReinterpretSink::<F, T>::new(buf.as_mut_slice()).map_into_in_place(|v: F| v.as_());
148+
// SAFETY: same size + alignment for NativePType
149+
let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
150+
Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array())
151+
}
152+
None => {
153+
let mut buffer = BufferMut::<T>::with_capacity(len);
154+
values.map_into(&mut buffer.spare_capacity_mut()[..len], |v| v.as_());
155+
// SAFETY: map_into initializes every lane.
156+
unsafe { buffer.set_len(len) };
157+
Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array())
158+
}
159+
};
160+
}
161+
162+
let mask = array.validity()?.execute_mask(len, ctx)?;
163+
164+
let buffer: Buffer<T> = match (&mask, owned) {
165+
(Mask::AllTrue(_), Some(mut buf)) => {
166+
ReinterpretSink::<F, T>::new(buf.as_mut_slice())
167+
.try_map_in_place(|v: F| <T as NumCast>::from(v))
168+
.map_err(|_| overflow())?;
169+
// SAFETY: same size + alignment for NativePType
170+
let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
171+
result.freeze()
172+
}
173+
(Mask::AllTrue(_), None) => {
174+
let mut buffer = BufferMut::<T>::with_capacity(len);
175+
values
176+
.try_map_into(&mut buffer.spare_capacity_mut()[..len], |v| {
177+
<T as NumCast>::from(v)
178+
})
179+
.map_err(|_| overflow())?;
180+
// SAFETY: initialized every lane.
181+
unsafe { buffer.set_len(len) };
182+
buffer.freeze()
183+
}
184+
(Mask::AllFalse(_), _) => BufferMut::<T>::zeroed(len).freeze(),
185+
(Mask::Values(m), Some(mut buf)) => {
186+
ReinterpretSink::<F, T>::new(buf.as_mut_slice())
187+
.try_map_masked_in_place(m.bit_buffer(), |v: F| <T as NumCast>::from(v))
188+
.map_err(|_| overflow())?;
189+
// SAFETY: same size + alignment for NativePType
190+
let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
191+
result.freeze()
192+
}
193+
(Mask::Values(m), None) => {
194+
let mut buffer = BufferMut::<T>::with_capacity(len);
140195
values
141-
.iter()
142-
.map(|&v| <T as NumCast>::from(v).ok_or_else(overflow)),
143-
)?
144-
.freeze(),
145-
Mask::AllFalse(_) => BufferMut::<T>::zeroed(values.len()).freeze(),
146-
Mask::Values(m) => BufferMut::try_from_trusted_len_iter(
147-
values.iter().zip(m.bit_buffer().iter()).map(|(&v, valid)| {
148-
let factor = if valid { F::one() } else { F::zero() };
149-
<T as NumCast>::from(v * factor).ok_or_else(overflow)
150-
}),
151-
)?
152-
.freeze(),
196+
.try_map_masked_into(
197+
m.bit_buffer(),
198+
&mut buffer.spare_capacity_mut()[..len],
199+
|v| <T as NumCast>::from(v),
200+
)
201+
.map_err(|_| overflow())?;
202+
// SAFETY: initialized every lane.
203+
unsafe { buffer.set_len(len) };
204+
buffer.freeze()
205+
}
153206
};
154207

155208
Ok(PrimitiveArray::new(buffer, new_validity).into_array())
156209
}
157210

158-
/// Out-of-range values at invalid positions are truncated/wrapped by `as`, which is fine because
159-
/// they are masked out by validity.
160-
fn cast<F: NativePType + AsPrimitive<T>, T: NativePType>(array: &[F]) -> Buffer<T> {
161-
BufferMut::from_trusted_len_iter(array.iter().map(|&src| src.as_())).freeze()
162-
}
163-
164211
fn reinterpret(
165212
array: ArrayView<'_, Primitive>,
166213
new_ptype: PType,
@@ -178,23 +225,6 @@ fn reinterpret(
178225
.into_array()
179226
}
180227

181-
/// Returns `true` if every value of `src` is guaranteed representable in `target` without
182-
/// overflow. Precision may be lost (e.g. large integers cast to `f32`), but the cast can never
183-
/// produce an out-of-range result.
184-
fn values_always_fit(src: PType, target: PType) -> bool {
185-
if src == target {
186-
return true;
187-
}
188-
if src.is_int() && target.is_int() {
189-
return target.byte_width() > src.byte_width()
190-
&& (src.is_unsigned_int() || target.is_signed_int());
191-
}
192-
if src.is_float() && target.is_float() {
193-
return target.byte_width() > src.byte_width();
194-
}
195-
src.is_int() && matches!(target, PType::F32 | PType::F64)
196-
}
197-
198228
/// Returns `true` if all valid values in `array` are representable as `target_ptype`.
199229
///
200230
/// Cached min/max statistics are consulted first. If either bound is missing, the function either

vortex-buffer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ workspace = true
3939
[dev-dependencies]
4040
divan = { workspace = true }
4141
num-traits = { workspace = true }
42+
rand = { workspace = true }
4243
rstest = { workspace = true }
4344

4445
[[bench]]

vortex-compute/Cargo.toml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[package]
2+
name = "vortex-compute"
3+
authors = { workspace = true }
4+
categories = { workspace = true }
5+
description = "Lane-level compute kernels for Vortex buffers"
6+
edition = { workspace = true }
7+
homepage = { workspace = true }
8+
include = { workspace = true }
9+
keywords = { workspace = true }
10+
license = { workspace = true }
11+
readme = { workspace = true }
12+
repository = { workspace = true }
13+
rust-version = { workspace = true }
14+
version = { workspace = true }
15+
16+
[package.metadata.docs.rs]
17+
all-features = true
18+
19+
[dependencies]
20+
vortex-buffer = { workspace = true }
21+
22+
[dev-dependencies]
23+
arrow-arith = { workspace = true }
24+
arrow-array = { workspace = true }
25+
arrow-buffer = { workspace = true }
26+
arrow-cast = { workspace = true }
27+
arrow-schema = { workspace = true }
28+
divan = { workspace = true }
29+
num-traits = { workspace = true }
30+
rand = { workspace = true }
31+
32+
[lints]
33+
workspace = true
34+
35+
[[bench]]
36+
name = "lane_kernels"
37+
harness = false

0 commit comments

Comments
 (0)