Skip to content

Commit a41f261

Browse files
committed
vortex-row: convert overflow panics to errors, add reference-byte test
Follow-up to the PR #8253 review pass: - Make the size pass fully fallible: add_size_* now return VortexResult and use checked arithmetic, so an input whose per-row encoding exceeds u32::MAX surfaces a VortexError instead of panicking via vortex_expect. encoded_size_for_non_empty_varlen and encode_non_empty_varlen_body likewise return VortexResult for their byte-total overflow checks. - Drop the #[allow(cast_possible_truncation)] on byte_width_u32; use u32::try_from with an infallible-invariant expect instead of a bare cast. - Add reference_row_bytes_match_spec: encodes the worked-example row from docs/specs/row-encoding.md and asserts the exact encoded bytes, pinning the byte layout and keeping the spec honest. Signed-off-by: Claude <noreply@anthropic.com> https://claude.ai/code/session_019GXtsg21qhpxDVD9ZUpFTx
1 parent b914967 commit a41f261

4 files changed

Lines changed: 156 additions & 51 deletions

File tree

vortex-row/src/codec/encoding.rs

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ pub(super) fn encode_varbinview(
258258
&buffers[r.buffer_index as usize][off..off + len]
259259
};
260260
out[pos] = non_empty_byte;
261-
let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending);
261+
let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending)?;
262262
col_offset[i] += 1 + written;
263263
}
264264
}
@@ -284,7 +284,7 @@ pub(super) fn encode_varbinview(
284284
&buffers[r.buffer_index as usize][off..off + len]
285285
};
286286
out[pos] = non_empty_byte;
287-
let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending);
287+
let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending)?;
288288
col_offset[i] += 1 + written;
289289
}
290290
}
@@ -376,7 +376,7 @@ pub(super) fn encode_fsl(
376376
debug_assert_eq!(elements.len(), nrows * list_size);
377377
let row_body_bytes = w
378378
.checked_mul(list_size_u32)
379-
.vortex_expect("FSL body width overflow");
379+
.ok_or_else(|| vortex_err!("FSL body width overflow"))?;
380380
let mut elem_offsets = vec![0u32; nrows * list_size];
381381
for i in 0..nrows {
382382
let base = row_offsets[i] + col_offset[i];
@@ -389,7 +389,7 @@ pub(super) fn encode_fsl(
389389
for i in 0..nrows {
390390
col_offset[i] = col_offset[i]
391391
.checked_add(row_body_bytes)
392-
.vortex_expect("FSL row body overflow");
392+
.ok_or_else(|| vortex_err!("FSL row body overflow"))?;
393393
}
394394
// Canonical null body for null parent rows: one null encoding per element.
395395
let null_byte = child_canonical_null_byte(&elem_dtype, field);
@@ -427,7 +427,7 @@ pub(super) fn encode_fsl(
427427
scratch_offsets.push(acc);
428428
acc = acc
429429
.checked_add(s)
430-
.vortex_expect("FSL scratch offset overflow");
430+
.ok_or_else(|| vortex_err!("FSL scratch offset overflow"))?;
431431
}
432432
let mut scratch_cursors = vec![0u32; nrows * list_size];
433433
field_encode(
@@ -451,18 +451,18 @@ pub(super) fn encode_fsl(
451451
.copy_from_slice(&scratch[src..src + sz]);
452452
body_bytes = body_bytes
453453
.checked_add(elem_sizes[k])
454-
.vortex_expect("FSL body bytes overflow");
454+
.ok_or_else(|| vortex_err!("FSL body bytes overflow"))?;
455455
}
456456
col_offset[i] = col_offset[i]
457457
.checked_add(body_bytes)
458-
.vortex_expect("FSL row offset overflow");
458+
.ok_or_else(|| vortex_err!("FSL row offset overflow"))?;
459459
} else {
460460
for offset in 0..list_size {
461461
out[dst + offset] = null_byte;
462462
}
463463
col_offset[i] = col_offset[i]
464464
.checked_add(list_size_u32)
465-
.vortex_expect("FSL row offset overflow");
465+
.ok_or_else(|| vortex_err!("FSL row offset overflow"))?;
466466
}
467467
}
468468
}
@@ -498,7 +498,7 @@ fn encode_variable_child(
498498
scratch_offsets.push(acc);
499499
acc = acc
500500
.checked_add(s)
501-
.vortex_expect("child scratch offset overflow");
501+
.ok_or_else(|| vortex_err!("child scratch offset overflow"))?;
502502
}
503503
let mut scratch_cursors = vec![0u32; n];
504504
field_encode(
@@ -519,12 +519,12 @@ fn encode_variable_child(
519519
out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]);
520520
col_offset[i] = col_offset[i]
521521
.checked_add(child_sizes[i])
522-
.vortex_expect("col_offset overflow");
522+
.ok_or_else(|| vortex_err!("col_offset overflow"))?;
523523
} else {
524524
out[dst] = null_byte;
525525
col_offset[i] = col_offset[i]
526526
.checked_add(1)
527-
.vortex_expect("col_offset overflow");
527+
.ok_or_else(|| vortex_err!("col_offset overflow"))?;
528528
}
529529
}
530530
Ok(())
@@ -599,7 +599,11 @@ fn encode_primitive_arith_typed<T: NativePType + RowEncode>(
599599
/// For the ascending path the hot loop is a `copy_nonoverlapping` of 32 bytes per block
600600
/// plus one stamped continuation byte. For the descending path it reads a u64 at a time and
601601
/// XORs with `0xFF`, giving LLVM a vectorizable inner loop.
602-
fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
602+
fn encode_non_empty_varlen_body(
603+
bytes: &[u8],
604+
out: &mut [u8],
605+
descending: bool,
606+
) -> VortexResult<u32> {
603607
debug_assert!(!bytes.is_empty());
604608
let len = bytes.len();
605609
let full_blocks = len / VARLEN_BLOCK_SIZE;
@@ -612,6 +616,10 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool)
612616
(full_blocks, partial)
613617
};
614618
let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
619+
// The caller reserved this slot from `encoded_size_for_non_empty_varlen`, which already
620+
// verified the byte total fits `u32`; re-check here so the conversion never panics.
621+
let total_u32 =
622+
u32::try_from(total).map_err(|_| vortex_err!("encoded varlen size overflows u32"))?;
615623
debug_assert!(out.len() >= total);
616624
// The final block's continuation byte encodes its content length (1..=32).
617625
let len_byte =
@@ -662,7 +670,7 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool)
662670
*dst.add(VARLEN_BLOCK_SIZE) = len_byte ^ 0xFF;
663671
}
664672
}
665-
u32::try_from(total).vortex_expect("encoded varlen byte length fits u32")
673+
Ok(total_u32)
666674
}
667675

668676
/// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the

vortex-row/src/codec/mod.rs

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,22 @@ pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1;
6969
///
7070
/// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1
7171
/// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and
72-
/// [`VARLEN_EMPTY_SIZE`] for empty values. A `u32` always suffices because a `BinaryView`
73-
/// length is itself a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`.
72+
/// [`VARLEN_EMPTY_SIZE`] for empty values.
73+
///
74+
/// # Errors
75+
///
76+
/// Returns an error if the encoded length overflows `u32`. The block count itself always fits
77+
/// (a `BinaryView` length is a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`), but the
78+
/// `blocks * 33 + 1` byte total can exceed `u32::MAX` for multi-gigabyte values.
7479
#[inline]
75-
fn encoded_size_for_non_empty_varlen(len: usize) -> u32 {
80+
fn encoded_size_for_non_empty_varlen(len: usize) -> VortexResult<u32> {
7681
debug_assert!(len > 0);
7782
let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE))
7883
.vortex_expect("varlen block count must fit in u32");
79-
1 + blocks * VARLEN_BLOCK_TOTAL_U32
84+
blocks
85+
.checked_mul(VARLEN_BLOCK_TOTAL_U32)
86+
.and_then(|b| b.checked_add(1))
87+
.ok_or_else(|| vortex_err!("varlen encoded size overflows u32"))
8088
}
8189

8290
/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel).
@@ -85,14 +93,10 @@ const fn encoded_size_for_fixed(value_bytes: u32) -> u32 {
8593
1 + value_bytes
8694
}
8795

88-
/// A native byte width (at most 32 for `i256`) always fits in a `u32`, so a plain cast is fine.
96+
/// A native byte width (at most 32 for `i256`) always fits in a `u32`.
8997
#[inline]
90-
#[allow(
91-
clippy::cast_possible_truncation,
92-
reason = "native byte widths are at most 32"
93-
)]
9498
fn byte_width_u32(width: usize) -> u32 {
95-
width as u32
99+
u32::try_from(width).vortex_expect("native byte width must fit in u32")
96100
}
97101

98102
/// Pre-resolved per-row validity for the row encoders.
@@ -279,10 +283,10 @@ pub(crate) fn field_size(
279283
ctx: &mut ExecutionCtx,
280284
) -> VortexResult<()> {
281285
match canonical {
282-
Canonical::Null(arr) => add_size_null(arr, sizes),
283-
Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)),
284-
Canonical::Primitive(arr) => add_size_primitive(arr, sizes),
285-
Canonical::Decimal(arr) => add_size_decimal(arr, sizes),
286+
Canonical::Null(arr) => add_size_null(arr, sizes)?,
287+
Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1))?,
288+
Canonical::Primitive(arr) => add_size_primitive(arr, sizes)?,
289+
Canonical::Decimal(arr) => add_size_decimal(arr, sizes)?,
286290
Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
287291
Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?,
288292
Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?,

vortex-row/src/codec/sizing.rs

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,40 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
//! Size pass leaf kernels: per-row byte-size accumulation for each canonical variant.
5+
//!
6+
//! Every accumulator returns [`VortexResult`] and uses checked arithmetic, so an input whose
7+
//! per-row encoding would exceed `u32::MAX` bytes surfaces a [`VortexError`](vortex_error::VortexError)
8+
//! instead of overflowing or panicking.
59
610
use super::*;
711

8-
pub(super) fn add_size_const(sizes: &mut [u32], add: u32) {
9-
sizes.iter_mut().for_each(|s| *s += add);
12+
pub(super) fn add_size_const(sizes: &mut [u32], add: u32) -> VortexResult<()> {
13+
for s in sizes.iter_mut() {
14+
*s = s
15+
.checked_add(add)
16+
.ok_or_else(|| vortex_err!("per-row size overflow"))?;
17+
}
18+
Ok(())
1019
}
1120

12-
pub(super) fn add_size_null(arr: &NullArray, sizes: &mut [u32]) {
21+
pub(super) fn add_size_null(arr: &NullArray, sizes: &mut [u32]) -> VortexResult<()> {
1322
debug_assert_eq!(arr.len(), sizes.len());
1423
// Just a sentinel byte per row.
15-
sizes.iter_mut().for_each(|s| *s += 1);
24+
add_size_const(sizes, 1)
1625
}
1726

18-
pub(super) fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) {
27+
pub(super) fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) -> VortexResult<()> {
1928
let width = byte_width_u32(arr.ptype().byte_width());
20-
add_size_const(sizes, encoded_size_for_fixed(width));
29+
add_size_const(sizes, encoded_size_for_fixed(width))
2130
}
2231

23-
pub(super) fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) {
32+
pub(super) fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) -> VortexResult<()> {
2433
// Size from the precision-minimal type, not the physical `values_type`, so the size pass
2534
// agrees with `row_width_for_dtype` (and the encode pass) regardless of how the producer
2635
// stored the values. See `narrow_decimal_to_smallest`.
2736
let vt = DecimalType::smallest_decimal_value_type(&arr.decimal_dtype());
2837
let width = byte_width_u32(vt.byte_width());
29-
add_size_const(sizes, encoded_size_for_fixed(width));
38+
add_size_const(sizes, encoded_size_for_fixed(width))
3039
}
3140

3241
pub(super) fn add_size_varbinview(
@@ -41,11 +50,11 @@ pub(super) fn add_size_varbinview(
4150
let contribution = if view.is_empty() {
4251
VARLEN_EMPTY_SIZE
4352
} else {
44-
encoded_size_for_non_empty_varlen(view.len() as usize)
53+
encoded_size_for_non_empty_varlen(view.len() as usize)?
4554
};
4655
sizes[i] = sizes[i]
4756
.checked_add(contribution)
48-
.vortex_expect("per-row size overflow");
57+
.ok_or_else(|| vortex_err!("per-row size overflow"))?;
4958
}
5059
}
5160
ValidityKind::Mask(mask) => {
@@ -55,11 +64,11 @@ pub(super) fn add_size_varbinview(
5564
} else if view.is_empty() {
5665
VARLEN_EMPTY_SIZE
5766
} else {
58-
encoded_size_for_non_empty_varlen(view.len() as usize)
67+
encoded_size_for_non_empty_varlen(view.len() as usize)?
5968
};
6069
sizes[i] = sizes[i]
6170
.checked_add(contribution)
62-
.vortex_expect("per-row size overflow");
71+
.ok_or_else(|| vortex_err!("per-row size overflow"))?;
6372
}
6473
}
6574
}
@@ -75,16 +84,14 @@ pub(super) fn add_size_struct(
7584
let n = arr.len();
7685
let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
7786
// Outer sentinel: 1 byte per row.
78-
sizes
79-
.iter_mut()
80-
.for_each(|s| *s = s.checked_add(1).vortex_expect("per-row size overflow"));
87+
add_size_const(sizes, 1)?;
8188
// Each child contributes its per-row size when the parent is non-null, and a canonical
8289
// null contribution when the parent is null. For fixed-width children both are equal,
8390
// so we can simply add the fixed width to every row. For variable-width children the
8491
// null contribution collapses to 1 byte, ensuring null parent rows have a constant body.
8592
for child in arr.iter_unmasked_fields() {
8693
match row_width_for_dtype(child.dtype())? {
87-
RowWidth::Fixed(w) => add_size_const(sizes, w),
94+
RowWidth::Fixed(w) => add_size_const(sizes, w)?,
8895
RowWidth::Variable => {
8996
let canonical = child.clone().execute::<Canonical>(ctx)?;
9097
let mut child_sizes = vec![0u32; n];
@@ -93,7 +100,7 @@ pub(super) fn add_size_struct(
93100
let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 };
94101
sizes[i] = sizes[i]
95102
.checked_add(contribution)
96-
.vortex_expect("per-row size overflow");
103+
.ok_or_else(|| vortex_err!("per-row size overflow"))?;
97104
}
98105
}
99106
}
@@ -116,16 +123,14 @@ pub(super) fn add_size_fsl(
116123
let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
117124
let elem_dtype = arr.elements().dtype();
118125
// Outer sentinel: 1 byte per row.
119-
sizes
120-
.iter_mut()
121-
.for_each(|s| *s = s.checked_add(1).vortex_expect("per-row size overflow"));
126+
add_size_const(sizes, 1)?;
122127
match row_width_for_dtype(elem_dtype)? {
123128
RowWidth::Fixed(w) => {
124129
// Each row has `list_size` fixed-width elements regardless of null parent mask.
125130
let body = w
126131
.checked_mul(list_size_u32)
127-
.vortex_expect("FSL body width overflow");
128-
add_size_const(sizes, body);
132+
.ok_or_else(|| vortex_err!("FSL body width overflow"))?;
133+
add_size_const(sizes, body)?;
129134
}
130135
RowWidth::Variable => {
131136
let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
@@ -139,7 +144,7 @@ pub(super) fn add_size_fsl(
139144
for j in 0..list_size {
140145
sum = sum
141146
.checked_add(elem_sizes[base + j])
142-
.vortex_expect("FSL row body overflow");
147+
.ok_or_else(|| vortex_err!("FSL row body overflow"))?;
143148
}
144149
sum
145150
} else {
@@ -149,7 +154,7 @@ pub(super) fn add_size_fsl(
149154
};
150155
sizes[i] = sizes[i]
151156
.checked_add(body)
152-
.vortex_expect("FSL per-row size overflow");
157+
.ok_or_else(|| vortex_err!("FSL per-row size overflow"))?;
153158
}
154159
}
155160
}

0 commit comments

Comments
 (0)