vortex-row: convert overflow panics to errors, add reference-byte test

claude · claude · commit a41f261355e7 · 2026-06-05T17:38:12.000Z
Follow-up to the PR #8253 review pass: - Make the size pass fully fallible: add_size_* now return VortexResult and use checked arithmetic, so an input whose per-row encoding exceeds u32::MAX surfaces a VortexError instead of panicking via vortex_expect. encoded_size_for_non_empty_varlen and encode_non_empty_varlen_body likewise return VortexResult for their byte-total overflow checks. - Drop the #[allow(cast_possible_truncation)] on byte_width_u32; use u32::try_from with an infallible-invariant expect instead of a bare cast. - Add reference_row_bytes_match_spec: encodes the worked-example row from docs/specs/row-encoding.md and asserts the exact encoded bytes, pinning the byte layout and keeping the spec honest. Signed-off-by: Claude <noreply@anthropic.com> https://claude.ai/code/session_019GXtsg21qhpxDVD9ZUpFTx
diff --git a/vortex-row/src/codec/encoding.rs b/vortex-row/src/codec/encoding.rs
@@ -258,7 +258,7 @@ pub(super) fn encode_varbinview(
                     &buffers[r.buffer_index as usize][off..off + len]
                 };
                 out[pos] = non_empty_byte;
-                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending);
+                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending)?;
                 col_offset[i] += 1 + written;
             }
         }
@@ -284,7 +284,7 @@ pub(super) fn encode_varbinview(
                     &buffers[r.buffer_index as usize][off..off + len]
                 };
                 out[pos] = non_empty_byte;
-                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending);
+                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending)?;
                 col_offset[i] += 1 + written;
             }
         }
@@ -376,7 +376,7 @@ pub(super) fn encode_fsl(
             debug_assert_eq!(elements.len(), nrows * list_size);
             let row_body_bytes = w
                 .checked_mul(list_size_u32)
-                .vortex_expect("FSL body width overflow");
+                .ok_or_else(|| vortex_err!("FSL body width overflow"))?;
             let mut elem_offsets = vec![0u32; nrows * list_size];
             for i in 0..nrows {
                 let base = row_offsets[i] + col_offset[i];
@@ -389,7 +389,7 @@ pub(super) fn encode_fsl(
             for i in 0..nrows {
                 col_offset[i] = col_offset[i]
                     .checked_add(row_body_bytes)
-                    .vortex_expect("FSL row body overflow");
+                    .ok_or_else(|| vortex_err!("FSL row body overflow"))?;
             }
             // Canonical null body for null parent rows: one null encoding per element.
             let null_byte = child_canonical_null_byte(&elem_dtype, field);
@@ -427,7 +427,7 @@ pub(super) fn encode_fsl(
                 scratch_offsets.push(acc);
                 acc = acc
                     .checked_add(s)
-                    .vortex_expect("FSL scratch offset overflow");
+                    .ok_or_else(|| vortex_err!("FSL scratch offset overflow"))?;
             }
             let mut scratch_cursors = vec![0u32; nrows * list_size];
             field_encode(
@@ -451,18 +451,18 @@ pub(super) fn encode_fsl(
                             .copy_from_slice(&scratch[src..src + sz]);
                         body_bytes = body_bytes
                             .checked_add(elem_sizes[k])
-                            .vortex_expect("FSL body bytes overflow");
+                            .ok_or_else(|| vortex_err!("FSL body bytes overflow"))?;
                     }
                     col_offset[i] = col_offset[i]
                         .checked_add(body_bytes)
-                        .vortex_expect("FSL row offset overflow");
+                        .ok_or_else(|| vortex_err!("FSL row offset overflow"))?;
                 } else {
                     for offset in 0..list_size {
                         out[dst + offset] = null_byte;
                     }
                     col_offset[i] = col_offset[i]
                         .checked_add(list_size_u32)
-                        .vortex_expect("FSL row offset overflow");
+                        .ok_or_else(|| vortex_err!("FSL row offset overflow"))?;
                 }
             }
         }
@@ -498,7 +498,7 @@ fn encode_variable_child(
         scratch_offsets.push(acc);
         acc = acc
             .checked_add(s)
-            .vortex_expect("child scratch offset overflow");
+            .ok_or_else(|| vortex_err!("child scratch offset overflow"))?;
     }
     let mut scratch_cursors = vec![0u32; n];
     field_encode(
@@ -519,12 +519,12 @@ fn encode_variable_child(
             out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]);
             col_offset[i] = col_offset[i]
                 .checked_add(child_sizes[i])
-                .vortex_expect("col_offset overflow");
+                .ok_or_else(|| vortex_err!("col_offset overflow"))?;
         } else {
             out[dst] = null_byte;
             col_offset[i] = col_offset[i]
                 .checked_add(1)
-                .vortex_expect("col_offset overflow");
+                .ok_or_else(|| vortex_err!("col_offset overflow"))?;
         }
     }
     Ok(())
@@ -599,7 +599,11 @@ fn encode_primitive_arith_typed<T: NativePType + RowEncode>(
 /// For the ascending path the hot loop is a `copy_nonoverlapping` of 32 bytes per block
 /// plus one stamped continuation byte. For the descending path it reads a u64 at a time and
 /// XORs with `0xFF`, giving LLVM a vectorizable inner loop.
-fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
+fn encode_non_empty_varlen_body(
+    bytes: &[u8],
+    out: &mut [u8],
+    descending: bool,
+) -> VortexResult<u32> {
     debug_assert!(!bytes.is_empty());
     let len = bytes.len();
     let full_blocks = len / VARLEN_BLOCK_SIZE;
@@ -612,6 +616,10 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool)
         (full_blocks, partial)
     };
     let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
+    // The caller reserved this slot from `encoded_size_for_non_empty_varlen`, which already
+    // verified the byte total fits `u32`; re-check here so the conversion never panics.
+    let total_u32 =
+        u32::try_from(total).map_err(|_| vortex_err!("encoded varlen size overflows u32"))?;
     debug_assert!(out.len() >= total);
     // The final block's continuation byte encodes its content length (1..=32).
     let len_byte =
@@ -662,7 +670,7 @@ fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool)
             *dst.add(VARLEN_BLOCK_SIZE) = len_byte ^ 0xFF;
         }
     }
-    u32::try_from(total).vortex_expect("encoded varlen byte length fits u32")
+    Ok(total_u32)
 }
 
 /// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the
diff --git a/vortex-row/src/codec/mod.rs b/vortex-row/src/codec/mod.rs
@@ -69,14 +69,22 @@ pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1;
 ///
 /// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1
 /// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and
-/// [`VARLEN_EMPTY_SIZE`] for empty values. A `u32` always suffices because a `BinaryView`
-/// length is itself a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`.
+/// [`VARLEN_EMPTY_SIZE`] for empty values.
+///
+/// # Errors
+///
+/// Returns an error if the encoded length overflows `u32`. The block count itself always fits
+/// (a `BinaryView` length is a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`), but the
+/// `blocks * 33 + 1` byte total can exceed `u32::MAX` for multi-gigabyte values.
 #[inline]
-fn encoded_size_for_non_empty_varlen(len: usize) -> u32 {
+fn encoded_size_for_non_empty_varlen(len: usize) -> VortexResult<u32> {
     debug_assert!(len > 0);
     let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE))
         .vortex_expect("varlen block count must fit in u32");
-    1 + blocks * VARLEN_BLOCK_TOTAL_U32
+    blocks
+        .checked_mul(VARLEN_BLOCK_TOTAL_U32)
+        .and_then(|b| b.checked_add(1))
+        .ok_or_else(|| vortex_err!("varlen encoded size overflows u32"))
 }
 
 /// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel).
@@ -85,14 +93,10 @@ const fn encoded_size_for_fixed(value_bytes: u32) -> u32 {
     1 + value_bytes
 }
 
-/// A native byte width (at most 32 for `i256`) always fits in a `u32`, so a plain cast is fine.
+/// A native byte width (at most 32 for `i256`) always fits in a `u32`.
 #[inline]
-#[allow(
-    clippy::cast_possible_truncation,
-    reason = "native byte widths are at most 32"
-)]
 fn byte_width_u32(width: usize) -> u32 {
-    width as u32
+    u32::try_from(width).vortex_expect("native byte width must fit in u32")
 }
 
 /// Pre-resolved per-row validity for the row encoders.
@@ -279,10 +283,10 @@ pub(crate) fn field_size(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
     match canonical {
-        Canonical::Null(arr) => add_size_null(arr, sizes),
-        Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)),
-        Canonical::Primitive(arr) => add_size_primitive(arr, sizes),
-        Canonical::Decimal(arr) => add_size_decimal(arr, sizes),
+        Canonical::Null(arr) => add_size_null(arr, sizes)?,
+        Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1))?,
+        Canonical::Primitive(arr) => add_size_primitive(arr, sizes)?,
+        Canonical::Decimal(arr) => add_size_decimal(arr, sizes)?,
         Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
         Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?,
         Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?,
diff --git a/vortex-row/src/codec/sizing.rs b/vortex-row/src/codec/sizing.rs
@@ -2,31 +2,40 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 //! Size pass leaf kernels: per-row byte-size accumulation for each canonical variant.
+//!
+//! Every accumulator returns [`VortexResult`] and uses checked arithmetic, so an input whose
+//! per-row encoding would exceed `u32::MAX` bytes surfaces a [`VortexError`](vortex_error::VortexError)
+//! instead of overflowing or panicking.
 
 use super::*;
 
-pub(super) fn add_size_const(sizes: &mut [u32], add: u32) {
-    sizes.iter_mut().for_each(|s| *s += add);
+pub(super) fn add_size_const(sizes: &mut [u32], add: u32) -> VortexResult<()> {
+    for s in sizes.iter_mut() {
+        *s = s
+            .checked_add(add)
+            .ok_or_else(|| vortex_err!("per-row size overflow"))?;
+    }
+    Ok(())
 }
 
-pub(super) fn add_size_null(arr: &NullArray, sizes: &mut [u32]) {
+pub(super) fn add_size_null(arr: &NullArray, sizes: &mut [u32]) -> VortexResult<()> {
     debug_assert_eq!(arr.len(), sizes.len());
     // Just a sentinel byte per row.
-    sizes.iter_mut().for_each(|s| *s += 1);
+    add_size_const(sizes, 1)
 }
 
-pub(super) fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) {
+pub(super) fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) -> VortexResult<()> {
     let width = byte_width_u32(arr.ptype().byte_width());
-    add_size_const(sizes, encoded_size_for_fixed(width));
+    add_size_const(sizes, encoded_size_for_fixed(width))
 }
 
-pub(super) fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) {
+pub(super) fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) -> VortexResult<()> {
     // Size from the precision-minimal type, not the physical `values_type`, so the size pass
     // agrees with `row_width_for_dtype` (and the encode pass) regardless of how the producer
     // stored the values. See `narrow_decimal_to_smallest`.
     let vt = DecimalType::smallest_decimal_value_type(&arr.decimal_dtype());
     let width = byte_width_u32(vt.byte_width());
-    add_size_const(sizes, encoded_size_for_fixed(width));
+    add_size_const(sizes, encoded_size_for_fixed(width))
 }
 
 pub(super) fn add_size_varbinview(
@@ -41,11 +50,11 @@ pub(super) fn add_size_varbinview(
                 let contribution = if view.is_empty() {
                     VARLEN_EMPTY_SIZE
                 } else {
-                    encoded_size_for_non_empty_varlen(view.len() as usize)
+                    encoded_size_for_non_empty_varlen(view.len() as usize)?
                 };
                 sizes[i] = sizes[i]
                     .checked_add(contribution)
-                    .vortex_expect("per-row size overflow");
+                    .ok_or_else(|| vortex_err!("per-row size overflow"))?;
             }
         }
         ValidityKind::Mask(mask) => {
@@ -55,11 +64,11 @@ pub(super) fn add_size_varbinview(
                 } else if view.is_empty() {
                     VARLEN_EMPTY_SIZE
                 } else {
-                    encoded_size_for_non_empty_varlen(view.len() as usize)
+                    encoded_size_for_non_empty_varlen(view.len() as usize)?
                 };
                 sizes[i] = sizes[i]
                     .checked_add(contribution)
-                    .vortex_expect("per-row size overflow");
+                    .ok_or_else(|| vortex_err!("per-row size overflow"))?;
             }
         }
     }
@@ -75,16 +84,14 @@ pub(super) fn add_size_struct(
     let n = arr.len();
     let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
     // Outer sentinel: 1 byte per row.
-    sizes
-        .iter_mut()
-        .for_each(|s| *s = s.checked_add(1).vortex_expect("per-row size overflow"));
+    add_size_const(sizes, 1)?;
     // Each child contributes its per-row size when the parent is non-null, and a canonical
     // null contribution when the parent is null. For fixed-width children both are equal,
     // so we can simply add the fixed width to every row. For variable-width children the
     // null contribution collapses to 1 byte, ensuring null parent rows have a constant body.
     for child in arr.iter_unmasked_fields() {
         match row_width_for_dtype(child.dtype())? {
-            RowWidth::Fixed(w) => add_size_const(sizes, w),
+            RowWidth::Fixed(w) => add_size_const(sizes, w)?,
             RowWidth::Variable => {
                 let canonical = child.clone().execute::<Canonical>(ctx)?;
                 let mut child_sizes = vec![0u32; n];
@@ -93,7 +100,7 @@ pub(super) fn add_size_struct(
                     let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 };
                     sizes[i] = sizes[i]
                         .checked_add(contribution)
-                        .vortex_expect("per-row size overflow");
+                        .ok_or_else(|| vortex_err!("per-row size overflow"))?;
                 }
             }
         }
@@ -116,16 +123,14 @@ pub(super) fn add_size_fsl(
     let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
     let elem_dtype = arr.elements().dtype();
     // Outer sentinel: 1 byte per row.
-    sizes
-        .iter_mut()
-        .for_each(|s| *s = s.checked_add(1).vortex_expect("per-row size overflow"));
+    add_size_const(sizes, 1)?;
     match row_width_for_dtype(elem_dtype)? {
         RowWidth::Fixed(w) => {
             // Each row has `list_size` fixed-width elements regardless of null parent mask.
             let body = w
                 .checked_mul(list_size_u32)
-                .vortex_expect("FSL body width overflow");
-            add_size_const(sizes, body);
+                .ok_or_else(|| vortex_err!("FSL body width overflow"))?;
+            add_size_const(sizes, body)?;
         }
         RowWidth::Variable => {
             let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
@@ -139,7 +144,7 @@ pub(super) fn add_size_fsl(
                     for j in 0..list_size {
                         sum = sum
                             .checked_add(elem_sizes[base + j])
-                            .vortex_expect("FSL row body overflow");
+                            .ok_or_else(|| vortex_err!("FSL row body overflow"))?;
                     }
                     sum
                 } else {
@@ -149,7 +154,7 @@ pub(super) fn add_size_fsl(
                 };
                 sizes[i] = sizes[i]
                     .checked_add(body)
-                    .vortex_expect("FSL per-row size overflow");
+                    .ok_or_else(|| vortex_err!("FSL per-row size overflow"))?;
             }
         }
     }
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs