vortex-row: arithmetic-write fast path for fixed-before-varlen columns

joseph-isaacs · joseph-isaacs · commit 2711504490e8 · 2026-06-04T18:57:45.000Z
Classify each column in the size pass (`ColKind` + `first_varlen_idx`): a fixed-width
column with no varlen column before it has a constant within-row offset, so its write
position is pure arithmetic (`i * fixed_per_row + prefix + var_prefix[i]`) with no
per-row cursor. Route those columns through `field_encode_fixed_arithmetic`; the cursor
path is seeded to start at the first varlen column. Primitive columns in the pure-fixed
case use a `chunks_exact_mut` hot loop (matching arrow-row's not-null path); all other
fixed types reuse the cursor encoder at the computed offsets, so output is byte-identical.

Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;
diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
@@ -269,6 +269,59 @@ pub(crate) fn field_size(
     Ok(())
 }
 
+/// Encode a fixed-width column at arithmetic offsets, without reading or writing any per-row
+/// cursor.
+///
+/// For row `i`, the column's bytes are written starting at `i * row_stride + col_prefix
+/// (+ var_prefix[i])`, where `var_prefix` is the exclusive prefix sum of the varlen
+/// contributions (`None` when the row layout has no variable-length columns). This is the
+/// fast path for fixed-width columns that appear before any varlen column, so their
+/// within-row position is a constant offset rather than a running cursor.
+///
+/// For primitive columns in the pure-fixed case it uses a `chunks_exact_mut` hot loop that
+/// removes the per-row offset/cursor indirection (matching `arrow-row`'s `encode_not_null`).
+/// All other types reuse [`field_encode`] at the materialized offsets, so the bytes written
+/// are byte-identical to the cursor path.
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn field_encode_fixed_arithmetic(
+    canonical: &Canonical,
+    field: RowSortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    nrows: usize,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    if var_prefix.is_none()
+        && let Canonical::Primitive(arr) = canonical
+    {
+        return encode_primitive_arith(arr, field, col_prefix, row_stride, out, ctx);
+    }
+
+    // General path: materialize this column's per-row start offsets and reuse the cursor
+    // encoder with zero-initialized cursors, so every row is written at its arithmetic
+    // offset with the exact same bytes the cursor path would produce.
+    let mut offsets: Vec<u32> = Vec::with_capacity(nrows);
+    let mut base = col_prefix;
+    match var_prefix {
+        None => {
+            for _ in 0..nrows {
+                offsets.push(base);
+                base = base.wrapping_add(row_stride);
+            }
+        }
+        Some(vp) => {
+            for &p in vp.iter().take(nrows) {
+                offsets.push(base.wrapping_add(p));
+                base = base.wrapping_add(row_stride);
+            }
+        }
+    }
+    let mut cursors = vec![0u32; nrows];
+    field_encode(canonical, field, &offsets, &mut cursors, out, ctx)
+}
+
 /// Encode each row's bytes for the given canonical view into `out`, writing starting at
 /// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of
 /// bytes written.
@@ -958,6 +1011,68 @@ fn encode_extension(
     field_encode(&storage, field, row_offsets, col_offset, out, ctx)
 }
 
+/// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a
+/// constant within-row offset, iterating the output in `row_stride`-sized chunks so the
+/// compiler can drop the per-row offset/cursor indirection.
+fn encode_primitive_arith(
+    arr: &PrimitiveArray,
+    field: RowSortField,
+    col_prefix: u32,
+    row_stride: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_arith_typed::<T>(arr, field, col_prefix, row_stride, out, ctx)?;
+    });
+    Ok(())
+}
+
+fn encode_primitive_arith_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: RowSortField,
+    col_prefix: u32,
+    row_stride: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let slice: &[T] = arr.as_slice();
+    let non_null = field.non_null_sentinel();
+    let value_bytes = size_of::<T>();
+    let slot_size = 1 + value_bytes;
+    let stride = row_stride as usize;
+    let prefix = col_prefix as usize;
+    let descending = field.descending;
+
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            // Hot path: each row's slot is a fixed window inside its `stride`-sized chunk,
+            // so the inner write vectorizes the same way as `arrow-row`'s not-null path.
+            for (chunk, &v) in out.chunks_exact_mut(stride).zip(slice.iter()) {
+                let slot = &mut chunk[prefix..prefix + slot_size];
+                slot[0] = non_null;
+                v.encode_to(&mut slot[1..], descending);
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            for (i, (chunk, &v)) in out.chunks_exact_mut(stride).zip(slice.iter()).enumerate() {
+                let slot = &mut chunk[prefix..prefix + slot_size];
+                if mask.value(i) {
+                    slot[0] = non_null;
+                    v.encode_to(&mut slot[1..], descending);
+                } else {
+                    slot[0] = null;
+                    for b in &mut slot[1..] {
+                        *b = 0;
+                    }
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
 /// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with
 /// continuation/length markers. Returns the number of bytes written. Empty values are
 /// encoded by the caller as a single sentinel byte and never reach this function.
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
@@ -34,6 +34,7 @@ use crate::codec;
 use crate::options::RowEncodingOptions;
 use crate::options::deserialize_row_encoding_options;
 use crate::options::serialize_row_encoding_options;
+use crate::size::ColKind;
 use crate::size::compute_sizes;
 
 /// Variadic scalar function that encodes N input columns into a single `List<u8>`
@@ -112,6 +113,8 @@ fn execute_row_encode(
     let crate::size::SizePassResult {
         fixed_per_row,
         var_lengths,
+        col_kinds,
+        first_varlen_idx,
         columns,
     } = compute_sizes(options, args, ctx)?;
 
@@ -149,53 +152,107 @@ fn execute_row_encode(
     // listview_offsets[i] is the absolute byte offset where row `i` begins.
     // For pure-fixed: i * fixed_per_row.
     // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths.
+    //
+    // When fixed-before-varlen columns coexist with a varlen column, we additionally build
+    // `var_prefix_for_arith[i] = exclusive cumsum of var_lengths[..i]` and hand it to the
+    // arithmetic encoders so they can compute per-row write positions without a cursor.
+    let need_arith_prefix = first_varlen_idx.is_some()
+        && col_kinds.iter().any(|k| {
+            matches!(
+                k,
+                ColKind::Fixed {
+                    before_varlen: true,
+                    ..
+                }
+            )
+        });
+
     // Build directly into a BufferMut to avoid a Vec→Buffer copy at the end.
     let mut listview_offsets: BufferMut<u32> = BufferMut::with_capacity(nrows);
     // SAFETY: `nrows` of capacity reserved above; every index in `[0, nrows)` is written
     // before the buffer is read out. `nrows` was validated to fit `u32` at function entry,
-    // so `i as u32` below is exact and the multiplications can't overflow.
+    // so the `0u32..` counters below are exact and the multiplications can't overflow.
     unsafe { listview_offsets.set_len(nrows) };
     let off = listview_offsets.as_mut_slice();
+    let mut var_prefix_for_arith: Option<Vec<u32>> = None;
     match var_lengths.as_ref() {
         None => {
             // Pure-fixed: offsets[i] = i * fixed_per_row. Zipping against a `u32` counter
-            // elides per-element bounds checks (and avoids a per-element `usize as u32`
-            // cast), so LLVM auto-vectorizes this multiply. `nrows` fits u32, so the counter
-            // never overflows.
+            // elides per-element bounds checks, so LLVM auto-vectorizes this multiply.
             for (slot, i) in off.iter_mut().zip(0u32..) {
                 *slot = i * fixed_per_row;
             }
         }
         Some(v) => {
             // Mixed: offsets[i] = i * fixed_per_row + var_prefix[i], where var_prefix is the
-            // exclusive cumsum of varlen lengths. `iter_mut().zip` elides per-element bounds
-            // checks; the total was validated to fit u32 upstream so the wrapping arithmetic
-            // is exact (it never actually wraps).
+            // exclusive cumsum of varlen lengths. The total was validated to fit u32 upstream
+            // so the wrapping arithmetic is exact (it never actually wraps).
+            let mut vp: Option<Vec<u32>> = need_arith_prefix.then(|| Vec::with_capacity(nrows));
             let mut acc: u32 = 0;
             for ((slot, &l), i) in off.iter_mut().zip(v.iter()).zip(0u32..) {
+                if let Some(p) = vp.as_mut() {
+                    p.push(acc);
+                }
                 *slot = i.wrapping_mul(fixed_per_row).wrapping_add(acc);
                 acc = acc.wrapping_add(l);
             }
+            var_prefix_for_arith = vp;
         }
     }
     let listview_offsets_slice: &[u32] = listview_offsets.as_slice();
 
     // Per-row write cursor (also doubles as the ListView `sizes` slot when done). We build
     // it as a BufferMut so we can hand it directly to the output PrimitiveArray.
+    //
+    // The cursor path begins at the first cursor-path column. Fixed-before-varlen columns
+    // are written by the arithmetic path and do not touch the cursor, so the cursor is
+    // pre-seeded with the within-row offset of the first varlen column (its `fixed_prefix`).
+    // When there are no varlen columns at all, every column takes the arithmetic path and
+    // the cursor loop runs zero iterations; seeding with `fixed_per_row` then leaves the
+    // cursors already correct as per-row sizes.
+    let initial_cursor: u32 = match first_varlen_idx {
+        Some(idx) => match col_kinds[idx] {
+            ColKind::Variable { fixed_prefix } => fixed_prefix,
+            ColKind::Fixed { .. } => unreachable!("first_varlen_idx points at a varlen column"),
+        },
+        None => fixed_per_row,
+    };
     let mut row_cursors: BufferMut<u32> = BufferMut::with_capacity(nrows);
-    row_cursors.push_n(0u32, nrows);
+    row_cursors.push_n(initial_cursor, nrows);
 
-    // ===== Phase 4: encode columns via the cursor path =====
-    // Each column was canonicalized once during the size pass; reuse that canonical form.
+    // ===== Phase 4: encode columns =====
+    // Fixed-before-varlen columns take the arithmetic-write path (constant within-row
+    // offset, no cursor mutation). Fixed-after-varlen and varlen columns take the cursor
+    // path. Each column was canonicalized once during the size pass; reuse that form.
     for (i, canonical) in columns.iter().enumerate() {
-        codec::field_encode(
-            canonical,
-            options.fields[i],
-            listview_offsets_slice,
-            row_cursors.as_mut_slice(),
-            &mut out_buf,
-            ctx,
-        )?;
+        match col_kinds[i] {
+            ColKind::Fixed {
+                prefix,
+                before_varlen: true,
+                ..
+            } => {
+                codec::field_encode_fixed_arithmetic(
+                    canonical,
+                    options.fields[i],
+                    prefix,
+                    fixed_per_row,
+                    var_prefix_for_arith.as_deref(),
+                    nrows,
+                    &mut out_buf,
+                    ctx,
+                )?;
+            }
+            ColKind::Fixed { .. } | ColKind::Variable { .. } => {
+                codec::field_encode(
+                    canonical,
+                    options.fields[i],
+                    listview_offsets_slice,
+                    row_cursors.as_mut_slice(),
+                    &mut out_buf,
+                    ctx,
+                )?;
+            }
+        }
     }
 
     // ===== Phase 5: build ListView output =====
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
@@ -36,6 +36,24 @@ use crate::options::RowEncodingOptions;
 use crate::options::deserialize_row_encoding_options;
 use crate::options::serialize_row_encoding_options;
 
+/// Classification of a single input column for the size pass.
+///
+/// Tracks each column's within-row byte offset (the constant prefix from all preceding
+/// fixed-width columns) and, for fixed columns, whether any variable-length column has
+/// appeared yet — the encode pass uses this to choose between the arithmetic-write fast
+/// path (no varlen before this column, so the within-row position is constant per row) and
+/// the cursor-write path.
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum ColKind {
+    /// Fixed-width column. `prefix` is the within-row byte offset of this column's first
+    /// byte. When `before_varlen` is true no variable-length column precedes this one, so the
+    /// within-row offset is constant for every row.
+    Fixed { prefix: u32, before_varlen: bool },
+    /// Column has variable per-row width. `fixed_prefix` is the sum of widths of all
+    /// preceding fixed columns; the contribution of earlier varlen columns is added per row.
+    Variable { fixed_prefix: u32 },
+}
+
 /// Result of the size pass: enough information for both [`RowSize::execute`] and the
 /// downstream [`RowEncode`](super::encode::RowEncode) pipeline.
 ///
@@ -45,6 +63,8 @@ use crate::options::serialize_row_encoding_options;
 pub(crate) struct SizePassResult {
     pub fixed_per_row: u32,
     pub var_lengths: Option<Vec<u32>>,
+    pub col_kinds: Vec<ColKind>,
+    pub first_varlen_idx: Option<usize>,
     pub columns: Vec<Canonical>,
 }
 
@@ -77,8 +97,11 @@ pub(crate) fn compute_sizes(
     let nrows = args.row_count();
 
     let mut columns: Vec<Canonical> = Vec::with_capacity(n_inputs);
+    let mut col_kinds: Vec<ColKind> = Vec::with_capacity(n_inputs);
     let mut fixed_per_row: u32 = 0;
     let mut var_lengths: Option<Vec<u32>> = None;
+    let mut first_varlen_idx: Option<usize> = None;
+    let mut running_fixed_prefix: u32 = 0;
 
     for i in 0..n_inputs {
         let col = args.get(i)?;
@@ -95,13 +118,24 @@ pub(crate) fn compute_sizes(
         let canonical = col.execute::<Canonical>(ctx)?;
         match width {
             RowWidth::Fixed(w) => {
-                fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(|| {
-                    vortex_error::vortex_err!("per-row fixed width overflows u32 at column {}", i)
-                })?;
+                col_kinds.push(ColKind::Fixed {
+                    prefix: running_fixed_prefix,
+                    before_varlen: first_varlen_idx.is_none(),
+                });
+                let overflow =
+                    || vortex_error::vortex_err!("per-row fixed width overflows u32 at column {i}");
+                fixed_per_row = fixed_per_row.checked_add(w).ok_or_else(overflow)?;
+                running_fixed_prefix = running_fixed_prefix.checked_add(w).ok_or_else(overflow)?;
             }
             RowWidth::Variable => {
+                if first_varlen_idx.is_none() {
+                    first_varlen_idx = Some(i);
+                }
                 let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]);
                 codec::field_size(&canonical, options.fields[i], v, ctx)?;
+                col_kinds.push(ColKind::Variable {
+                    fixed_prefix: running_fixed_prefix,
+                });
             }
         }
         columns.push(canonical);
@@ -110,6 +144,8 @@ pub(crate) fn compute_sizes(
     Ok(SizePassResult {
         fixed_per_row,
         var_lengths,
+        col_kinds,
+        first_varlen_idx,
         columns,
     })
 }