From 658e954af754fd4c0694df73c8c3ea160bd9abff Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Mon, 8 Jun 2026 10:17:18 +0100
Subject: [PATCH 1/3] fix

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 docs/specs/row-encoding.md       |  378 +--------
 vortex-row/src/codec.rs          | 1257 ------------------------------
 vortex-row/src/codec/encoding.rs |  593 ++++++++++++++
 vortex-row/src/codec/mod.rs      |  407 ++++++++++
 vortex-row/src/codec/native.rs   |  113 +++
 vortex-row/src/codec/sizing.rs   |  162 ++++
 vortex-row/src/codec/varlen.rs   |  104 +++
 vortex-row/src/encode.rs         |   38 +-
 vortex-row/src/encoder.rs        |   47 +-
 vortex-row/src/lib.rs            |  124 ++-
 vortex-row/src/options.rs        |   54 +-
 vortex-row/src/size.rs           |   46 +-
 vortex-row/src/tests.rs          |  269 +++++--
 13 files changed, 1768 insertions(+), 1824 deletions(-)
 delete mode 100644 vortex-row/src/codec.rs
 create mode 100644 vortex-row/src/codec/encoding.rs
 create mode 100644 vortex-row/src/codec/mod.rs
 create mode 100644 vortex-row/src/codec/native.rs
 create mode 100644 vortex-row/src/codec/sizing.rs
 create mode 100644 vortex-row/src/codec/varlen.rs

diff --git a/docs/specs/row-encoding.md b/docs/specs/row-encoding.md
index 8fc3288f82b..c12b6a72712 100644
--- a/docs/specs/row-encoding.md
+++ b/docs/specs/row-encoding.md
@@ -7,7 +7,7 @@ logical tuple comparison of the input values under the configured row sort optio
 
 This is a schema-aware row-key format. The bytes do not contain type tags, field names, or
 sort options. Two encoded rows are comparable only when they were produced with the same
-input schema and the same per-column `RowSortField` settings.
+input schema and the same per-column `RowSortFieldOptions` settings.
 
 The row encoding is not the Vortex file format or scalar IPC format. It is an internal
 comparison representation used for sort keys and row-key operations.
@@ -18,6 +18,11 @@ semantics may change between Vortex releases. Do not persist these bytes or depe
 a stable interchange format.
 :::
 
+The **per-type byte layout** — sentinel tables, field options, and the encoding rules for each
+supported type — lives in the `vortex-row` crate's module-level documentation, so it stays next
+to the implementation. This page gives the order property, the notation, the order-preservation
+argument, and a fully worked example row.
+
 ## Order Property
 
 For a fixed schema with columns `c0, c1, ..., cn` and per-column sort fields
@@ -70,374 +75,6 @@ For example:
 | `-5_i32`, before the signed sign-bit transform | `FF FF FF FB` |
 | `ordered = 0x80000000_u32` | `80 00 00 00` |
 
-## Field Options
-
-Each input column has a `RowSortField`:
-
-```text
-RowSortField {
-    descending: bool,
-    nulls_first: bool,
-}
-```
-
-`descending` reverses the order of non-null values. `nulls_first` is independent of
-`descending`, so nulls can sort before or after non-nulls in either direction.
-
-## Sentinel Summary
-
-Sentinels are single bytes that classify nullness and, for variable-width values, whether a
-value is empty or non-empty. They are chosen so byte comparison can decide those categories
-before comparing any value bytes.
-
-| Encoding family | Case | Ascending, nulls first | Descending, nulls first | Ascending, nulls last | Descending, nulls last |
-| --- | --- | --- | --- | --- | --- |
-| Fixed-width | Null | `0x00` | `0x00` | `0x02` | `0x02` |
-| Fixed-width | Non-null | `0x01` | `0x01` | `0x01` | `0x01` |
-| Variable-width | Null | `0x00` | `0x00` | `0xFF` | `0xFF` |
-| Variable-width | Empty | `0x01` | `0xFE` | `0x01` | `0xFE` |
-| Variable-width | Non-empty | `0x02` | `0xFD` | `0x02` | `0xFD` |
-
-Fixed-width sentinels are used by null, boolean, primitive, decimal, struct, and fixed-size
-list values. Variable-width sentinels are used by UTF-8 and binary values.
-
-## Fixed-Width Sentinels
-
-Every fixed-width value starts with a one-byte sentinel:
-
-| Case | Sentinel |
-| --- | --- |
-| Null, `nulls_first = true` | `0x00` |
-| Non-null | `0x01` |
-| Null, `nulls_first = false` | `0x02` |
-
-The sentinel is not inverted for descending order. Only the non-null value bytes are
-inverted. This keeps null placement independent from sort direction.
-
-For fixed-width nulls, the sentinel is followed by zero-filled value bytes. This gives fixed
-types a constant encoded width for every row.
-
-## Variable-Width Sentinels
-
-UTF-8 and binary values use three leading sentinels. The separate empty and non-empty
-sentinels are important: they ensure the first byte decides null, empty, or non-empty before
-later columns can affect comparison.
-
-| Case | Ascending | Descending |
-| --- | --- | --- |
-| Null, `nulls_first = true` | `0x00` | `0x00` |
-| Empty | `0x01` | `0xFE` |
-| Non-empty | `0x02` | `0xFD` |
-| Null, `nulls_first = false` | `0xFF` | `0xFF` |
-
-The null sentinel is not inverted by descending order. Empty and non-empty sentinels are
-inverted so non-null value order is reversed while null placement stays fixed.
-
-## Null
-
-`Null` values have no body:
-
-```text
-fixed_null_sentinel
-```
-
-The sentinel is `0x00` for nulls-first and `0x02` for nulls-last.
-
-## Boolean
-
-Booleans are fixed-width and use one value byte:
-
-```text
-sentinel || value_byte
-```
-
-For ascending order:
-
-| Value | Value byte |
-| --- | --- |
-| `false` | `0x01` |
-| `true` | `0x02` |
-
-For descending order, the value byte is inverted:
-
-| Value | Value byte |
-| --- | --- |
-| `true` | `0xFD` |
-| `false` | `0xFE` |
-
-Null booleans encode as:
-
-```text
-null_sentinel || 0x00
-```
-
-## Unsigned Integers
-
-Supported unsigned primitive types are `u8`, `u16`, `u32`, and `u64`.
-
-Ascending encoding:
-
-```text
-0x01 || BE(value)
-```
-
-Descending encoding:
-
-```text
-0x01 || !BE(value)
-```
-
-Big-endian byte order makes lexicographic byte order match numeric order for fixed-width
-unsigned integers. Bitwise complement reverses that order for descending fields.
-
-Null unsigned integers encode as:
-
-```text
-null_sentinel || zero(width(T))
-```
-
-## Signed Integers
-
-Supported signed primitive PTypes are `i8`, `i16`, `i32`, and `i64`. The same signed
-integer transform is also used for `i128` decimal storage.
-
-Signed integers first flip the sign bit of their big-endian two's-complement
-representation:
-
-```text
-ordered = BE(value)
-ordered[0] = ordered[0] XOR 0x80
-```
-
-Ascending encoding:
-
-```text
-0x01 || ordered
-```
-
-Descending encoding:
-
-```text
-0x01 || !ordered
-```
-
-Flipping the sign bit maps the signed numeric range into unsigned byte order:
-
-```text
-negative values -> 0x00..0x7F prefix range
-non-negative values -> 0x80..0xFF prefix range
-```
-
-Null signed integers encode as:
-
-```text
-null_sentinel || zero(width(T))
-```
-
-## Floating Point
-
-Supported floating primitive types are `f16`, `f32`, and `f64`.
-
-The encoder treats the IEEE bit pattern as an unsigned integer and applies a sign-aware
-transform before writing big-endian bytes.
-
-For a floating value with raw bits `bits`:
-
-```text
-if sign_bit(bits) == 0:
-    ordered = bits XOR sign_bit_mask
-else:
-    ordered = bits XOR all_ones
-```
-
-Ascending encoding:
-
-```text
-0x01 || BE(ordered)
-```
-
-Descending encoding:
-
-```text
-0x01 || !BE(ordered)
-```
-
-This produces a total-order-style byte ordering where negative values sort before positive
-values, and `-0.0` sorts before `+0.0`. NaN values are ordered by their raw bit patterns
-under the same transform; they are not canonicalized by row encoding.
-
-Null floats encode as:
-
-```text
-null_sentinel || zero(width(T))
-```
-
-## Decimal
-
-Decimals are encoded as their scaled signed integer storage value. The selected storage
-width is the smallest decimal value type for the decimal precision:
-
-| Precision | Storage |
-| --- | --- |
-| `1..=2` | `i8` |
-| `3..=4` | `i16` |
-| `5..=9` | `i32` |
-| `10..=18` | `i64` |
-| `19..=38` | `i128` |
-
-The storage integer is encoded with the signed integer encoding described above. Decimal
-columns have one precision and scale, so ordering the scaled integer storage values matches
-ordering the decimal values in that column.
-
-`Decimal256` is not supported by row encoding.
-
-## UTF-8 and Binary
-
-UTF-8 and binary values use the variable-width sentinels described above.
-
-Null:
-
-```text
-varlen_null_sentinel
-```
-
-Empty:
-
-```text
-varlen_empty_sentinel
-```
-
-Non-empty:
-
-```text
-varlen_non_empty_sentinel || varlen_body(bytes)
-```
-
-For UTF-8, `bytes` are the UTF-8 bytes of the string. For binary, `bytes` are the raw binary
-bytes. The byte ordering is therefore UTF-8 byte lexicographic order for strings and raw byte
-lexicographic order for binary.
-
-### Variable-Length Body
-
-Non-empty variable-length values are encoded in blocks. Each block contains 32 data bytes
-followed by one marker byte:
-
-```text
-data[0..32] || marker
-```
-
-For ascending order:
-
-- Every non-final full block uses marker `0xFF`.
-- The final block is padded with zeros to 32 data bytes.
-- The final marker is the number of real data bytes in the final block, in `1..=32`.
-
-For descending order:
-
-- Every data byte is inverted.
-- Every non-final full-block marker is `0x00`, the inverse of `0xFF`.
-- The final block is padded with `0xFF`, the inverse of ascending zero padding.
-- The final marker is inverted: `final_len XOR 0xFF`.
-
-If the input length is exactly a multiple of 32, the final block has marker `32`, and earlier
-blocks, if any, use the continuation marker.
-
-This block structure preserves prefix order. For example, in ascending order a shorter value
-that is a prefix of a longer value reaches its final marker before the longer value reaches
-the continuation marker. Since final length markers in `1..=32` are less than `0xFF`, the
-shorter prefix sorts first. Descending order inverts the same bytes and reverses that result.
-
-## Struct
-
-A struct is encoded as:
-
-```text
-struct_sentinel || field_0 || field_1 || ... || field_n
-```
-
-The outer sentinel is the fixed-width sentinel:
-
-- `0x01` for a non-null struct
-- `0x00` or `0x02` for a null struct, depending on null placement
-
-For a non-null struct, each field is encoded recursively in schema order using the same
-`RowSortField` as the parent struct column.
-
-For a null struct, the body is canonicalized so two null parent rows produce byte-equal
-output even if their physical child arrays contain different values:
-
-- Fixed-width children contribute their fixed-width null encoding.
-- Variable-width children contribute exactly one child null sentinel byte.
-
-A struct has fixed row width only when all of its fields have fixed row width. If any child
-is variable-width, the struct is variable-width.
-
-## Fixed-Size List
-
-A fixed-size list with `N` elements is encoded as:
-
-```text
-list_sentinel || element_0 || element_1 || ... || element_N-1
-```
-
-The outer sentinel is the fixed-width sentinel:
-
-- `0x01` for a non-null list
-- `0x00` or `0x02` for a null list, depending on null placement
-
-For a non-null fixed-size list, elements are encoded recursively in element order using the
-same `RowSortField` as the parent list column.
-
-For a null fixed-size list, the body is canonicalized:
-
-- Fixed-width elements contribute their fixed-width null encoding, repeated `N` times.
-- Variable-width elements contribute one child null sentinel byte per element.
-
-A fixed-size list has fixed row width only when its element type has fixed row width.
-
-## Nested Values
-
-Nested structs and fixed-size lists apply the same rules recursively. Each nullable parent
-adds its own outer sentinel. Null parents canonicalize their child body before comparison can
-observe underlying child values.
-
-## Unsupported Types
-
-The current row encoder rejects types for which it does not define byte-sort semantics:
-
-| Type | Reason |
-| --- | --- |
-| Variable-size `List` | No row encoding order is defined. |
-| `Variant` | No row encoding order is defined. |
-| `Union` | No row encoding order is defined. |
-| `Extension` | No row encoding order is defined. |
-| `Decimal256` | Encoding is not implemented. |
-
-The absence of these encodings is intentional. Adding one requires defining both the logical
-ordering and the exact byte representation that preserves that ordering.
-
-Temporal extensions could be added later by normalizing them to storage arrays at the
-row-encoder boundary, once the supported temporal ordering contract is made explicit.
-
-## Size and Output Layout
-
-The encoded output is a `ListView<u8>`:
-
-```text
-elements: contiguous u8 buffer containing all row bytes
-offsets:  per-row start offset into elements
-sizes:    per-row byte length
-```
-
-Rows are not self-describing without their `sizes`. A variable-width field can make one row
-longer than another, and the enclosing `ListView` supplies the row boundary.
-
-The encoder computes sizes before writing bytes:
-
-- Fixed-width columns contribute a constant width per row.
-- Variable-width columns contribute data-dependent widths per row.
-- The final `sizes` array is also used as the per-row write cursor during encoding.
-
 ## Why Concatenation Works
 
 For each supported field type, the field encoder is an order embedding from logical values to
@@ -469,7 +106,8 @@ controlled solely by `nulls_first`.
 ## Example Row
 
 This example shows one row that contains every supported encoding family. All columns use
-ascending order with nulls first.
+ascending order with nulls first. (This row is locked in by the `reference_row_bytes_match_spec`
+test in `vortex-row`.)
 
 Schema:
 
diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
deleted file mode 100644
index 4848a750e52..00000000000
--- a/vortex-row/src/codec.rs
+++ /dev/null
@@ -1,1257 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants.
-//!
-//! The encoded byte format produces a lexicographically byte-comparable representation:
-//! comparing the byte slices of two encoded rows yields the same ordering as the
-//! original logical (tuple) comparison of their values, modulo nulls placement and
-//! descending-ness as configured by [`RowSortField`].
-//!
-//! Conventions:
-//! - Every fixed-width value is preceded by a 1-byte sentinel that orders nulls relative to
-//!   non-nulls. For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF),
-//!   not the sentinel.
-//! - Variable-length (Utf8, Binary) values use **three** distinct leading sentinels — one each
-//!   for null, empty, and non-empty — so byte comparison at position 0 fully categorizes the
-//!   value and column-byte boundaries stay aligned across rows. See
-//!   [`varlen_null_sentinel`], [`varlen_empty_sentinel`], [`varlen_non_empty_sentinel`].
-//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types.
-//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top
-//!   bit; negative flips all bits.
-//! - Nullable structs and fixed-size lists encode null parent rows with a **canonical null
-//!   body** so two null parent rows produce byte-equal encodings: fixed-width children
-//!   contribute their fixed null encoding, and variable-width children collapse to a single
-//!   null sentinel byte.
-
-use vortex_array::Canonical;
-use vortex_array::ExecutionCtx;
-use vortex_array::arrays::BoolArray;
-use vortex_array::arrays::DecimalArray;
-use vortex_array::arrays::FixedSizeListArray;
-use vortex_array::arrays::NullArray;
-use vortex_array::arrays::PrimitiveArray;
-use vortex_array::arrays::StructArray;
-use vortex_array::arrays::VarBinViewArray;
-use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
-use vortex_array::arrays::struct_::StructArrayExt;
-use vortex_array::dtype::DType;
-use vortex_array::dtype::DecimalType;
-use vortex_array::dtype::NativePType;
-use vortex_array::dtype::half::f16;
-use vortex_array::match_each_native_ptype;
-use vortex_array::validity::Validity;
-use vortex_error::VortexExpect;
-use vortex_error::VortexResult;
-use vortex_error::vortex_bail;
-
-use crate::options::RowSortField;
-
-/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte).
-pub(crate) const BOOL_ENCODED_SIZE: u32 = 2;
-
-/// Block size used in the variable-length encoding.
-pub(crate) const VARLEN_BLOCK_SIZE: usize = 32;
-/// Total bytes per varlen block including the trailing continuation marker.
-pub(crate) const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1;
-const VARLEN_BLOCK_TOTAL_U32: u32 = 33;
-
-/// Size in bytes of an encoded null varlen value (just the sentinel byte).
-pub(crate) const VARLEN_NULL_SIZE: u32 = 1;
-/// Size in bytes of an encoded empty varlen value (just the sentinel byte).
-pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1;
-
-/// Returns the size in bytes of the encoded form of a non-empty variable-length value.
-///
-/// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1
-/// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and
-/// [`VARLEN_EMPTY_SIZE`] for empty values. A `u32` always suffices because a `BinaryView`
-/// length is itself a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`.
-#[inline]
-fn encoded_size_for_non_empty_varlen(len: usize) -> u32 {
-    debug_assert!(len > 0);
-    let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE))
-        .vortex_expect("varlen block count must fit in u32");
-    1 + blocks * VARLEN_BLOCK_TOTAL_U32
-}
-
-/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel).
-#[inline]
-const fn encoded_size_for_fixed(value_bytes: u32) -> u32 {
-    1 + value_bytes
-}
-
-fn byte_width_u32(width: usize) -> u32 {
-    u32::try_from(width).vortex_expect("native byte width must fit in u32")
-}
-
-/// Pre-resolved per-row validity for the row encoders.
-///
-/// Encoders pattern-match on this once before their inner loop so the no-nulls fast path
-/// avoids per-row `mask.value(i)` branches entirely, and the nullable path materializes the
-/// mask exactly once.
-pub(crate) enum ValidityKind {
-    /// Column statically has no nulls (`Validity::NonNullable` or `AllValid`); no mask needed.
-    AllValid,
-    /// Column may have nulls; carries the materialized per-row mask.
-    Mask(vortex_mask::Mask),
-}
-
-/// Resolve a [`Validity`] into a [`ValidityKind`], materializing the mask only when the column
-/// may actually have nulls.
-#[inline]
-pub(crate) fn resolve_validity(
-    validity: Validity,
-    len: usize,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ValidityKind> {
-    Ok(match validity {
-        Validity::NonNullable | Validity::AllValid => ValidityKind::AllValid,
-        other => ValidityKind::Mask(other.execute_mask(len, ctx)?),
-    })
-}
-
-/// Returns the sentinel byte for a null varlen value.
-///
-/// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and
-/// independent of `descending`, matching the convention used by `arrow-row`.
-#[inline]
-fn varlen_null_sentinel(field: RowSortField) -> u8 {
-    if field.nulls_first { 0x00 } else { 0xFF }
-}
-
-/// Returns the sentinel byte for an empty varlen value.
-///
-/// Equal to `0x01` in ascending mode and `!0x01 = 0xFE` in descending mode.
-#[inline]
-fn varlen_empty_sentinel(field: RowSortField) -> u8 {
-    if field.descending { !0x01u8 } else { 0x01u8 }
-}
-
-/// Returns the sentinel byte for a non-empty varlen value.
-///
-/// Equal to `0x02` in ascending mode and `!0x02 = 0xFD` in descending mode.
-#[inline]
-fn varlen_non_empty_sentinel(field: RowSortField) -> u8 {
-    if field.descending { !0x02u8 } else { 0x02u8 }
-}
-
-/// Returns the single-byte null sentinel used when a child contributes its canonical null
-/// encoding inside a null parent struct/FSL row.
-///
-/// For varlen children that is the varlen null sentinel; for everything else (including
-/// nested struct/FSL when used as a variable-width child) it is the fixed-width null sentinel.
-fn child_canonical_null_byte(child_dtype: &DType, field: RowSortField) -> u8 {
-    match child_dtype {
-        DType::Utf8(_) | DType::Binary(_) => varlen_null_sentinel(field),
-        _ => field.null_sentinel(),
-    }
-}
-
-/// Per-row width classification for a column.
-///
-/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless
-/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary,
-/// List, or any composite that recurses through a variable-width field).
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) enum RowWidth {
-    /// Per-row width is the same constant for every row in the column.
-    Fixed(u32),
-    /// Per-row width is data-dependent.
-    Variable,
-}
-
-/// Classify a column's per-row encoded width by inspecting only its [`DType`].
-///
-/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value),
-/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the
-/// data.
-///
-/// Classification does not depend on the [`RowSortField`]: null-vs-non-null encoding width is
-/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls).
-///
-/// # Errors
-///
-/// Returns an error for dtypes that the row encoder does not support. Width arithmetic that
-/// would overflow `u32` is also reported as an error rather than silently saturating.
-pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
-    match dtype {
-        DType::Null => Ok(RowWidth::Fixed(1)),
-        DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)),
-        DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32(
-            ptype.byte_width(),
-        )))),
-        DType::Decimal(dt, _) => {
-            let vt = DecimalType::smallest_decimal_value_type(dt);
-            if matches!(vt, DecimalType::I256) {
-                vortex_bail!("row encoding for Decimal256 is not yet implemented");
-            }
-            Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32(
-                vt.byte_width(),
-            ))))
-        }
-        DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable),
-        DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? {
-            // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL
-            // itself, then `n` copies of the element width.
-            RowWidth::Fixed(w) => {
-                let body = w
-                    .checked_mul(*n)
-                    .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?;
-                let total = body
-                    .checked_add(1)
-                    .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?;
-                Ok(RowWidth::Fixed(total))
-            }
-            RowWidth::Variable => Ok(RowWidth::Variable),
-        },
-        DType::Struct(fields, _) => {
-            // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel.
-            let mut total: u32 = 1; // outer sentinel
-            for field_dtype in fields.fields() {
-                match row_width_for_dtype(&field_dtype)? {
-                    RowWidth::Fixed(w) => {
-                        total = total.checked_add(w).ok_or_else(|| {
-                            vortex_error::vortex_err!("Struct row width overflows u32")
-                        })?;
-                    }
-                    RowWidth::Variable => return Ok(RowWidth::Variable),
-                }
-            }
-            Ok(RowWidth::Fixed(total))
-        }
-        DType::List(..) => {
-            vortex_bail!(
-                "row encoding does not support variable-size List arrays (no well-defined ordering)"
-            )
-        }
-        DType::Variant(_) => {
-            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
-        }
-        DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"),
-        dtype => vortex_bail!("row encoding does not support dtype: {dtype:?}"),
-    }
-}
-
-/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`.
-///
-/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the
-/// per-row size to each entry so multiple columns can accumulate into the same buffer.
-///
-/// # Errors
-///
-/// Returns an error for unsupported canonical variants.
-pub(crate) fn field_size(
-    canonical: &Canonical,
-    field: RowSortField,
-    sizes: &mut [u32],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    match canonical {
-        Canonical::Null(arr) => add_size_null(arr, sizes),
-        Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)),
-        Canonical::Primitive(arr) => add_size_primitive(arr, sizes),
-        Canonical::Decimal(arr) => add_size_decimal(arr, sizes),
-        Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
-        Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?,
-        Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?,
-        Canonical::List(_) => vortex_bail!(
-            "row encoding does not support canonical List arrays: {:?}",
-            canonical.dtype()
-        ),
-        Canonical::Variant(_) => {
-            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
-        }
-        unsupported => {
-            vortex_bail!(
-                "row encoding does not support canonical array: {:?}",
-                unsupported.dtype()
-            )
-        }
-    }
-    Ok(())
-}
-
-/// Encode a fixed-width column at arithmetic offsets, without reading or writing any per-row
-/// cursor.
-///
-/// For row `i`, the column's bytes are written starting at `i * row_stride + col_prefix
-/// (+ var_prefix[i])`, where `var_prefix` is the exclusive prefix sum of the varlen
-/// contributions (`None` when the row layout has no variable-length columns). This is the
-/// fast path for fixed-width columns that appear before any varlen column, so their
-/// within-row position is a constant offset rather than a running cursor.
-///
-/// For primitive columns in the pure-fixed case it uses a `chunks_exact_mut` hot loop that
-/// removes the per-row offset/cursor indirection (matching `arrow-row`'s `encode_not_null`).
-/// All other types reuse [`field_encode`] at the materialized offsets, so the bytes written
-/// are byte-identical to the cursor path.
-#[allow(clippy::too_many_arguments)]
-pub(crate) fn field_encode_fixed_arithmetic(
-    canonical: &Canonical,
-    field: RowSortField,
-    col_prefix: u32,
-    row_stride: u32,
-    var_prefix: Option<&[u32]>,
-    nrows: usize,
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    if var_prefix.is_none()
-        && let Canonical::Primitive(arr) = canonical
-    {
-        return encode_primitive_arith(arr, field, col_prefix, row_stride, out, ctx);
-    }
-
-    // General path: materialize this column's per-row start offsets and reuse the cursor
-    // encoder with zero-initialized cursors, so every row is written at its arithmetic
-    // offset with the exact same bytes the cursor path would produce.
-    let mut offsets: Vec<u32> = Vec::with_capacity(nrows);
-    let mut base = col_prefix;
-    match var_prefix {
-        None => {
-            for _ in 0..nrows {
-                offsets.push(base);
-                base = base.wrapping_add(row_stride);
-            }
-        }
-        Some(vp) => {
-            for &p in vp.iter().take(nrows) {
-                offsets.push(base.wrapping_add(p));
-                base = base.wrapping_add(row_stride);
-            }
-        }
-    }
-    let mut cursors = vec![0u32; nrows];
-    field_encode(canonical, field, &offsets, &mut cursors, out, ctx)
-}
-
-/// Encode each row's bytes for the given canonical view into `out`, writing starting at
-/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of
-/// bytes written.
-///
-/// After this call returns successfully, `cursors[i]` will have advanced by exactly the
-/// per-row contribution previously computed by [`field_size`] for the same column.
-pub(crate) fn field_encode(
-    canonical: &Canonical,
-    field: RowSortField,
-    offsets: &[u32],
-    cursors: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    match canonical {
-        Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out),
-        Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::List(_) => vortex_bail!(
-            "row encoding does not support canonical List arrays: {:?}",
-            canonical.dtype()
-        ),
-        Canonical::Variant(_) => {
-            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
-        }
-        unsupported => {
-            vortex_bail!(
-                "row encoding does not support canonical array: {:?}",
-                unsupported.dtype()
-            )
-        }
-    }
-    Ok(())
-}
-
-fn add_size_const(sizes: &mut [u32], add: u32) {
-    for s in sizes.iter_mut() {
-        *s += add;
-    }
-}
-
-fn add_size_null(arr: &NullArray, sizes: &mut [u32]) {
-    debug_assert_eq!(arr.len(), sizes.len());
-    // Just a sentinel byte per row.
-    for s in sizes.iter_mut() {
-        *s += 1;
-    }
-}
-
-fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) {
-    let width = byte_width_u32(arr.ptype().byte_width());
-    add_size_const(sizes, encoded_size_for_fixed(width));
-}
-
-fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) {
-    let width = byte_width_u32(arr.values_type().byte_width());
-    add_size_const(sizes, encoded_size_for_fixed(width));
-}
-
-fn add_size_varbinview(
-    arr: &VarBinViewArray,
-    sizes: &mut [u32],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let views = arr.views();
-    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
-        ValidityKind::AllValid => {
-            for (i, view) in views.iter().enumerate() {
-                let contribution = if view.is_empty() {
-                    VARLEN_EMPTY_SIZE
-                } else {
-                    encoded_size_for_non_empty_varlen(view.len() as usize)
-                };
-                sizes[i] = sizes[i]
-                    .checked_add(contribution)
-                    .vortex_expect("per-row size overflow");
-            }
-        }
-        ValidityKind::Mask(mask) => {
-            for (i, view) in views.iter().enumerate() {
-                let contribution = if !mask.value(i) {
-                    VARLEN_NULL_SIZE
-                } else if view.is_empty() {
-                    VARLEN_EMPTY_SIZE
-                } else {
-                    encoded_size_for_non_empty_varlen(view.len() as usize)
-                };
-                sizes[i] = sizes[i]
-                    .checked_add(contribution)
-                    .vortex_expect("per-row size overflow");
-            }
-        }
-    }
-    Ok(())
-}
-
-fn add_size_struct(
-    arr: &StructArray,
-    field: RowSortField,
-    sizes: &mut [u32],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let n = arr.len();
-    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
-    // Outer sentinel: 1 byte per row.
-    for s in sizes.iter_mut() {
-        *s = s.checked_add(1).vortex_expect("per-row size overflow");
-    }
-    // Each child contributes its per-row size when the parent is non-null, and a canonical
-    // null contribution when the parent is null. For fixed-width children both are equal,
-    // so we can simply add the fixed width to every row. For variable-width children the
-    // null contribution collapses to 1 byte, ensuring null parent rows have a constant body.
-    for child in arr.iter_unmasked_fields() {
-        match row_width_for_dtype(child.dtype())? {
-            RowWidth::Fixed(w) => add_size_const(sizes, w),
-            RowWidth::Variable => {
-                let canonical = child.clone().execute::<Canonical>(ctx)?;
-                let mut child_sizes = vec![0u32; n];
-                field_size(&canonical, field, &mut child_sizes, ctx)?;
-                for i in 0..n {
-                    let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 };
-                    sizes[i] = sizes[i]
-                        .checked_add(contribution)
-                        .vortex_expect("per-row size overflow");
-                }
-            }
-        }
-    }
-    Ok(())
-}
-
-fn add_size_fsl(
-    arr: &FixedSizeListArray,
-    field: RowSortField,
-    sizes: &mut [u32],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let n = arr.len();
-    debug_assert_eq!(n, sizes.len());
-    let list_size = arr.list_size() as usize;
-    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
-    let elem_dtype = arr.elements().dtype();
-    // Outer sentinel: 1 byte per row.
-    for s in sizes.iter_mut() {
-        *s = s.checked_add(1).vortex_expect("per-row size overflow");
-    }
-    match row_width_for_dtype(elem_dtype)? {
-        RowWidth::Fixed(w) => {
-            // Each row has `list_size` fixed-width elements regardless of null parent mask.
-            let body = w
-                .checked_mul(u32::try_from(list_size).vortex_expect("list_size fits u32"))
-                .vortex_expect("FSL body width overflow");
-            add_size_const(sizes, body);
-        }
-        RowWidth::Variable => {
-            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
-            debug_assert_eq!(elements.len(), n * list_size);
-            let mut elem_sizes = vec![0u32; n * list_size];
-            field_size(&elements, field, &mut elem_sizes, ctx)?;
-            for i in 0..n {
-                let body: u32 = if mask.value(i) {
-                    let base = i * list_size;
-                    let mut sum: u32 = 0;
-                    for j in 0..list_size {
-                        sum = sum
-                            .checked_add(elem_sizes[base + j])
-                            .vortex_expect("FSL row body overflow");
-                    }
-                    sum
-                } else {
-                    // Canonical null body for FSL with variable element: one null sentinel
-                    // per element. (Each element contributes `child_null_width = 1`.)
-                    u32::try_from(list_size).vortex_expect("list_size fits u32")
-                };
-                sizes[i] = sizes[i]
-                    .checked_add(body)
-                    .vortex_expect("FSL per-row size overflow");
-            }
-        }
-    }
-    Ok(())
-}
-
-fn encode_null(
-    arr: &NullArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-) {
-    let sentinel = field.null_sentinel();
-    for i in 0..arr.len() {
-        let pos = (row_offsets[i] + col_offset[i]) as usize;
-        out[pos] = sentinel;
-        col_offset[i] += 1;
-    }
-}
-
-fn encode_bool(
-    arr: &BoolArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let bits = arr.clone().into_bit_buffer();
-    let non_null = field.non_null_sentinel();
-    let xor = if field.descending { 0xFF } else { 0x00 };
-    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
-        ValidityKind::AllValid => {
-            for i in 0..bits.len() {
-                let pos = (row_offsets[i] + col_offset[i]) as usize;
-                out[pos] = non_null;
-                // false=0x01, true=0x02 so false < true; XOR for descending
-                let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
-                out[pos + 1] = raw ^ xor;
-                col_offset[i] += BOOL_ENCODED_SIZE;
-            }
-        }
-        ValidityKind::Mask(mask) => {
-            let null = field.null_sentinel();
-            for i in 0..bits.len() {
-                let pos = (row_offsets[i] + col_offset[i]) as usize;
-                if mask.value(i) {
-                    out[pos] = non_null;
-                    let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
-                    out[pos + 1] = raw ^ xor;
-                } else {
-                    out[pos] = null;
-                    out[pos + 1] = 0;
-                }
-                col_offset[i] += BOOL_ENCODED_SIZE;
-            }
-        }
-    }
-    Ok(())
-}
-
-fn encode_primitive(
-    arr: &PrimitiveArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    match_each_native_ptype!(arr.ptype(), |T| {
-        encode_primitive_typed::<T>(arr, field, row_offsets, col_offset, out, ctx)?;
-    });
-    Ok(())
-}
-
-fn encode_primitive_typed<T: NativePType + RowEncode>(
-    arr: &PrimitiveArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let slice: &[T] = arr.as_slice();
-    let non_null = field.non_null_sentinel();
-    let value_bytes = size_of::<T>();
-    let stride = encoded_size_for_fixed(byte_width_u32(value_bytes));
-    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
-        ValidityKind::AllValid => {
-            for (i, &v) in slice.iter().enumerate() {
-                let pos = (row_offsets[i] + col_offset[i]) as usize;
-                out[pos] = non_null;
-                v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
-                col_offset[i] += stride;
-            }
-        }
-        ValidityKind::Mask(mask) => {
-            let null = field.null_sentinel();
-            for (i, &v) in slice.iter().enumerate() {
-                let pos = (row_offsets[i] + col_offset[i]) as usize;
-                if mask.value(i) {
-                    out[pos] = non_null;
-                    v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
-                } else {
-                    out[pos] = null;
-                    // Zero-fill the value bytes.
-                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
-                        *b = 0;
-                    }
-                }
-                col_offset[i] += stride;
-            }
-        }
-    }
-    Ok(())
-}
-
-fn encode_decimal(
-    arr: &DecimalArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
-    match arr.values_type() {
-        DecimalType::I8 => {
-            encode_decimal_typed::<i8>(arr, &mask, field, row_offsets, col_offset, out)
-        }
-        DecimalType::I16 => {
-            encode_decimal_typed::<i16>(arr, &mask, field, row_offsets, col_offset, out)
-        }
-        DecimalType::I32 => {
-            encode_decimal_typed::<i32>(arr, &mask, field, row_offsets, col_offset, out)
-        }
-        DecimalType::I64 => {
-            encode_decimal_typed::<i64>(arr, &mask, field, row_offsets, col_offset, out)
-        }
-        DecimalType::I128 => {
-            encode_decimal_typed::<i128>(arr, &mask, field, row_offsets, col_offset, out)
-        }
-        DecimalType::I256 => {
-            vortex_bail!("row encoding for Decimal256 is not yet implemented")
-        }
-    }
-    Ok(())
-}
-
-fn encode_decimal_typed<T>(
-    arr: &DecimalArray,
-    mask: &vortex_mask::Mask,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-) where
-    T: vortex_array::dtype::NativeDecimalType + RowEncode,
-{
-    let non_null = field.non_null_sentinel();
-    let null = field.null_sentinel();
-    let value_bytes = size_of::<T>();
-    let total = encoded_size_for_fixed(byte_width_u32(value_bytes));
-    let slice = arr.buffer::<T>();
-    for i in 0..slice.len() {
-        let pos = (row_offsets[i] + col_offset[i]) as usize;
-        if mask.value(i) {
-            out[pos] = non_null;
-            slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
-        } else {
-            out[pos] = null;
-            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
-                *b = 0;
-            }
-        }
-        col_offset[i] += total;
-    }
-}
-
-fn encode_varbinview(
-    arr: &VarBinViewArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let null_byte = varlen_null_sentinel(field);
-    let empty_byte = varlen_empty_sentinel(field);
-    let non_empty_byte = varlen_non_empty_sentinel(field);
-    let descending = field.descending;
-
-    let views = arr.views();
-    // Cache the data-buffer slices once. Inlined views (len <= 12) carry their bytes inline,
-    // so they never touch `buffers`; referenced views index into the pre-validated buffer at
-    // `offset..offset + len`. Walking views directly avoids the per-row bounds and branch work
-    // of `with_iterator`.
-    let buffers: smallvec::SmallVec<[&[u8]; 4]> = (0..arr.data_buffers().len())
-        .map(|i| arr.buffer(i).as_slice())
-        .collect();
-
-    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
-        ValidityKind::AllValid => {
-            for (i, view) in views.iter().enumerate() {
-                let pos = (row_offsets[i] + col_offset[i]) as usize;
-                let len = view.len() as usize;
-                if len == 0 {
-                    out[pos] = empty_byte;
-                    col_offset[i] += VARLEN_EMPTY_SIZE;
-                    continue;
-                }
-                let bytes: &[u8] = if view.is_inlined() {
-                    view.as_inlined().value()
-                } else {
-                    let r = view.as_view();
-                    let off = r.offset as usize;
-                    &buffers[r.buffer_index as usize][off..off + len]
-                };
-                out[pos] = non_empty_byte;
-                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending);
-                col_offset[i] += 1 + written;
-            }
-        }
-        ValidityKind::Mask(mask) => {
-            for (i, view) in views.iter().enumerate() {
-                let pos = (row_offsets[i] + col_offset[i]) as usize;
-                if !mask.value(i) {
-                    out[pos] = null_byte;
-                    col_offset[i] += VARLEN_NULL_SIZE;
-                    continue;
-                }
-                let len = view.len() as usize;
-                if len == 0 {
-                    out[pos] = empty_byte;
-                    col_offset[i] += VARLEN_EMPTY_SIZE;
-                    continue;
-                }
-                let bytes: &[u8] = if view.is_inlined() {
-                    view.as_inlined().value()
-                } else {
-                    let r = view.as_view();
-                    let off = r.offset as usize;
-                    &buffers[r.buffer_index as usize][off..off + len]
-                };
-                out[pos] = non_empty_byte;
-                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending);
-                col_offset[i] += 1 + written;
-            }
-        }
-    }
-    Ok(())
-}
-
-fn encode_struct(
-    arr: &StructArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let n = arr.len();
-    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
-    let non_null = field.non_null_sentinel();
-    let null = field.null_sentinel();
-
-    // Write the outer sentinel for each row.
-    for i in 0..n {
-        let pos = (row_offsets[i] + col_offset[i]) as usize;
-        out[pos] = if mask.value(i) { non_null } else { null };
-        col_offset[i] += 1;
-    }
-
-    // Encode each child. For non-null parent rows the child contributes its actual encoding;
-    // for null parent rows the child contributes its canonical null encoding so that two null
-    // parent rows produce byte-equal output regardless of underlying child values.
-    for child in arr.iter_unmasked_fields() {
-        match row_width_for_dtype(child.dtype())? {
-            RowWidth::Fixed(w) => {
-                let canonical = child.clone().execute::<Canonical>(ctx)?;
-                field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?;
-                // Replace null parent rows with the canonical null encoding (the same as a
-                // child-level null: null sentinel followed by zero-padded value bytes).
-                let null_byte = child_canonical_null_byte(child.dtype(), field);
-                for i in 0..n {
-                    if !mask.value(i) {
-                        let end = (row_offsets[i] + col_offset[i]) as usize;
-                        let start = end - w as usize;
-                        out[start] = null_byte;
-                        for b in &mut out[start + 1..end] {
-                            *b = 0;
-                        }
-                    }
-                }
-            }
-            RowWidth::Variable => {
-                encode_variable_child(child, field, &mask, row_offsets, col_offset, out, ctx)?;
-            }
-        }
-    }
-
-    Ok(())
-}
-
-fn encode_fsl(
-    arr: &FixedSizeListArray,
-    field: RowSortField,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let nrows = arr.len();
-    let list_size = arr.list_size() as usize;
-    let mask = arr.as_ref().validity()?.execute_mask(nrows, ctx)?;
-    let non_null = field.non_null_sentinel();
-    let null = field.null_sentinel();
-    let elem_dtype = arr.elements().dtype().clone();
-
-    // Outer sentinel.
-    for i in 0..nrows {
-        let pos = (row_offsets[i] + col_offset[i]) as usize;
-        out[pos] = if mask.value(i) { non_null } else { null };
-        col_offset[i] += 1;
-    }
-
-    match row_width_for_dtype(&elem_dtype)? {
-        RowWidth::Fixed(w) => {
-            // Fixed-width elements: encode the elements array directly (its length is
-            // nrows * list_size) using a derived (offsets, cursors) pair. Then overwrite
-            // the body of null parent rows with the canonical null encoding per element.
-            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
-            debug_assert_eq!(elements.len(), nrows * list_size);
-            let list_size_u32 = arr.list_size();
-            let row_body_bytes = w
-                .checked_mul(list_size_u32)
-                .vortex_expect("FSL body width overflow");
-            let mut elem_offsets = vec![0u32; nrows * list_size];
-            for i in 0..nrows {
-                let base = row_offsets[i] + col_offset[i];
-                for j in 0u32..list_size_u32 {
-                    elem_offsets[i * list_size + j as usize] = base + j * w;
-                }
-            }
-            let mut elem_cursors = vec![0u32; nrows * list_size];
-            field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?;
-            for i in 0..nrows {
-                col_offset[i] = col_offset[i]
-                    .checked_add(row_body_bytes)
-                    .vortex_expect("FSL row body overflow");
-            }
-            // Canonical null body for null parent rows: one null encoding per element.
-            let null_byte = child_canonical_null_byte(&elem_dtype, field);
-            let elem_width = w as usize;
-            for i in 0..nrows {
-                if !mask.value(i) {
-                    let end = (row_offsets[i] + col_offset[i]) as usize;
-                    let start = end - row_body_bytes as usize;
-                    let mut pos = start;
-                    for _ in 0..list_size {
-                        out[pos] = null_byte;
-                        for b in &mut out[pos + 1..pos + elem_width] {
-                            *b = 0;
-                        }
-                        pos += elem_width;
-                    }
-                }
-            }
-        }
-        RowWidth::Variable => {
-            // Variable-width elements: for null parent rows the canonical body is exactly
-            // `list_size` null sentinel bytes (one per element). For non-null parent rows,
-            // encode each element via a scratch buffer and copy into out.
-            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
-            debug_assert_eq!(elements.len(), nrows * list_size);
-            let mut elem_sizes = vec![0u32; nrows * list_size];
-            field_size(&elements, field, &mut elem_sizes, ctx)?;
-            let total: u64 = elem_sizes.iter().map(|&s| u64::from(s)).sum();
-            let total_usize =
-                usize::try_from(total).vortex_expect("FSL scratch buffer size fits usize");
-            let mut scratch = vec![0u8; total_usize];
-            let mut scratch_offsets = Vec::with_capacity(nrows * list_size);
-            let mut acc: u32 = 0;
-            for &s in &elem_sizes {
-                scratch_offsets.push(acc);
-                acc = acc
-                    .checked_add(s)
-                    .vortex_expect("FSL scratch offset overflow");
-            }
-            let mut scratch_cursors = vec![0u32; nrows * list_size];
-            field_encode(
-                &elements,
-                field,
-                &scratch_offsets,
-                &mut scratch_cursors,
-                &mut scratch,
-                ctx,
-            )?;
-            let null_byte = child_canonical_null_byte(&elem_dtype, field);
-            for i in 0..nrows {
-                let dst = (row_offsets[i] + col_offset[i]) as usize;
-                if mask.value(i) {
-                    let mut body_bytes: u32 = 0;
-                    for j in 0..list_size {
-                        let k = i * list_size + j;
-                        let src = scratch_offsets[k] as usize;
-                        let sz = elem_sizes[k] as usize;
-                        out[dst + body_bytes as usize..dst + body_bytes as usize + sz]
-                            .copy_from_slice(&scratch[src..src + sz]);
-                        body_bytes = body_bytes
-                            .checked_add(elem_sizes[k])
-                            .vortex_expect("FSL body bytes overflow");
-                    }
-                    col_offset[i] = col_offset[i]
-                        .checked_add(body_bytes)
-                        .vortex_expect("FSL row offset overflow");
-                } else {
-                    for offset in 0..list_size {
-                        out[dst + offset] = null_byte;
-                    }
-                    col_offset[i] = col_offset[i]
-                        .checked_add(u32::try_from(list_size).vortex_expect("list_size fits u32"))
-                        .vortex_expect("FSL row offset overflow");
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-
-/// Encode one variable-width child of a struct: for non-null parent rows, copy the child's
-/// natural encoding from a scratch buffer; for null parent rows, write a single
-/// `child_canonical_null_byte`.
-fn encode_variable_child(
-    child: &vortex_array::ArrayRef,
-    field: RowSortField,
-    parent_mask: &vortex_mask::Mask,
-    row_offsets: &[u32],
-    col_offset: &mut [u32],
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let n = child.len();
-    let canonical = child.clone().execute::<Canonical>(ctx)?;
-
-    // Size and encode the child into a sequential scratch buffer.
-    let mut child_sizes = vec![0u32; n];
-    field_size(&canonical, field, &mut child_sizes, ctx)?;
-    let total: u64 = child_sizes.iter().map(|&s| u64::from(s)).sum();
-    let total_usize = usize::try_from(total).vortex_expect("child scratch buffer size fits usize");
-    let mut scratch = vec![0u8; total_usize];
-    let mut scratch_offsets = Vec::with_capacity(n);
-    let mut acc: u32 = 0;
-    for &s in &child_sizes {
-        scratch_offsets.push(acc);
-        acc = acc
-            .checked_add(s)
-            .vortex_expect("child scratch offset overflow");
-    }
-    let mut scratch_cursors = vec![0u32; n];
-    field_encode(
-        &canonical,
-        field,
-        &scratch_offsets,
-        &mut scratch_cursors,
-        &mut scratch,
-        ctx,
-    )?;
-
-    let null_byte = child_canonical_null_byte(child.dtype(), field);
-    for i in 0..n {
-        let dst = (row_offsets[i] + col_offset[i]) as usize;
-        if parent_mask.value(i) {
-            let src = scratch_offsets[i] as usize;
-            let sz = child_sizes[i] as usize;
-            out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]);
-            col_offset[i] = col_offset[i]
-                .checked_add(child_sizes[i])
-                .vortex_expect("col_offset overflow");
-        } else {
-            out[dst] = null_byte;
-            col_offset[i] = col_offset[i]
-                .checked_add(1)
-                .vortex_expect("col_offset overflow");
-        }
-    }
-    Ok(())
-}
-
-/// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a
-/// constant within-row offset, iterating the output in `row_stride`-sized chunks so the
-/// compiler can drop the per-row offset/cursor indirection.
-fn encode_primitive_arith(
-    arr: &PrimitiveArray,
-    field: RowSortField,
-    col_prefix: u32,
-    row_stride: u32,
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    match_each_native_ptype!(arr.ptype(), |T| {
-        encode_primitive_arith_typed::<T>(arr, field, col_prefix, row_stride, out, ctx)?;
-    });
-    Ok(())
-}
-
-fn encode_primitive_arith_typed<T: NativePType + RowEncode>(
-    arr: &PrimitiveArray,
-    field: RowSortField,
-    col_prefix: u32,
-    row_stride: u32,
-    out: &mut [u8],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<()> {
-    let slice: &[T] = arr.as_slice();
-    let non_null = field.non_null_sentinel();
-    let value_bytes = size_of::<T>();
-    let slot_size = 1 + value_bytes;
-    let stride = row_stride as usize;
-    let prefix = col_prefix as usize;
-    let descending = field.descending;
-
-    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
-        ValidityKind::AllValid => {
-            // Hot path: each row's slot is a fixed window inside its `stride`-sized chunk,
-            // so the inner write vectorizes the same way as `arrow-row`'s not-null path.
-            for (chunk, &v) in out.chunks_exact_mut(stride).zip(slice.iter()) {
-                let slot = &mut chunk[prefix..prefix + slot_size];
-                slot[0] = non_null;
-                v.encode_to(&mut slot[1..], descending);
-            }
-        }
-        ValidityKind::Mask(mask) => {
-            let null = field.null_sentinel();
-            for (i, (chunk, &v)) in out.chunks_exact_mut(stride).zip(slice.iter()).enumerate() {
-                let slot = &mut chunk[prefix..prefix + slot_size];
-                if mask.value(i) {
-                    slot[0] = non_null;
-                    v.encode_to(&mut slot[1..], descending);
-                } else {
-                    slot[0] = null;
-                    for b in &mut slot[1..] {
-                        *b = 0;
-                    }
-                }
-            }
-        }
-    }
-    Ok(())
-}
-
-/// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with
-/// continuation/length markers. Returns the number of bytes written. Empty values are
-/// encoded by the caller as a single sentinel byte and never reach this function.
-///
-/// For the ascending path the hot loop is a `copy_nonoverlapping` of 32 bytes per block
-/// plus one stamped continuation byte. For the descending path it reads a u64 at a time and
-/// XORs with `0xFF`, giving LLVM a vectorizable inner loop.
-fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
-    debug_assert!(!bytes.is_empty());
-    let len = bytes.len();
-    let full_blocks = len / VARLEN_BLOCK_SIZE;
-    let partial = len % VARLEN_BLOCK_SIZE;
-    let (full_to_write, partial_block_len) = if partial == 0 {
-        // Length is an exact multiple of 32: emit (full_blocks - 1) full blocks with the
-        // 0xFF continuation marker, then a final block whose continuation byte is 32.
-        (full_blocks - 1, VARLEN_BLOCK_SIZE)
-    } else {
-        (full_blocks, partial)
-    };
-    let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
-    debug_assert!(out.len() >= total);
-    // The final block's continuation byte encodes its content length (1..=32).
-    let len_byte =
-        u8::try_from(partial_block_len).vortex_expect("varlen final block length (1..=32) fits u8");
-
-    // SAFETY: `out` has at least `total` bytes — the caller sizes every varlen slot via
-    // `encoded_size_for_non_empty_varlen` (which equals `1 + total`, the extra byte being the
-    // leading sentinel that the caller wrote and that is not part of `out`). `bytes` is valid
-    // for `len` reads, and every pointer advance below stays within `[0, total)` for `dst`
-    // and `[0, len)` for `src`.
-    unsafe {
-        let mut src = bytes.as_ptr();
-        let mut dst = out.as_mut_ptr();
-
-        if !descending {
-            // Ascending fast path: each full block is a 32-byte memcpy + a single 0xFF stamp.
-            for _ in 0..full_to_write {
-                std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE);
-                *dst.add(VARLEN_BLOCK_SIZE) = 0xFF;
-                src = src.add(VARLEN_BLOCK_SIZE);
-                dst = dst.add(VARLEN_BLOCK_TOTAL);
-            }
-            // Final block: copy the partial data, zero-pad the tail, write the length byte.
-            std::ptr::copy_nonoverlapping(src, dst, partial_block_len);
-            std::ptr::write_bytes(
-                dst.add(partial_block_len),
-                0,
-                VARLEN_BLOCK_SIZE - partial_block_len,
-            );
-            *dst.add(VARLEN_BLOCK_SIZE) = len_byte;
-        } else {
-            // Descending: invert every value byte. A u64-stride XOR gives LLVM a vectorizable
-            // inner loop; the tail handles the partial block byte-wise.
-            for _ in 0..full_to_write {
-                xor_copy_block(src, dst);
-                *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF
-                src = src.add(VARLEN_BLOCK_SIZE);
-                dst = dst.add(VARLEN_BLOCK_TOTAL);
-            }
-            for i in 0..partial_block_len {
-                *dst.add(i) = *src.add(i) ^ 0xFF;
-            }
-            std::ptr::write_bytes(
-                dst.add(partial_block_len),
-                0xFF, // 0x00 XOR 0xFF
-                VARLEN_BLOCK_SIZE - partial_block_len,
-            );
-            *dst.add(VARLEN_BLOCK_SIZE) = len_byte ^ 0xFF;
-        }
-    }
-    u32::try_from(total).vortex_expect("encoded varlen byte length fits u32")
-}
-
-/// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the
-/// four u64-wide iterations into SIMD on x86.
-///
-/// # Safety
-/// `src` must be valid for 32 reads, `dst` valid for 32 writes, and the regions must not
-/// overlap.
-#[inline(always)]
-unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) {
-    // Four u64 lanes of 8 bytes each = 32 bytes total.
-    for i in 0..4 {
-        let off = i * 8;
-        // SAFETY: the caller guarantees src/dst are valid for the full 32-byte block.
-        let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) };
-        unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) };
-    }
-}
-
-/// Internal trait for encoding a fixed-width native value into byte slots.
-///
-/// Implementations must produce a sequence of `size_of::<Self>()` bytes that is
-/// lexicographically byte-comparable according to the natural ordering of the type.
-pub(crate) trait RowEncode: Copy {
-    /// Encode this value into `out`, inverting the bytes for descending order.
-    fn encode_to(self, out: &mut [u8], descending: bool);
-}
-
-macro_rules! impl_row_encode_unsigned {
-    ($t:ty) => {
-        impl RowEncode for $t {
-            #[inline]
-            fn encode_to(self, out: &mut [u8], descending: bool) {
-                let bytes = self.to_be_bytes();
-                if descending {
-                    for (i, b) in bytes.iter().enumerate() {
-                        out[i] = b ^ 0xFF;
-                    }
-                } else {
-                    out.copy_from_slice(&bytes);
-                }
-            }
-        }
-    };
-}
-
-macro_rules! impl_row_encode_signed {
-    ($t:ty) => {
-        impl RowEncode for $t {
-            #[inline]
-            fn encode_to(self, out: &mut [u8], descending: bool) {
-                let mut bytes = self.to_be_bytes();
-                // Flip sign bit so negatives < non-negatives lexicographically.
-                bytes[0] ^= 0x80;
-                if descending {
-                    for (i, b) in bytes.iter().enumerate() {
-                        out[i] = b ^ 0xFF;
-                    }
-                } else {
-                    out.copy_from_slice(&bytes);
-                }
-            }
-        }
-    };
-}
-
-impl_row_encode_unsigned!(u8);
-impl_row_encode_unsigned!(u16);
-impl_row_encode_unsigned!(u32);
-impl_row_encode_unsigned!(u64);
-impl_row_encode_signed!(i8);
-impl_row_encode_signed!(i16);
-impl_row_encode_signed!(i32);
-impl_row_encode_signed!(i64);
-impl_row_encode_signed!(i128);
-
-impl RowEncode for f32 {
-    fn encode_to(self, out: &mut [u8], descending: bool) {
-        let bits = self.to_bits();
-        let mask: u32 = if (bits >> 31) == 0 {
-            0x8000_0000
-        } else {
-            0xFFFF_FFFF
-        };
-        let mut bytes = (bits ^ mask).to_be_bytes();
-        if descending {
-            for b in bytes.iter_mut() {
-                *b ^= 0xFF;
-            }
-        }
-        out.copy_from_slice(&bytes);
-    }
-}
-
-impl RowEncode for f64 {
-    fn encode_to(self, out: &mut [u8], descending: bool) {
-        let bits = self.to_bits();
-        let mask: u64 = if (bits >> 63) == 0 {
-            0x8000_0000_0000_0000
-        } else {
-            0xFFFF_FFFF_FFFF_FFFF
-        };
-        let mut bytes = (bits ^ mask).to_be_bytes();
-        if descending {
-            for b in bytes.iter_mut() {
-                *b ^= 0xFF;
-            }
-        }
-        out.copy_from_slice(&bytes);
-    }
-}
-
-impl RowEncode for f16 {
-    fn encode_to(self, out: &mut [u8], descending: bool) {
-        let bits = self.to_bits();
-        let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF };
-        let mut bytes = (bits ^ mask).to_be_bytes();
-        if descending {
-            for b in bytes.iter_mut() {
-                *b ^= 0xFF;
-            }
-        }
-        out.copy_from_slice(&bytes);
-    }
-}
diff --git a/vortex-row/src/codec/encoding.rs b/vortex-row/src/codec/encoding.rs
new file mode 100644
index 00000000000..c3e90641b2c
--- /dev/null
+++ b/vortex-row/src/codec/encoding.rs
@@ -0,0 +1,593 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Encode pass leaf kernels: per-row byte writers for each canonical variant, plus the
+//! variable-length block body encoder.
+
+use super::*;
+
+pub(super) fn encode_null(
+    arr: &NullArray,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+) {
+    let sentinel = fixed_null_sentinel(field);
+    for i in 0..arr.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = sentinel;
+        col_offset[i] += 1;
+    }
+}
+
+pub(super) fn encode_bool(
+    arr: &BoolArray,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let bits = arr.clone().into_bit_buffer();
+    let non_null = FIXED_NON_NULL_SENTINEL;
+    let xor = if field.descending { 0xFF } else { 0x00 };
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for i in 0..bits.len() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                // false=0x01, true=0x02 so false < true; XOR for descending
+                let raw = u8::from(bits.value(i)) + 1;
+                out[pos + 1] = raw ^ xor;
+                col_offset[i] += BOOL_ENCODED_SIZE;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = fixed_null_sentinel(field);
+            for i in 0..bits.len() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    out[pos] = non_null;
+                    let raw = u8::from(bits.value(i)) + 1;
+                    out[pos + 1] = raw ^ xor;
+                } else {
+                    out[pos] = null;
+                    out[pos + 1] = 0;
+                }
+                col_offset[i] += BOOL_ENCODED_SIZE;
+            }
+        }
+    }
+    Ok(())
+}
+
+pub(super) fn encode_primitive(
+    arr: &PrimitiveArray,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_typed::<T>(arr, field, row_offsets, col_offset, out, ctx)?;
+    });
+    Ok(())
+}
+
+fn encode_primitive_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let slice: &[T] = arr.as_slice();
+    let non_null = FIXED_NON_NULL_SENTINEL;
+    let value_bytes = size_of::<T>();
+    let stride = encoded_size_for_fixed(byte_width_u32(value_bytes));
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+                col_offset[i] += stride;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = fixed_null_sentinel(field);
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    out[pos] = non_null;
+                    v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+                } else {
+                    out[pos] = null;
+                    // Zero-fill the value bytes.
+                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                        *b = 0;
+                    }
+                }
+                col_offset[i] += stride;
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Narrow a decimal array whose physical `values_type` is wider than its precision-minimal
+/// type down to that minimal type, returning `None` when it already uses the minimal width.
+///
+/// Row-encoded widths are a pure function of the logical dtype: [`row_width_for_dtype`] sizes a
+/// decimal column from [`DecimalType::smallest_decimal_value_type`] (the smallest physical type
+/// that can hold the declared precision), independent of how the producer happened to store the
+/// values. A `DecimalArray` may legally carry a wider `values_type` than its precision requires,
+/// so without this normalization the encode pass would write more bytes than the size pass
+/// reserved. The narrowing is always lossless because a decimal's precision bounds the magnitude
+/// of every valid value, so the precision-minimal type can represent it.
+fn narrow_decimal_to_smallest(arr: &DecimalArray) -> VortexResult<Option<DecimalArray>> {
+    let decimal_dtype = arr.decimal_dtype();
+    let target = DecimalType::smallest_decimal_value_type(&decimal_dtype);
+    if arr.values_type() == target {
+        return Ok(None);
+    }
+    let validity = arr.as_ref().validity()?;
+    let narrowed = match_each_decimal_value_type!(arr.values_type(), |P| {
+        let from = arr.buffer::<P>();
+        match_each_decimal_value_type!(target, |Q| {
+            DecimalArray::new::<Q>(narrow_decimal_buffer::<P, Q>(from), decimal_dtype, validity)
+        })
+    });
+    Ok(Some(narrowed))
+}
+
+/// Narrow a buffer of decimal values from type `F` to a smaller type `T`. Lossless because the
+/// caller only narrows to the precision-minimal type, which can represent every valid value.
+fn narrow_decimal_buffer<F: NativeDecimalType, T: NativeDecimalType>(from: Buffer<F>) -> Buffer<T> {
+    from.iter()
+        .map(|&v| T::from(v).vortex_expect("decimal value must fit its precision-minimal type"))
+        .collect()
+}
+
+pub(super) fn encode_decimal(
+    arr: &DecimalArray,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // Normalize to the precision-minimal physical type so the bytes we write match the width the
+    // size pass reserved (see `narrow_decimal_to_smallest`).
+    let narrowed = narrow_decimal_to_smallest(arr)?;
+    let arr = narrowed.as_ref().unwrap_or(arr);
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    match arr.values_type() {
+        DecimalType::I8 => {
+            encode_decimal_typed::<i8>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I16 => {
+            encode_decimal_typed::<i16>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I32 => {
+            encode_decimal_typed::<i32>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I64 => {
+            encode_decimal_typed::<i64>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I128 => {
+            encode_decimal_typed::<i128>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I256 => {
+            vortex_bail!("row encoding for Decimal256 is not yet implemented")
+        }
+    }
+    Ok(())
+}
+
+fn encode_decimal_typed<T>(
+    arr: &DecimalArray,
+    mask: &vortex_mask::Mask,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+) where
+    T: NativeDecimalType + RowEncode,
+{
+    let non_null = FIXED_NON_NULL_SENTINEL;
+    let null = fixed_null_sentinel(field);
+    let value_bytes = size_of::<T>();
+    let total = encoded_size_for_fixed(byte_width_u32(value_bytes));
+    let slice = arr.buffer::<T>();
+    for i in 0..slice.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+        } else {
+            out[pos] = null;
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+        col_offset[i] += total;
+    }
+}
+
+pub(super) fn encode_varbinview(
+    arr: &VarBinViewArray,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let null_byte = varlen_null_sentinel(field);
+    let empty_byte = varlen_empty_sentinel(field);
+    let non_empty_byte = varlen_non_empty_sentinel(field);
+    let descending = field.descending;
+
+    let views = arr.views();
+    // Cache the data-buffer slices once. Inlined views (len <= 12) carry their bytes inline,
+    // so they never touch `buffers`; referenced views index into the pre-validated buffer at
+    // `offset..offset + len`. Walking views directly avoids the per-row bounds and branch work
+    // of `with_iterator`.
+    let buffers: smallvec::SmallVec<[&[u8]; 4]> = (0..arr.data_buffers().len())
+        .map(|i| arr.buffer(i).as_slice())
+        .collect();
+
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, view) in views.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                let len = view.len() as usize;
+                if len == 0 {
+                    out[pos] = empty_byte;
+                    col_offset[i] += VARLEN_EMPTY_SIZE;
+                    continue;
+                }
+                let bytes: &[u8] = if view.is_inlined() {
+                    view.as_inlined().value()
+                } else {
+                    let r = view.as_view();
+                    let off = r.offset as usize;
+                    &buffers[r.buffer_index as usize][off..off + len]
+                };
+                out[pos] = non_empty_byte;
+                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending)?;
+                col_offset[i] += 1 + written;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            for (i, view) in views.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if !mask.value(i) {
+                    out[pos] = null_byte;
+                    col_offset[i] += VARLEN_NULL_SIZE;
+                    continue;
+                }
+                let len = view.len() as usize;
+                if len == 0 {
+                    out[pos] = empty_byte;
+                    col_offset[i] += VARLEN_EMPTY_SIZE;
+                    continue;
+                }
+                let bytes: &[u8] = if view.is_inlined() {
+                    view.as_inlined().value()
+                } else {
+                    let r = view.as_view();
+                    let off = r.offset as usize;
+                    &buffers[r.buffer_index as usize][off..off + len]
+                };
+                out[pos] = non_empty_byte;
+                let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending)?;
+                col_offset[i] += 1 + written;
+            }
+        }
+    }
+    Ok(())
+}
+
+pub(super) fn encode_struct(
+    arr: &StructArray,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = FIXED_NON_NULL_SENTINEL;
+    let null = fixed_null_sentinel(field);
+
+    // Write the outer sentinel for each row.
+    for i in 0..n {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = if mask.value(i) { non_null } else { null };
+        col_offset[i] += 1;
+    }
+
+    // Encode each child. For non-null parent rows the child contributes its actual encoding;
+    // for null parent rows the child contributes its canonical null encoding so that two null
+    // parent rows produce byte-equal output regardless of underlying child values.
+    for child in arr.iter_unmasked_fields() {
+        match row_width_for_dtype(child.dtype())? {
+            RowWidth::Fixed(w) => {
+                let canonical = child.clone().execute::<Canonical>(ctx)?;
+                field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?;
+                // Replace null parent rows with the canonical null encoding (the same as a
+                // child-level null: null sentinel followed by zero-padded value bytes).
+                let null_byte = child_canonical_null_byte(child.dtype(), field);
+                for i in 0..n {
+                    if !mask.value(i) {
+                        let end = (row_offsets[i] + col_offset[i]) as usize;
+                        let start = end - w as usize;
+                        out[start] = null_byte;
+                        for b in &mut out[start + 1..end] {
+                            *b = 0;
+                        }
+                    }
+                }
+            }
+            RowWidth::Variable => {
+                encode_variable_child(child, field, &mask, row_offsets, col_offset, out, ctx)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+pub(super) fn encode_fsl(
+    arr: &FixedSizeListArray,
+    field: RowSortFieldOptions,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let nrows = arr.len();
+    // `list_size` is natively a `u32`; keep both forms (see `add_size_fsl`).
+    let list_size_u32 = arr.list_size();
+    let list_size = list_size_u32 as usize;
+    let mask = arr.as_ref().validity()?.execute_mask(nrows, ctx)?;
+    let non_null = FIXED_NON_NULL_SENTINEL;
+    let null = fixed_null_sentinel(field);
+    let elem_dtype = arr.elements().dtype().clone();
+
+    // Outer sentinel.
+    for i in 0..nrows {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = if mask.value(i) { non_null } else { null };
+        col_offset[i] += 1;
+    }
+
+    match row_width_for_dtype(&elem_dtype)? {
+        RowWidth::Fixed(w) => {
+            // Fixed-width elements: encode the elements array directly (its length is
+            // nrows * list_size) using a derived (offsets, cursors) pair. Then overwrite
+            // the body of null parent rows with the canonical null encoding per element.
+            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+            debug_assert_eq!(elements.len(), nrows * list_size);
+            let row_body_bytes = w
+                .checked_mul(list_size_u32)
+                .ok_or_else(|| vortex_err!("FSL body width overflow"))?;
+            let mut elem_offsets = vec![0u32; nrows * list_size];
+            for i in 0..nrows {
+                let base = row_offsets[i] + col_offset[i];
+                for j in 0u32..list_size_u32 {
+                    elem_offsets[i * list_size + j as usize] = base + j * w;
+                }
+            }
+            let mut elem_cursors = vec![0u32; nrows * list_size];
+            field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?;
+            for i in 0..nrows {
+                col_offset[i] = col_offset[i]
+                    .checked_add(row_body_bytes)
+                    .ok_or_else(|| vortex_err!("FSL row body overflow"))?;
+            }
+            // Canonical null body for null parent rows: one null encoding per element.
+            let null_byte = child_canonical_null_byte(&elem_dtype, field);
+            let elem_width = w as usize;
+            for i in 0..nrows {
+                if !mask.value(i) {
+                    let end = (row_offsets[i] + col_offset[i]) as usize;
+                    let start = end - row_body_bytes as usize;
+                    let mut pos = start;
+                    for _ in 0..list_size {
+                        out[pos] = null_byte;
+                        for b in &mut out[pos + 1..pos + elem_width] {
+                            *b = 0;
+                        }
+                        pos += elem_width;
+                    }
+                }
+            }
+        }
+        RowWidth::Variable => {
+            // Variable-width elements: for null parent rows the canonical body is exactly
+            // `list_size` null sentinel bytes (one per element). For non-null parent rows,
+            // encode each element via a scratch buffer and copy into out.
+            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+            debug_assert_eq!(elements.len(), nrows * list_size);
+            let mut elem_sizes = vec![0u32; nrows * list_size];
+            field_size(&elements, field, &mut elem_sizes, ctx)?;
+            let total: u64 = elem_sizes.iter().map(|&s| u64::from(s)).sum();
+            let total_usize =
+                usize::try_from(total).vortex_expect("FSL scratch buffer size fits usize");
+            let mut scratch = vec![0u8; total_usize];
+            let mut scratch_offsets = Vec::with_capacity(nrows * list_size);
+            let mut acc: u32 = 0;
+            for &s in &elem_sizes {
+                scratch_offsets.push(acc);
+                acc = acc
+                    .checked_add(s)
+                    .ok_or_else(|| vortex_err!("FSL scratch offset overflow"))?;
+            }
+            let mut scratch_cursors = vec![0u32; nrows * list_size];
+            field_encode(
+                &elements,
+                field,
+                &scratch_offsets,
+                &mut scratch_cursors,
+                &mut scratch,
+                ctx,
+            )?;
+            let null_byte = child_canonical_null_byte(&elem_dtype, field);
+            for i in 0..nrows {
+                let dst = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    let mut body_bytes: u32 = 0;
+                    for j in 0..list_size {
+                        let k = i * list_size + j;
+                        let src = scratch_offsets[k] as usize;
+                        let sz = elem_sizes[k] as usize;
+                        out[dst + body_bytes as usize..dst + body_bytes as usize + sz]
+                            .copy_from_slice(&scratch[src..src + sz]);
+                        body_bytes = body_bytes
+                            .checked_add(elem_sizes[k])
+                            .ok_or_else(|| vortex_err!("FSL body bytes overflow"))?;
+                    }
+                    col_offset[i] = col_offset[i]
+                        .checked_add(body_bytes)
+                        .ok_or_else(|| vortex_err!("FSL row offset overflow"))?;
+                } else {
+                    for offset in 0..list_size {
+                        out[dst + offset] = null_byte;
+                    }
+                    col_offset[i] = col_offset[i]
+                        .checked_add(list_size_u32)
+                        .ok_or_else(|| vortex_err!("FSL row offset overflow"))?;
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Encode one variable-width child of a struct: for non-null parent rows, copy the child's
+/// natural encoding from a scratch buffer; for null parent rows, write a single
+/// `child_canonical_null_byte`.
+fn encode_variable_child(
+    child: &vortex_array::ArrayRef,
+    field: RowSortFieldOptions,
+    parent_mask: &vortex_mask::Mask,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = child.len();
+    let canonical = child.clone().execute::<Canonical>(ctx)?;
+
+    // Size and encode the child into a sequential scratch buffer.
+    let mut child_sizes = vec![0u32; n];
+    field_size(&canonical, field, &mut child_sizes, ctx)?;
+    let total: u64 = child_sizes.iter().map(|&s| u64::from(s)).sum();
+    let total_usize = usize::try_from(total).vortex_expect("child scratch buffer size fits usize");
+    let mut scratch = vec![0u8; total_usize];
+    let mut scratch_offsets = Vec::with_capacity(n);
+    let mut acc: u32 = 0;
+    for &s in &child_sizes {
+        scratch_offsets.push(acc);
+        acc = acc
+            .checked_add(s)
+            .ok_or_else(|| vortex_err!("child scratch offset overflow"))?;
+    }
+    let mut scratch_cursors = vec![0u32; n];
+    field_encode(
+        &canonical,
+        field,
+        &scratch_offsets,
+        &mut scratch_cursors,
+        &mut scratch,
+        ctx,
+    )?;
+
+    let null_byte = child_canonical_null_byte(child.dtype(), field);
+    for i in 0..n {
+        let dst = (row_offsets[i] + col_offset[i]) as usize;
+        if parent_mask.value(i) {
+            let src = scratch_offsets[i] as usize;
+            let sz = child_sizes[i] as usize;
+            out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]);
+            col_offset[i] = col_offset[i]
+                .checked_add(child_sizes[i])
+                .ok_or_else(|| vortex_err!("col_offset overflow"))?;
+        } else {
+            out[dst] = null_byte;
+            col_offset[i] = col_offset[i]
+                .checked_add(1)
+                .ok_or_else(|| vortex_err!("col_offset overflow"))?;
+        }
+    }
+    Ok(())
+}
+
+/// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a
+/// constant within-row offset, iterating the output in `row_stride`-sized chunks so the
+/// compiler can drop the per-row offset/cursor indirection.
+pub(super) fn encode_primitive_arith(
+    arr: &PrimitiveArray,
+    field: RowSortFieldOptions,
+    col_prefix: u32,
+    row_stride: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_arith_typed::<T>(arr, field, col_prefix, row_stride, out, ctx)?;
+    });
+    Ok(())
+}
+
+fn encode_primitive_arith_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: RowSortFieldOptions,
+    col_prefix: u32,
+    row_stride: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let slice: &[T] = arr.as_slice();
+    let non_null = FIXED_NON_NULL_SENTINEL;
+    let value_bytes = size_of::<T>();
+    let slot_size = 1 + value_bytes;
+    let stride = row_stride as usize;
+    let prefix = col_prefix as usize;
+    let descending = field.descending;
+
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            // Hot path: each row's slot is a fixed window inside its `stride`-sized chunk,
+            // so the inner write vectorizes the same way as `arrow-row`'s not-null path.
+            for (chunk, &v) in out.chunks_exact_mut(stride).zip(slice.iter()) {
+                let slot = &mut chunk[prefix..prefix + slot_size];
+                slot[0] = non_null;
+                v.encode_to(&mut slot[1..], descending);
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = fixed_null_sentinel(field);
+            for (i, (chunk, &v)) in out.chunks_exact_mut(stride).zip(slice.iter()).enumerate() {
+                let slot = &mut chunk[prefix..prefix + slot_size];
+                if mask.value(i) {
+                    slot[0] = non_null;
+                    v.encode_to(&mut slot[1..], descending);
+                } else {
+                    slot[0] = null;
+                    for b in &mut slot[1..] {
+                        *b = 0;
+                    }
+                }
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/vortex-row/src/codec/mod.rs b/vortex-row/src/codec/mod.rs
new file mode 100644
index 00000000000..d5c8afbcfb7
--- /dev/null
+++ b/vortex-row/src/codec/mod.rs
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants.
+//!
+//! The encoded byte format produces a lexicographically byte-comparable representation:
+//! comparing the byte slices of two encoded rows yields the same ordering as the
+//! original logical (tuple) comparison of their values, modulo nulls placement and
+//! descending-ness as configured by [`RowSortFieldOptions`].
+//!
+//! Conventions:
+//! - Every fixed-width value is preceded by a 1-byte sentinel that orders nulls relative to
+//!   non-nulls. For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF),
+//!   not the sentinel.
+//! - Variable-length (Utf8, Binary) values use **three** distinct leading sentinels — one each
+//!   for null, empty, and non-empty — so byte comparison at position 0 fully categorizes the
+//!   value and column-byte boundaries stay aligned across rows. See
+//!   [`varlen_null_sentinel`], [`varlen_empty_sentinel`], [`varlen_non_empty_sentinel`].
+//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types.
+//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top
+//!   bit; negative flips all bits.
+//! - Nullable structs and fixed-size lists encode null parent rows with a **canonical null
+//!   body** so two null parent rows produce byte-equal encodings: fixed-width children
+//!   contribute their fixed null encoding, and variable-width children collapse to a single
+//!   null sentinel byte.
+
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::DecimalArray;
+use vortex_array::arrays::FixedSizeListArray;
+use vortex_array::arrays::NullArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
+use vortex_array::arrays::struct_::StructArrayExt;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::DecimalType;
+use vortex_array::dtype::NativeDecimalType;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::half::f16;
+use vortex_array::match_each_decimal_value_type;
+use vortex_array::match_each_native_ptype;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_error::vortex_err;
+
+use crate::options::RowSortFieldOptions;
+
+/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte).
+pub(crate) const BOOL_ENCODED_SIZE: u32 = 2;
+
+/// Block size used in the variable-length encoding.
+pub(crate) const VARLEN_BLOCK_SIZE: usize = 32;
+/// Total bytes per varlen block including the trailing continuation marker.
+pub(crate) const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1;
+const VARLEN_BLOCK_TOTAL_U32: u32 = 33;
+
+/// Size in bytes of an encoded null varlen value (just the sentinel byte).
+pub(crate) const VARLEN_NULL_SIZE: u32 = 1;
+/// Size in bytes of an encoded empty varlen value (just the sentinel byte).
+pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1;
+
+/// Returns the size in bytes of the encoded form of a non-empty variable-length value.
+///
+/// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1
+/// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and
+/// [`VARLEN_EMPTY_SIZE`] for empty values.
+///
+/// # Errors
+///
+/// Returns an error if the encoded length overflows `u32`. The block count itself always fits
+/// (a `BinaryView` length is a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`), but the
+/// `blocks * 33 + 1` byte total can exceed `u32::MAX` for multi-gigabyte values.
+#[inline]
+fn encoded_size_for_non_empty_varlen(len: usize) -> VortexResult<u32> {
+    debug_assert!(len > 0);
+    let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE))
+        .vortex_expect("varlen block count must fit in u32");
+    blocks
+        .checked_mul(VARLEN_BLOCK_TOTAL_U32)
+        .and_then(|b| b.checked_add(1))
+        .ok_or_else(|| vortex_err!("varlen encoded size overflows u32"))
+}
+
+/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel).
+#[inline]
+const fn encoded_size_for_fixed(value_bytes: u32) -> u32 {
+    1 + value_bytes
+}
+
+/// A native byte width (at most 32 for `i256`) always fits in a `u32`.
+#[inline]
+fn byte_width_u32(width: usize) -> u32 {
+    u32::try_from(width).vortex_expect("native byte width must fit in u32")
+}
+
+/// Pre-resolved per-row validity for the row encoders.
+///
+/// Encoders pattern-match on this once before their inner loop so the no-nulls fast path
+/// avoids per-row `mask.value(i)` branches entirely, and the nullable path materializes the
+/// mask exactly once.
+pub(crate) enum ValidityKind {
+    /// Column statically has no nulls (`Validity::NonNullable` or `AllValid`); no mask needed.
+    AllValid,
+    /// Column may have nulls; carries the materialized per-row mask.
+    Mask(vortex_mask::Mask),
+}
+
+/// Resolve a [`Validity`] into a [`ValidityKind`], materializing the mask only when the column
+/// may actually have nulls.
+#[inline]
+pub(crate) fn resolve_validity(
+    validity: Validity,
+    len: usize,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ValidityKind> {
+    Ok(match validity {
+        Validity::NonNullable | Validity::AllValid => ValidityKind::AllValid,
+        other => ValidityKind::Mask(other.execute_mask(len, ctx)?),
+    })
+}
+
+/// Returns the sentinel byte for a null varlen value.
+///
+/// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and
+/// independent of `descending`, matching the convention used by `arrow-row`.
+#[inline]
+fn varlen_null_sentinel(field: RowSortFieldOptions) -> u8 {
+    if field.nulls_first { 0x00 } else { 0xFF }
+}
+
+/// Returns the sentinel byte for an empty varlen value.
+///
+/// Equal to `0x01` in ascending mode and `!0x01 = 0xFE` in descending mode.
+#[inline]
+fn varlen_empty_sentinel(field: RowSortFieldOptions) -> u8 {
+    if field.descending { !0x01u8 } else { 0x01u8 }
+}
+
+/// Returns the sentinel byte for a non-empty varlen value.
+///
+/// Equal to `0x02` in ascending mode and `!0x02 = 0xFD` in descending mode.
+#[inline]
+fn varlen_non_empty_sentinel(field: RowSortFieldOptions) -> u8 {
+    if field.descending { !0x02u8 } else { 0x02u8 }
+}
+
+/// The sentinel byte that precedes a non-null fixed-width value.
+///
+/// Fixed-width values always lead with `0x01`. Null values use a sentinel that sorts either
+/// below (`0x00`) or above (`0x02`) it (see [`fixed_null_sentinel`]), so a single leading-byte
+/// comparison orders nulls relative to non-nulls. Unlike the value bytes, this sentinel is never
+/// inverted for `descending`: null placement is positional and independent of sort direction.
+const FIXED_NON_NULL_SENTINEL: u8 = 0x01;
+
+/// Returns the sentinel byte that precedes a null fixed-width value.
+///
+/// `nulls_first` writes `0x00` (sorts before the [`FIXED_NON_NULL_SENTINEL`] `0x01`); otherwise
+/// `0x02` (sorts after). Like the non-null sentinel, the choice is positional and independent of
+/// `descending`, matching the convention used by `arrow-row`.
+#[inline]
+fn fixed_null_sentinel(field: RowSortFieldOptions) -> u8 {
+    if field.nulls_first { 0x00 } else { 0x02 }
+}
+
+/// Returns the single-byte null sentinel used when a child contributes its canonical null
+/// encoding inside a null parent struct/FSL row.
+///
+/// For varlen children that is the varlen null sentinel; for everything else (including
+/// nested struct/FSL when used as a variable-width child) it is the fixed-width null sentinel.
+fn child_canonical_null_byte(child_dtype: &DType, field: RowSortFieldOptions) -> u8 {
+    match child_dtype {
+        DType::Utf8(_) | DType::Binary(_) => varlen_null_sentinel(field),
+        _ => fixed_null_sentinel(field),
+    }
+}
+
+/// Per-row width classification for a column.
+///
+/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless
+/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary,
+/// List, or any composite that recurses through a variable-width field).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum RowWidth {
+    /// Per-row width is the same constant for every row in the column.
+    Fixed(u32),
+    /// Per-row width is data-dependent.
+    Variable,
+}
+
+/// Classify a column's per-row encoded width by inspecting only its [`DType`].
+///
+/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value),
+/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the
+/// data.
+///
+/// Classification does not depend on the [`RowSortFieldOptions`]: null-vs-non-null encoding width is
+/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls).
+///
+/// # Errors
+///
+/// Returns an error for dtypes that the row encoder does not support. Width arithmetic that
+/// would overflow `u32` is also reported as an error rather than silently saturating.
+pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
+    match dtype {
+        DType::Null => Ok(RowWidth::Fixed(1)),
+        DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)),
+        DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32(
+            ptype.byte_width(),
+        )))),
+        DType::Decimal(dt, _) => {
+            let vt = DecimalType::smallest_decimal_value_type(dt);
+            if matches!(vt, DecimalType::I256) {
+                vortex_bail!("row encoding for Decimal256 is not yet implemented");
+            }
+            Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32(
+                vt.byte_width(),
+            ))))
+        }
+        DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable),
+        DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? {
+            // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL
+            // itself, then `n` copies of the element width.
+            RowWidth::Fixed(w) => {
+                let body = w
+                    .checked_mul(*n)
+                    .ok_or_else(|| vortex_err!("FSL row width overflows u32"))?;
+                let total = body
+                    .checked_add(1)
+                    .ok_or_else(|| vortex_err!("FSL row width overflows u32"))?;
+                Ok(RowWidth::Fixed(total))
+            }
+            RowWidth::Variable => Ok(RowWidth::Variable),
+        },
+        DType::Struct(fields, _) => {
+            // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel.
+            let mut total: u32 = 1; // outer sentinel
+            for field_dtype in fields.fields() {
+                match row_width_for_dtype(&field_dtype)? {
+                    RowWidth::Fixed(w) => {
+                        total = total
+                            .checked_add(w)
+                            .ok_or_else(|| vortex_err!("Struct row width overflows u32"))?;
+                    }
+                    RowWidth::Variable => return Ok(RowWidth::Variable),
+                }
+            }
+            Ok(RowWidth::Fixed(total))
+        }
+        DType::List(..) => {
+            vortex_bail!(
+                "row encoding does not support variable-size List arrays (no well-defined ordering)"
+            )
+        }
+        DType::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+        DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"),
+        dtype => vortex_bail!("row encoding does not support dtype: {dtype:?}"),
+    }
+}
+
+/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`.
+///
+/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the
+/// per-row size to each entry so multiple columns can accumulate into the same buffer.
+///
+/// # Errors
+///
+/// Returns an error for unsupported canonical variants.
+pub(crate) fn field_size(
+    canonical: &Canonical,
+    field: RowSortFieldOptions,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => add_size_null(arr, sizes)?,
+        Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1))?,
+        Canonical::Primitive(arr) => add_size_primitive(arr, sizes)?,
+        Canonical::Decimal(arr) => add_size_decimal(arr, sizes)?,
+        Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
+        Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?,
+        Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?,
+        Canonical::List(_) => vortex_bail!(
+            "row encoding does not support canonical List arrays: {:?}",
+            canonical.dtype()
+        ),
+        Canonical::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+        unsupported => {
+            vortex_bail!(
+                "row encoding does not support canonical array: {:?}",
+                unsupported.dtype()
+            )
+        }
+    }
+    Ok(())
+}
+
+/// Encode a fixed-width column at arithmetic offsets, without reading or writing any per-row
+/// cursor.
+///
+/// For row `i`, the column's bytes are written starting at `i * row_stride + col_prefix
+/// (+ var_prefix[i])`, where `var_prefix` is the exclusive prefix sum of the varlen
+/// contributions (`None` when the row layout has no variable-length columns). This is the
+/// fast path for fixed-width columns that appear before any varlen column, so their
+/// within-row position is a constant offset rather than a running cursor.
+///
+/// For primitive columns in the pure-fixed case it uses a `chunks_exact_mut` hot loop that
+/// removes the per-row offset/cursor indirection (matching `arrow-row`'s `encode_not_null`).
+/// All other types reuse [`field_encode`] at the materialized offsets, so the bytes written
+/// are byte-identical to the cursor path.
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn field_encode_fixed_arithmetic(
+    canonical: &Canonical,
+    field: RowSortFieldOptions,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    nrows: usize,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    if var_prefix.is_none()
+        && let Canonical::Primitive(arr) = canonical
+    {
+        return encode_primitive_arith(arr, field, col_prefix, row_stride, out, ctx);
+    }
+
+    // General path: materialize this column's per-row start offsets and reuse the cursor
+    // encoder with zero-initialized cursors, so every row is written at its arithmetic
+    // offset with the exact same bytes the cursor path would produce.
+    let mut offsets: Vec<u32> = Vec::with_capacity(nrows);
+    let mut base = col_prefix;
+    match var_prefix {
+        None => {
+            for _ in 0..nrows {
+                offsets.push(base);
+                base = base.wrapping_add(row_stride);
+            }
+        }
+        Some(vp) => {
+            for &p in vp.iter().take(nrows) {
+                offsets.push(base.wrapping_add(p));
+                base = base.wrapping_add(row_stride);
+            }
+        }
+    }
+    let mut cursors = vec![0u32; nrows];
+    field_encode(canonical, field, &offsets, &mut cursors, out, ctx)
+}
+
+/// Encode each row's bytes for the given canonical view into `out`, writing starting at
+/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of
+/// bytes written.
+///
+/// After this call returns successfully, `cursors[i]` will have advanced by exactly the
+/// per-row contribution previously computed by [`field_size`] for the same column.
+pub(crate) fn field_encode(
+    canonical: &Canonical,
+    field: RowSortFieldOptions,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out),
+        Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::List(_) => vortex_bail!(
+            "row encoding does not support canonical List arrays: {:?}",
+            canonical.dtype()
+        ),
+        Canonical::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+        unsupported => {
+            vortex_bail!(
+                "row encoding does not support canonical array: {:?}",
+                unsupported.dtype()
+            )
+        }
+    }
+    Ok(())
+}
+
+mod encoding;
+mod native;
+mod sizing;
+mod varlen;
+
+use encoding::*;
+use native::*;
+use sizing::*;
+use varlen::*;
diff --git a/vortex-row/src/codec/native.rs b/vortex-row/src/codec/native.rs
new file mode 100644
index 00000000000..b09df81adcf
--- /dev/null
+++ b/vortex-row/src/codec/native.rs
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! The `RowEncode` trait and its implementations for fixed-width native value types.
+
+use super::*;
+
+/// Internal trait for encoding a fixed-width native value into byte slots.
+///
+/// Implementations must produce a sequence of `size_of::<Self>()` bytes that is
+/// lexicographically byte-comparable according to the natural ordering of the type.
+pub(crate) trait RowEncode: Copy {
+    /// Encode this value into `out`, inverting the bytes for descending order.
+    fn encode_to(self, out: &mut [u8], descending: bool);
+}
+
+macro_rules! impl_row_encode_unsigned {
+    ($t:ty) => {
+        impl RowEncode for $t {
+            #[inline]
+            fn encode_to(self, out: &mut [u8], descending: bool) {
+                let bytes = self.to_be_bytes();
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! impl_row_encode_signed {
+    ($t:ty) => {
+        impl RowEncode for $t {
+            #[inline]
+            fn encode_to(self, out: &mut [u8], descending: bool) {
+                let mut bytes = self.to_be_bytes();
+                // Flip sign bit so negatives < non-negatives lexicographically.
+                bytes[0] ^= 0x80;
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+impl_row_encode_unsigned!(u8);
+impl_row_encode_unsigned!(u16);
+impl_row_encode_unsigned!(u32);
+impl_row_encode_unsigned!(u64);
+impl_row_encode_signed!(i8);
+impl_row_encode_signed!(i16);
+impl_row_encode_signed!(i32);
+impl_row_encode_signed!(i64);
+impl_row_encode_signed!(i128);
+
+impl RowEncode for f32 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u32 = if (bits >> 31) == 0 {
+            0x8000_0000
+        } else {
+            0xFFFF_FFFF
+        };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+impl RowEncode for f64 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u64 = if (bits >> 63) == 0 {
+            0x8000_0000_0000_0000
+        } else {
+            0xFFFF_FFFF_FFFF_FFFF
+        };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+impl RowEncode for f16 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
diff --git a/vortex-row/src/codec/sizing.rs b/vortex-row/src/codec/sizing.rs
new file mode 100644
index 00000000000..8621d966f2b
--- /dev/null
+++ b/vortex-row/src/codec/sizing.rs
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Size pass leaf kernels: per-row byte-size accumulation for each canonical variant.
+//!
+//! Every accumulator returns [`VortexResult`] and uses checked arithmetic, so an input whose
+//! per-row encoding would exceed `u32::MAX` bytes surfaces a [`VortexError`](vortex_error::VortexError)
+//! instead of overflowing or panicking.
+
+use super::*;
+
+pub(super) fn add_size_const(sizes: &mut [u32], add: u32) -> VortexResult<()> {
+    for s in sizes.iter_mut() {
+        *s = s
+            .checked_add(add)
+            .ok_or_else(|| vortex_err!("per-row size overflow"))?;
+    }
+    Ok(())
+}
+
+pub(super) fn add_size_null(arr: &NullArray, sizes: &mut [u32]) -> VortexResult<()> {
+    debug_assert_eq!(arr.len(), sizes.len());
+    // Just a sentinel byte per row.
+    add_size_const(sizes, 1)
+}
+
+pub(super) fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) -> VortexResult<()> {
+    let width = byte_width_u32(arr.ptype().byte_width());
+    add_size_const(sizes, encoded_size_for_fixed(width))
+}
+
+pub(super) fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) -> VortexResult<()> {
+    // Size from the precision-minimal type, not the physical `values_type`, so the size pass
+    // agrees with `row_width_for_dtype` (and the encode pass) regardless of how the producer
+    // stored the values. See `narrow_decimal_to_smallest`.
+    let vt = DecimalType::smallest_decimal_value_type(&arr.decimal_dtype());
+    let width = byte_width_u32(vt.byte_width());
+    add_size_const(sizes, encoded_size_for_fixed(width))
+}
+
+pub(super) fn add_size_varbinview(
+    arr: &VarBinViewArray,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let views = arr.views();
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, view) in views.iter().enumerate() {
+                let contribution = if view.is_empty() {
+                    VARLEN_EMPTY_SIZE
+                } else {
+                    encoded_size_for_non_empty_varlen(view.len() as usize)?
+                };
+                sizes[i] = sizes[i]
+                    .checked_add(contribution)
+                    .ok_or_else(|| vortex_err!("per-row size overflow"))?;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            for (i, view) in views.iter().enumerate() {
+                let contribution = if !mask.value(i) {
+                    VARLEN_NULL_SIZE
+                } else if view.is_empty() {
+                    VARLEN_EMPTY_SIZE
+                } else {
+                    encoded_size_for_non_empty_varlen(view.len() as usize)?
+                };
+                sizes[i] = sizes[i]
+                    .checked_add(contribution)
+                    .ok_or_else(|| vortex_err!("per-row size overflow"))?;
+            }
+        }
+    }
+    Ok(())
+}
+
+pub(super) fn add_size_struct(
+    arr: &StructArray,
+    field: RowSortFieldOptions,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    // Outer sentinel: 1 byte per row.
+    add_size_const(sizes, 1)?;
+    // Each child contributes its per-row size when the parent is non-null, and a canonical
+    // null contribution when the parent is null. For fixed-width children both are equal,
+    // so we can simply add the fixed width to every row. For variable-width children the
+    // null contribution collapses to 1 byte, ensuring null parent rows have a constant body.
+    for child in arr.iter_unmasked_fields() {
+        match row_width_for_dtype(child.dtype())? {
+            RowWidth::Fixed(w) => add_size_const(sizes, w)?,
+            RowWidth::Variable => {
+                let canonical = child.clone().execute::<Canonical>(ctx)?;
+                let mut child_sizes = vec![0u32; n];
+                field_size(&canonical, field, &mut child_sizes, ctx)?;
+                for i in 0..n {
+                    let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 };
+                    sizes[i] = sizes[i]
+                        .checked_add(contribution)
+                        .ok_or_else(|| vortex_err!("per-row size overflow"))?;
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+pub(super) fn add_size_fsl(
+    arr: &FixedSizeListArray,
+    field: RowSortFieldOptions,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    debug_assert_eq!(n, sizes.len());
+    // `list_size` is natively a `u32`; keep both forms so element indexing stays `usize` while
+    // width arithmetic avoids a fallible `usize -> u32` conversion.
+    let list_size_u32 = arr.list_size();
+    let list_size = list_size_u32 as usize;
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let elem_dtype = arr.elements().dtype();
+    // Outer sentinel: 1 byte per row.
+    add_size_const(sizes, 1)?;
+    match row_width_for_dtype(elem_dtype)? {
+        RowWidth::Fixed(w) => {
+            // Each row has `list_size` fixed-width elements regardless of null parent mask.
+            let body = w
+                .checked_mul(list_size_u32)
+                .ok_or_else(|| vortex_err!("FSL body width overflow"))?;
+            add_size_const(sizes, body)?;
+        }
+        RowWidth::Variable => {
+            let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+            debug_assert_eq!(elements.len(), n * list_size);
+            let mut elem_sizes = vec![0u32; n * list_size];
+            field_size(&elements, field, &mut elem_sizes, ctx)?;
+            for i in 0..n {
+                let body: u32 = if mask.value(i) {
+                    let base = i * list_size;
+                    let mut sum: u32 = 0;
+                    for j in 0..list_size {
+                        sum = sum
+                            .checked_add(elem_sizes[base + j])
+                            .ok_or_else(|| vortex_err!("FSL row body overflow"))?;
+                    }
+                    sum
+                } else {
+                    // Canonical null body for FSL with variable element: one null sentinel
+                    // per element. (Each element contributes `child_null_width = 1`.)
+                    list_size_u32
+                };
+                sizes[i] = sizes[i]
+                    .checked_add(body)
+                    .ok_or_else(|| vortex_err!("FSL per-row size overflow"))?;
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/vortex-row/src/codec/varlen.rs b/vortex-row/src/codec/varlen.rs
new file mode 100644
index 00000000000..fdc077d608a
--- /dev/null
+++ b/vortex-row/src/codec/varlen.rs
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Variable-length value body encoder: 32-byte blocks with continuation/length markers.
+
+use super::*;
+
+/// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with
+/// continuation/length markers. Returns the number of bytes written. Empty values are
+/// encoded by the caller as a single sentinel byte and never reach this function.
+///
+/// For the ascending path the hot loop is a `copy_nonoverlapping` of 32 bytes per block
+/// plus one stamped continuation byte. For the descending path it reads a u64 at a time and
+/// XORs with `0xFF`, giving LLVM a vectorizable inner loop.
+pub(super) fn encode_non_empty_varlen_body(
+    bytes: &[u8],
+    out: &mut [u8],
+    descending: bool,
+) -> VortexResult<u32> {
+    debug_assert!(!bytes.is_empty());
+    let len = bytes.len();
+    let full_blocks = len / VARLEN_BLOCK_SIZE;
+    let partial = len % VARLEN_BLOCK_SIZE;
+    let (full_to_write, partial_block_len) = if partial == 0 {
+        // Length is an exact multiple of 32: emit (full_blocks - 1) full blocks with the
+        // 0xFF continuation marker, then a final block whose continuation byte is 32.
+        (full_blocks - 1, VARLEN_BLOCK_SIZE)
+    } else {
+        (full_blocks, partial)
+    };
+    let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
+    // The caller reserved this slot from `encoded_size_for_non_empty_varlen`, which already
+    // verified the byte total fits `u32`; re-check here so the conversion never panics.
+    let total_u32 =
+        u32::try_from(total).map_err(|_| vortex_err!("encoded varlen size overflows u32"))?;
+    debug_assert!(out.len() >= total);
+    // The final block's continuation byte encodes its content length (1..=32).
+    let len_byte =
+        u8::try_from(partial_block_len).vortex_expect("varlen final block length (1..=32) fits u8");
+
+    // SAFETY: `out` has at least `total` bytes — the caller sizes every varlen slot via
+    // `encoded_size_for_non_empty_varlen` (which equals `1 + total`, the extra byte being the
+    // leading sentinel that the caller wrote and that is not part of `out`). `bytes` is valid
+    // for `len` reads, and every pointer advance below stays within `[0, total)` for `dst`
+    // and `[0, len)` for `src`.
+    unsafe {
+        let mut src = bytes.as_ptr();
+        let mut dst = out.as_mut_ptr();
+
+        if !descending {
+            // Ascending fast path: each full block is a 32-byte memcpy + a single 0xFF stamp.
+            for _ in 0..full_to_write {
+                std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0xFF;
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            // Final block: copy the partial data, zero-pad the tail, write the length byte.
+            std::ptr::copy_nonoverlapping(src, dst, partial_block_len);
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0,
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = len_byte;
+        } else {
+            // Descending: invert every value byte. A u64-stride XOR gives LLVM a vectorizable
+            // inner loop; the tail handles the partial block byte-wise.
+            for _ in 0..full_to_write {
+                xor_copy_block(src, dst);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            for i in 0..partial_block_len {
+                *dst.add(i) = *src.add(i) ^ 0xFF;
+            }
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0xFF, // 0x00 XOR 0xFF
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = len_byte ^ 0xFF;
+        }
+    }
+    Ok(total_u32)
+}
+
+/// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the
+/// four u64-wide iterations into SIMD on x86.
+///
+/// # Safety
+/// `src` must be valid for 32 reads, `dst` valid for 32 writes, and the regions must not
+/// overlap.
+#[inline(always)]
+unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) {
+    // Four u64 lanes of 8 bytes each = 32 bytes total.
+    for i in 0..4 {
+        let off = i * 8;
+        // SAFETY: the caller guarantees src/dst are valid for the full 32-byte block.
+        let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) };
+        unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) };
+    }
+}
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 04feec89415..77afec25330 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -34,7 +34,7 @@ use crate::codec;
 use crate::options::RowEncodingOptions;
 use crate::options::deserialize_row_encoding_options;
 use crate::options::serialize_row_encoding_options;
-use crate::size::ColKind;
+use crate::size::ColumnKind;
 use crate::size::compute_sizes;
 
 /// Variadic scalar function that encodes N input columns into a single `List<u8>`
@@ -135,15 +135,7 @@ fn execute_row_encode(
         usize::try_from(total).vortex_expect("validated row-encoded output size must fit usize");
 
     let mut out_buf: BufferMut<u8> = BufferMut::with_capacity(total_len);
-    // Every encoder writes every byte in its row range: fixed-width values write
-    // sentinel + value (null rows write sentinel + explicit zero-fill); varlen blocks
-    // zero-pad their final partial block; struct/FSL fixed children are written for all
-    // rows then null parent rows are overwritten with the canonical null body. So the
-    // size-pass + encoder contract guarantees `[0, total_len)` is fully written before
-    // the buffer is read out, making the pre-zero-init redundant. Skipping it saves a
-    // `total_len`-byte memset per call (significant for varlen-heavy inputs, where
-    // `total_len` reaches multiple MB).
-    //
+
     // SAFETY: `total_len` bytes of capacity were just reserved, and by the contract above
     // every byte in that range is written before `out_buf` is frozen and read.
     unsafe { out_buf.set_len(total_len) };
@@ -160,7 +152,7 @@ fn execute_row_encode(
         && col_kinds.iter().any(|k| {
             matches!(
                 k,
-                ColKind::Fixed {
+                ColumnKind::Fixed {
                     before_varlen: true,
                     ..
                 }
@@ -203,17 +195,10 @@ fn execute_row_encode(
 
     // Per-row write cursor (also doubles as the ListView `sizes` slot when done). We build
     // it as a BufferMut so we can hand it directly to the output PrimitiveArray.
-    //
-    // The cursor path begins at the first cursor-path column. Fixed-before-varlen columns
-    // are written by the arithmetic path and do not touch the cursor, so the cursor is
-    // pre-seeded with the within-row offset of the first varlen column (its `fixed_prefix`).
-    // When there are no varlen columns at all, every column takes the arithmetic path and
-    // the cursor loop runs zero iterations; seeding with `fixed_per_row` then leaves the
-    // cursors already correct as per-row sizes.
     let initial_cursor: u32 = match first_varlen_idx {
         Some(idx) => match col_kinds[idx] {
-            ColKind::Variable { fixed_prefix } => fixed_prefix,
-            ColKind::Fixed { .. } => unreachable!("first_varlen_idx points at a varlen column"),
+            ColumnKind::Variable { fixed_prefix } => fixed_prefix,
+            ColumnKind::Fixed { .. } => unreachable!("first_varlen_idx points at a varlen column"),
         },
         None => fixed_per_row,
     };
@@ -226,7 +211,7 @@ fn execute_row_encode(
     // path. Each column was canonicalized once during the size pass; reuse that form.
     for (i, canonical) in columns.iter().enumerate() {
         match col_kinds[i] {
-            ColKind::Fixed {
+            ColumnKind::Fixed {
                 prefix,
                 before_varlen: true,
                 ..
@@ -242,7 +227,7 @@ fn execute_row_encode(
                     ctx,
                 )?;
             }
-            ColKind::Fixed { .. } | ColKind::Variable { .. } => {
+            ColumnKind::Fixed { .. } | ColumnKind::Variable { .. } => {
                 codec::field_encode(
                     canonical,
                     options.fields[i],
@@ -260,14 +245,7 @@ fn execute_row_encode(
     let offsets_arr =
         PrimitiveArray::new(listview_offsets.freeze(), Validity::NonNullable).into_array();
     let sizes_arr = PrimitiveArray::new(row_cursors.freeze(), Validity::NonNullable).into_array();
-    // SAFETY: this encoder constructs `elements`, `offsets_arr`, and `sizes_arr` itself:
-    // - `elements` is a `PrimitiveArray<u8>` of length `total_len`.
-    // - `offsets_arr[i]` is `i * fixed_per_row + var_prefix[i]`, monotonically increasing and
-    //   in `0..=total_len`.
-    // - `offsets_arr[i] + sizes_arr[i] <= total_len` by construction, and each row's slice is
-    //   disjoint from every other row's.
-    // `try_new`'s validation re-walks every row to check exactly these invariants, which we
-    // already guarantee by construction, so we skip it.
+    // SAFETY: this encoder constructs `elements`, `offsets_arr`, and `sizes_arr` itself.
     Ok(unsafe {
         ListViewArray::new_unchecked(elements, offsets_arr, sizes_arr, Validity::NonNullable)
     }
diff --git a/vortex-row/src/encoder.rs b/vortex-row/src/encoder.rs
index 7bcd3e05627..47c067d8107 100644
--- a/vortex-row/src/encoder.rs
+++ b/vortex-row/src/encoder.rs
@@ -14,12 +14,12 @@ use vortex_error::vortex_bail;
 
 use crate::encode::RowEncode;
 use crate::options::RowEncodingOptions;
-use crate::options::RowSortField;
+use crate::options::RowSortFieldOptions;
 use crate::size::RowSize;
 
 /// Encodes N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose row
 /// byte slices compare lexicographically in the same order as a tuple comparison of the input
-/// values under the configured [`RowSortField`]s.
+/// values under the configured [`RowSortFieldOptions`]s.
 ///
 /// Construct with [`RowEncoder::new`] or [`RowEncoder::with_options`] to pin the per-column
 /// sort options, or use [`RowEncoder::default`] to apply ascending, nulls-first ordering to
@@ -30,8 +30,8 @@ pub struct RowEncoder {
 }
 
 impl RowEncoder {
-    /// Construct a `RowEncoder` from one [`RowSortField`] per input column.
-    pub fn new(fields: impl IntoIterator<Item = RowSortField>) -> Self {
+    /// Construct a `RowEncoder` from one [`RowSortFieldOptions`] per input column.
+    pub fn new(fields: impl IntoIterator<Item = RowSortFieldOptions>) -> Self {
         Self {
             options: Some(RowEncodingOptions::new(fields)),
         }
@@ -120,42 +120,3 @@ fn reject_extension_dtype(dtype: &DType) -> VortexResult<()> {
     }
     Ok(())
 }
-
-/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose bytes
-/// are lexicographically comparable in the same order as a tuple comparison of the input
-/// values according to `fields`. Convenience wrapper over [`RowEncoder::encode`].
-pub fn convert_columns(
-    cols: &[ArrayRef],
-    fields: &[RowSortField],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ListViewArray> {
-    RowEncoder::new(fields.iter().copied()).encode(cols, ctx)
-}
-
-/// Like [`convert_columns`] but takes a prebuilt [`RowEncodingOptions`].
-pub fn convert_columns_with_options(
-    cols: &[ArrayRef],
-    options: &RowEncodingOptions,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ListViewArray> {
-    RowEncoder::with_options(options.clone()).encode(cols, ctx)
-}
-
-/// Compute only the per-row sizes (in bytes) of the row-encoded form for N columns.
-/// Convenience wrapper over [`RowEncoder::row_sizes`].
-pub fn compute_row_sizes(
-    cols: &[ArrayRef],
-    fields: &[RowSortField],
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ArrayRef> {
-    RowEncoder::new(fields.iter().copied()).row_sizes(cols, ctx)
-}
-
-/// Like [`compute_row_sizes`] but takes a prebuilt [`RowEncodingOptions`].
-pub fn compute_row_sizes_with_options(
-    cols: &[ArrayRef],
-    options: &RowEncodingOptions,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ArrayRef> {
-    RowEncoder::with_options(options.clone()).row_sizes(cols, ctx)
-}
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index b36121f0da2..60c06a916d9 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -5,8 +5,9 @@
 //!
 //! This crate converts one or more columnar arrays into a single `ListView<u8>` array whose
 //! row byte slices can be compared lexicographically. The byte ordering matches tuple
-//! ordering of the input values under the requested [`RowSortField`] settings, making the
-//! representation useful for sort keys and other row-key operations.
+//! ordering of the input values under the requested [`RowSortFieldOptions`] settings, making the
+//! representation useful for sort keys and other row-key operations. It is the Vortex analogue
+//! of `arrow-row`.
 //!
 //! The public entry points are:
 //! - [`RowEncoder`], the primary API for encoding columns into row bytes.
@@ -22,10 +23,99 @@
 //! those sizes to allocate one contiguous elements buffer, then writes each column's bytes
 //! into the per-row slots from left to right.
 //!
+//! <div class="warning">
+//!
+//! The row encoding format is **experimental**. Its byte layout, supported type set, and
+//! edge-case semantics may change between Vortex releases. Do not persist these bytes or
+//! depend on them as a stable interchange format.
+//!
+//! </div>
+//!
+//! # Byte-layout reference
+//!
+//! This is a schema-aware row-key format: the bytes carry no type tags, field names, or sort
+//! options, so two encoded rows are comparable only when produced from the same schema and the
+//! same per-column [`RowSortFieldOptions`].
+//!
+//! ## Order property
+//!
+//! For a fixed schema with columns `c0..cn` and per-column sort fields `f0..fn`:
+//!
+//! ```text
+//! encode(row_a) < encode(row_b)
+//!   <=>  (row_a.c0, .., row_a.cn) < (row_b.c0, .., row_b.cn)
+//! ```
+//!
+//! under the requested direction and null placement of each column. This holds because (1)
+//! every supported value is encoded so its bytes sort in the same order as the value, and (2)
+//! fields are concatenated left to right, so lexicographic byte comparison performs tuple
+//! comparison. `||` below means byte concatenation, `BE(x)` the fixed-width big-endian bytes of
+//! `x`, and `!bytes` the bitwise complement of every byte.
+//!
+//! ## Field options
+//!
+//! Each input column carries a [`RowSortFieldOptions`] `{ descending, nulls_first }`.
+//! `descending` reverses the order of non-null values; `nulls_first` is independent of
+//! `descending`, so nulls can sort before or after non-nulls in either direction.
+//!
+//! ## Sentinels
+//!
+//! A leading sentinel byte classifies nullness (and, for variable-width values, empty vs
+//! non-empty) before any value bytes are compared. The sentinel itself is never inverted for
+//! `descending`, which keeps null placement independent of sort direction.
+//!
+//! | Family | Case | Asc, nulls first | Desc, nulls first | Asc, nulls last | Desc, nulls last |
+//! | --- | --- | --- | --- | --- | --- |
+//! | Fixed-width | Null | `0x00` | `0x00` | `0x02` | `0x02` |
+//! | Fixed-width | Non-null | `0x01` | `0x01` | `0x01` | `0x01` |
+//! | Variable-width | Null | `0x00` | `0x00` | `0xFF` | `0xFF` |
+//! | Variable-width | Empty | `0x01` | `0xFE` | `0x01` | `0xFE` |
+//! | Variable-width | Non-empty | `0x02` | `0xFD` | `0x02` | `0xFD` |
+//!
+//! Fixed-width sentinels are used by null, boolean, primitive, decimal, struct, and fixed-size
+//! list values; variable-width sentinels by UTF-8 and binary values.
+//!
+//! ## Per-type encoding
+//!
+//! - **Null**: just the fixed-width sentinel, no body.
+//! - **Boolean**: `sentinel || value_byte`, where `false = 0x01`, `true = 0x02` (inverted for
+//!   descending). Null bodies are a single zero byte.
+//! - **Unsigned integer** (`u8`–`u64`): `0x01 || BE(value)` (`!BE(value)` descending). Null
+//!   bodies are `width(T)` zero bytes.
+//! - **Signed integer** (`i8`–`i64`, and `i128` decimal storage): flip the sign bit of
+//!   `BE(value)` so negatives sort before non-negatives, then apply the descending complement.
+//! - **Floating point** (`f16`/`f32`/`f64`): treat the IEEE bits as unsigned; flip the top bit
+//!   for non-negative values and all bits for negative, then big-endian. Yields total-ordering
+//!   semantics (`-0.0 < +0.0`, NaNs ordered by bit pattern).
+//! - **Decimal**: encoded as its scaled signed-integer storage value at the *precision-minimal*
+//!   width (`1..=2 -> i8`, `3..=4 -> i16`, `5..=9 -> i32`, `10..=18 -> i64`, `19..=38 -> i128`),
+//!   using the signed-integer encoding. `Decimal256` is unsupported. The width is a pure
+//!   function of the precision, so storage physically wider than the precision requires is
+//!   narrowed losslessly before encoding (precision bounds the magnitude of every valid value).
+//! - **UTF-8 / Binary**: a variable-width sentinel, and for non-empty values a block-structured
+//!   body. Each block is 32 data bytes plus a marker: non-final full blocks use marker `0xFF`,
+//!   the final block is zero-padded to 32 bytes with a marker giving its real length (`1..=32`).
+//!   Descending inverts the data bytes, padding, and markers. This preserves prefix order.
+//! - **Struct / Fixed-size list**: an outer fixed-width sentinel followed by the children
+//!   encoded recursively in order with the parent's options. A null parent emits a *canonical
+//!   null body* (fixed-width children contribute their fixed null encoding; variable-width
+//!   children collapse to one null sentinel byte) so two null parents are byte-equal regardless
+//!   of underlying child data. A composite is fixed-width only when all of its children are.
+//!
+//! ## Output layout
+//!
+//! The result is a `ListView<u8>`: a single contiguous `elements` buffer holding every row's
+//! bytes, with per-row `offsets` and `sizes`. Rows are not self-describing without `sizes`,
+//! since a variable-width field can make one row longer than another. The sizing pass computes
+//! `sizes` before writing, and the same array doubles as the per-row write cursor.
+//!
 //! Supported logical types are nulls, booleans, primitive integers and floats, decimals up to
 //! 128 bits, UTF-8 and binary values, structs, and fixed-size lists. Extension, variant,
 //! union, and variable-size list arrays are rejected because this crate does not define an
 //! ordering for them.
+//!
+//! See `docs/specs/row-encoding.md` for the formal specification and a fully worked example
+//! row.
 
 mod codec;
 mod encode;
@@ -38,14 +128,14 @@ mod tests;
 
 pub use encode::RowEncode;
 pub use encoder::RowEncoder;
-pub use encoder::compute_row_sizes;
-pub use encoder::compute_row_sizes_with_options;
-pub use encoder::convert_columns;
-pub use encoder::convert_columns_with_options;
 pub use options::RowEncodingOptions;
-pub use options::RowSortField;
+pub use options::RowSortFieldOptions;
 pub use size::RowSize;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::ListViewArray;
 use vortex_array::scalar_fn::session::ScalarFnSessionExt;
+use vortex_error::VortexResult;
 use vortex_session::VortexSession;
 
 /// Register the row-encoding scalar functions ([`RowSize`] and [`RowEncode`]) on the given
@@ -58,3 +148,23 @@ pub fn initialize(session: &VortexSession) {
     session.scalar_fns().register(RowSize);
     session.scalar_fns().register(RowEncode);
 }
+
+/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose bytes
+/// are lexicographically comparable in the same order as a tuple comparison of the input
+/// values according to `fields`. Convenience wrapper over [`RowEncoder::encode`].
+pub fn convert_columns(
+    cols: &[ArrayRef],
+    fields: &[RowSortFieldOptions],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ListViewArray> {
+    RowEncoder::new(fields.iter().copied()).encode(cols, ctx)
+}
+
+/// Like [`convert_columns`] but takes a prebuilt [`RowEncodingOptions`].
+pub fn convert_columns_with_options(
+    cols: &[ArrayRef],
+    options: &RowEncodingOptions,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ListViewArray> {
+    RowEncoder::with_options(options.clone()).encode(cols, ctx)
+}
diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs
index 380c9a3827f..1997512941d 100644
--- a/vortex-row/src/options.rs
+++ b/vortex-row/src/options.rs
@@ -8,24 +8,24 @@ use smallvec::SmallVec;
 
 /// Per-column ordering options for row-oriented encoding.
 ///
-/// A `RowSortField` describes how one input column contributes to a row key. Descending order
+/// A `RowSortFieldOptions` describes how one input column contributes to a row key. Descending order
 /// reverses the encoded value bytes for that column. Null placement is controlled separately,
 /// so nulls keep the requested position relative to non-null values in either direction.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub struct RowSortField {
+pub struct RowSortFieldOptions {
     /// If true, this column sorts in descending order.
     pub descending: bool,
     /// If true, nulls sort before non-null values.
     pub nulls_first: bool,
 }
 
-impl Default for RowSortField {
+impl Default for RowSortFieldOptions {
     fn default() -> Self {
         Self::ascending()
     }
 }
 
-impl Display for RowSortField {
+impl Display for RowSortFieldOptions {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
@@ -35,8 +35,8 @@ impl Display for RowSortField {
     }
 }
 
-impl RowSortField {
-    /// Construct a new `RowSortField` with explicit options.
+impl RowSortFieldOptions {
+    /// Construct a new `RowSortFieldOptions` with explicit options.
     pub const fn new(descending: bool, nulls_first: bool) -> Self {
         Self {
             descending,
@@ -65,43 +65,24 @@ impl RowSortField {
         self.nulls_first = false;
         self
     }
-
-    /// Returns the sentinel byte to write for a non-null value.
-    #[inline]
-    pub(crate) fn non_null_sentinel(&self) -> u8 {
-        // Non-null is always 0x01. Null choices are < or > 0x01.
-        0x01
-    }
-
-    /// Returns the sentinel byte to write for a null value.
-    #[inline]
-    pub(crate) fn null_sentinel(&self) -> u8 {
-        if self.nulls_first {
-            // Nulls before non-nulls (smaller byte sorts first).
-            0x00
-        } else {
-            // Nulls after non-nulls (larger byte sorts later).
-            0x02
-        }
-    }
 }
 
 const FIELDS_INLINE: usize = 4;
 
 /// Ordering options for row-oriented encoding.
 ///
-/// The options contain one [`RowSortField`] per input column, in the same order as the columns
+/// The options contain one [`RowSortFieldOptions`] per input column, in the same order as the columns
 /// passed to [`convert_columns`](crate::convert_columns),
 /// [`compute_row_sizes`](crate::compute_row_sizes), [`RowSize`](crate::RowSize), or
 /// [`RowEncode`](crate::RowEncode).
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct RowEncodingOptions {
-    pub(crate) fields: SmallVec<[RowSortField; FIELDS_INLINE]>,
+    pub(crate) fields: SmallVec<[RowSortFieldOptions; FIELDS_INLINE]>,
 }
 
 impl RowEncodingOptions {
-    /// Construct a new `RowEncodingOptions` from any iterator of [`RowSortField`]s.
-    pub fn new(fields: impl IntoIterator<Item = RowSortField>) -> Self {
+    /// Construct a new `RowEncodingOptions` from any iterator of [`RowSortFieldOptions`]s.
+    pub fn new(fields: impl IntoIterator<Item = RowSortFieldOptions>) -> Self {
         Self {
             fields: fields.into_iter().collect(),
         }
@@ -109,11 +90,14 @@ impl RowEncodingOptions {
 
     /// Construct default ascending, nulls-first options for `column_count` input columns.
     pub fn default_for_columns(column_count: usize) -> Self {
-        Self::new(std::iter::repeat_n(RowSortField::default(), column_count))
+        Self::new(std::iter::repeat_n(
+            RowSortFieldOptions::default(),
+            column_count,
+        ))
     }
 
     /// Borrow the per-column sort fields.
-    pub fn fields(&self) -> &[RowSortField] {
+    pub fn fields(&self) -> &[RowSortFieldOptions] {
         &self.fields
     }
 
@@ -128,8 +112,8 @@ impl RowEncodingOptions {
     }
 }
 
-impl FromIterator<RowSortField> for RowEncodingOptions {
-    fn from_iter<T: IntoIterator<Item = RowSortField>>(iter: T) -> Self {
+impl FromIterator<RowSortFieldOptions> for RowEncodingOptions {
+    fn from_iter<T: IntoIterator<Item = RowSortFieldOptions>>(iter: T) -> Self {
         Self::new(iter)
     }
 }
@@ -180,10 +164,10 @@ pub(crate) fn deserialize_row_encoding_options(
             expected
         );
     }
-    let mut fields: SmallVec<[RowSortField; FIELDS_INLINE]> = SmallVec::with_capacity(n);
+    let mut fields: SmallVec<[RowSortFieldOptions; FIELDS_INLINE]> = SmallVec::with_capacity(n);
     let mut i = 4;
     for _ in 0..n {
-        fields.push(RowSortField {
+        fields.push(RowSortFieldOptions {
             descending: bytes[i] != 0,
             nulls_first: bytes[i + 1] != 0,
         });
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
index 9112379a6f4..a9b3255227d 100644
--- a/vortex-row/src/size.rs
+++ b/vortex-row/src/size.rs
@@ -4,6 +4,7 @@
 //! `RowSize` variadic scalar function: aggregate per-row byte sizes for N input columns.
 
 use std::sync::Arc;
+use std::sync::LazyLock;
 
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
@@ -13,6 +14,7 @@ use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::StructArray;
 use vortex_array::dtype::DType;
+use vortex_array::dtype::FieldDType;
 use vortex_array::dtype::FieldName;
 use vortex_array::dtype::FieldNames;
 use vortex_array::dtype::Nullability;
@@ -44,7 +46,7 @@ use crate::options::serialize_row_encoding_options;
 /// path (no varlen before this column, so the within-row position is constant per row) and
 /// the cursor-write path.
 #[derive(Clone, Copy, Debug)]
-pub(crate) enum ColKind {
+pub(crate) enum ColumnKind {
     /// Fixed-width column. `prefix` is the within-row byte offset of this column's first
     /// byte. When `before_varlen` is true no variable-length column precedes this one, so the
     /// within-row offset is constant for every row.
@@ -63,7 +65,7 @@ pub(crate) enum ColKind {
 pub(crate) struct SizePassResult {
     pub fixed_per_row: u32,
     pub var_lengths: Option<Vec<u32>>,
-    pub col_kinds: Vec<ColKind>,
+    pub col_kinds: Vec<ColumnKind>,
     pub first_varlen_idx: Option<usize>,
     pub columns: Vec<Canonical>,
 }
@@ -97,7 +99,7 @@ pub(crate) fn compute_sizes(
     let nrows = args.row_count();
 
     let mut columns: Vec<Canonical> = Vec::with_capacity(n_inputs);
-    let mut col_kinds: Vec<ColKind> = Vec::with_capacity(n_inputs);
+    let mut col_kinds: Vec<ColumnKind> = Vec::with_capacity(n_inputs);
     let mut fixed_per_row: u32 = 0;
     let mut var_lengths: Option<Vec<u32>> = None;
     let mut first_varlen_idx: Option<usize> = None;
@@ -118,7 +120,7 @@ pub(crate) fn compute_sizes(
         let canonical = col.execute::<Canonical>(ctx)?;
         match width {
             RowWidth::Fixed(w) => {
-                col_kinds.push(ColKind::Fixed {
+                col_kinds.push(ColumnKind::Fixed {
                     prefix: running_fixed_prefix,
                     before_varlen: first_varlen_idx.is_none(),
                 });
@@ -133,7 +135,7 @@ pub(crate) fn compute_sizes(
                 }
                 let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]);
                 codec::field_size(&canonical, options.fields[i], v, ctx)?;
-                col_kinds.push(ColKind::Variable {
+                col_kinds.push(ColumnKind::Variable {
                     fixed_prefix: running_fixed_prefix,
                 });
             }
@@ -151,7 +153,7 @@ pub(crate) fn compute_sizes(
 }
 
 /// Variadic scalar function that, given N input columns and per-column
-/// [`RowSortField`](crate::RowSortField)s,
+/// [`RowSortFieldOptions`](crate::RowSortFieldOptions)s,
 /// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the
 /// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode).
 ///
@@ -169,22 +171,17 @@ pub(crate) fn compute_sizes(
 pub struct RowSize;
 
 /// Returns the [`FieldNames`] used by the [`RowSize`] output struct.
-pub(crate) fn row_size_field_names() -> FieldNames {
-    FieldNames::from([FieldName::from("fixed"), FieldName::from("var")])
-}
-
-/// Returns the output [`DType`] of [`RowSize`].
-pub(crate) fn row_size_struct_dtype() -> DType {
-    DType::Struct(
-        StructFields::new(
-            row_size_field_names(),
+pub(crate) fn row_size_struct_fields() -> StructFields {
+    static FIELDS: LazyLock<StructFields> = LazyLock::new(|| {
+        StructFields::from_fields(
+            FieldNames::from([FieldName::from("fixed"), FieldName::from("var")]),
             vec![
-                DType::Primitive(PType::U32, Nullability::NonNullable),
-                DType::Primitive(PType::U32, Nullability::NonNullable),
+                FieldDType::from(DType::Primitive(PType::U32, Nullability::NonNullable)),
+                FieldDType::from(DType::Primitive(PType::U32, Nullability::NonNullable)),
             ],
-        ),
-        Nullability::NonNullable,
-    )
+        )
+    });
+    *FIELDS
 }
 
 impl ScalarFnVTable for RowSize {
@@ -215,7 +212,10 @@ impl ScalarFnVTable for RowSize {
     }
 
     fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult<DType> {
-        Ok(row_size_struct_dtype())
+        Ok(DType::Struct(
+            row_size_struct_fields(),
+            Nullability::NonNullable,
+        ))
     }
 
     fn execute(
@@ -233,9 +233,9 @@ impl ScalarFnVTable for RowSize {
                 .into_array(),
             None => ConstantArray::new(Scalar::from(0u32), nrows).into_array(),
         };
-        Ok(StructArray::try_new(
-            row_size_field_names(),
+        Ok(StructArray::try_new_with_dtype(
             vec![fixed_array, var_array],
+            row_size_struct_fields(),
             nrows,
             Validity::NonNullable,
         )?
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs
index 5c85c911154..8c36c66a95c 100644
--- a/vortex-row/src/tests.rs
+++ b/vortex-row/src/tests.rs
@@ -6,6 +6,8 @@
 use std::f64::consts::PI;
 
 use rstest::rstest;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::LEGACY_SESSION;
 use vortex_array::VortexSessionExecute;
@@ -22,11 +24,16 @@ use vortex_array::extension::datetime::TimeUnit;
 use vortex_error::VortexResult;
 
 use crate::RowEncoder;
-use crate::RowEncodingOptions;
-use crate::RowSortField;
-use crate::compute_row_sizes_with_options;
+use crate::RowSortFieldOptions;
 use crate::convert_columns;
-use crate::convert_columns_with_options;
+
+fn compute_row_sizes(
+    cols: &[ArrayRef],
+    fields: &[RowSortFieldOptions],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    RowEncoder::new(fields.iter().copied()).row_sizes(cols, ctx)
+}
 
 fn collect_row_bytes(array: &ListViewArray) -> Vec<Vec<u8>> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();
@@ -45,7 +52,7 @@ fn collect_row_bytes(array: &ListViewArray) -> Vec<Vec<u8>> {
 fn assert_sort_order_i64(values: Vec<i64>, descending: bool) -> VortexResult<()> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();
     let col = PrimitiveArray::from_iter(values.clone()).into_array();
-    let field = RowSortField::new(descending, true);
+    let field = RowSortFieldOptions::new(descending, true);
     let encoded = convert_columns(&[col], &[field], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
 
@@ -80,7 +87,7 @@ fn primitive_u32_sort_order() -> VortexResult<()> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();
     let values: Vec<u32> = vec![0, 1, 100, u32::MAX, 42, 17];
     let col = PrimitiveArray::from_iter(values.clone()).into_array();
-    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
 
     let mut sorted_rows = rows.clone();
@@ -100,7 +107,7 @@ fn reject_temporal_extension_dtype_early() -> VortexResult<()> {
     let ext_dtype = Date::new(TimeUnit::Days, Nullability::NonNullable).erased();
     let col = ExtensionArray::new(ext_dtype, storage).into_array();
 
-    let err = convert_columns(&[col], &[RowSortField::ascending()], &mut ctx)
+    let err = convert_columns(&[col], &[RowSortFieldOptions::ascending()], &mut ctx)
         .expect_err("temporal extensions should be rejected");
     assert!(
         err.to_string().contains("Extension arrays yet"),
@@ -119,7 +126,7 @@ fn reject_nested_temporal_extension_dtype_early() -> VortexResult<()> {
     let struct_col =
         StructArray::from_fields(&[("date", date_col), ("tag", tag_col)])?.into_array();
 
-    let err = convert_columns(&[struct_col], &[RowSortField::ascending()], &mut ctx)
+    let err = convert_columns(&[struct_col], &[RowSortFieldOptions::ascending()], &mut ctx)
         .expect_err("nested temporal extensions should be rejected");
     assert!(
         err.to_string().contains("Extension arrays yet"),
@@ -136,7 +143,7 @@ fn primitive_f64_sort_order() -> VortexResult<()> {
     // -0.0 == 0.0.
     let values: Vec<f64> = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, PI];
     let col = PrimitiveArray::from_iter(values.clone()).into_array();
-    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
 
     let mut sorted_rows = rows.clone();
@@ -153,7 +160,7 @@ fn primitive_f64_sort_order() -> VortexResult<()> {
 fn bool_sort_order() -> VortexResult<()> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();
     let col = BoolArray::from_iter([true, false, true, false]).into_array();
-    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
 
     let mut sorted = rows.clone();
@@ -178,7 +185,7 @@ fn utf8_sort_order() -> VortexResult<()> {
         "banana_loaf_for_test",
     ];
     let col = VarBinViewArray::from_iter_str(values.clone()).into_array();
-    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
 
     let mut sorted = rows.clone();
@@ -200,7 +207,10 @@ fn multi_column_sort() -> VortexResult<()> {
     let col1 = VarBinViewArray::from_iter_str(strs.clone()).into_array();
     let encoded = convert_columns(
         &[col0, col1],
-        &[RowSortField::default(), RowSortField::default()],
+        &[
+            RowSortFieldOptions::default(),
+            RowSortFieldOptions::default(),
+        ],
         &mut ctx,
     )?;
     let rows = collect_row_bytes(&encoded);
@@ -223,7 +233,7 @@ fn nulls_first_and_last() -> VortexResult<()> {
     // nulls_first=true
     let encoded = convert_columns(
         std::slice::from_ref(&col),
-        &[RowSortField::ascending()],
+        &[RowSortFieldOptions::ascending()],
         &mut ctx,
     )?;
     let rows = collect_row_bytes(&encoded);
@@ -236,7 +246,11 @@ fn nulls_first_and_last() -> VortexResult<()> {
         assert_eq!(sorted[i][0], 0x00);
     }
     // nulls_first=false
-    let encoded = convert_columns(&[col], &[RowSortField::ascending().nulls_last()], &mut ctx)?;
+    let encoded = convert_columns(
+        &[col],
+        &[RowSortFieldOptions::ascending().nulls_last()],
+        &mut ctx,
+    )?;
     let rows = collect_row_bytes(&encoded);
     let mut sorted = rows;
     sorted.sort();
@@ -248,42 +262,10 @@ fn nulls_first_and_last() -> VortexResult<()> {
     Ok(())
 }
 
-#[test]
-fn reusable_options_helpers() -> VortexResult<()> {
-    let mut ctx = LEGACY_SESSION.create_execution_ctx();
-    let options = RowEncodingOptions::new([RowSortField::descending().nulls_last()]);
-    assert_eq!(options.len(), 1);
-    assert!(!options.is_empty());
-    assert_eq!(
-        options.fields(),
-        &[RowSortField {
-            descending: true,
-            nulls_first: false
-        }]
-    );
-
-    let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array();
-    let encoder = RowEncoder::with_options(options.clone());
-    assert_eq!(encoder.options(), Some(&options));
-
-    let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?;
-    assert_eq!(encoded.len(), 3);
-
-    let sizes = encoder.row_sizes(std::slice::from_ref(&col), &mut ctx)?;
-    assert_eq!(sizes.len(), 3);
-
-    let encoded = convert_columns_with_options(std::slice::from_ref(&col), &options, &mut ctx)?;
-    assert_eq!(encoded.len(), 3);
-
-    let sizes = compute_row_sizes_with_options(std::slice::from_ref(&col), &options, &mut ctx)?;
-    assert_eq!(sizes.len(), 3);
-    Ok(())
-}
-
 #[test]
 fn row_encoder_new_accepts_sort_fields() -> VortexResult<()> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();
-    let encoder = RowEncoder::new([RowSortField::ascending()]);
+    let encoder = RowEncoder::new([RowSortFieldOptions::ascending()]);
     let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array();
 
     let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?;
@@ -312,7 +294,7 @@ fn struct_sort_order() -> VortexResult<()> {
     let name_arr = VarBinViewArray::from_iter_str(names.clone()).into_array();
     let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])?.into_array();
 
-    let encoded = convert_columns(&[struct_arr], &[RowSortField::default()], &mut ctx)?;
+    let encoded = convert_columns(&[struct_arr], &[RowSortFieldOptions::default()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
 
     let mut sorted = rows.clone();
@@ -330,8 +312,6 @@ fn row_size_struct_shape() -> VortexResult<()> {
     use vortex_array::arrays::StructArray;
     use vortex_array::arrays::struct_::StructArrayExt;
 
-    use crate::compute_row_sizes;
-
     let mut ctx = LEGACY_SESSION.create_execution_ctx();
     let ints: Vec<i32> = vec![1, 2, 3, 4, 5];
     let strs = vec!["a", "bb", "ccc", "", "eeeee"];
@@ -340,7 +320,10 @@ fn row_size_struct_shape() -> VortexResult<()> {
 
     let sizes = compute_row_sizes(
         &[col0, col1],
-        &[RowSortField::default(), RowSortField::default()],
+        &[
+            RowSortFieldOptions::default(),
+            RowSortFieldOptions::default(),
+        ],
         &mut ctx,
     )?;
     // Shape must be Struct { fixed, var }
@@ -384,7 +367,10 @@ fn single_buffer_invariant() -> VortexResult<()> {
     let col1 = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array();
     let encoded = convert_columns(
         &[col0, col1],
-        &[RowSortField::default(), RowSortField::default()],
+        &[
+            RowSortFieldOptions::default(),
+            RowSortFieldOptions::default(),
+        ],
         &mut ctx,
     )?;
 
@@ -417,7 +403,10 @@ fn multi_column_varlen_empty_vs_nul_byte_string() -> VortexResult<()> {
     let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1]).into_array();
     let encoded = convert_columns(
         &[col1, col2],
-        &[RowSortField::default(), RowSortField::default()],
+        &[
+            RowSortFieldOptions::default(),
+            RowSortFieldOptions::default(),
+        ],
         &mut ctx,
     )?;
     let rows = collect_row_bytes(&encoded);
@@ -454,7 +443,10 @@ fn multi_column_varlen_null_vs_empty() -> VortexResult<()> {
     let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1, 1]).into_array();
     let encoded = convert_columns(
         &[col1, col2],
-        &[RowSortField::ascending(), RowSortField::ascending()],
+        &[
+            RowSortFieldOptions::ascending(),
+            RowSortFieldOptions::ascending(),
+        ],
         &mut ctx,
     )?;
     let rows = collect_row_bytes(&encoded);
@@ -507,7 +499,7 @@ fn multi_column_varlen_null_vs_empty() -> VortexResult<()> {
 fn varlen_descending_empty_vs_non_empty() -> VortexResult<()> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();
     let col = VarBinViewArray::from_iter_str(["a", "", "abc"]).into_array();
-    let encoded = convert_columns(&[col], &[RowSortField::descending()], &mut ctx)?;
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::descending()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
 
     // Natural order: "" < "a" < "abc"; descending byte sort: "abc" first, "" last.
@@ -541,7 +533,7 @@ fn null_struct_rows_with_varying_child_lengths_are_byte_equal() -> VortexResult<
     let validity = Validity::from(bits);
     let struct_arr = StructArray::try_new(field_names, vec![names], 3, validity)?.into_array();
 
-    let encoded = convert_columns(&[struct_arr], &[RowSortField::ascending()], &mut ctx)?;
+    let encoded = convert_columns(&[struct_arr], &[RowSortFieldOptions::ascending()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
     assert_eq!(rows.len(), 3);
     // Both null parent rows must produce identical bytes despite the divergent children.
@@ -559,7 +551,7 @@ fn primitive_f32_sort_order() -> VortexResult<()> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();
     let values: Vec<f32> = vec![-1.5, 0.0, 1.5, f32::INFINITY, f32::NEG_INFINITY];
     let col = PrimitiveArray::from_iter(values.clone()).into_array();
-    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
     let mut sorted_rows = rows.clone();
     sorted_rows.sort();
@@ -582,7 +574,7 @@ fn primitive_f16_sort_order() -> VortexResult<()> {
         f16::NEG_INFINITY,
     ];
     let col = PrimitiveArray::from_iter(values.clone()).into_array();
-    let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?;
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?;
     let rows = collect_row_bytes(&encoded);
     let mut sorted_rows = rows.clone();
     sorted_rows.sort();
@@ -593,6 +585,165 @@ fn primitive_f16_sort_order() -> VortexResult<()> {
     Ok(())
 }
 
+#[test]
+fn decimal_nullable_sort_order() -> VortexResult<()> {
+    use vortex_array::arrays::DecimalArray;
+    use vortex_array::dtype::DecimalDType;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::BitBuffer;
+    use vortex_buffer::Buffer;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // precision=9 -> minimal physical type I32; row 1 is null.
+    let dt = DecimalDType::new(9, 3);
+    let values: Vec<i32> = vec![5, 0, -7, 0, 123];
+    let validity = Validity::from(BitBuffer::from_iter([true, false, true, false, true]));
+    let col =
+        DecimalArray::new::<i32>(Buffer::<i32>::copy_from(&values), dt, validity).into_array();
+
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::ascending()], &mut ctx)?;
+    let mut sorted = collect_row_bytes(&encoded);
+    sorted.sort();
+    // nulls_first: the two null rows sort to the front and are byte-equal.
+    assert_eq!(sorted[0][0], 0x00, "null sentinel sorts first");
+    assert_eq!(sorted[0], sorted[1], "null decimal rows are byte-equal");
+    assert_eq!(sorted[1][0], 0x00);
+    assert_eq!(sorted[2][0], 0x01, "non-null sentinel");
+    Ok(())
+}
+
+/// Regression: a decimal column whose physical `values_type` is wider than its precision
+/// requires (precision 5 fits in `I32` but is stored as `i64`) must still encode correctly.
+/// The size pass reserves the precision-minimal width, so the encode pass must narrow the
+/// physical values to that same width rather than writing the wider physical bytes (which
+/// previously overran the per-row slot). Byte order must still match the natural value order.
+#[rstest]
+#[case::ascending(false)]
+#[case::descending(true)]
+fn decimal_wide_physical_storage_sort_order(#[case] descending: bool) -> VortexResult<()> {
+    use vortex_array::arrays::DecimalArray;
+    use vortex_array::dtype::DecimalDType;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::Buffer;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // precision=5 fits in I32 (4 bytes), but store physically as i64 (8 bytes).
+    let dt = DecimalDType::new(5, 2);
+    let values: Vec<i64> = vec![1, -4, 0, 99_999, -99_999, 42, -42];
+    let col =
+        DecimalArray::new::<i64>(Buffer::<i64>::copy_from(&values), dt, Validity::NonNullable)
+            .into_array();
+    let field = RowSortFieldOptions::new(descending, true);
+    let encoded = convert_columns(&[col], &[field], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    // Each encoded row is the precision-minimal width: sentinel(1) + I32(4) = 5 bytes.
+    assert!(rows.iter().all(|r| r.len() == 5), "row lens: {:?}", rows);
+
+    let mut idx: Vec<usize> = (0..values.len()).collect();
+    if descending {
+        idx.sort_by(|a, b| values[*b].cmp(&values[*a]));
+    } else {
+        idx.sort_by(|a, b| values[*a].cmp(&values[*b]));
+    }
+    let expected: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+    let mut sorted = rows;
+    sorted.sort();
+    assert_eq!(
+        sorted, expected,
+        "decimal byte order must match value order"
+    );
+    Ok(())
+}
+
+/// Lock-in reference test: encode the worked-example row from `docs/specs/row-encoding.md`
+/// (one row with every supported encoding family, all columns ascending nulls-first) and
+/// assert the exact encoded bytes. This pins the byte layout so any accidental change to the
+/// format is caught, and keeps the spec document honest.
+#[test]
+fn reference_row_bytes_match_spec() -> VortexResult<()> {
+    use vortex_array::arrays::DecimalArray;
+    use vortex_array::arrays::FixedSizeListArray;
+    use vortex_array::arrays::NullArray;
+    use vortex_array::dtype::DecimalDType;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::Buffer;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+
+    let null_col = NullArray::new(1).into_array();
+    let bool_col = BoolArray::from_iter([true]).into_array();
+    let uint_col = PrimitiveArray::from_iter([258u16]).into_array();
+    let int_col = PrimitiveArray::from_iter([-5i16]).into_array();
+    let float_col = PrimitiveArray::from_iter([1.5f32]).into_array();
+    let decimal_col = DecimalArray::new::<i32>(
+        Buffer::<i32>::copy_from([12345i32]),
+        DecimalDType::new(9, 2),
+        Validity::NonNullable,
+    )
+    .into_array();
+    let utf8_col = VarBinViewArray::from_iter_str(["a"]).into_array();
+    let binary_col = VarBinViewArray::from_iter_bin([[0xDEu8, 0xAD, 0xBE, 0xEF]]).into_array();
+    let struct_col = StructArray::from_fields(&[
+        ("x", PrimitiveArray::from_iter([1i8]).into_array()),
+        ("y", VarBinViewArray::from_iter_str([""]).into_array()),
+    ])?
+    .into_array();
+    let fsl_col = FixedSizeListArray::try_new(
+        PrimitiveArray::from_iter([1u8, 2, 3]).into_array(),
+        3,
+        Validity::NonNullable,
+        1,
+    )?
+    .into_array();
+
+    let cols = [
+        null_col,
+        bool_col,
+        uint_col,
+        int_col,
+        float_col,
+        decimal_col,
+        utf8_col,
+        binary_col,
+        struct_col,
+        fsl_col,
+    ];
+    let fields = vec![RowSortFieldOptions::default(); cols.len()];
+    let encoded = convert_columns(&cols, &fields, &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+    assert_eq!(rows.len(), 1);
+
+    // Per-column encodings from the spec's worked example.
+    let mut expected: Vec<u8> = Vec::new();
+    expected.extend_from_slice(&[0x00]); // null_col
+    expected.extend_from_slice(&[0x01, 0x02]); // bool_col: true
+    expected.extend_from_slice(&[0x01, 0x01, 0x02]); // uint_col: 258 u16
+    expected.extend_from_slice(&[0x01, 0x7F, 0xFB]); // int_col: -5 i16 (sign-bit flipped)
+    expected.extend_from_slice(&[0x01, 0xBF, 0xC0, 0x00, 0x00]); // float_col: 1.5 f32
+    expected.extend_from_slice(&[0x01, 0x80, 0x00, 0x30, 0x39]); // decimal_col: 12345 i32
+    // utf8 "a": non-empty sentinel, 'a', zero pad to 32, length marker 1.
+    expected.push(0x02);
+    expected.push(b'a');
+    expected.extend(std::iter::repeat_n(0u8, 31));
+    expected.push(0x01);
+    // binary DE AD BE EF: non-empty sentinel, data, zero pad to 32, length marker 4.
+    expected.push(0x02);
+    expected.extend_from_slice(&[0xDE, 0xAD, 0xBE, 0xEF]);
+    expected.extend(std::iter::repeat_n(0u8, 28));
+    expected.push(0x04);
+    // struct { x: 1 i8, y: "" }: outer sentinel, x = 0x01 || 0x81, y = empty sentinel 0x01.
+    expected.extend_from_slice(&[0x01, 0x01, 0x81, 0x01]);
+    // fsl [1, 2, 3] u8: outer sentinel, then per element 0x01 || BE(value).
+    expected.extend_from_slice(&[0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x03]);
+
+    assert_eq!(
+        rows[0], expected,
+        "encoded reference row does not match the documented byte layout"
+    );
+    Ok(())
+}
+
 #[test]
 fn reject_list_dtype_early() {
     use vortex_array::ArrayRef;
@@ -606,7 +757,7 @@ fn reject_list_dtype_early() {
     let list: ArrayRef = ListArray::try_new(elements, offsets, Validity::NonNullable)
         .unwrap()
         .into_array();
-    let err = convert_columns(&[list], &[RowSortField::default()], &mut ctx)
+    let err = convert_columns(&[list], &[RowSortFieldOptions::default()], &mut ctx)
         .expect_err("List should not be accepted");
     assert!(
         err.to_string().contains("List"),

From 9f134b4b6568bf0e487e8f82b1ed69e1f34df7c4 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Mon, 8 Jun 2026 13:23:48 +0100
Subject: [PATCH 2/3] fix

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-row/src/size.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
index a9b3255227d..5465ce34357 100644
--- a/vortex-row/src/size.rs
+++ b/vortex-row/src/size.rs
@@ -181,7 +181,7 @@ pub(crate) fn row_size_struct_fields() -> StructFields {
             ],
         )
     });
-    *FIELDS
+    FIELDS.clone()
 }
 
 impl ScalarFnVTable for RowSize {

From d0913ffab48257f086bee41f320fd60e2996a131 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 11 Jun 2026 18:34:30 +0100
Subject: [PATCH 3/3] u

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/src/arrays/decimal/utils.rs | 86 ++++++++++++++++++++++++
 vortex-row/src/codec/encoding.rs         | 35 ++++------
 vortex-row/src/codec/mod.rs              |  2 -
 vortex-row/src/tests.rs                  | 37 ++++++++++
 4 files changed, 136 insertions(+), 24 deletions(-)

diff --git a/vortex-array/src/arrays/decimal/utils.rs b/vortex-array/src/arrays/decimal/utils.rs
index a1eafde0fe3..93660f943cf 100644
--- a/vortex-array/src/arrays/decimal/utils.rs
+++ b/vortex-array/src/arrays/decimal/utils.rs
@@ -3,11 +3,17 @@
 
 use itertools::Itertools;
 use itertools::MinMaxResult;
+use vortex_buffer::Buffer;
 use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_mask::Mask;
 
 use crate::arrays::DecimalArray;
+use crate::dtype::BigCast;
 use crate::dtype::DecimalType;
 use crate::dtype::i256;
+use crate::match_each_decimal_value_type;
 
 macro_rules! try_downcast {
     ($array:expr, from: $src:ty, to: $($dst:ty),*) => {{
@@ -41,6 +47,44 @@ macro_rules! try_downcast {
     }};
 }
 
+/// Cast the array's physical values to `target`, preserving the logical decimal dtype and
+/// validity.
+///
+/// `mask` must be the materialized validity of `array`. Null slots are unconstrained by the
+/// [`DecimalArray`] invariants (only *non-null* values must fit the precision) and may hold
+/// bytes that do not fit `target`, so they are replaced with zero rather than cast.
+///
+/// # Errors
+///
+/// Returns an error if a non-null value cannot be represented in `target`.
+pub fn cast_decimal_values(
+    array: &DecimalArray,
+    target: DecimalType,
+    mask: &Mask,
+) -> VortexResult<DecimalArray> {
+    let decimal_dtype = array.decimal_dtype();
+    let validity = array.validity()?;
+    match_each_decimal_value_type!(array.values_type(), |F| {
+        let from = array.buffer::<F>();
+        match_each_decimal_value_type!(target, |T| {
+            let values = from
+                .iter()
+                .enumerate()
+                .map(|(i, &v)| {
+                    if mask.value(i) {
+                        <T as BigCast>::from(v).ok_or_else(|| {
+                            vortex_err!("decimal value {v} does not fit values type {target}")
+                        })
+                    } else {
+                        Ok(T::default())
+                    }
+                })
+                .collect::<VortexResult<Buffer<T>>>()?;
+            Ok(DecimalArray::new::<T>(values, decimal_dtype, validity))
+        })
+    })
+}
+
 /// Attempt to narrow the decimal array to any smaller supported type.
 pub fn narrowed_decimal(decimal_array: DecimalArray) -> DecimalArray {
     match decimal_array.values_type() {
@@ -63,3 +107,45 @@ pub fn narrowed_decimal(decimal_array: DecimalArray) -> DecimalArray {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use vortex_buffer::BitBuffer;
+    use vortex_buffer::Buffer;
+    use vortex_error::VortexResult;
+    use vortex_mask::Mask;
+
+    use super::cast_decimal_values;
+    use crate::arrays::DecimalArray;
+    use crate::dtype::DecimalDType;
+    use crate::dtype::DecimalType;
+    use crate::validity::Validity;
+
+    #[test]
+    fn cast_zeroes_garbage_null_slots() -> VortexResult<()> {
+        let dt = DecimalDType::new(5, 2);
+        let validity = Validity::from(BitBuffer::from_iter([true, false, true]));
+        let arr = DecimalArray::new::<i64>(
+            Buffer::<i64>::copy_from([7i64, i64::MAX, -99_999]),
+            dt,
+            validity,
+        );
+        let mask = Mask::from_iter([true, false, true]);
+        let narrowed = cast_decimal_values(&arr, DecimalType::I32, &mask)?;
+        assert_eq!(narrowed.values_type(), DecimalType::I32);
+        assert_eq!(narrowed.buffer::<i32>().as_slice(), &[7, 0, -99_999]);
+        Ok(())
+    }
+
+    #[test]
+    fn cast_rejects_non_null_value_that_does_not_fit() {
+        let dt = DecimalDType::new(5, 2);
+        let arr = DecimalArray::new::<i64>(
+            Buffer::<i64>::copy_from([i64::MAX]),
+            dt,
+            Validity::NonNullable,
+        );
+        let mask = Mask::new_true(1);
+        assert!(cast_decimal_values(&arr, DecimalType::I32, &mask).is_err());
+    }
+}
diff --git a/vortex-row/src/codec/encoding.rs b/vortex-row/src/codec/encoding.rs
index c3e90641b2c..153f14c6015 100644
--- a/vortex-row/src/codec/encoding.rs
+++ b/vortex-row/src/codec/encoding.rs
@@ -4,6 +4,8 @@
 //! Encode pass leaf kernels: per-row byte writers for each canonical variant, plus the
 //! variable-length block body encoder.
 
+use vortex_array::arrays::decimal::cast_decimal_values;
+
 use super::*;
 
 pub(super) fn encode_null(
@@ -127,29 +129,18 @@ fn encode_primitive_typed<T: NativePType + RowEncode>(
 /// values. A `DecimalArray` may legally carry a wider `values_type` than its precision requires,
 /// so without this normalization the encode pass would write more bytes than the size pass
 /// reserved. The narrowing is always lossless because a decimal's precision bounds the magnitude
-/// of every valid value, so the precision-minimal type can represent it.
-fn narrow_decimal_to_smallest(arr: &DecimalArray) -> VortexResult<Option<DecimalArray>> {
-    let decimal_dtype = arr.decimal_dtype();
-    let target = DecimalType::smallest_decimal_value_type(&decimal_dtype);
+/// of every valid *non-null* value, so the precision-minimal type can represent it. Null slots
+/// are unconstrained and may hold values that do not fit; [`cast_decimal_values`] narrows them
+/// to zero instead of casting (the encoder zero-fills null bodies anyway).
+fn narrow_decimal_to_smallest(
+    arr: &DecimalArray,
+    mask: &vortex_mask::Mask,
+) -> VortexResult<Option<DecimalArray>> {
+    let target = DecimalType::smallest_decimal_value_type(&arr.decimal_dtype());
     if arr.values_type() == target {
         return Ok(None);
     }
-    let validity = arr.as_ref().validity()?;
-    let narrowed = match_each_decimal_value_type!(arr.values_type(), |P| {
-        let from = arr.buffer::<P>();
-        match_each_decimal_value_type!(target, |Q| {
-            DecimalArray::new::<Q>(narrow_decimal_buffer::<P, Q>(from), decimal_dtype, validity)
-        })
-    });
-    Ok(Some(narrowed))
-}
-
-/// Narrow a buffer of decimal values from type `F` to a smaller type `T`. Lossless because the
-/// caller only narrows to the precision-minimal type, which can represent every valid value.
-fn narrow_decimal_buffer<F: NativeDecimalType, T: NativeDecimalType>(from: Buffer<F>) -> Buffer<T> {
-    from.iter()
-        .map(|&v| T::from(v).vortex_expect("decimal value must fit its precision-minimal type"))
-        .collect()
+    cast_decimal_values(arr, target, mask).map(Some)
 }
 
 pub(super) fn encode_decimal(
@@ -162,9 +153,9 @@ pub(super) fn encode_decimal(
 ) -> VortexResult<()> {
     // Normalize to the precision-minimal physical type so the bytes we write match the width the
     // size pass reserved (see `narrow_decimal_to_smallest`).
-    let narrowed = narrow_decimal_to_smallest(arr)?;
-    let arr = narrowed.as_ref().unwrap_or(arr);
     let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let narrowed = narrow_decimal_to_smallest(arr, &mask)?;
+    let arr = narrowed.as_ref().unwrap_or(arr);
     match arr.values_type() {
         DecimalType::I8 => {
             encode_decimal_typed::<i8>(arr, &mask, field, row_offsets, col_offset, out)
diff --git a/vortex-row/src/codec/mod.rs b/vortex-row/src/codec/mod.rs
index d5c8afbcfb7..e1f63da2852 100644
--- a/vortex-row/src/codec/mod.rs
+++ b/vortex-row/src/codec/mod.rs
@@ -40,10 +40,8 @@ use vortex_array::dtype::DecimalType;
 use vortex_array::dtype::NativeDecimalType;
 use vortex_array::dtype::NativePType;
 use vortex_array::dtype::half::f16;
-use vortex_array::match_each_decimal_value_type;
 use vortex_array::match_each_native_ptype;
 use vortex_array::validity::Validity;
-use vortex_buffer::Buffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs
index 8c36c66a95c..90720b818fb 100644
--- a/vortex-row/src/tests.rs
+++ b/vortex-row/src/tests.rs
@@ -656,6 +656,43 @@ fn decimal_wide_physical_storage_sort_order(#[case] descending: bool) -> VortexR
     Ok(())
 }
 
+/// Regression: a nullable decimal stored wider than its precision requires may hold arbitrary
+/// garbage in null slots (`DecimalArray` only constrains *non-null* values to the precision).
+/// The narrowing pass must skip null slots instead of panicking when the garbage does not fit
+/// the precision-minimal type.
+#[test]
+fn decimal_wide_storage_with_garbage_null_slot() -> VortexResult<()> {
+    use vortex_array::arrays::DecimalArray;
+    use vortex_array::dtype::DecimalDType;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::BitBuffer;
+    use vortex_buffer::Buffer;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // precision=5 -> minimal physical type I32, but stored as i64. Row 1 is null and its slot
+    // holds a value that fits neither i32 nor precision 5.
+    let dt = DecimalDType::new(5, 2);
+    let values: Vec<i64> = vec![7, i64::MAX, -99_999];
+    let validity = Validity::from(BitBuffer::from_iter([true, false, true]));
+    let col =
+        DecimalArray::new::<i64>(Buffer::<i64>::copy_from(&values), dt, validity).into_array();
+
+    let encoded = convert_columns(&[col], &[RowSortFieldOptions::ascending()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+    // sentinel(1) + i32(4) per row.
+    assert!(rows.iter().all(|r| r.len() == 5), "row lens: {rows:?}");
+    // The null row encodes as the canonical null (sentinel 0x00, zero body), so it sorts first
+    // and carries no trace of the garbage slot value.
+    assert_eq!(rows[1], vec![0x00, 0, 0, 0, 0]);
+    let mut sorted = rows.clone();
+    sorted.sort();
+    assert_eq!(
+        sorted,
+        vec![rows[1].clone(), rows[2].clone(), rows[0].clone()]
+    );
+    Ok(())
+}
+
 /// Lock-in reference test: encode the worked-example row from `docs/specs/row-encoding.md`
 /// (one row with every supported encoding family, all columns ascending nulls-first) and
 /// assert the exact encoded bytes. This pins the byte layout so any accidental change to the