From 658e954af754fd4c0694df73c8c3ea160bd9abff Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 8 Jun 2026 10:17:18 +0100 Subject: [PATCH 1/3] fix Signed-off-by: Joe Isaacs --- docs/specs/row-encoding.md | 378 +-------- vortex-row/src/codec.rs | 1257 ------------------------------ vortex-row/src/codec/encoding.rs | 593 ++++++++++++++ vortex-row/src/codec/mod.rs | 407 ++++++++++ vortex-row/src/codec/native.rs | 113 +++ vortex-row/src/codec/sizing.rs | 162 ++++ vortex-row/src/codec/varlen.rs | 104 +++ vortex-row/src/encode.rs | 38 +- vortex-row/src/encoder.rs | 47 +- vortex-row/src/lib.rs | 124 ++- vortex-row/src/options.rs | 54 +- vortex-row/src/size.rs | 46 +- vortex-row/src/tests.rs | 269 +++++-- 13 files changed, 1768 insertions(+), 1824 deletions(-) delete mode 100644 vortex-row/src/codec.rs create mode 100644 vortex-row/src/codec/encoding.rs create mode 100644 vortex-row/src/codec/mod.rs create mode 100644 vortex-row/src/codec/native.rs create mode 100644 vortex-row/src/codec/sizing.rs create mode 100644 vortex-row/src/codec/varlen.rs diff --git a/docs/specs/row-encoding.md b/docs/specs/row-encoding.md index 8fc3288f82b..c12b6a72712 100644 --- a/docs/specs/row-encoding.md +++ b/docs/specs/row-encoding.md @@ -7,7 +7,7 @@ logical tuple comparison of the input values under the configured row sort optio This is a schema-aware row-key format. The bytes do not contain type tags, field names, or sort options. Two encoded rows are comparable only when they were produced with the same -input schema and the same per-column `RowSortField` settings. +input schema and the same per-column `RowSortFieldOptions` settings. The row encoding is not the Vortex file format or scalar IPC format. It is an internal comparison representation used for sort keys and row-key operations. @@ -18,6 +18,11 @@ semantics may change between Vortex releases. Do not persist these bytes or depe a stable interchange format. ::: +The **per-type byte layout** — sentinel tables, field options, and the encoding rules for each +supported type — lives in the `vortex-row` crate's module-level documentation, so it stays next +to the implementation. This page gives the order property, the notation, the order-preservation +argument, and a fully worked example row. + ## Order Property For a fixed schema with columns `c0, c1, ..., cn` and per-column sort fields @@ -70,374 +75,6 @@ For example: | `-5_i32`, before the signed sign-bit transform | `FF FF FF FB` | | `ordered = 0x80000000_u32` | `80 00 00 00` | -## Field Options - -Each input column has a `RowSortField`: - -```text -RowSortField { - descending: bool, - nulls_first: bool, -} -``` - -`descending` reverses the order of non-null values. `nulls_first` is independent of -`descending`, so nulls can sort before or after non-nulls in either direction. - -## Sentinel Summary - -Sentinels are single bytes that classify nullness and, for variable-width values, whether a -value is empty or non-empty. They are chosen so byte comparison can decide those categories -before comparing any value bytes. - -| Encoding family | Case | Ascending, nulls first | Descending, nulls first | Ascending, nulls last | Descending, nulls last | -| --- | --- | --- | --- | --- | --- | -| Fixed-width | Null | `0x00` | `0x00` | `0x02` | `0x02` | -| Fixed-width | Non-null | `0x01` | `0x01` | `0x01` | `0x01` | -| Variable-width | Null | `0x00` | `0x00` | `0xFF` | `0xFF` | -| Variable-width | Empty | `0x01` | `0xFE` | `0x01` | `0xFE` | -| Variable-width | Non-empty | `0x02` | `0xFD` | `0x02` | `0xFD` | - -Fixed-width sentinels are used by null, boolean, primitive, decimal, struct, and fixed-size -list values. Variable-width sentinels are used by UTF-8 and binary values. - -## Fixed-Width Sentinels - -Every fixed-width value starts with a one-byte sentinel: - -| Case | Sentinel | -| --- | --- | -| Null, `nulls_first = true` | `0x00` | -| Non-null | `0x01` | -| Null, `nulls_first = false` | `0x02` | - -The sentinel is not inverted for descending order. Only the non-null value bytes are -inverted. This keeps null placement independent from sort direction. - -For fixed-width nulls, the sentinel is followed by zero-filled value bytes. This gives fixed -types a constant encoded width for every row. - -## Variable-Width Sentinels - -UTF-8 and binary values use three leading sentinels. The separate empty and non-empty -sentinels are important: they ensure the first byte decides null, empty, or non-empty before -later columns can affect comparison. - -| Case | Ascending | Descending | -| --- | --- | --- | -| Null, `nulls_first = true` | `0x00` | `0x00` | -| Empty | `0x01` | `0xFE` | -| Non-empty | `0x02` | `0xFD` | -| Null, `nulls_first = false` | `0xFF` | `0xFF` | - -The null sentinel is not inverted by descending order. Empty and non-empty sentinels are -inverted so non-null value order is reversed while null placement stays fixed. - -## Null - -`Null` values have no body: - -```text -fixed_null_sentinel -``` - -The sentinel is `0x00` for nulls-first and `0x02` for nulls-last. - -## Boolean - -Booleans are fixed-width and use one value byte: - -```text -sentinel || value_byte -``` - -For ascending order: - -| Value | Value byte | -| --- | --- | -| `false` | `0x01` | -| `true` | `0x02` | - -For descending order, the value byte is inverted: - -| Value | Value byte | -| --- | --- | -| `true` | `0xFD` | -| `false` | `0xFE` | - -Null booleans encode as: - -```text -null_sentinel || 0x00 -``` - -## Unsigned Integers - -Supported unsigned primitive types are `u8`, `u16`, `u32`, and `u64`. - -Ascending encoding: - -```text -0x01 || BE(value) -``` - -Descending encoding: - -```text -0x01 || !BE(value) -``` - -Big-endian byte order makes lexicographic byte order match numeric order for fixed-width -unsigned integers. Bitwise complement reverses that order for descending fields. - -Null unsigned integers encode as: - -```text -null_sentinel || zero(width(T)) -``` - -## Signed Integers - -Supported signed primitive PTypes are `i8`, `i16`, `i32`, and `i64`. The same signed -integer transform is also used for `i128` decimal storage. - -Signed integers first flip the sign bit of their big-endian two's-complement -representation: - -```text -ordered = BE(value) -ordered[0] = ordered[0] XOR 0x80 -``` - -Ascending encoding: - -```text -0x01 || ordered -``` - -Descending encoding: - -```text -0x01 || !ordered -``` - -Flipping the sign bit maps the signed numeric range into unsigned byte order: - -```text -negative values -> 0x00..0x7F prefix range -non-negative values -> 0x80..0xFF prefix range -``` - -Null signed integers encode as: - -```text -null_sentinel || zero(width(T)) -``` - -## Floating Point - -Supported floating primitive types are `f16`, `f32`, and `f64`. - -The encoder treats the IEEE bit pattern as an unsigned integer and applies a sign-aware -transform before writing big-endian bytes. - -For a floating value with raw bits `bits`: - -```text -if sign_bit(bits) == 0: - ordered = bits XOR sign_bit_mask -else: - ordered = bits XOR all_ones -``` - -Ascending encoding: - -```text -0x01 || BE(ordered) -``` - -Descending encoding: - -```text -0x01 || !BE(ordered) -``` - -This produces a total-order-style byte ordering where negative values sort before positive -values, and `-0.0` sorts before `+0.0`. NaN values are ordered by their raw bit patterns -under the same transform; they are not canonicalized by row encoding. - -Null floats encode as: - -```text -null_sentinel || zero(width(T)) -``` - -## Decimal - -Decimals are encoded as their scaled signed integer storage value. The selected storage -width is the smallest decimal value type for the decimal precision: - -| Precision | Storage | -| --- | --- | -| `1..=2` | `i8` | -| `3..=4` | `i16` | -| `5..=9` | `i32` | -| `10..=18` | `i64` | -| `19..=38` | `i128` | - -The storage integer is encoded with the signed integer encoding described above. Decimal -columns have one precision and scale, so ordering the scaled integer storage values matches -ordering the decimal values in that column. - -`Decimal256` is not supported by row encoding. - -## UTF-8 and Binary - -UTF-8 and binary values use the variable-width sentinels described above. - -Null: - -```text -varlen_null_sentinel -``` - -Empty: - -```text -varlen_empty_sentinel -``` - -Non-empty: - -```text -varlen_non_empty_sentinel || varlen_body(bytes) -``` - -For UTF-8, `bytes` are the UTF-8 bytes of the string. For binary, `bytes` are the raw binary -bytes. The byte ordering is therefore UTF-8 byte lexicographic order for strings and raw byte -lexicographic order for binary. - -### Variable-Length Body - -Non-empty variable-length values are encoded in blocks. Each block contains 32 data bytes -followed by one marker byte: - -```text -data[0..32] || marker -``` - -For ascending order: - -- Every non-final full block uses marker `0xFF`. -- The final block is padded with zeros to 32 data bytes. -- The final marker is the number of real data bytes in the final block, in `1..=32`. - -For descending order: - -- Every data byte is inverted. -- Every non-final full-block marker is `0x00`, the inverse of `0xFF`. -- The final block is padded with `0xFF`, the inverse of ascending zero padding. -- The final marker is inverted: `final_len XOR 0xFF`. - -If the input length is exactly a multiple of 32, the final block has marker `32`, and earlier -blocks, if any, use the continuation marker. - -This block structure preserves prefix order. For example, in ascending order a shorter value -that is a prefix of a longer value reaches its final marker before the longer value reaches -the continuation marker. Since final length markers in `1..=32` are less than `0xFF`, the -shorter prefix sorts first. Descending order inverts the same bytes and reverses that result. - -## Struct - -A struct is encoded as: - -```text -struct_sentinel || field_0 || field_1 || ... || field_n -``` - -The outer sentinel is the fixed-width sentinel: - -- `0x01` for a non-null struct -- `0x00` or `0x02` for a null struct, depending on null placement - -For a non-null struct, each field is encoded recursively in schema order using the same -`RowSortField` as the parent struct column. - -For a null struct, the body is canonicalized so two null parent rows produce byte-equal -output even if their physical child arrays contain different values: - -- Fixed-width children contribute their fixed-width null encoding. -- Variable-width children contribute exactly one child null sentinel byte. - -A struct has fixed row width only when all of its fields have fixed row width. If any child -is variable-width, the struct is variable-width. - -## Fixed-Size List - -A fixed-size list with `N` elements is encoded as: - -```text -list_sentinel || element_0 || element_1 || ... || element_N-1 -``` - -The outer sentinel is the fixed-width sentinel: - -- `0x01` for a non-null list -- `0x00` or `0x02` for a null list, depending on null placement - -For a non-null fixed-size list, elements are encoded recursively in element order using the -same `RowSortField` as the parent list column. - -For a null fixed-size list, the body is canonicalized: - -- Fixed-width elements contribute their fixed-width null encoding, repeated `N` times. -- Variable-width elements contribute one child null sentinel byte per element. - -A fixed-size list has fixed row width only when its element type has fixed row width. - -## Nested Values - -Nested structs and fixed-size lists apply the same rules recursively. Each nullable parent -adds its own outer sentinel. Null parents canonicalize their child body before comparison can -observe underlying child values. - -## Unsupported Types - -The current row encoder rejects types for which it does not define byte-sort semantics: - -| Type | Reason | -| --- | --- | -| Variable-size `List` | No row encoding order is defined. | -| `Variant` | No row encoding order is defined. | -| `Union` | No row encoding order is defined. | -| `Extension` | No row encoding order is defined. | -| `Decimal256` | Encoding is not implemented. | - -The absence of these encodings is intentional. Adding one requires defining both the logical -ordering and the exact byte representation that preserves that ordering. - -Temporal extensions could be added later by normalizing them to storage arrays at the -row-encoder boundary, once the supported temporal ordering contract is made explicit. - -## Size and Output Layout - -The encoded output is a `ListView`: - -```text -elements: contiguous u8 buffer containing all row bytes -offsets: per-row start offset into elements -sizes: per-row byte length -``` - -Rows are not self-describing without their `sizes`. A variable-width field can make one row -longer than another, and the enclosing `ListView` supplies the row boundary. - -The encoder computes sizes before writing bytes: - -- Fixed-width columns contribute a constant width per row. -- Variable-width columns contribute data-dependent widths per row. -- The final `sizes` array is also used as the per-row write cursor during encoding. - ## Why Concatenation Works For each supported field type, the field encoder is an order embedding from logical values to @@ -469,7 +106,8 @@ controlled solely by `nulls_first`. ## Example Row This example shows one row that contains every supported encoding family. All columns use -ascending order with nulls first. +ascending order with nulls first. (This row is locked in by the `reference_row_bytes_match_spec` +test in `vortex-row`.) Schema: diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs deleted file mode 100644 index 4848a750e52..00000000000 --- a/vortex-row/src/codec.rs +++ /dev/null @@ -1,1257 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants. -//! -//! The encoded byte format produces a lexicographically byte-comparable representation: -//! comparing the byte slices of two encoded rows yields the same ordering as the -//! original logical (tuple) comparison of their values, modulo nulls placement and -//! descending-ness as configured by [`RowSortField`]. -//! -//! Conventions: -//! - Every fixed-width value is preceded by a 1-byte sentinel that orders nulls relative to -//! non-nulls. For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF), -//! not the sentinel. -//! - Variable-length (Utf8, Binary) values use **three** distinct leading sentinels — one each -//! for null, empty, and non-empty — so byte comparison at position 0 fully categorizes the -//! value and column-byte boundaries stay aligned across rows. See -//! [`varlen_null_sentinel`], [`varlen_empty_sentinel`], [`varlen_non_empty_sentinel`]. -//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types. -//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top -//! bit; negative flips all bits. -//! - Nullable structs and fixed-size lists encode null parent rows with a **canonical null -//! body** so two null parent rows produce byte-equal encodings: fixed-width children -//! contribute their fixed null encoding, and variable-width children collapse to a single -//! null sentinel byte. - -use vortex_array::Canonical; -use vortex_array::ExecutionCtx; -use vortex_array::arrays::BoolArray; -use vortex_array::arrays::DecimalArray; -use vortex_array::arrays::FixedSizeListArray; -use vortex_array::arrays::NullArray; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::StructArray; -use vortex_array::arrays::VarBinViewArray; -use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt; -use vortex_array::arrays::struct_::StructArrayExt; -use vortex_array::dtype::DType; -use vortex_array::dtype::DecimalType; -use vortex_array::dtype::NativePType; -use vortex_array::dtype::half::f16; -use vortex_array::match_each_native_ptype; -use vortex_array::validity::Validity; -use vortex_error::VortexExpect; -use vortex_error::VortexResult; -use vortex_error::vortex_bail; - -use crate::options::RowSortField; - -/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte). -pub(crate) const BOOL_ENCODED_SIZE: u32 = 2; - -/// Block size used in the variable-length encoding. -pub(crate) const VARLEN_BLOCK_SIZE: usize = 32; -/// Total bytes per varlen block including the trailing continuation marker. -pub(crate) const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1; -const VARLEN_BLOCK_TOTAL_U32: u32 = 33; - -/// Size in bytes of an encoded null varlen value (just the sentinel byte). -pub(crate) const VARLEN_NULL_SIZE: u32 = 1; -/// Size in bytes of an encoded empty varlen value (just the sentinel byte). -pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1; - -/// Returns the size in bytes of the encoded form of a non-empty variable-length value. -/// -/// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1 -/// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and -/// [`VARLEN_EMPTY_SIZE`] for empty values. A `u32` always suffices because a `BinaryView` -/// length is itself a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`. -#[inline] -fn encoded_size_for_non_empty_varlen(len: usize) -> u32 { - debug_assert!(len > 0); - let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE)) - .vortex_expect("varlen block count must fit in u32"); - 1 + blocks * VARLEN_BLOCK_TOTAL_U32 -} - -/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel). -#[inline] -const fn encoded_size_for_fixed(value_bytes: u32) -> u32 { - 1 + value_bytes -} - -fn byte_width_u32(width: usize) -> u32 { - u32::try_from(width).vortex_expect("native byte width must fit in u32") -} - -/// Pre-resolved per-row validity for the row encoders. -/// -/// Encoders pattern-match on this once before their inner loop so the no-nulls fast path -/// avoids per-row `mask.value(i)` branches entirely, and the nullable path materializes the -/// mask exactly once. -pub(crate) enum ValidityKind { - /// Column statically has no nulls (`Validity::NonNullable` or `AllValid`); no mask needed. - AllValid, - /// Column may have nulls; carries the materialized per-row mask. - Mask(vortex_mask::Mask), -} - -/// Resolve a [`Validity`] into a [`ValidityKind`], materializing the mask only when the column -/// may actually have nulls. -#[inline] -pub(crate) fn resolve_validity( - validity: Validity, - len: usize, - ctx: &mut ExecutionCtx, -) -> VortexResult { - Ok(match validity { - Validity::NonNullable | Validity::AllValid => ValidityKind::AllValid, - other => ValidityKind::Mask(other.execute_mask(len, ctx)?), - }) -} - -/// Returns the sentinel byte for a null varlen value. -/// -/// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and -/// independent of `descending`, matching the convention used by `arrow-row`. -#[inline] -fn varlen_null_sentinel(field: RowSortField) -> u8 { - if field.nulls_first { 0x00 } else { 0xFF } -} - -/// Returns the sentinel byte for an empty varlen value. -/// -/// Equal to `0x01` in ascending mode and `!0x01 = 0xFE` in descending mode. -#[inline] -fn varlen_empty_sentinel(field: RowSortField) -> u8 { - if field.descending { !0x01u8 } else { 0x01u8 } -} - -/// Returns the sentinel byte for a non-empty varlen value. -/// -/// Equal to `0x02` in ascending mode and `!0x02 = 0xFD` in descending mode. -#[inline] -fn varlen_non_empty_sentinel(field: RowSortField) -> u8 { - if field.descending { !0x02u8 } else { 0x02u8 } -} - -/// Returns the single-byte null sentinel used when a child contributes its canonical null -/// encoding inside a null parent struct/FSL row. -/// -/// For varlen children that is the varlen null sentinel; for everything else (including -/// nested struct/FSL when used as a variable-width child) it is the fixed-width null sentinel. -fn child_canonical_null_byte(child_dtype: &DType, field: RowSortField) -> u8 { - match child_dtype { - DType::Utf8(_) | DType::Binary(_) => varlen_null_sentinel(field), - _ => field.null_sentinel(), - } -} - -/// Per-row width classification for a column. -/// -/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless -/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary, -/// List, or any composite that recurses through a variable-width field). -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum RowWidth { - /// Per-row width is the same constant for every row in the column. - Fixed(u32), - /// Per-row width is data-dependent. - Variable, -} - -/// Classify a column's per-row encoded width by inspecting only its [`DType`]. -/// -/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value), -/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the -/// data. -/// -/// Classification does not depend on the [`RowSortField`]: null-vs-non-null encoding width is -/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls). -/// -/// # Errors -/// -/// Returns an error for dtypes that the row encoder does not support. Width arithmetic that -/// would overflow `u32` is also reported as an error rather than silently saturating. -pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { - match dtype { - DType::Null => Ok(RowWidth::Fixed(1)), - DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)), - DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( - ptype.byte_width(), - )))), - DType::Decimal(dt, _) => { - let vt = DecimalType::smallest_decimal_value_type(dt); - if matches!(vt, DecimalType::I256) { - vortex_bail!("row encoding for Decimal256 is not yet implemented"); - } - Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( - vt.byte_width(), - )))) - } - DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable), - DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? { - // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL - // itself, then `n` copies of the element width. - RowWidth::Fixed(w) => { - let body = w - .checked_mul(*n) - .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?; - let total = body - .checked_add(1) - .ok_or_else(|| vortex_error::vortex_err!("FSL row width overflows u32"))?; - Ok(RowWidth::Fixed(total)) - } - RowWidth::Variable => Ok(RowWidth::Variable), - }, - DType::Struct(fields, _) => { - // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel. - let mut total: u32 = 1; // outer sentinel - for field_dtype in fields.fields() { - match row_width_for_dtype(&field_dtype)? { - RowWidth::Fixed(w) => { - total = total.checked_add(w).ok_or_else(|| { - vortex_error::vortex_err!("Struct row width overflows u32") - })?; - } - RowWidth::Variable => return Ok(RowWidth::Variable), - } - } - Ok(RowWidth::Fixed(total)) - } - DType::List(..) => { - vortex_bail!( - "row encoding does not support variable-size List arrays (no well-defined ordering)" - ) - } - DType::Variant(_) => { - vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") - } - DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"), - dtype => vortex_bail!("row encoding does not support dtype: {dtype:?}"), - } -} - -/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`. -/// -/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the -/// per-row size to each entry so multiple columns can accumulate into the same buffer. -/// -/// # Errors -/// -/// Returns an error for unsupported canonical variants. -pub(crate) fn field_size( - canonical: &Canonical, - field: RowSortField, - sizes: &mut [u32], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - match canonical { - Canonical::Null(arr) => add_size_null(arr, sizes), - Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)), - Canonical::Primitive(arr) => add_size_primitive(arr, sizes), - Canonical::Decimal(arr) => add_size_decimal(arr, sizes), - Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?, - Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?, - Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?, - Canonical::List(_) => vortex_bail!( - "row encoding does not support canonical List arrays: {:?}", - canonical.dtype() - ), - Canonical::Variant(_) => { - vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") - } - unsupported => { - vortex_bail!( - "row encoding does not support canonical array: {:?}", - unsupported.dtype() - ) - } - } - Ok(()) -} - -/// Encode a fixed-width column at arithmetic offsets, without reading or writing any per-row -/// cursor. -/// -/// For row `i`, the column's bytes are written starting at `i * row_stride + col_prefix -/// (+ var_prefix[i])`, where `var_prefix` is the exclusive prefix sum of the varlen -/// contributions (`None` when the row layout has no variable-length columns). This is the -/// fast path for fixed-width columns that appear before any varlen column, so their -/// within-row position is a constant offset rather than a running cursor. -/// -/// For primitive columns in the pure-fixed case it uses a `chunks_exact_mut` hot loop that -/// removes the per-row offset/cursor indirection (matching `arrow-row`'s `encode_not_null`). -/// All other types reuse [`field_encode`] at the materialized offsets, so the bytes written -/// are byte-identical to the cursor path. -#[allow(clippy::too_many_arguments)] -pub(crate) fn field_encode_fixed_arithmetic( - canonical: &Canonical, - field: RowSortField, - col_prefix: u32, - row_stride: u32, - var_prefix: Option<&[u32]>, - nrows: usize, - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - if var_prefix.is_none() - && let Canonical::Primitive(arr) = canonical - { - return encode_primitive_arith(arr, field, col_prefix, row_stride, out, ctx); - } - - // General path: materialize this column's per-row start offsets and reuse the cursor - // encoder with zero-initialized cursors, so every row is written at its arithmetic - // offset with the exact same bytes the cursor path would produce. - let mut offsets: Vec = Vec::with_capacity(nrows); - let mut base = col_prefix; - match var_prefix { - None => { - for _ in 0..nrows { - offsets.push(base); - base = base.wrapping_add(row_stride); - } - } - Some(vp) => { - for &p in vp.iter().take(nrows) { - offsets.push(base.wrapping_add(p)); - base = base.wrapping_add(row_stride); - } - } - } - let mut cursors = vec![0u32; nrows]; - field_encode(canonical, field, &offsets, &mut cursors, out, ctx) -} - -/// Encode each row's bytes for the given canonical view into `out`, writing starting at -/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of -/// bytes written. -/// -/// After this call returns successfully, `cursors[i]` will have advanced by exactly the -/// per-row contribution previously computed by [`field_size`] for the same column. -pub(crate) fn field_encode( - canonical: &Canonical, - field: RowSortField, - offsets: &[u32], - cursors: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - match canonical { - Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out), - Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?, - Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?, - Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?, - Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?, - Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?, - Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?, - Canonical::List(_) => vortex_bail!( - "row encoding does not support canonical List arrays: {:?}", - canonical.dtype() - ), - Canonical::Variant(_) => { - vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") - } - unsupported => { - vortex_bail!( - "row encoding does not support canonical array: {:?}", - unsupported.dtype() - ) - } - } - Ok(()) -} - -fn add_size_const(sizes: &mut [u32], add: u32) { - for s in sizes.iter_mut() { - *s += add; - } -} - -fn add_size_null(arr: &NullArray, sizes: &mut [u32]) { - debug_assert_eq!(arr.len(), sizes.len()); - // Just a sentinel byte per row. - for s in sizes.iter_mut() { - *s += 1; - } -} - -fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) { - let width = byte_width_u32(arr.ptype().byte_width()); - add_size_const(sizes, encoded_size_for_fixed(width)); -} - -fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) { - let width = byte_width_u32(arr.values_type().byte_width()); - add_size_const(sizes, encoded_size_for_fixed(width)); -} - -fn add_size_varbinview( - arr: &VarBinViewArray, - sizes: &mut [u32], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let views = arr.views(); - match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { - ValidityKind::AllValid => { - for (i, view) in views.iter().enumerate() { - let contribution = if view.is_empty() { - VARLEN_EMPTY_SIZE - } else { - encoded_size_for_non_empty_varlen(view.len() as usize) - }; - sizes[i] = sizes[i] - .checked_add(contribution) - .vortex_expect("per-row size overflow"); - } - } - ValidityKind::Mask(mask) => { - for (i, view) in views.iter().enumerate() { - let contribution = if !mask.value(i) { - VARLEN_NULL_SIZE - } else if view.is_empty() { - VARLEN_EMPTY_SIZE - } else { - encoded_size_for_non_empty_varlen(view.len() as usize) - }; - sizes[i] = sizes[i] - .checked_add(contribution) - .vortex_expect("per-row size overflow"); - } - } - } - Ok(()) -} - -fn add_size_struct( - arr: &StructArray, - field: RowSortField, - sizes: &mut [u32], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let n = arr.len(); - let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; - // Outer sentinel: 1 byte per row. - for s in sizes.iter_mut() { - *s = s.checked_add(1).vortex_expect("per-row size overflow"); - } - // Each child contributes its per-row size when the parent is non-null, and a canonical - // null contribution when the parent is null. For fixed-width children both are equal, - // so we can simply add the fixed width to every row. For variable-width children the - // null contribution collapses to 1 byte, ensuring null parent rows have a constant body. - for child in arr.iter_unmasked_fields() { - match row_width_for_dtype(child.dtype())? { - RowWidth::Fixed(w) => add_size_const(sizes, w), - RowWidth::Variable => { - let canonical = child.clone().execute::(ctx)?; - let mut child_sizes = vec![0u32; n]; - field_size(&canonical, field, &mut child_sizes, ctx)?; - for i in 0..n { - let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 }; - sizes[i] = sizes[i] - .checked_add(contribution) - .vortex_expect("per-row size overflow"); - } - } - } - } - Ok(()) -} - -fn add_size_fsl( - arr: &FixedSizeListArray, - field: RowSortField, - sizes: &mut [u32], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let n = arr.len(); - debug_assert_eq!(n, sizes.len()); - let list_size = arr.list_size() as usize; - let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; - let elem_dtype = arr.elements().dtype(); - // Outer sentinel: 1 byte per row. - for s in sizes.iter_mut() { - *s = s.checked_add(1).vortex_expect("per-row size overflow"); - } - match row_width_for_dtype(elem_dtype)? { - RowWidth::Fixed(w) => { - // Each row has `list_size` fixed-width elements regardless of null parent mask. - let body = w - .checked_mul(u32::try_from(list_size).vortex_expect("list_size fits u32")) - .vortex_expect("FSL body width overflow"); - add_size_const(sizes, body); - } - RowWidth::Variable => { - let elements = arr.elements().clone().execute::(ctx)?; - debug_assert_eq!(elements.len(), n * list_size); - let mut elem_sizes = vec![0u32; n * list_size]; - field_size(&elements, field, &mut elem_sizes, ctx)?; - for i in 0..n { - let body: u32 = if mask.value(i) { - let base = i * list_size; - let mut sum: u32 = 0; - for j in 0..list_size { - sum = sum - .checked_add(elem_sizes[base + j]) - .vortex_expect("FSL row body overflow"); - } - sum - } else { - // Canonical null body for FSL with variable element: one null sentinel - // per element. (Each element contributes `child_null_width = 1`.) - u32::try_from(list_size).vortex_expect("list_size fits u32") - }; - sizes[i] = sizes[i] - .checked_add(body) - .vortex_expect("FSL per-row size overflow"); - } - } - } - Ok(()) -} - -fn encode_null( - arr: &NullArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], -) { - let sentinel = field.null_sentinel(); - for i in 0..arr.len() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - out[pos] = sentinel; - col_offset[i] += 1; - } -} - -fn encode_bool( - arr: &BoolArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let bits = arr.clone().into_bit_buffer(); - let non_null = field.non_null_sentinel(); - let xor = if field.descending { 0xFF } else { 0x00 }; - match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { - ValidityKind::AllValid => { - for i in 0..bits.len() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - out[pos] = non_null; - // false=0x01, true=0x02 so false < true; XOR for descending - let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 }; - out[pos + 1] = raw ^ xor; - col_offset[i] += BOOL_ENCODED_SIZE; - } - } - ValidityKind::Mask(mask) => { - let null = field.null_sentinel(); - for i in 0..bits.len() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - if mask.value(i) { - out[pos] = non_null; - let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 }; - out[pos + 1] = raw ^ xor; - } else { - out[pos] = null; - out[pos + 1] = 0; - } - col_offset[i] += BOOL_ENCODED_SIZE; - } - } - } - Ok(()) -} - -fn encode_primitive( - arr: &PrimitiveArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - match_each_native_ptype!(arr.ptype(), |T| { - encode_primitive_typed::(arr, field, row_offsets, col_offset, out, ctx)?; - }); - Ok(()) -} - -fn encode_primitive_typed( - arr: &PrimitiveArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let slice: &[T] = arr.as_slice(); - let non_null = field.non_null_sentinel(); - let value_bytes = size_of::(); - let stride = encoded_size_for_fixed(byte_width_u32(value_bytes)); - match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { - ValidityKind::AllValid => { - for (i, &v) in slice.iter().enumerate() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - out[pos] = non_null; - v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); - col_offset[i] += stride; - } - } - ValidityKind::Mask(mask) => { - let null = field.null_sentinel(); - for (i, &v) in slice.iter().enumerate() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - if mask.value(i) { - out[pos] = non_null; - v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); - } else { - out[pos] = null; - // Zero-fill the value bytes. - for b in &mut out[pos + 1..pos + 1 + value_bytes] { - *b = 0; - } - } - col_offset[i] += stride; - } - } - } - Ok(()) -} - -fn encode_decimal( - arr: &DecimalArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; - match arr.values_type() { - DecimalType::I8 => { - encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) - } - DecimalType::I16 => { - encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) - } - DecimalType::I32 => { - encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) - } - DecimalType::I64 => { - encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) - } - DecimalType::I128 => { - encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) - } - DecimalType::I256 => { - vortex_bail!("row encoding for Decimal256 is not yet implemented") - } - } - Ok(()) -} - -fn encode_decimal_typed( - arr: &DecimalArray, - mask: &vortex_mask::Mask, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], -) where - T: vortex_array::dtype::NativeDecimalType + RowEncode, -{ - let non_null = field.non_null_sentinel(); - let null = field.null_sentinel(); - let value_bytes = size_of::(); - let total = encoded_size_for_fixed(byte_width_u32(value_bytes)); - let slice = arr.buffer::(); - for i in 0..slice.len() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - if mask.value(i) { - out[pos] = non_null; - slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); - } else { - out[pos] = null; - for b in &mut out[pos + 1..pos + 1 + value_bytes] { - *b = 0; - } - } - col_offset[i] += total; - } -} - -fn encode_varbinview( - arr: &VarBinViewArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let null_byte = varlen_null_sentinel(field); - let empty_byte = varlen_empty_sentinel(field); - let non_empty_byte = varlen_non_empty_sentinel(field); - let descending = field.descending; - - let views = arr.views(); - // Cache the data-buffer slices once. Inlined views (len <= 12) carry their bytes inline, - // so they never touch `buffers`; referenced views index into the pre-validated buffer at - // `offset..offset + len`. Walking views directly avoids the per-row bounds and branch work - // of `with_iterator`. - let buffers: smallvec::SmallVec<[&[u8]; 4]> = (0..arr.data_buffers().len()) - .map(|i| arr.buffer(i).as_slice()) - .collect(); - - match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { - ValidityKind::AllValid => { - for (i, view) in views.iter().enumerate() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - let len = view.len() as usize; - if len == 0 { - out[pos] = empty_byte; - col_offset[i] += VARLEN_EMPTY_SIZE; - continue; - } - let bytes: &[u8] = if view.is_inlined() { - view.as_inlined().value() - } else { - let r = view.as_view(); - let off = r.offset as usize; - &buffers[r.buffer_index as usize][off..off + len] - }; - out[pos] = non_empty_byte; - let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending); - col_offset[i] += 1 + written; - } - } - ValidityKind::Mask(mask) => { - for (i, view) in views.iter().enumerate() { - let pos = (row_offsets[i] + col_offset[i]) as usize; - if !mask.value(i) { - out[pos] = null_byte; - col_offset[i] += VARLEN_NULL_SIZE; - continue; - } - let len = view.len() as usize; - if len == 0 { - out[pos] = empty_byte; - col_offset[i] += VARLEN_EMPTY_SIZE; - continue; - } - let bytes: &[u8] = if view.is_inlined() { - view.as_inlined().value() - } else { - let r = view.as_view(); - let off = r.offset as usize; - &buffers[r.buffer_index as usize][off..off + len] - }; - out[pos] = non_empty_byte; - let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending); - col_offset[i] += 1 + written; - } - } - } - Ok(()) -} - -fn encode_struct( - arr: &StructArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let n = arr.len(); - let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; - let non_null = field.non_null_sentinel(); - let null = field.null_sentinel(); - - // Write the outer sentinel for each row. - for i in 0..n { - let pos = (row_offsets[i] + col_offset[i]) as usize; - out[pos] = if mask.value(i) { non_null } else { null }; - col_offset[i] += 1; - } - - // Encode each child. For non-null parent rows the child contributes its actual encoding; - // for null parent rows the child contributes its canonical null encoding so that two null - // parent rows produce byte-equal output regardless of underlying child values. - for child in arr.iter_unmasked_fields() { - match row_width_for_dtype(child.dtype())? { - RowWidth::Fixed(w) => { - let canonical = child.clone().execute::(ctx)?; - field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?; - // Replace null parent rows with the canonical null encoding (the same as a - // child-level null: null sentinel followed by zero-padded value bytes). - let null_byte = child_canonical_null_byte(child.dtype(), field); - for i in 0..n { - if !mask.value(i) { - let end = (row_offsets[i] + col_offset[i]) as usize; - let start = end - w as usize; - out[start] = null_byte; - for b in &mut out[start + 1..end] { - *b = 0; - } - } - } - } - RowWidth::Variable => { - encode_variable_child(child, field, &mask, row_offsets, col_offset, out, ctx)?; - } - } - } - - Ok(()) -} - -fn encode_fsl( - arr: &FixedSizeListArray, - field: RowSortField, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let nrows = arr.len(); - let list_size = arr.list_size() as usize; - let mask = arr.as_ref().validity()?.execute_mask(nrows, ctx)?; - let non_null = field.non_null_sentinel(); - let null = field.null_sentinel(); - let elem_dtype = arr.elements().dtype().clone(); - - // Outer sentinel. - for i in 0..nrows { - let pos = (row_offsets[i] + col_offset[i]) as usize; - out[pos] = if mask.value(i) { non_null } else { null }; - col_offset[i] += 1; - } - - match row_width_for_dtype(&elem_dtype)? { - RowWidth::Fixed(w) => { - // Fixed-width elements: encode the elements array directly (its length is - // nrows * list_size) using a derived (offsets, cursors) pair. Then overwrite - // the body of null parent rows with the canonical null encoding per element. - let elements = arr.elements().clone().execute::(ctx)?; - debug_assert_eq!(elements.len(), nrows * list_size); - let list_size_u32 = arr.list_size(); - let row_body_bytes = w - .checked_mul(list_size_u32) - .vortex_expect("FSL body width overflow"); - let mut elem_offsets = vec![0u32; nrows * list_size]; - for i in 0..nrows { - let base = row_offsets[i] + col_offset[i]; - for j in 0u32..list_size_u32 { - elem_offsets[i * list_size + j as usize] = base + j * w; - } - } - let mut elem_cursors = vec![0u32; nrows * list_size]; - field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?; - for i in 0..nrows { - col_offset[i] = col_offset[i] - .checked_add(row_body_bytes) - .vortex_expect("FSL row body overflow"); - } - // Canonical null body for null parent rows: one null encoding per element. - let null_byte = child_canonical_null_byte(&elem_dtype, field); - let elem_width = w as usize; - for i in 0..nrows { - if !mask.value(i) { - let end = (row_offsets[i] + col_offset[i]) as usize; - let start = end - row_body_bytes as usize; - let mut pos = start; - for _ in 0..list_size { - out[pos] = null_byte; - for b in &mut out[pos + 1..pos + elem_width] { - *b = 0; - } - pos += elem_width; - } - } - } - } - RowWidth::Variable => { - // Variable-width elements: for null parent rows the canonical body is exactly - // `list_size` null sentinel bytes (one per element). For non-null parent rows, - // encode each element via a scratch buffer and copy into out. - let elements = arr.elements().clone().execute::(ctx)?; - debug_assert_eq!(elements.len(), nrows * list_size); - let mut elem_sizes = vec![0u32; nrows * list_size]; - field_size(&elements, field, &mut elem_sizes, ctx)?; - let total: u64 = elem_sizes.iter().map(|&s| u64::from(s)).sum(); - let total_usize = - usize::try_from(total).vortex_expect("FSL scratch buffer size fits usize"); - let mut scratch = vec![0u8; total_usize]; - let mut scratch_offsets = Vec::with_capacity(nrows * list_size); - let mut acc: u32 = 0; - for &s in &elem_sizes { - scratch_offsets.push(acc); - acc = acc - .checked_add(s) - .vortex_expect("FSL scratch offset overflow"); - } - let mut scratch_cursors = vec![0u32; nrows * list_size]; - field_encode( - &elements, - field, - &scratch_offsets, - &mut scratch_cursors, - &mut scratch, - ctx, - )?; - let null_byte = child_canonical_null_byte(&elem_dtype, field); - for i in 0..nrows { - let dst = (row_offsets[i] + col_offset[i]) as usize; - if mask.value(i) { - let mut body_bytes: u32 = 0; - for j in 0..list_size { - let k = i * list_size + j; - let src = scratch_offsets[k] as usize; - let sz = elem_sizes[k] as usize; - out[dst + body_bytes as usize..dst + body_bytes as usize + sz] - .copy_from_slice(&scratch[src..src + sz]); - body_bytes = body_bytes - .checked_add(elem_sizes[k]) - .vortex_expect("FSL body bytes overflow"); - } - col_offset[i] = col_offset[i] - .checked_add(body_bytes) - .vortex_expect("FSL row offset overflow"); - } else { - for offset in 0..list_size { - out[dst + offset] = null_byte; - } - col_offset[i] = col_offset[i] - .checked_add(u32::try_from(list_size).vortex_expect("list_size fits u32")) - .vortex_expect("FSL row offset overflow"); - } - } - } - } - - Ok(()) -} - -/// Encode one variable-width child of a struct: for non-null parent rows, copy the child's -/// natural encoding from a scratch buffer; for null parent rows, write a single -/// `child_canonical_null_byte`. -fn encode_variable_child( - child: &vortex_array::ArrayRef, - field: RowSortField, - parent_mask: &vortex_mask::Mask, - row_offsets: &[u32], - col_offset: &mut [u32], - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let n = child.len(); - let canonical = child.clone().execute::(ctx)?; - - // Size and encode the child into a sequential scratch buffer. - let mut child_sizes = vec![0u32; n]; - field_size(&canonical, field, &mut child_sizes, ctx)?; - let total: u64 = child_sizes.iter().map(|&s| u64::from(s)).sum(); - let total_usize = usize::try_from(total).vortex_expect("child scratch buffer size fits usize"); - let mut scratch = vec![0u8; total_usize]; - let mut scratch_offsets = Vec::with_capacity(n); - let mut acc: u32 = 0; - for &s in &child_sizes { - scratch_offsets.push(acc); - acc = acc - .checked_add(s) - .vortex_expect("child scratch offset overflow"); - } - let mut scratch_cursors = vec![0u32; n]; - field_encode( - &canonical, - field, - &scratch_offsets, - &mut scratch_cursors, - &mut scratch, - ctx, - )?; - - let null_byte = child_canonical_null_byte(child.dtype(), field); - for i in 0..n { - let dst = (row_offsets[i] + col_offset[i]) as usize; - if parent_mask.value(i) { - let src = scratch_offsets[i] as usize; - let sz = child_sizes[i] as usize; - out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]); - col_offset[i] = col_offset[i] - .checked_add(child_sizes[i]) - .vortex_expect("col_offset overflow"); - } else { - out[dst] = null_byte; - col_offset[i] = col_offset[i] - .checked_add(1) - .vortex_expect("col_offset overflow"); - } - } - Ok(()) -} - -/// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a -/// constant within-row offset, iterating the output in `row_stride`-sized chunks so the -/// compiler can drop the per-row offset/cursor indirection. -fn encode_primitive_arith( - arr: &PrimitiveArray, - field: RowSortField, - col_prefix: u32, - row_stride: u32, - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - match_each_native_ptype!(arr.ptype(), |T| { - encode_primitive_arith_typed::(arr, field, col_prefix, row_stride, out, ctx)?; - }); - Ok(()) -} - -fn encode_primitive_arith_typed( - arr: &PrimitiveArray, - field: RowSortField, - col_prefix: u32, - row_stride: u32, - out: &mut [u8], - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - let slice: &[T] = arr.as_slice(); - let non_null = field.non_null_sentinel(); - let value_bytes = size_of::(); - let slot_size = 1 + value_bytes; - let stride = row_stride as usize; - let prefix = col_prefix as usize; - let descending = field.descending; - - match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { - ValidityKind::AllValid => { - // Hot path: each row's slot is a fixed window inside its `stride`-sized chunk, - // so the inner write vectorizes the same way as `arrow-row`'s not-null path. - for (chunk, &v) in out.chunks_exact_mut(stride).zip(slice.iter()) { - let slot = &mut chunk[prefix..prefix + slot_size]; - slot[0] = non_null; - v.encode_to(&mut slot[1..], descending); - } - } - ValidityKind::Mask(mask) => { - let null = field.null_sentinel(); - for (i, (chunk, &v)) in out.chunks_exact_mut(stride).zip(slice.iter()).enumerate() { - let slot = &mut chunk[prefix..prefix + slot_size]; - if mask.value(i) { - slot[0] = non_null; - v.encode_to(&mut slot[1..], descending); - } else { - slot[0] = null; - for b in &mut slot[1..] { - *b = 0; - } - } - } - } - } - Ok(()) -} - -/// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with -/// continuation/length markers. Returns the number of bytes written. Empty values are -/// encoded by the caller as a single sentinel byte and never reach this function. -/// -/// For the ascending path the hot loop is a `copy_nonoverlapping` of 32 bytes per block -/// plus one stamped continuation byte. For the descending path it reads a u64 at a time and -/// XORs with `0xFF`, giving LLVM a vectorizable inner loop. -fn encode_non_empty_varlen_body(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 { - debug_assert!(!bytes.is_empty()); - let len = bytes.len(); - let full_blocks = len / VARLEN_BLOCK_SIZE; - let partial = len % VARLEN_BLOCK_SIZE; - let (full_to_write, partial_block_len) = if partial == 0 { - // Length is an exact multiple of 32: emit (full_blocks - 1) full blocks with the - // 0xFF continuation marker, then a final block whose continuation byte is 32. - (full_blocks - 1, VARLEN_BLOCK_SIZE) - } else { - (full_blocks, partial) - }; - let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL; - debug_assert!(out.len() >= total); - // The final block's continuation byte encodes its content length (1..=32). - let len_byte = - u8::try_from(partial_block_len).vortex_expect("varlen final block length (1..=32) fits u8"); - - // SAFETY: `out` has at least `total` bytes — the caller sizes every varlen slot via - // `encoded_size_for_non_empty_varlen` (which equals `1 + total`, the extra byte being the - // leading sentinel that the caller wrote and that is not part of `out`). `bytes` is valid - // for `len` reads, and every pointer advance below stays within `[0, total)` for `dst` - // and `[0, len)` for `src`. - unsafe { - let mut src = bytes.as_ptr(); - let mut dst = out.as_mut_ptr(); - - if !descending { - // Ascending fast path: each full block is a 32-byte memcpy + a single 0xFF stamp. - for _ in 0..full_to_write { - std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE); - *dst.add(VARLEN_BLOCK_SIZE) = 0xFF; - src = src.add(VARLEN_BLOCK_SIZE); - dst = dst.add(VARLEN_BLOCK_TOTAL); - } - // Final block: copy the partial data, zero-pad the tail, write the length byte. - std::ptr::copy_nonoverlapping(src, dst, partial_block_len); - std::ptr::write_bytes( - dst.add(partial_block_len), - 0, - VARLEN_BLOCK_SIZE - partial_block_len, - ); - *dst.add(VARLEN_BLOCK_SIZE) = len_byte; - } else { - // Descending: invert every value byte. A u64-stride XOR gives LLVM a vectorizable - // inner loop; the tail handles the partial block byte-wise. - for _ in 0..full_to_write { - xor_copy_block(src, dst); - *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF - src = src.add(VARLEN_BLOCK_SIZE); - dst = dst.add(VARLEN_BLOCK_TOTAL); - } - for i in 0..partial_block_len { - *dst.add(i) = *src.add(i) ^ 0xFF; - } - std::ptr::write_bytes( - dst.add(partial_block_len), - 0xFF, // 0x00 XOR 0xFF - VARLEN_BLOCK_SIZE - partial_block_len, - ); - *dst.add(VARLEN_BLOCK_SIZE) = len_byte ^ 0xFF; - } - } - u32::try_from(total).vortex_expect("encoded varlen byte length fits u32") -} - -/// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the -/// four u64-wide iterations into SIMD on x86. -/// -/// # Safety -/// `src` must be valid for 32 reads, `dst` valid for 32 writes, and the regions must not -/// overlap. -#[inline(always)] -unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) { - // Four u64 lanes of 8 bytes each = 32 bytes total. - for i in 0..4 { - let off = i * 8; - // SAFETY: the caller guarantees src/dst are valid for the full 32-byte block. - let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) }; - unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) }; - } -} - -/// Internal trait for encoding a fixed-width native value into byte slots. -/// -/// Implementations must produce a sequence of `size_of::()` bytes that is -/// lexicographically byte-comparable according to the natural ordering of the type. -pub(crate) trait RowEncode: Copy { - /// Encode this value into `out`, inverting the bytes for descending order. - fn encode_to(self, out: &mut [u8], descending: bool); -} - -macro_rules! impl_row_encode_unsigned { - ($t:ty) => { - impl RowEncode for $t { - #[inline] - fn encode_to(self, out: &mut [u8], descending: bool) { - let bytes = self.to_be_bytes(); - if descending { - for (i, b) in bytes.iter().enumerate() { - out[i] = b ^ 0xFF; - } - } else { - out.copy_from_slice(&bytes); - } - } - } - }; -} - -macro_rules! impl_row_encode_signed { - ($t:ty) => { - impl RowEncode for $t { - #[inline] - fn encode_to(self, out: &mut [u8], descending: bool) { - let mut bytes = self.to_be_bytes(); - // Flip sign bit so negatives < non-negatives lexicographically. - bytes[0] ^= 0x80; - if descending { - for (i, b) in bytes.iter().enumerate() { - out[i] = b ^ 0xFF; - } - } else { - out.copy_from_slice(&bytes); - } - } - } - }; -} - -impl_row_encode_unsigned!(u8); -impl_row_encode_unsigned!(u16); -impl_row_encode_unsigned!(u32); -impl_row_encode_unsigned!(u64); -impl_row_encode_signed!(i8); -impl_row_encode_signed!(i16); -impl_row_encode_signed!(i32); -impl_row_encode_signed!(i64); -impl_row_encode_signed!(i128); - -impl RowEncode for f32 { - fn encode_to(self, out: &mut [u8], descending: bool) { - let bits = self.to_bits(); - let mask: u32 = if (bits >> 31) == 0 { - 0x8000_0000 - } else { - 0xFFFF_FFFF - }; - let mut bytes = (bits ^ mask).to_be_bytes(); - if descending { - for b in bytes.iter_mut() { - *b ^= 0xFF; - } - } - out.copy_from_slice(&bytes); - } -} - -impl RowEncode for f64 { - fn encode_to(self, out: &mut [u8], descending: bool) { - let bits = self.to_bits(); - let mask: u64 = if (bits >> 63) == 0 { - 0x8000_0000_0000_0000 - } else { - 0xFFFF_FFFF_FFFF_FFFF - }; - let mut bytes = (bits ^ mask).to_be_bytes(); - if descending { - for b in bytes.iter_mut() { - *b ^= 0xFF; - } - } - out.copy_from_slice(&bytes); - } -} - -impl RowEncode for f16 { - fn encode_to(self, out: &mut [u8], descending: bool) { - let bits = self.to_bits(); - let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF }; - let mut bytes = (bits ^ mask).to_be_bytes(); - if descending { - for b in bytes.iter_mut() { - *b ^= 0xFF; - } - } - out.copy_from_slice(&bytes); - } -} diff --git a/vortex-row/src/codec/encoding.rs b/vortex-row/src/codec/encoding.rs new file mode 100644 index 00000000000..c3e90641b2c --- /dev/null +++ b/vortex-row/src/codec/encoding.rs @@ -0,0 +1,593 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Encode pass leaf kernels: per-row byte writers for each canonical variant, plus the +//! variable-length block body encoder. + +use super::*; + +pub(super) fn encode_null( + arr: &NullArray, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], +) { + let sentinel = fixed_null_sentinel(field); + for i in 0..arr.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = sentinel; + col_offset[i] += 1; + } +} + +pub(super) fn encode_bool( + arr: &BoolArray, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let bits = arr.clone().into_bit_buffer(); + let non_null = FIXED_NON_NULL_SENTINEL; + let xor = if field.descending { 0xFF } else { 0x00 }; + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + for i in 0..bits.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = non_null; + // false=0x01, true=0x02 so false < true; XOR for descending + let raw = u8::from(bits.value(i)) + 1; + out[pos + 1] = raw ^ xor; + col_offset[i] += BOOL_ENCODED_SIZE; + } + } + ValidityKind::Mask(mask) => { + let null = fixed_null_sentinel(field); + for i in 0..bits.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + let raw = u8::from(bits.value(i)) + 1; + out[pos + 1] = raw ^ xor; + } else { + out[pos] = null; + out[pos + 1] = 0; + } + col_offset[i] += BOOL_ENCODED_SIZE; + } + } + } + Ok(()) +} + +pub(super) fn encode_primitive( + arr: &PrimitiveArray, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match_each_native_ptype!(arr.ptype(), |T| { + encode_primitive_typed::(arr, field, row_offsets, col_offset, out, ctx)?; + }); + Ok(()) +} + +fn encode_primitive_typed( + arr: &PrimitiveArray, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let slice: &[T] = arr.as_slice(); + let non_null = FIXED_NON_NULL_SENTINEL; + let value_bytes = size_of::(); + let stride = encoded_size_for_fixed(byte_width_u32(value_bytes)); + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + for (i, &v) in slice.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = non_null; + v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + col_offset[i] += stride; + } + } + ValidityKind::Mask(mask) => { + let null = fixed_null_sentinel(field); + for (i, &v) in slice.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + } else { + out[pos] = null; + // Zero-fill the value bytes. + for b in &mut out[pos + 1..pos + 1 + value_bytes] { + *b = 0; + } + } + col_offset[i] += stride; + } + } + } + Ok(()) +} + +/// Narrow a decimal array whose physical `values_type` is wider than its precision-minimal +/// type down to that minimal type, returning `None` when it already uses the minimal width. +/// +/// Row-encoded widths are a pure function of the logical dtype: [`row_width_for_dtype`] sizes a +/// decimal column from [`DecimalType::smallest_decimal_value_type`] (the smallest physical type +/// that can hold the declared precision), independent of how the producer happened to store the +/// values. A `DecimalArray` may legally carry a wider `values_type` than its precision requires, +/// so without this normalization the encode pass would write more bytes than the size pass +/// reserved. The narrowing is always lossless because a decimal's precision bounds the magnitude +/// of every valid value, so the precision-minimal type can represent it. +fn narrow_decimal_to_smallest(arr: &DecimalArray) -> VortexResult> { + let decimal_dtype = arr.decimal_dtype(); + let target = DecimalType::smallest_decimal_value_type(&decimal_dtype); + if arr.values_type() == target { + return Ok(None); + } + let validity = arr.as_ref().validity()?; + let narrowed = match_each_decimal_value_type!(arr.values_type(), |P| { + let from = arr.buffer::

(); + match_each_decimal_value_type!(target, |Q| { + DecimalArray::new::(narrow_decimal_buffer::(from), decimal_dtype, validity) + }) + }); + Ok(Some(narrowed)) +} + +/// Narrow a buffer of decimal values from type `F` to a smaller type `T`. Lossless because the +/// caller only narrows to the precision-minimal type, which can represent every valid value. +fn narrow_decimal_buffer(from: Buffer) -> Buffer { + from.iter() + .map(|&v| T::from(v).vortex_expect("decimal value must fit its precision-minimal type")) + .collect() +} + +pub(super) fn encode_decimal( + arr: &DecimalArray, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + // Normalize to the precision-minimal physical type so the bytes we write match the width the + // size pass reserved (see `narrow_decimal_to_smallest`). + let narrowed = narrow_decimal_to_smallest(arr)?; + let arr = narrowed.as_ref().unwrap_or(arr); + let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + match arr.values_type() { + DecimalType::I8 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I16 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I32 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I64 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I128 => { + encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) + } + DecimalType::I256 => { + vortex_bail!("row encoding for Decimal256 is not yet implemented") + } + } + Ok(()) +} + +fn encode_decimal_typed( + arr: &DecimalArray, + mask: &vortex_mask::Mask, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], +) where + T: NativeDecimalType + RowEncode, +{ + let non_null = FIXED_NON_NULL_SENTINEL; + let null = fixed_null_sentinel(field); + let value_bytes = size_of::(); + let total = encoded_size_for_fixed(byte_width_u32(value_bytes)); + let slice = arr.buffer::(); + for i in 0..slice.len() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + out[pos] = non_null; + slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending); + } else { + out[pos] = null; + for b in &mut out[pos + 1..pos + 1 + value_bytes] { + *b = 0; + } + } + col_offset[i] += total; + } +} + +pub(super) fn encode_varbinview( + arr: &VarBinViewArray, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let null_byte = varlen_null_sentinel(field); + let empty_byte = varlen_empty_sentinel(field); + let non_empty_byte = varlen_non_empty_sentinel(field); + let descending = field.descending; + + let views = arr.views(); + // Cache the data-buffer slices once. Inlined views (len <= 12) carry their bytes inline, + // so they never touch `buffers`; referenced views index into the pre-validated buffer at + // `offset..offset + len`. Walking views directly avoids the per-row bounds and branch work + // of `with_iterator`. + let buffers: smallvec::SmallVec<[&[u8]; 4]> = (0..arr.data_buffers().len()) + .map(|i| arr.buffer(i).as_slice()) + .collect(); + + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + for (i, view) in views.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + let len = view.len() as usize; + if len == 0 { + out[pos] = empty_byte; + col_offset[i] += VARLEN_EMPTY_SIZE; + continue; + } + let bytes: &[u8] = if view.is_inlined() { + view.as_inlined().value() + } else { + let r = view.as_view(); + let off = r.offset as usize; + &buffers[r.buffer_index as usize][off..off + len] + }; + out[pos] = non_empty_byte; + let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending)?; + col_offset[i] += 1 + written; + } + } + ValidityKind::Mask(mask) => { + for (i, view) in views.iter().enumerate() { + let pos = (row_offsets[i] + col_offset[i]) as usize; + if !mask.value(i) { + out[pos] = null_byte; + col_offset[i] += VARLEN_NULL_SIZE; + continue; + } + let len = view.len() as usize; + if len == 0 { + out[pos] = empty_byte; + col_offset[i] += VARLEN_EMPTY_SIZE; + continue; + } + let bytes: &[u8] = if view.is_inlined() { + view.as_inlined().value() + } else { + let r = view.as_view(); + let off = r.offset as usize; + &buffers[r.buffer_index as usize][off..off + len] + }; + out[pos] = non_empty_byte; + let written = encode_non_empty_varlen_body(bytes, &mut out[pos + 1..], descending)?; + col_offset[i] += 1 + written; + } + } + } + Ok(()) +} + +pub(super) fn encode_struct( + arr: &StructArray, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + let non_null = FIXED_NON_NULL_SENTINEL; + let null = fixed_null_sentinel(field); + + // Write the outer sentinel for each row. + for i in 0..n { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = if mask.value(i) { non_null } else { null }; + col_offset[i] += 1; + } + + // Encode each child. For non-null parent rows the child contributes its actual encoding; + // for null parent rows the child contributes its canonical null encoding so that two null + // parent rows produce byte-equal output regardless of underlying child values. + for child in arr.iter_unmasked_fields() { + match row_width_for_dtype(child.dtype())? { + RowWidth::Fixed(w) => { + let canonical = child.clone().execute::(ctx)?; + field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?; + // Replace null parent rows with the canonical null encoding (the same as a + // child-level null: null sentinel followed by zero-padded value bytes). + let null_byte = child_canonical_null_byte(child.dtype(), field); + for i in 0..n { + if !mask.value(i) { + let end = (row_offsets[i] + col_offset[i]) as usize; + let start = end - w as usize; + out[start] = null_byte; + for b in &mut out[start + 1..end] { + *b = 0; + } + } + } + } + RowWidth::Variable => { + encode_variable_child(child, field, &mask, row_offsets, col_offset, out, ctx)?; + } + } + } + + Ok(()) +} + +pub(super) fn encode_fsl( + arr: &FixedSizeListArray, + field: RowSortFieldOptions, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let nrows = arr.len(); + // `list_size` is natively a `u32`; keep both forms (see `add_size_fsl`). + let list_size_u32 = arr.list_size(); + let list_size = list_size_u32 as usize; + let mask = arr.as_ref().validity()?.execute_mask(nrows, ctx)?; + let non_null = FIXED_NON_NULL_SENTINEL; + let null = fixed_null_sentinel(field); + let elem_dtype = arr.elements().dtype().clone(); + + // Outer sentinel. + for i in 0..nrows { + let pos = (row_offsets[i] + col_offset[i]) as usize; + out[pos] = if mask.value(i) { non_null } else { null }; + col_offset[i] += 1; + } + + match row_width_for_dtype(&elem_dtype)? { + RowWidth::Fixed(w) => { + // Fixed-width elements: encode the elements array directly (its length is + // nrows * list_size) using a derived (offsets, cursors) pair. Then overwrite + // the body of null parent rows with the canonical null encoding per element. + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), nrows * list_size); + let row_body_bytes = w + .checked_mul(list_size_u32) + .ok_or_else(|| vortex_err!("FSL body width overflow"))?; + let mut elem_offsets = vec![0u32; nrows * list_size]; + for i in 0..nrows { + let base = row_offsets[i] + col_offset[i]; + for j in 0u32..list_size_u32 { + elem_offsets[i * list_size + j as usize] = base + j * w; + } + } + let mut elem_cursors = vec![0u32; nrows * list_size]; + field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?; + for i in 0..nrows { + col_offset[i] = col_offset[i] + .checked_add(row_body_bytes) + .ok_or_else(|| vortex_err!("FSL row body overflow"))?; + } + // Canonical null body for null parent rows: one null encoding per element. + let null_byte = child_canonical_null_byte(&elem_dtype, field); + let elem_width = w as usize; + for i in 0..nrows { + if !mask.value(i) { + let end = (row_offsets[i] + col_offset[i]) as usize; + let start = end - row_body_bytes as usize; + let mut pos = start; + for _ in 0..list_size { + out[pos] = null_byte; + for b in &mut out[pos + 1..pos + elem_width] { + *b = 0; + } + pos += elem_width; + } + } + } + } + RowWidth::Variable => { + // Variable-width elements: for null parent rows the canonical body is exactly + // `list_size` null sentinel bytes (one per element). For non-null parent rows, + // encode each element via a scratch buffer and copy into out. + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), nrows * list_size); + let mut elem_sizes = vec![0u32; nrows * list_size]; + field_size(&elements, field, &mut elem_sizes, ctx)?; + let total: u64 = elem_sizes.iter().map(|&s| u64::from(s)).sum(); + let total_usize = + usize::try_from(total).vortex_expect("FSL scratch buffer size fits usize"); + let mut scratch = vec![0u8; total_usize]; + let mut scratch_offsets = Vec::with_capacity(nrows * list_size); + let mut acc: u32 = 0; + for &s in &elem_sizes { + scratch_offsets.push(acc); + acc = acc + .checked_add(s) + .ok_or_else(|| vortex_err!("FSL scratch offset overflow"))?; + } + let mut scratch_cursors = vec![0u32; nrows * list_size]; + field_encode( + &elements, + field, + &scratch_offsets, + &mut scratch_cursors, + &mut scratch, + ctx, + )?; + let null_byte = child_canonical_null_byte(&elem_dtype, field); + for i in 0..nrows { + let dst = (row_offsets[i] + col_offset[i]) as usize; + if mask.value(i) { + let mut body_bytes: u32 = 0; + for j in 0..list_size { + let k = i * list_size + j; + let src = scratch_offsets[k] as usize; + let sz = elem_sizes[k] as usize; + out[dst + body_bytes as usize..dst + body_bytes as usize + sz] + .copy_from_slice(&scratch[src..src + sz]); + body_bytes = body_bytes + .checked_add(elem_sizes[k]) + .ok_or_else(|| vortex_err!("FSL body bytes overflow"))?; + } + col_offset[i] = col_offset[i] + .checked_add(body_bytes) + .ok_or_else(|| vortex_err!("FSL row offset overflow"))?; + } else { + for offset in 0..list_size { + out[dst + offset] = null_byte; + } + col_offset[i] = col_offset[i] + .checked_add(list_size_u32) + .ok_or_else(|| vortex_err!("FSL row offset overflow"))?; + } + } + } + } + + Ok(()) +} + +/// Encode one variable-width child of a struct: for non-null parent rows, copy the child's +/// natural encoding from a scratch buffer; for null parent rows, write a single +/// `child_canonical_null_byte`. +fn encode_variable_child( + child: &vortex_array::ArrayRef, + field: RowSortFieldOptions, + parent_mask: &vortex_mask::Mask, + row_offsets: &[u32], + col_offset: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = child.len(); + let canonical = child.clone().execute::(ctx)?; + + // Size and encode the child into a sequential scratch buffer. + let mut child_sizes = vec![0u32; n]; + field_size(&canonical, field, &mut child_sizes, ctx)?; + let total: u64 = child_sizes.iter().map(|&s| u64::from(s)).sum(); + let total_usize = usize::try_from(total).vortex_expect("child scratch buffer size fits usize"); + let mut scratch = vec![0u8; total_usize]; + let mut scratch_offsets = Vec::with_capacity(n); + let mut acc: u32 = 0; + for &s in &child_sizes { + scratch_offsets.push(acc); + acc = acc + .checked_add(s) + .ok_or_else(|| vortex_err!("child scratch offset overflow"))?; + } + let mut scratch_cursors = vec![0u32; n]; + field_encode( + &canonical, + field, + &scratch_offsets, + &mut scratch_cursors, + &mut scratch, + ctx, + )?; + + let null_byte = child_canonical_null_byte(child.dtype(), field); + for i in 0..n { + let dst = (row_offsets[i] + col_offset[i]) as usize; + if parent_mask.value(i) { + let src = scratch_offsets[i] as usize; + let sz = child_sizes[i] as usize; + out[dst..dst + sz].copy_from_slice(&scratch[src..src + sz]); + col_offset[i] = col_offset[i] + .checked_add(child_sizes[i]) + .ok_or_else(|| vortex_err!("col_offset overflow"))?; + } else { + out[dst] = null_byte; + col_offset[i] = col_offset[i] + .checked_add(1) + .ok_or_else(|| vortex_err!("col_offset overflow"))?; + } + } + Ok(()) +} + +/// Arithmetic-write primitive encoder: writes each row's `sentinel + value` slot at a +/// constant within-row offset, iterating the output in `row_stride`-sized chunks so the +/// compiler can drop the per-row offset/cursor indirection. +pub(super) fn encode_primitive_arith( + arr: &PrimitiveArray, + field: RowSortFieldOptions, + col_prefix: u32, + row_stride: u32, + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match_each_native_ptype!(arr.ptype(), |T| { + encode_primitive_arith_typed::(arr, field, col_prefix, row_stride, out, ctx)?; + }); + Ok(()) +} + +fn encode_primitive_arith_typed( + arr: &PrimitiveArray, + field: RowSortFieldOptions, + col_prefix: u32, + row_stride: u32, + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let slice: &[T] = arr.as_slice(); + let non_null = FIXED_NON_NULL_SENTINEL; + let value_bytes = size_of::(); + let slot_size = 1 + value_bytes; + let stride = row_stride as usize; + let prefix = col_prefix as usize; + let descending = field.descending; + + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + // Hot path: each row's slot is a fixed window inside its `stride`-sized chunk, + // so the inner write vectorizes the same way as `arrow-row`'s not-null path. + for (chunk, &v) in out.chunks_exact_mut(stride).zip(slice.iter()) { + let slot = &mut chunk[prefix..prefix + slot_size]; + slot[0] = non_null; + v.encode_to(&mut slot[1..], descending); + } + } + ValidityKind::Mask(mask) => { + let null = fixed_null_sentinel(field); + for (i, (chunk, &v)) in out.chunks_exact_mut(stride).zip(slice.iter()).enumerate() { + let slot = &mut chunk[prefix..prefix + slot_size]; + if mask.value(i) { + slot[0] = non_null; + v.encode_to(&mut slot[1..], descending); + } else { + slot[0] = null; + for b in &mut slot[1..] { + *b = 0; + } + } + } + } + } + Ok(()) +} diff --git a/vortex-row/src/codec/mod.rs b/vortex-row/src/codec/mod.rs new file mode 100644 index 00000000000..d5c8afbcfb7 --- /dev/null +++ b/vortex-row/src/codec/mod.rs @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants. +//! +//! The encoded byte format produces a lexicographically byte-comparable representation: +//! comparing the byte slices of two encoded rows yields the same ordering as the +//! original logical (tuple) comparison of their values, modulo nulls placement and +//! descending-ness as configured by [`RowSortFieldOptions`]. +//! +//! Conventions: +//! - Every fixed-width value is preceded by a 1-byte sentinel that orders nulls relative to +//! non-nulls. For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF), +//! not the sentinel. +//! - Variable-length (Utf8, Binary) values use **three** distinct leading sentinels — one each +//! for null, empty, and non-empty — so byte comparison at position 0 fully categorizes the +//! value and column-byte boundaries stay aligned across rows. See +//! [`varlen_null_sentinel`], [`varlen_empty_sentinel`], [`varlen_non_empty_sentinel`]. +//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types. +//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top +//! bit; negative flips all bits. +//! - Nullable structs and fixed-size lists encode null parent rows with a **canonical null +//! body** so two null parent rows produce byte-equal encodings: fixed-width children +//! contribute their fixed null encoding, and variable-width children collapse to a single +//! null sentinel byte. + +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::FixedSizeListArray; +use vortex_array::arrays::NullArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt; +use vortex_array::arrays::struct_::StructArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::DecimalType; +use vortex_array::dtype::NativeDecimalType; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::half::f16; +use vortex_array::match_each_decimal_value_type; +use vortex_array::match_each_native_ptype; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; + +use crate::options::RowSortFieldOptions; + +/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte). +pub(crate) const BOOL_ENCODED_SIZE: u32 = 2; + +/// Block size used in the variable-length encoding. +pub(crate) const VARLEN_BLOCK_SIZE: usize = 32; +/// Total bytes per varlen block including the trailing continuation marker. +pub(crate) const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1; +const VARLEN_BLOCK_TOTAL_U32: u32 = 33; + +/// Size in bytes of an encoded null varlen value (just the sentinel byte). +pub(crate) const VARLEN_NULL_SIZE: u32 = 1; +/// Size in bytes of an encoded empty varlen value (just the sentinel byte). +pub(crate) const VARLEN_EMPTY_SIZE: u32 = 1; + +/// Returns the size in bytes of the encoded form of a non-empty variable-length value. +/// +/// Includes the leading sentinel byte plus `ceil(len/32) * 33` block bytes (32 content + 1 +/// continuation/length byte). Callers must use [`VARLEN_NULL_SIZE`] for null values and +/// [`VARLEN_EMPTY_SIZE`] for empty values. +/// +/// # Errors +/// +/// Returns an error if the encoded length overflows `u32`. The block count itself always fits +/// (a `BinaryView` length is a `u32`, so `blocks <= ceil(u32::MAX / 32) < 2^27`), but the +/// `blocks * 33 + 1` byte total can exceed `u32::MAX` for multi-gigabyte values. +#[inline] +fn encoded_size_for_non_empty_varlen(len: usize) -> VortexResult { + debug_assert!(len > 0); + let blocks = u32::try_from(len.div_ceil(VARLEN_BLOCK_SIZE)) + .vortex_expect("varlen block count must fit in u32"); + blocks + .checked_mul(VARLEN_BLOCK_TOTAL_U32) + .and_then(|b| b.checked_add(1)) + .ok_or_else(|| vortex_err!("varlen encoded size overflows u32")) +} + +/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel). +#[inline] +const fn encoded_size_for_fixed(value_bytes: u32) -> u32 { + 1 + value_bytes +} + +/// A native byte width (at most 32 for `i256`) always fits in a `u32`. +#[inline] +fn byte_width_u32(width: usize) -> u32 { + u32::try_from(width).vortex_expect("native byte width must fit in u32") +} + +/// Pre-resolved per-row validity for the row encoders. +/// +/// Encoders pattern-match on this once before their inner loop so the no-nulls fast path +/// avoids per-row `mask.value(i)` branches entirely, and the nullable path materializes the +/// mask exactly once. +pub(crate) enum ValidityKind { + /// Column statically has no nulls (`Validity::NonNullable` or `AllValid`); no mask needed. + AllValid, + /// Column may have nulls; carries the materialized per-row mask. + Mask(vortex_mask::Mask), +} + +/// Resolve a [`Validity`] into a [`ValidityKind`], materializing the mask only when the column +/// may actually have nulls. +#[inline] +pub(crate) fn resolve_validity( + validity: Validity, + len: usize, + ctx: &mut ExecutionCtx, +) -> VortexResult { + Ok(match validity { + Validity::NonNullable | Validity::AllValid => ValidityKind::AllValid, + other => ValidityKind::Mask(other.execute_mask(len, ctx)?), + }) +} + +/// Returns the sentinel byte for a null varlen value. +/// +/// The choice is positional (0x00 when nulls sort first, 0xFF when nulls sort last) and +/// independent of `descending`, matching the convention used by `arrow-row`. +#[inline] +fn varlen_null_sentinel(field: RowSortFieldOptions) -> u8 { + if field.nulls_first { 0x00 } else { 0xFF } +} + +/// Returns the sentinel byte for an empty varlen value. +/// +/// Equal to `0x01` in ascending mode and `!0x01 = 0xFE` in descending mode. +#[inline] +fn varlen_empty_sentinel(field: RowSortFieldOptions) -> u8 { + if field.descending { !0x01u8 } else { 0x01u8 } +} + +/// Returns the sentinel byte for a non-empty varlen value. +/// +/// Equal to `0x02` in ascending mode and `!0x02 = 0xFD` in descending mode. +#[inline] +fn varlen_non_empty_sentinel(field: RowSortFieldOptions) -> u8 { + if field.descending { !0x02u8 } else { 0x02u8 } +} + +/// The sentinel byte that precedes a non-null fixed-width value. +/// +/// Fixed-width values always lead with `0x01`. Null values use a sentinel that sorts either +/// below (`0x00`) or above (`0x02`) it (see [`fixed_null_sentinel`]), so a single leading-byte +/// comparison orders nulls relative to non-nulls. Unlike the value bytes, this sentinel is never +/// inverted for `descending`: null placement is positional and independent of sort direction. +const FIXED_NON_NULL_SENTINEL: u8 = 0x01; + +/// Returns the sentinel byte that precedes a null fixed-width value. +/// +/// `nulls_first` writes `0x00` (sorts before the [`FIXED_NON_NULL_SENTINEL`] `0x01`); otherwise +/// `0x02` (sorts after). Like the non-null sentinel, the choice is positional and independent of +/// `descending`, matching the convention used by `arrow-row`. +#[inline] +fn fixed_null_sentinel(field: RowSortFieldOptions) -> u8 { + if field.nulls_first { 0x00 } else { 0x02 } +} + +/// Returns the single-byte null sentinel used when a child contributes its canonical null +/// encoding inside a null parent struct/FSL row. +/// +/// For varlen children that is the varlen null sentinel; for everything else (including +/// nested struct/FSL when used as a variable-width child) it is the fixed-width null sentinel. +fn child_canonical_null_byte(child_dtype: &DType, field: RowSortFieldOptions) -> u8 { + match child_dtype { + DType::Utf8(_) | DType::Binary(_) => varlen_null_sentinel(field), + _ => fixed_null_sentinel(field), + } +} + +/// Per-row width classification for a column. +/// +/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless +/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary, +/// List, or any composite that recurses through a variable-width field). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum RowWidth { + /// Per-row width is the same constant for every row in the column. + Fixed(u32), + /// Per-row width is data-dependent. + Variable, +} + +/// Classify a column's per-row encoded width by inspecting only its [`DType`]. +/// +/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value), +/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the +/// data. +/// +/// Classification does not depend on the [`RowSortFieldOptions`]: null-vs-non-null encoding width is +/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls). +/// +/// # Errors +/// +/// Returns an error for dtypes that the row encoder does not support. Width arithmetic that +/// would overflow `u32` is also reported as an error rather than silently saturating. +pub(crate) fn row_width_for_dtype(dtype: &DType) -> VortexResult { + match dtype { + DType::Null => Ok(RowWidth::Fixed(1)), + DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)), + DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( + ptype.byte_width(), + )))), + DType::Decimal(dt, _) => { + let vt = DecimalType::smallest_decimal_value_type(dt); + if matches!(vt, DecimalType::I256) { + vortex_bail!("row encoding for Decimal256 is not yet implemented"); + } + Ok(RowWidth::Fixed(encoded_size_for_fixed(byte_width_u32( + vt.byte_width(), + )))) + } + DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable), + DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? { + // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL + // itself, then `n` copies of the element width. + RowWidth::Fixed(w) => { + let body = w + .checked_mul(*n) + .ok_or_else(|| vortex_err!("FSL row width overflows u32"))?; + let total = body + .checked_add(1) + .ok_or_else(|| vortex_err!("FSL row width overflows u32"))?; + Ok(RowWidth::Fixed(total)) + } + RowWidth::Variable => Ok(RowWidth::Variable), + }, + DType::Struct(fields, _) => { + // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel. + let mut total: u32 = 1; // outer sentinel + for field_dtype in fields.fields() { + match row_width_for_dtype(&field_dtype)? { + RowWidth::Fixed(w) => { + total = total + .checked_add(w) + .ok_or_else(|| vortex_err!("Struct row width overflows u32"))?; + } + RowWidth::Variable => return Ok(RowWidth::Variable), + } + } + Ok(RowWidth::Fixed(total)) + } + DType::List(..) => { + vortex_bail!( + "row encoding does not support variable-size List arrays (no well-defined ordering)" + ) + } + DType::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"), + dtype => vortex_bail!("row encoding does not support dtype: {dtype:?}"), + } +} + +/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`. +/// +/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the +/// per-row size to each entry so multiple columns can accumulate into the same buffer. +/// +/// # Errors +/// +/// Returns an error for unsupported canonical variants. +pub(crate) fn field_size( + canonical: &Canonical, + field: RowSortFieldOptions, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match canonical { + Canonical::Null(arr) => add_size_null(arr, sizes)?, + Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1))?, + Canonical::Primitive(arr) => add_size_primitive(arr, sizes)?, + Canonical::Decimal(arr) => add_size_decimal(arr, sizes)?, + Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?, + Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?, + Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?, + Canonical::List(_) => vortex_bail!( + "row encoding does not support canonical List arrays: {:?}", + canonical.dtype() + ), + Canonical::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + unsupported => { + vortex_bail!( + "row encoding does not support canonical array: {:?}", + unsupported.dtype() + ) + } + } + Ok(()) +} + +/// Encode a fixed-width column at arithmetic offsets, without reading or writing any per-row +/// cursor. +/// +/// For row `i`, the column's bytes are written starting at `i * row_stride + col_prefix +/// (+ var_prefix[i])`, where `var_prefix` is the exclusive prefix sum of the varlen +/// contributions (`None` when the row layout has no variable-length columns). This is the +/// fast path for fixed-width columns that appear before any varlen column, so their +/// within-row position is a constant offset rather than a running cursor. +/// +/// For primitive columns in the pure-fixed case it uses a `chunks_exact_mut` hot loop that +/// removes the per-row offset/cursor indirection (matching `arrow-row`'s `encode_not_null`). +/// All other types reuse [`field_encode`] at the materialized offsets, so the bytes written +/// are byte-identical to the cursor path. +#[allow(clippy::too_many_arguments)] +pub(crate) fn field_encode_fixed_arithmetic( + canonical: &Canonical, + field: RowSortFieldOptions, + col_prefix: u32, + row_stride: u32, + var_prefix: Option<&[u32]>, + nrows: usize, + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + if var_prefix.is_none() + && let Canonical::Primitive(arr) = canonical + { + return encode_primitive_arith(arr, field, col_prefix, row_stride, out, ctx); + } + + // General path: materialize this column's per-row start offsets and reuse the cursor + // encoder with zero-initialized cursors, so every row is written at its arithmetic + // offset with the exact same bytes the cursor path would produce. + let mut offsets: Vec = Vec::with_capacity(nrows); + let mut base = col_prefix; + match var_prefix { + None => { + for _ in 0..nrows { + offsets.push(base); + base = base.wrapping_add(row_stride); + } + } + Some(vp) => { + for &p in vp.iter().take(nrows) { + offsets.push(base.wrapping_add(p)); + base = base.wrapping_add(row_stride); + } + } + } + let mut cursors = vec![0u32; nrows]; + field_encode(canonical, field, &offsets, &mut cursors, out, ctx) +} + +/// Encode each row's bytes for the given canonical view into `out`, writing starting at +/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of +/// bytes written. +/// +/// After this call returns successfully, `cursors[i]` will have advanced by exactly the +/// per-row contribution previously computed by [`field_size`] for the same column. +pub(crate) fn field_encode( + canonical: &Canonical, + field: RowSortFieldOptions, + offsets: &[u32], + cursors: &mut [u32], + out: &mut [u8], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + match canonical { + Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out), + Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?, + Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?, + Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?, + Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?, + Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?, + Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?, + Canonical::List(_) => vortex_bail!( + "row encoding does not support canonical List arrays: {:?}", + canonical.dtype() + ), + Canonical::Variant(_) => { + vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)") + } + unsupported => { + vortex_bail!( + "row encoding does not support canonical array: {:?}", + unsupported.dtype() + ) + } + } + Ok(()) +} + +mod encoding; +mod native; +mod sizing; +mod varlen; + +use encoding::*; +use native::*; +use sizing::*; +use varlen::*; diff --git a/vortex-row/src/codec/native.rs b/vortex-row/src/codec/native.rs new file mode 100644 index 00000000000..b09df81adcf --- /dev/null +++ b/vortex-row/src/codec/native.rs @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The `RowEncode` trait and its implementations for fixed-width native value types. + +use super::*; + +/// Internal trait for encoding a fixed-width native value into byte slots. +/// +/// Implementations must produce a sequence of `size_of::()` bytes that is +/// lexicographically byte-comparable according to the natural ordering of the type. +pub(crate) trait RowEncode: Copy { + /// Encode this value into `out`, inverting the bytes for descending order. + fn encode_to(self, out: &mut [u8], descending: bool); +} + +macro_rules! impl_row_encode_unsigned { + ($t:ty) => { + impl RowEncode for $t { + #[inline] + fn encode_to(self, out: &mut [u8], descending: bool) { + let bytes = self.to_be_bytes(); + if descending { + for (i, b) in bytes.iter().enumerate() { + out[i] = b ^ 0xFF; + } + } else { + out.copy_from_slice(&bytes); + } + } + } + }; +} + +macro_rules! impl_row_encode_signed { + ($t:ty) => { + impl RowEncode for $t { + #[inline] + fn encode_to(self, out: &mut [u8], descending: bool) { + let mut bytes = self.to_be_bytes(); + // Flip sign bit so negatives < non-negatives lexicographically. + bytes[0] ^= 0x80; + if descending { + for (i, b) in bytes.iter().enumerate() { + out[i] = b ^ 0xFF; + } + } else { + out.copy_from_slice(&bytes); + } + } + } + }; +} + +impl_row_encode_unsigned!(u8); +impl_row_encode_unsigned!(u16); +impl_row_encode_unsigned!(u32); +impl_row_encode_unsigned!(u64); +impl_row_encode_signed!(i8); +impl_row_encode_signed!(i16); +impl_row_encode_signed!(i32); +impl_row_encode_signed!(i64); +impl_row_encode_signed!(i128); + +impl RowEncode for f32 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u32 = if (bits >> 31) == 0 { + 0x8000_0000 + } else { + 0xFFFF_FFFF + }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} + +impl RowEncode for f64 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u64 = if (bits >> 63) == 0 { + 0x8000_0000_0000_0000 + } else { + 0xFFFF_FFFF_FFFF_FFFF + }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} + +impl RowEncode for f16 { + fn encode_to(self, out: &mut [u8], descending: bool) { + let bits = self.to_bits(); + let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF }; + let mut bytes = (bits ^ mask).to_be_bytes(); + if descending { + for b in bytes.iter_mut() { + *b ^= 0xFF; + } + } + out.copy_from_slice(&bytes); + } +} diff --git a/vortex-row/src/codec/sizing.rs b/vortex-row/src/codec/sizing.rs new file mode 100644 index 00000000000..8621d966f2b --- /dev/null +++ b/vortex-row/src/codec/sizing.rs @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Size pass leaf kernels: per-row byte-size accumulation for each canonical variant. +//! +//! Every accumulator returns [`VortexResult`] and uses checked arithmetic, so an input whose +//! per-row encoding would exceed `u32::MAX` bytes surfaces a [`VortexError`](vortex_error::VortexError) +//! instead of overflowing or panicking. + +use super::*; + +pub(super) fn add_size_const(sizes: &mut [u32], add: u32) -> VortexResult<()> { + for s in sizes.iter_mut() { + *s = s + .checked_add(add) + .ok_or_else(|| vortex_err!("per-row size overflow"))?; + } + Ok(()) +} + +pub(super) fn add_size_null(arr: &NullArray, sizes: &mut [u32]) -> VortexResult<()> { + debug_assert_eq!(arr.len(), sizes.len()); + // Just a sentinel byte per row. + add_size_const(sizes, 1) +} + +pub(super) fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) -> VortexResult<()> { + let width = byte_width_u32(arr.ptype().byte_width()); + add_size_const(sizes, encoded_size_for_fixed(width)) +} + +pub(super) fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) -> VortexResult<()> { + // Size from the precision-minimal type, not the physical `values_type`, so the size pass + // agrees with `row_width_for_dtype` (and the encode pass) regardless of how the producer + // stored the values. See `narrow_decimal_to_smallest`. + let vt = DecimalType::smallest_decimal_value_type(&arr.decimal_dtype()); + let width = byte_width_u32(vt.byte_width()); + add_size_const(sizes, encoded_size_for_fixed(width)) +} + +pub(super) fn add_size_varbinview( + arr: &VarBinViewArray, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let views = arr.views(); + match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? { + ValidityKind::AllValid => { + for (i, view) in views.iter().enumerate() { + let contribution = if view.is_empty() { + VARLEN_EMPTY_SIZE + } else { + encoded_size_for_non_empty_varlen(view.len() as usize)? + }; + sizes[i] = sizes[i] + .checked_add(contribution) + .ok_or_else(|| vortex_err!("per-row size overflow"))?; + } + } + ValidityKind::Mask(mask) => { + for (i, view) in views.iter().enumerate() { + let contribution = if !mask.value(i) { + VARLEN_NULL_SIZE + } else if view.is_empty() { + VARLEN_EMPTY_SIZE + } else { + encoded_size_for_non_empty_varlen(view.len() as usize)? + }; + sizes[i] = sizes[i] + .checked_add(contribution) + .ok_or_else(|| vortex_err!("per-row size overflow"))?; + } + } + } + Ok(()) +} + +pub(super) fn add_size_struct( + arr: &StructArray, + field: RowSortFieldOptions, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + // Outer sentinel: 1 byte per row. + add_size_const(sizes, 1)?; + // Each child contributes its per-row size when the parent is non-null, and a canonical + // null contribution when the parent is null. For fixed-width children both are equal, + // so we can simply add the fixed width to every row. For variable-width children the + // null contribution collapses to 1 byte, ensuring null parent rows have a constant body. + for child in arr.iter_unmasked_fields() { + match row_width_for_dtype(child.dtype())? { + RowWidth::Fixed(w) => add_size_const(sizes, w)?, + RowWidth::Variable => { + let canonical = child.clone().execute::(ctx)?; + let mut child_sizes = vec![0u32; n]; + field_size(&canonical, field, &mut child_sizes, ctx)?; + for i in 0..n { + let contribution = if mask.value(i) { child_sizes[i] } else { 1u32 }; + sizes[i] = sizes[i] + .checked_add(contribution) + .ok_or_else(|| vortex_err!("per-row size overflow"))?; + } + } + } + } + Ok(()) +} + +pub(super) fn add_size_fsl( + arr: &FixedSizeListArray, + field: RowSortFieldOptions, + sizes: &mut [u32], + ctx: &mut ExecutionCtx, +) -> VortexResult<()> { + let n = arr.len(); + debug_assert_eq!(n, sizes.len()); + // `list_size` is natively a `u32`; keep both forms so element indexing stays `usize` while + // width arithmetic avoids a fallible `usize -> u32` conversion. + let list_size_u32 = arr.list_size(); + let list_size = list_size_u32 as usize; + let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?; + let elem_dtype = arr.elements().dtype(); + // Outer sentinel: 1 byte per row. + add_size_const(sizes, 1)?; + match row_width_for_dtype(elem_dtype)? { + RowWidth::Fixed(w) => { + // Each row has `list_size` fixed-width elements regardless of null parent mask. + let body = w + .checked_mul(list_size_u32) + .ok_or_else(|| vortex_err!("FSL body width overflow"))?; + add_size_const(sizes, body)?; + } + RowWidth::Variable => { + let elements = arr.elements().clone().execute::(ctx)?; + debug_assert_eq!(elements.len(), n * list_size); + let mut elem_sizes = vec![0u32; n * list_size]; + field_size(&elements, field, &mut elem_sizes, ctx)?; + for i in 0..n { + let body: u32 = if mask.value(i) { + let base = i * list_size; + let mut sum: u32 = 0; + for j in 0..list_size { + sum = sum + .checked_add(elem_sizes[base + j]) + .ok_or_else(|| vortex_err!("FSL row body overflow"))?; + } + sum + } else { + // Canonical null body for FSL with variable element: one null sentinel + // per element. (Each element contributes `child_null_width = 1`.) + list_size_u32 + }; + sizes[i] = sizes[i] + .checked_add(body) + .ok_or_else(|| vortex_err!("FSL per-row size overflow"))?; + } + } + } + Ok(()) +} diff --git a/vortex-row/src/codec/varlen.rs b/vortex-row/src/codec/varlen.rs new file mode 100644 index 00000000000..fdc077d608a --- /dev/null +++ b/vortex-row/src/codec/varlen.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Variable-length value body encoder: 32-byte blocks with continuation/length markers. + +use super::*; + +/// Encode a non-empty variable-length byte slice into `out` in 32-byte blocks with +/// continuation/length markers. Returns the number of bytes written. Empty values are +/// encoded by the caller as a single sentinel byte and never reach this function. +/// +/// For the ascending path the hot loop is a `copy_nonoverlapping` of 32 bytes per block +/// plus one stamped continuation byte. For the descending path it reads a u64 at a time and +/// XORs with `0xFF`, giving LLVM a vectorizable inner loop. +pub(super) fn encode_non_empty_varlen_body( + bytes: &[u8], + out: &mut [u8], + descending: bool, +) -> VortexResult { + debug_assert!(!bytes.is_empty()); + let len = bytes.len(); + let full_blocks = len / VARLEN_BLOCK_SIZE; + let partial = len % VARLEN_BLOCK_SIZE; + let (full_to_write, partial_block_len) = if partial == 0 { + // Length is an exact multiple of 32: emit (full_blocks - 1) full blocks with the + // 0xFF continuation marker, then a final block whose continuation byte is 32. + (full_blocks - 1, VARLEN_BLOCK_SIZE) + } else { + (full_blocks, partial) + }; + let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL; + // The caller reserved this slot from `encoded_size_for_non_empty_varlen`, which already + // verified the byte total fits `u32`; re-check here so the conversion never panics. + let total_u32 = + u32::try_from(total).map_err(|_| vortex_err!("encoded varlen size overflows u32"))?; + debug_assert!(out.len() >= total); + // The final block's continuation byte encodes its content length (1..=32). + let len_byte = + u8::try_from(partial_block_len).vortex_expect("varlen final block length (1..=32) fits u8"); + + // SAFETY: `out` has at least `total` bytes — the caller sizes every varlen slot via + // `encoded_size_for_non_empty_varlen` (which equals `1 + total`, the extra byte being the + // leading sentinel that the caller wrote and that is not part of `out`). `bytes` is valid + // for `len` reads, and every pointer advance below stays within `[0, total)` for `dst` + // and `[0, len)` for `src`. + unsafe { + let mut src = bytes.as_ptr(); + let mut dst = out.as_mut_ptr(); + + if !descending { + // Ascending fast path: each full block is a 32-byte memcpy + a single 0xFF stamp. + for _ in 0..full_to_write { + std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE); + *dst.add(VARLEN_BLOCK_SIZE) = 0xFF; + src = src.add(VARLEN_BLOCK_SIZE); + dst = dst.add(VARLEN_BLOCK_TOTAL); + } + // Final block: copy the partial data, zero-pad the tail, write the length byte. + std::ptr::copy_nonoverlapping(src, dst, partial_block_len); + std::ptr::write_bytes( + dst.add(partial_block_len), + 0, + VARLEN_BLOCK_SIZE - partial_block_len, + ); + *dst.add(VARLEN_BLOCK_SIZE) = len_byte; + } else { + // Descending: invert every value byte. A u64-stride XOR gives LLVM a vectorizable + // inner loop; the tail handles the partial block byte-wise. + for _ in 0..full_to_write { + xor_copy_block(src, dst); + *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF + src = src.add(VARLEN_BLOCK_SIZE); + dst = dst.add(VARLEN_BLOCK_TOTAL); + } + for i in 0..partial_block_len { + *dst.add(i) = *src.add(i) ^ 0xFF; + } + std::ptr::write_bytes( + dst.add(partial_block_len), + 0xFF, // 0x00 XOR 0xFF + VARLEN_BLOCK_SIZE - partial_block_len, + ); + *dst.add(VARLEN_BLOCK_SIZE) = len_byte ^ 0xFF; + } + } + Ok(total_u32) +} + +/// Copy 32 bytes from `src` to `dst`, XORing each with `0xFF`. LLVM auto-vectorizes the +/// four u64-wide iterations into SIMD on x86. +/// +/// # Safety +/// `src` must be valid for 32 reads, `dst` valid for 32 writes, and the regions must not +/// overlap. +#[inline(always)] +unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) { + // Four u64 lanes of 8 bytes each = 32 bytes total. + for i in 0..4 { + let off = i * 8; + // SAFETY: the caller guarantees src/dst are valid for the full 32-byte block. + let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) }; + unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) }; + } +} diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index 04feec89415..77afec25330 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -34,7 +34,7 @@ use crate::codec; use crate::options::RowEncodingOptions; use crate::options::deserialize_row_encoding_options; use crate::options::serialize_row_encoding_options; -use crate::size::ColKind; +use crate::size::ColumnKind; use crate::size::compute_sizes; /// Variadic scalar function that encodes N input columns into a single `List` @@ -135,15 +135,7 @@ fn execute_row_encode( usize::try_from(total).vortex_expect("validated row-encoded output size must fit usize"); let mut out_buf: BufferMut = BufferMut::with_capacity(total_len); - // Every encoder writes every byte in its row range: fixed-width values write - // sentinel + value (null rows write sentinel + explicit zero-fill); varlen blocks - // zero-pad their final partial block; struct/FSL fixed children are written for all - // rows then null parent rows are overwritten with the canonical null body. So the - // size-pass + encoder contract guarantees `[0, total_len)` is fully written before - // the buffer is read out, making the pre-zero-init redundant. Skipping it saves a - // `total_len`-byte memset per call (significant for varlen-heavy inputs, where - // `total_len` reaches multiple MB). - // + // SAFETY: `total_len` bytes of capacity were just reserved, and by the contract above // every byte in that range is written before `out_buf` is frozen and read. unsafe { out_buf.set_len(total_len) }; @@ -160,7 +152,7 @@ fn execute_row_encode( && col_kinds.iter().any(|k| { matches!( k, - ColKind::Fixed { + ColumnKind::Fixed { before_varlen: true, .. } @@ -203,17 +195,10 @@ fn execute_row_encode( // Per-row write cursor (also doubles as the ListView `sizes` slot when done). We build // it as a BufferMut so we can hand it directly to the output PrimitiveArray. - // - // The cursor path begins at the first cursor-path column. Fixed-before-varlen columns - // are written by the arithmetic path and do not touch the cursor, so the cursor is - // pre-seeded with the within-row offset of the first varlen column (its `fixed_prefix`). - // When there are no varlen columns at all, every column takes the arithmetic path and - // the cursor loop runs zero iterations; seeding with `fixed_per_row` then leaves the - // cursors already correct as per-row sizes. let initial_cursor: u32 = match first_varlen_idx { Some(idx) => match col_kinds[idx] { - ColKind::Variable { fixed_prefix } => fixed_prefix, - ColKind::Fixed { .. } => unreachable!("first_varlen_idx points at a varlen column"), + ColumnKind::Variable { fixed_prefix } => fixed_prefix, + ColumnKind::Fixed { .. } => unreachable!("first_varlen_idx points at a varlen column"), }, None => fixed_per_row, }; @@ -226,7 +211,7 @@ fn execute_row_encode( // path. Each column was canonicalized once during the size pass; reuse that form. for (i, canonical) in columns.iter().enumerate() { match col_kinds[i] { - ColKind::Fixed { + ColumnKind::Fixed { prefix, before_varlen: true, .. @@ -242,7 +227,7 @@ fn execute_row_encode( ctx, )?; } - ColKind::Fixed { .. } | ColKind::Variable { .. } => { + ColumnKind::Fixed { .. } | ColumnKind::Variable { .. } => { codec::field_encode( canonical, options.fields[i], @@ -260,14 +245,7 @@ fn execute_row_encode( let offsets_arr = PrimitiveArray::new(listview_offsets.freeze(), Validity::NonNullable).into_array(); let sizes_arr = PrimitiveArray::new(row_cursors.freeze(), Validity::NonNullable).into_array(); - // SAFETY: this encoder constructs `elements`, `offsets_arr`, and `sizes_arr` itself: - // - `elements` is a `PrimitiveArray` of length `total_len`. - // - `offsets_arr[i]` is `i * fixed_per_row + var_prefix[i]`, monotonically increasing and - // in `0..=total_len`. - // - `offsets_arr[i] + sizes_arr[i] <= total_len` by construction, and each row's slice is - // disjoint from every other row's. - // `try_new`'s validation re-walks every row to check exactly these invariants, which we - // already guarantee by construction, so we skip it. + // SAFETY: this encoder constructs `elements`, `offsets_arr`, and `sizes_arr` itself. Ok(unsafe { ListViewArray::new_unchecked(elements, offsets_arr, sizes_arr, Validity::NonNullable) } diff --git a/vortex-row/src/encoder.rs b/vortex-row/src/encoder.rs index 7bcd3e05627..47c067d8107 100644 --- a/vortex-row/src/encoder.rs +++ b/vortex-row/src/encoder.rs @@ -14,12 +14,12 @@ use vortex_error::vortex_bail; use crate::encode::RowEncode; use crate::options::RowEncodingOptions; -use crate::options::RowSortField; +use crate::options::RowSortFieldOptions; use crate::size::RowSize; /// Encodes N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose row /// byte slices compare lexicographically in the same order as a tuple comparison of the input -/// values under the configured [`RowSortField`]s. +/// values under the configured [`RowSortFieldOptions`]s. /// /// Construct with [`RowEncoder::new`] or [`RowEncoder::with_options`] to pin the per-column /// sort options, or use [`RowEncoder::default`] to apply ascending, nulls-first ordering to @@ -30,8 +30,8 @@ pub struct RowEncoder { } impl RowEncoder { - /// Construct a `RowEncoder` from one [`RowSortField`] per input column. - pub fn new(fields: impl IntoIterator) -> Self { + /// Construct a `RowEncoder` from one [`RowSortFieldOptions`] per input column. + pub fn new(fields: impl IntoIterator) -> Self { Self { options: Some(RowEncodingOptions::new(fields)), } @@ -120,42 +120,3 @@ fn reject_extension_dtype(dtype: &DType) -> VortexResult<()> { } Ok(()) } - -/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose bytes -/// are lexicographically comparable in the same order as a tuple comparison of the input -/// values according to `fields`. Convenience wrapper over [`RowEncoder::encode`]. -pub fn convert_columns( - cols: &[ArrayRef], - fields: &[RowSortField], - ctx: &mut ExecutionCtx, -) -> VortexResult { - RowEncoder::new(fields.iter().copied()).encode(cols, ctx) -} - -/// Like [`convert_columns`] but takes a prebuilt [`RowEncodingOptions`]. -pub fn convert_columns_with_options( - cols: &[ArrayRef], - options: &RowEncodingOptions, - ctx: &mut ExecutionCtx, -) -> VortexResult { - RowEncoder::with_options(options.clone()).encode(cols, ctx) -} - -/// Compute only the per-row sizes (in bytes) of the row-encoded form for N columns. -/// Convenience wrapper over [`RowEncoder::row_sizes`]. -pub fn compute_row_sizes( - cols: &[ArrayRef], - fields: &[RowSortField], - ctx: &mut ExecutionCtx, -) -> VortexResult { - RowEncoder::new(fields.iter().copied()).row_sizes(cols, ctx) -} - -/// Like [`compute_row_sizes`] but takes a prebuilt [`RowEncodingOptions`]. -pub fn compute_row_sizes_with_options( - cols: &[ArrayRef], - options: &RowEncodingOptions, - ctx: &mut ExecutionCtx, -) -> VortexResult { - RowEncoder::with_options(options.clone()).row_sizes(cols, ctx) -} diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index b36121f0da2..60c06a916d9 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -5,8 +5,9 @@ //! //! This crate converts one or more columnar arrays into a single `ListView` array whose //! row byte slices can be compared lexicographically. The byte ordering matches tuple -//! ordering of the input values under the requested [`RowSortField`] settings, making the -//! representation useful for sort keys and other row-key operations. +//! ordering of the input values under the requested [`RowSortFieldOptions`] settings, making the +//! representation useful for sort keys and other row-key operations. It is the Vortex analogue +//! of `arrow-row`. //! //! The public entry points are: //! - [`RowEncoder`], the primary API for encoding columns into row bytes. @@ -22,10 +23,99 @@ //! those sizes to allocate one contiguous elements buffer, then writes each column's bytes //! into the per-row slots from left to right. //! +//!

+//! +//! The row encoding format is **experimental**. Its byte layout, supported type set, and +//! edge-case semantics may change between Vortex releases. Do not persist these bytes or +//! depend on them as a stable interchange format. +//! +//!
+//! +//! # Byte-layout reference +//! +//! This is a schema-aware row-key format: the bytes carry no type tags, field names, or sort +//! options, so two encoded rows are comparable only when produced from the same schema and the +//! same per-column [`RowSortFieldOptions`]. +//! +//! ## Order property +//! +//! For a fixed schema with columns `c0..cn` and per-column sort fields `f0..fn`: +//! +//! ```text +//! encode(row_a) < encode(row_b) +//! <=> (row_a.c0, .., row_a.cn) < (row_b.c0, .., row_b.cn) +//! ``` +//! +//! under the requested direction and null placement of each column. This holds because (1) +//! every supported value is encoded so its bytes sort in the same order as the value, and (2) +//! fields are concatenated left to right, so lexicographic byte comparison performs tuple +//! comparison. `||` below means byte concatenation, `BE(x)` the fixed-width big-endian bytes of +//! `x`, and `!bytes` the bitwise complement of every byte. +//! +//! ## Field options +//! +//! Each input column carries a [`RowSortFieldOptions`] `{ descending, nulls_first }`. +//! `descending` reverses the order of non-null values; `nulls_first` is independent of +//! `descending`, so nulls can sort before or after non-nulls in either direction. +//! +//! ## Sentinels +//! +//! A leading sentinel byte classifies nullness (and, for variable-width values, empty vs +//! non-empty) before any value bytes are compared. The sentinel itself is never inverted for +//! `descending`, which keeps null placement independent of sort direction. +//! +//! | Family | Case | Asc, nulls first | Desc, nulls first | Asc, nulls last | Desc, nulls last | +//! | --- | --- | --- | --- | --- | --- | +//! | Fixed-width | Null | `0x00` | `0x00` | `0x02` | `0x02` | +//! | Fixed-width | Non-null | `0x01` | `0x01` | `0x01` | `0x01` | +//! | Variable-width | Null | `0x00` | `0x00` | `0xFF` | `0xFF` | +//! | Variable-width | Empty | `0x01` | `0xFE` | `0x01` | `0xFE` | +//! | Variable-width | Non-empty | `0x02` | `0xFD` | `0x02` | `0xFD` | +//! +//! Fixed-width sentinels are used by null, boolean, primitive, decimal, struct, and fixed-size +//! list values; variable-width sentinels by UTF-8 and binary values. +//! +//! ## Per-type encoding +//! +//! - **Null**: just the fixed-width sentinel, no body. +//! - **Boolean**: `sentinel || value_byte`, where `false = 0x01`, `true = 0x02` (inverted for +//! descending). Null bodies are a single zero byte. +//! - **Unsigned integer** (`u8`–`u64`): `0x01 || BE(value)` (`!BE(value)` descending). Null +//! bodies are `width(T)` zero bytes. +//! - **Signed integer** (`i8`–`i64`, and `i128` decimal storage): flip the sign bit of +//! `BE(value)` so negatives sort before non-negatives, then apply the descending complement. +//! - **Floating point** (`f16`/`f32`/`f64`): treat the IEEE bits as unsigned; flip the top bit +//! for non-negative values and all bits for negative, then big-endian. Yields total-ordering +//! semantics (`-0.0 < +0.0`, NaNs ordered by bit pattern). +//! - **Decimal**: encoded as its scaled signed-integer storage value at the *precision-minimal* +//! width (`1..=2 -> i8`, `3..=4 -> i16`, `5..=9 -> i32`, `10..=18 -> i64`, `19..=38 -> i128`), +//! using the signed-integer encoding. `Decimal256` is unsupported. The width is a pure +//! function of the precision, so storage physically wider than the precision requires is +//! narrowed losslessly before encoding (precision bounds the magnitude of every valid value). +//! - **UTF-8 / Binary**: a variable-width sentinel, and for non-empty values a block-structured +//! body. Each block is 32 data bytes plus a marker: non-final full blocks use marker `0xFF`, +//! the final block is zero-padded to 32 bytes with a marker giving its real length (`1..=32`). +//! Descending inverts the data bytes, padding, and markers. This preserves prefix order. +//! - **Struct / Fixed-size list**: an outer fixed-width sentinel followed by the children +//! encoded recursively in order with the parent's options. A null parent emits a *canonical +//! null body* (fixed-width children contribute their fixed null encoding; variable-width +//! children collapse to one null sentinel byte) so two null parents are byte-equal regardless +//! of underlying child data. A composite is fixed-width only when all of its children are. +//! +//! ## Output layout +//! +//! The result is a `ListView`: a single contiguous `elements` buffer holding every row's +//! bytes, with per-row `offsets` and `sizes`. Rows are not self-describing without `sizes`, +//! since a variable-width field can make one row longer than another. The sizing pass computes +//! `sizes` before writing, and the same array doubles as the per-row write cursor. +//! //! Supported logical types are nulls, booleans, primitive integers and floats, decimals up to //! 128 bits, UTF-8 and binary values, structs, and fixed-size lists. Extension, variant, //! union, and variable-size list arrays are rejected because this crate does not define an //! ordering for them. +//! +//! See `docs/specs/row-encoding.md` for the formal specification and a fully worked example +//! row. mod codec; mod encode; @@ -38,14 +128,14 @@ mod tests; pub use encode::RowEncode; pub use encoder::RowEncoder; -pub use encoder::compute_row_sizes; -pub use encoder::compute_row_sizes_with_options; -pub use encoder::convert_columns; -pub use encoder::convert_columns_with_options; pub use options::RowEncodingOptions; -pub use options::RowSortField; +pub use options::RowSortFieldOptions; pub use size::RowSize; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::ListViewArray; use vortex_array::scalar_fn::session::ScalarFnSessionExt; +use vortex_error::VortexResult; use vortex_session::VortexSession; /// Register the row-encoding scalar functions ([`RowSize`] and [`RowEncode`]) on the given @@ -58,3 +148,23 @@ pub fn initialize(session: &VortexSession) { session.scalar_fns().register(RowSize); session.scalar_fns().register(RowEncode); } + +/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose bytes +/// are lexicographically comparable in the same order as a tuple comparison of the input +/// values according to `fields`. Convenience wrapper over [`RowEncoder::encode`]. +pub fn convert_columns( + cols: &[ArrayRef], + fields: &[RowSortFieldOptions], + ctx: &mut ExecutionCtx, +) -> VortexResult { + RowEncoder::new(fields.iter().copied()).encode(cols, ctx) +} + +/// Like [`convert_columns`] but takes a prebuilt [`RowEncodingOptions`]. +pub fn convert_columns_with_options( + cols: &[ArrayRef], + options: &RowEncodingOptions, + ctx: &mut ExecutionCtx, +) -> VortexResult { + RowEncoder::with_options(options.clone()).encode(cols, ctx) +} diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs index 380c9a3827f..1997512941d 100644 --- a/vortex-row/src/options.rs +++ b/vortex-row/src/options.rs @@ -8,24 +8,24 @@ use smallvec::SmallVec; /// Per-column ordering options for row-oriented encoding. /// -/// A `RowSortField` describes how one input column contributes to a row key. Descending order +/// A `RowSortFieldOptions` describes how one input column contributes to a row key. Descending order /// reverses the encoded value bytes for that column. Null placement is controlled separately, /// so nulls keep the requested position relative to non-null values in either direction. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct RowSortField { +pub struct RowSortFieldOptions { /// If true, this column sorts in descending order. pub descending: bool, /// If true, nulls sort before non-null values. pub nulls_first: bool, } -impl Default for RowSortField { +impl Default for RowSortFieldOptions { fn default() -> Self { Self::ascending() } } -impl Display for RowSortField { +impl Display for RowSortFieldOptions { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, @@ -35,8 +35,8 @@ impl Display for RowSortField { } } -impl RowSortField { - /// Construct a new `RowSortField` with explicit options. +impl RowSortFieldOptions { + /// Construct a new `RowSortFieldOptions` with explicit options. pub const fn new(descending: bool, nulls_first: bool) -> Self { Self { descending, @@ -65,43 +65,24 @@ impl RowSortField { self.nulls_first = false; self } - - /// Returns the sentinel byte to write for a non-null value. - #[inline] - pub(crate) fn non_null_sentinel(&self) -> u8 { - // Non-null is always 0x01. Null choices are < or > 0x01. - 0x01 - } - - /// Returns the sentinel byte to write for a null value. - #[inline] - pub(crate) fn null_sentinel(&self) -> u8 { - if self.nulls_first { - // Nulls before non-nulls (smaller byte sorts first). - 0x00 - } else { - // Nulls after non-nulls (larger byte sorts later). - 0x02 - } - } } const FIELDS_INLINE: usize = 4; /// Ordering options for row-oriented encoding. /// -/// The options contain one [`RowSortField`] per input column, in the same order as the columns +/// The options contain one [`RowSortFieldOptions`] per input column, in the same order as the columns /// passed to [`convert_columns`](crate::convert_columns), /// [`compute_row_sizes`](crate::compute_row_sizes), [`RowSize`](crate::RowSize), or /// [`RowEncode`](crate::RowEncode). #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct RowEncodingOptions { - pub(crate) fields: SmallVec<[RowSortField; FIELDS_INLINE]>, + pub(crate) fields: SmallVec<[RowSortFieldOptions; FIELDS_INLINE]>, } impl RowEncodingOptions { - /// Construct a new `RowEncodingOptions` from any iterator of [`RowSortField`]s. - pub fn new(fields: impl IntoIterator) -> Self { + /// Construct a new `RowEncodingOptions` from any iterator of [`RowSortFieldOptions`]s. + pub fn new(fields: impl IntoIterator) -> Self { Self { fields: fields.into_iter().collect(), } @@ -109,11 +90,14 @@ impl RowEncodingOptions { /// Construct default ascending, nulls-first options for `column_count` input columns. pub fn default_for_columns(column_count: usize) -> Self { - Self::new(std::iter::repeat_n(RowSortField::default(), column_count)) + Self::new(std::iter::repeat_n( + RowSortFieldOptions::default(), + column_count, + )) } /// Borrow the per-column sort fields. - pub fn fields(&self) -> &[RowSortField] { + pub fn fields(&self) -> &[RowSortFieldOptions] { &self.fields } @@ -128,8 +112,8 @@ impl RowEncodingOptions { } } -impl FromIterator for RowEncodingOptions { - fn from_iter>(iter: T) -> Self { +impl FromIterator for RowEncodingOptions { + fn from_iter>(iter: T) -> Self { Self::new(iter) } } @@ -180,10 +164,10 @@ pub(crate) fn deserialize_row_encoding_options( expected ); } - let mut fields: SmallVec<[RowSortField; FIELDS_INLINE]> = SmallVec::with_capacity(n); + let mut fields: SmallVec<[RowSortFieldOptions; FIELDS_INLINE]> = SmallVec::with_capacity(n); let mut i = 4; for _ in 0..n { - fields.push(RowSortField { + fields.push(RowSortFieldOptions { descending: bytes[i] != 0, nulls_first: bytes[i + 1] != 0, }); diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs index 9112379a6f4..a9b3255227d 100644 --- a/vortex-row/src/size.rs +++ b/vortex-row/src/size.rs @@ -4,6 +4,7 @@ //! `RowSize` variadic scalar function: aggregate per-row byte sizes for N input columns. use std::sync::Arc; +use std::sync::LazyLock; use vortex_array::ArrayRef; use vortex_array::Canonical; @@ -13,6 +14,7 @@ use vortex_array::arrays::ConstantArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; use vortex_array::dtype::DType; +use vortex_array::dtype::FieldDType; use vortex_array::dtype::FieldName; use vortex_array::dtype::FieldNames; use vortex_array::dtype::Nullability; @@ -44,7 +46,7 @@ use crate::options::serialize_row_encoding_options; /// path (no varlen before this column, so the within-row position is constant per row) and /// the cursor-write path. #[derive(Clone, Copy, Debug)] -pub(crate) enum ColKind { +pub(crate) enum ColumnKind { /// Fixed-width column. `prefix` is the within-row byte offset of this column's first /// byte. When `before_varlen` is true no variable-length column precedes this one, so the /// within-row offset is constant for every row. @@ -63,7 +65,7 @@ pub(crate) enum ColKind { pub(crate) struct SizePassResult { pub fixed_per_row: u32, pub var_lengths: Option>, - pub col_kinds: Vec, + pub col_kinds: Vec, pub first_varlen_idx: Option, pub columns: Vec, } @@ -97,7 +99,7 @@ pub(crate) fn compute_sizes( let nrows = args.row_count(); let mut columns: Vec = Vec::with_capacity(n_inputs); - let mut col_kinds: Vec = Vec::with_capacity(n_inputs); + let mut col_kinds: Vec = Vec::with_capacity(n_inputs); let mut fixed_per_row: u32 = 0; let mut var_lengths: Option> = None; let mut first_varlen_idx: Option = None; @@ -118,7 +120,7 @@ pub(crate) fn compute_sizes( let canonical = col.execute::(ctx)?; match width { RowWidth::Fixed(w) => { - col_kinds.push(ColKind::Fixed { + col_kinds.push(ColumnKind::Fixed { prefix: running_fixed_prefix, before_varlen: first_varlen_idx.is_none(), }); @@ -133,7 +135,7 @@ pub(crate) fn compute_sizes( } let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]); codec::field_size(&canonical, options.fields[i], v, ctx)?; - col_kinds.push(ColKind::Variable { + col_kinds.push(ColumnKind::Variable { fixed_prefix: running_fixed_prefix, }); } @@ -151,7 +153,7 @@ pub(crate) fn compute_sizes( } /// Variadic scalar function that, given N input columns and per-column -/// [`RowSortField`](crate::RowSortField)s, +/// [`RowSortFieldOptions`](crate::RowSortFieldOptions)s, /// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the /// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode). /// @@ -169,22 +171,17 @@ pub(crate) fn compute_sizes( pub struct RowSize; /// Returns the [`FieldNames`] used by the [`RowSize`] output struct. -pub(crate) fn row_size_field_names() -> FieldNames { - FieldNames::from([FieldName::from("fixed"), FieldName::from("var")]) -} - -/// Returns the output [`DType`] of [`RowSize`]. -pub(crate) fn row_size_struct_dtype() -> DType { - DType::Struct( - StructFields::new( - row_size_field_names(), +pub(crate) fn row_size_struct_fields() -> StructFields { + static FIELDS: LazyLock = LazyLock::new(|| { + StructFields::from_fields( + FieldNames::from([FieldName::from("fixed"), FieldName::from("var")]), vec![ - DType::Primitive(PType::U32, Nullability::NonNullable), - DType::Primitive(PType::U32, Nullability::NonNullable), + FieldDType::from(DType::Primitive(PType::U32, Nullability::NonNullable)), + FieldDType::from(DType::Primitive(PType::U32, Nullability::NonNullable)), ], - ), - Nullability::NonNullable, - ) + ) + }); + *FIELDS } impl ScalarFnVTable for RowSize { @@ -215,7 +212,10 @@ impl ScalarFnVTable for RowSize { } fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult { - Ok(row_size_struct_dtype()) + Ok(DType::Struct( + row_size_struct_fields(), + Nullability::NonNullable, + )) } fn execute( @@ -233,9 +233,9 @@ impl ScalarFnVTable for RowSize { .into_array(), None => ConstantArray::new(Scalar::from(0u32), nrows).into_array(), }; - Ok(StructArray::try_new( - row_size_field_names(), + Ok(StructArray::try_new_with_dtype( vec![fixed_array, var_array], + row_size_struct_fields(), nrows, Validity::NonNullable, )? diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs index 5c85c911154..8c36c66a95c 100644 --- a/vortex-row/src/tests.rs +++ b/vortex-row/src/tests.rs @@ -6,6 +6,8 @@ use std::f64::consts::PI; use rstest::rstest; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; @@ -22,11 +24,16 @@ use vortex_array::extension::datetime::TimeUnit; use vortex_error::VortexResult; use crate::RowEncoder; -use crate::RowEncodingOptions; -use crate::RowSortField; -use crate::compute_row_sizes_with_options; +use crate::RowSortFieldOptions; use crate::convert_columns; -use crate::convert_columns_with_options; + +fn compute_row_sizes( + cols: &[ArrayRef], + fields: &[RowSortFieldOptions], + ctx: &mut ExecutionCtx, +) -> VortexResult { + RowEncoder::new(fields.iter().copied()).row_sizes(cols, ctx) +} fn collect_row_bytes(array: &ListViewArray) -> Vec> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); @@ -45,7 +52,7 @@ fn collect_row_bytes(array: &ListViewArray) -> Vec> { fn assert_sort_order_i64(values: Vec, descending: bool) -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let col = PrimitiveArray::from_iter(values.clone()).into_array(); - let field = RowSortField::new(descending, true); + let field = RowSortFieldOptions::new(descending, true); let encoded = convert_columns(&[col], &[field], &mut ctx)?; let rows = collect_row_bytes(&encoded); @@ -80,7 +87,7 @@ fn primitive_u32_sort_order() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let values: Vec = vec![0, 1, 100, u32::MAX, 42, 17]; let col = PrimitiveArray::from_iter(values.clone()).into_array(); - let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted_rows = rows.clone(); @@ -100,7 +107,7 @@ fn reject_temporal_extension_dtype_early() -> VortexResult<()> { let ext_dtype = Date::new(TimeUnit::Days, Nullability::NonNullable).erased(); let col = ExtensionArray::new(ext_dtype, storage).into_array(); - let err = convert_columns(&[col], &[RowSortField::ascending()], &mut ctx) + let err = convert_columns(&[col], &[RowSortFieldOptions::ascending()], &mut ctx) .expect_err("temporal extensions should be rejected"); assert!( err.to_string().contains("Extension arrays yet"), @@ -119,7 +126,7 @@ fn reject_nested_temporal_extension_dtype_early() -> VortexResult<()> { let struct_col = StructArray::from_fields(&[("date", date_col), ("tag", tag_col)])?.into_array(); - let err = convert_columns(&[struct_col], &[RowSortField::ascending()], &mut ctx) + let err = convert_columns(&[struct_col], &[RowSortFieldOptions::ascending()], &mut ctx) .expect_err("nested temporal extensions should be rejected"); assert!( err.to_string().contains("Extension arrays yet"), @@ -136,7 +143,7 @@ fn primitive_f64_sort_order() -> VortexResult<()> { // -0.0 == 0.0. let values: Vec = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, PI]; let col = PrimitiveArray::from_iter(values.clone()).into_array(); - let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted_rows = rows.clone(); @@ -153,7 +160,7 @@ fn primitive_f64_sort_order() -> VortexResult<()> { fn bool_sort_order() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let col = BoolArray::from_iter([true, false, true, false]).into_array(); - let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted = rows.clone(); @@ -178,7 +185,7 @@ fn utf8_sort_order() -> VortexResult<()> { "banana_loaf_for_test", ]; let col = VarBinViewArray::from_iter_str(values.clone()).into_array(); - let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted = rows.clone(); @@ -200,7 +207,10 @@ fn multi_column_sort() -> VortexResult<()> { let col1 = VarBinViewArray::from_iter_str(strs.clone()).into_array(); let encoded = convert_columns( &[col0, col1], - &[RowSortField::default(), RowSortField::default()], + &[ + RowSortFieldOptions::default(), + RowSortFieldOptions::default(), + ], &mut ctx, )?; let rows = collect_row_bytes(&encoded); @@ -223,7 +233,7 @@ fn nulls_first_and_last() -> VortexResult<()> { // nulls_first=true let encoded = convert_columns( std::slice::from_ref(&col), - &[RowSortField::ascending()], + &[RowSortFieldOptions::ascending()], &mut ctx, )?; let rows = collect_row_bytes(&encoded); @@ -236,7 +246,11 @@ fn nulls_first_and_last() -> VortexResult<()> { assert_eq!(sorted[i][0], 0x00); } // nulls_first=false - let encoded = convert_columns(&[col], &[RowSortField::ascending().nulls_last()], &mut ctx)?; + let encoded = convert_columns( + &[col], + &[RowSortFieldOptions::ascending().nulls_last()], + &mut ctx, + )?; let rows = collect_row_bytes(&encoded); let mut sorted = rows; sorted.sort(); @@ -248,42 +262,10 @@ fn nulls_first_and_last() -> VortexResult<()> { Ok(()) } -#[test] -fn reusable_options_helpers() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let options = RowEncodingOptions::new([RowSortField::descending().nulls_last()]); - assert_eq!(options.len(), 1); - assert!(!options.is_empty()); - assert_eq!( - options.fields(), - &[RowSortField { - descending: true, - nulls_first: false - }] - ); - - let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); - let encoder = RowEncoder::with_options(options.clone()); - assert_eq!(encoder.options(), Some(&options)); - - let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?; - assert_eq!(encoded.len(), 3); - - let sizes = encoder.row_sizes(std::slice::from_ref(&col), &mut ctx)?; - assert_eq!(sizes.len(), 3); - - let encoded = convert_columns_with_options(std::slice::from_ref(&col), &options, &mut ctx)?; - assert_eq!(encoded.len(), 3); - - let sizes = compute_row_sizes_with_options(std::slice::from_ref(&col), &options, &mut ctx)?; - assert_eq!(sizes.len(), 3); - Ok(()) -} - #[test] fn row_encoder_new_accepts_sort_fields() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let encoder = RowEncoder::new([RowSortField::ascending()]); + let encoder = RowEncoder::new([RowSortFieldOptions::ascending()]); let col = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); let encoded = encoder.encode(std::slice::from_ref(&col), &mut ctx)?; @@ -312,7 +294,7 @@ fn struct_sort_order() -> VortexResult<()> { let name_arr = VarBinViewArray::from_iter_str(names.clone()).into_array(); let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])?.into_array(); - let encoded = convert_columns(&[struct_arr], &[RowSortField::default()], &mut ctx)?; + let encoded = convert_columns(&[struct_arr], &[RowSortFieldOptions::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted = rows.clone(); @@ -330,8 +312,6 @@ fn row_size_struct_shape() -> VortexResult<()> { use vortex_array::arrays::StructArray; use vortex_array::arrays::struct_::StructArrayExt; - use crate::compute_row_sizes; - let mut ctx = LEGACY_SESSION.create_execution_ctx(); let ints: Vec = vec![1, 2, 3, 4, 5]; let strs = vec!["a", "bb", "ccc", "", "eeeee"]; @@ -340,7 +320,10 @@ fn row_size_struct_shape() -> VortexResult<()> { let sizes = compute_row_sizes( &[col0, col1], - &[RowSortField::default(), RowSortField::default()], + &[ + RowSortFieldOptions::default(), + RowSortFieldOptions::default(), + ], &mut ctx, )?; // Shape must be Struct { fixed, var } @@ -384,7 +367,10 @@ fn single_buffer_invariant() -> VortexResult<()> { let col1 = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array(); let encoded = convert_columns( &[col0, col1], - &[RowSortField::default(), RowSortField::default()], + &[ + RowSortFieldOptions::default(), + RowSortFieldOptions::default(), + ], &mut ctx, )?; @@ -417,7 +403,10 @@ fn multi_column_varlen_empty_vs_nul_byte_string() -> VortexResult<()> { let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1]).into_array(); let encoded = convert_columns( &[col1, col2], - &[RowSortField::default(), RowSortField::default()], + &[ + RowSortFieldOptions::default(), + RowSortFieldOptions::default(), + ], &mut ctx, )?; let rows = collect_row_bytes(&encoded); @@ -454,7 +443,10 @@ fn multi_column_varlen_null_vs_empty() -> VortexResult<()> { let col2 = PrimitiveArray::from_iter([1i32, 1, 1, 1, 1]).into_array(); let encoded = convert_columns( &[col1, col2], - &[RowSortField::ascending(), RowSortField::ascending()], + &[ + RowSortFieldOptions::ascending(), + RowSortFieldOptions::ascending(), + ], &mut ctx, )?; let rows = collect_row_bytes(&encoded); @@ -507,7 +499,7 @@ fn multi_column_varlen_null_vs_empty() -> VortexResult<()> { fn varlen_descending_empty_vs_non_empty() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let col = VarBinViewArray::from_iter_str(["a", "", "abc"]).into_array(); - let encoded = convert_columns(&[col], &[RowSortField::descending()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortFieldOptions::descending()], &mut ctx)?; let rows = collect_row_bytes(&encoded); // Natural order: "" < "a" < "abc"; descending byte sort: "abc" first, "" last. @@ -541,7 +533,7 @@ fn null_struct_rows_with_varying_child_lengths_are_byte_equal() -> VortexResult< let validity = Validity::from(bits); let struct_arr = StructArray::try_new(field_names, vec![names], 3, validity)?.into_array(); - let encoded = convert_columns(&[struct_arr], &[RowSortField::ascending()], &mut ctx)?; + let encoded = convert_columns(&[struct_arr], &[RowSortFieldOptions::ascending()], &mut ctx)?; let rows = collect_row_bytes(&encoded); assert_eq!(rows.len(), 3); // Both null parent rows must produce identical bytes despite the divergent children. @@ -559,7 +551,7 @@ fn primitive_f32_sort_order() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let values: Vec = vec![-1.5, 0.0, 1.5, f32::INFINITY, f32::NEG_INFINITY]; let col = PrimitiveArray::from_iter(values.clone()).into_array(); - let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted_rows = rows.clone(); sorted_rows.sort(); @@ -582,7 +574,7 @@ fn primitive_f16_sort_order() -> VortexResult<()> { f16::NEG_INFINITY, ]; let col = PrimitiveArray::from_iter(values.clone()).into_array(); - let encoded = convert_columns(&[col], &[RowSortField::default()], &mut ctx)?; + let encoded = convert_columns(&[col], &[RowSortFieldOptions::default()], &mut ctx)?; let rows = collect_row_bytes(&encoded); let mut sorted_rows = rows.clone(); sorted_rows.sort(); @@ -593,6 +585,165 @@ fn primitive_f16_sort_order() -> VortexResult<()> { Ok(()) } +#[test] +fn decimal_nullable_sort_order() -> VortexResult<()> { + use vortex_array::arrays::DecimalArray; + use vortex_array::dtype::DecimalDType; + use vortex_array::validity::Validity; + use vortex_buffer::BitBuffer; + use vortex_buffer::Buffer; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // precision=9 -> minimal physical type I32; row 1 is null. + let dt = DecimalDType::new(9, 3); + let values: Vec = vec![5, 0, -7, 0, 123]; + let validity = Validity::from(BitBuffer::from_iter([true, false, true, false, true])); + let col = + DecimalArray::new::(Buffer::::copy_from(&values), dt, validity).into_array(); + + let encoded = convert_columns(&[col], &[RowSortFieldOptions::ascending()], &mut ctx)?; + let mut sorted = collect_row_bytes(&encoded); + sorted.sort(); + // nulls_first: the two null rows sort to the front and are byte-equal. + assert_eq!(sorted[0][0], 0x00, "null sentinel sorts first"); + assert_eq!(sorted[0], sorted[1], "null decimal rows are byte-equal"); + assert_eq!(sorted[1][0], 0x00); + assert_eq!(sorted[2][0], 0x01, "non-null sentinel"); + Ok(()) +} + +/// Regression: a decimal column whose physical `values_type` is wider than its precision +/// requires (precision 5 fits in `I32` but is stored as `i64`) must still encode correctly. +/// The size pass reserves the precision-minimal width, so the encode pass must narrow the +/// physical values to that same width rather than writing the wider physical bytes (which +/// previously overran the per-row slot). Byte order must still match the natural value order. +#[rstest] +#[case::ascending(false)] +#[case::descending(true)] +fn decimal_wide_physical_storage_sort_order(#[case] descending: bool) -> VortexResult<()> { + use vortex_array::arrays::DecimalArray; + use vortex_array::dtype::DecimalDType; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // precision=5 fits in I32 (4 bytes), but store physically as i64 (8 bytes). + let dt = DecimalDType::new(5, 2); + let values: Vec = vec![1, -4, 0, 99_999, -99_999, 42, -42]; + let col = + DecimalArray::new::(Buffer::::copy_from(&values), dt, Validity::NonNullable) + .into_array(); + let field = RowSortFieldOptions::new(descending, true); + let encoded = convert_columns(&[col], &[field], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + + // Each encoded row is the precision-minimal width: sentinel(1) + I32(4) = 5 bytes. + assert!(rows.iter().all(|r| r.len() == 5), "row lens: {:?}", rows); + + let mut idx: Vec = (0..values.len()).collect(); + if descending { + idx.sort_by(|a, b| values[*b].cmp(&values[*a])); + } else { + idx.sort_by(|a, b| values[*a].cmp(&values[*b])); + } + let expected: Vec> = idx.iter().map(|&i| rows[i].clone()).collect(); + let mut sorted = rows; + sorted.sort(); + assert_eq!( + sorted, expected, + "decimal byte order must match value order" + ); + Ok(()) +} + +/// Lock-in reference test: encode the worked-example row from `docs/specs/row-encoding.md` +/// (one row with every supported encoding family, all columns ascending nulls-first) and +/// assert the exact encoded bytes. This pins the byte layout so any accidental change to the +/// format is caught, and keeps the spec document honest. +#[test] +fn reference_row_bytes_match_spec() -> VortexResult<()> { + use vortex_array::arrays::DecimalArray; + use vortex_array::arrays::FixedSizeListArray; + use vortex_array::arrays::NullArray; + use vortex_array::dtype::DecimalDType; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + + let null_col = NullArray::new(1).into_array(); + let bool_col = BoolArray::from_iter([true]).into_array(); + let uint_col = PrimitiveArray::from_iter([258u16]).into_array(); + let int_col = PrimitiveArray::from_iter([-5i16]).into_array(); + let float_col = PrimitiveArray::from_iter([1.5f32]).into_array(); + let decimal_col = DecimalArray::new::( + Buffer::::copy_from([12345i32]), + DecimalDType::new(9, 2), + Validity::NonNullable, + ) + .into_array(); + let utf8_col = VarBinViewArray::from_iter_str(["a"]).into_array(); + let binary_col = VarBinViewArray::from_iter_bin([[0xDEu8, 0xAD, 0xBE, 0xEF]]).into_array(); + let struct_col = StructArray::from_fields(&[ + ("x", PrimitiveArray::from_iter([1i8]).into_array()), + ("y", VarBinViewArray::from_iter_str([""]).into_array()), + ])? + .into_array(); + let fsl_col = FixedSizeListArray::try_new( + PrimitiveArray::from_iter([1u8, 2, 3]).into_array(), + 3, + Validity::NonNullable, + 1, + )? + .into_array(); + + let cols = [ + null_col, + bool_col, + uint_col, + int_col, + float_col, + decimal_col, + utf8_col, + binary_col, + struct_col, + fsl_col, + ]; + let fields = vec![RowSortFieldOptions::default(); cols.len()]; + let encoded = convert_columns(&cols, &fields, &mut ctx)?; + let rows = collect_row_bytes(&encoded); + assert_eq!(rows.len(), 1); + + // Per-column encodings from the spec's worked example. + let mut expected: Vec = Vec::new(); + expected.extend_from_slice(&[0x00]); // null_col + expected.extend_from_slice(&[0x01, 0x02]); // bool_col: true + expected.extend_from_slice(&[0x01, 0x01, 0x02]); // uint_col: 258 u16 + expected.extend_from_slice(&[0x01, 0x7F, 0xFB]); // int_col: -5 i16 (sign-bit flipped) + expected.extend_from_slice(&[0x01, 0xBF, 0xC0, 0x00, 0x00]); // float_col: 1.5 f32 + expected.extend_from_slice(&[0x01, 0x80, 0x00, 0x30, 0x39]); // decimal_col: 12345 i32 + // utf8 "a": non-empty sentinel, 'a', zero pad to 32, length marker 1. + expected.push(0x02); + expected.push(b'a'); + expected.extend(std::iter::repeat_n(0u8, 31)); + expected.push(0x01); + // binary DE AD BE EF: non-empty sentinel, data, zero pad to 32, length marker 4. + expected.push(0x02); + expected.extend_from_slice(&[0xDE, 0xAD, 0xBE, 0xEF]); + expected.extend(std::iter::repeat_n(0u8, 28)); + expected.push(0x04); + // struct { x: 1 i8, y: "" }: outer sentinel, x = 0x01 || 0x81, y = empty sentinel 0x01. + expected.extend_from_slice(&[0x01, 0x01, 0x81, 0x01]); + // fsl [1, 2, 3] u8: outer sentinel, then per element 0x01 || BE(value). + expected.extend_from_slice(&[0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x03]); + + assert_eq!( + rows[0], expected, + "encoded reference row does not match the documented byte layout" + ); + Ok(()) +} + #[test] fn reject_list_dtype_early() { use vortex_array::ArrayRef; @@ -606,7 +757,7 @@ fn reject_list_dtype_early() { let list: ArrayRef = ListArray::try_new(elements, offsets, Validity::NonNullable) .unwrap() .into_array(); - let err = convert_columns(&[list], &[RowSortField::default()], &mut ctx) + let err = convert_columns(&[list], &[RowSortFieldOptions::default()], &mut ctx) .expect_err("List should not be accepted"); assert!( err.to_string().contains("List"), From 9f134b4b6568bf0e487e8f82b1ed69e1f34df7c4 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 8 Jun 2026 13:23:48 +0100 Subject: [PATCH 2/3] fix Signed-off-by: Joe Isaacs --- vortex-row/src/size.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs index a9b3255227d..5465ce34357 100644 --- a/vortex-row/src/size.rs +++ b/vortex-row/src/size.rs @@ -181,7 +181,7 @@ pub(crate) fn row_size_struct_fields() -> StructFields { ], ) }); - *FIELDS + FIELDS.clone() } impl ScalarFnVTable for RowSize { From d0913ffab48257f086bee41f320fd60e2996a131 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 11 Jun 2026 18:34:30 +0100 Subject: [PATCH 3/3] u Signed-off-by: Joe Isaacs --- vortex-array/src/arrays/decimal/utils.rs | 86 ++++++++++++++++++++++++ vortex-row/src/codec/encoding.rs | 35 ++++------ vortex-row/src/codec/mod.rs | 2 - vortex-row/src/tests.rs | 37 ++++++++++ 4 files changed, 136 insertions(+), 24 deletions(-) diff --git a/vortex-array/src/arrays/decimal/utils.rs b/vortex-array/src/arrays/decimal/utils.rs index a1eafde0fe3..93660f943cf 100644 --- a/vortex-array/src/arrays/decimal/utils.rs +++ b/vortex-array/src/arrays/decimal/utils.rs @@ -3,11 +3,17 @@ use itertools::Itertools; use itertools::MinMaxResult; +use vortex_buffer::Buffer; use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_mask::Mask; use crate::arrays::DecimalArray; +use crate::dtype::BigCast; use crate::dtype::DecimalType; use crate::dtype::i256; +use crate::match_each_decimal_value_type; macro_rules! try_downcast { ($array:expr, from: $src:ty, to: $($dst:ty),*) => {{ @@ -41,6 +47,44 @@ macro_rules! try_downcast { }}; } +/// Cast the array's physical values to `target`, preserving the logical decimal dtype and +/// validity. +/// +/// `mask` must be the materialized validity of `array`. Null slots are unconstrained by the +/// [`DecimalArray`] invariants (only *non-null* values must fit the precision) and may hold +/// bytes that do not fit `target`, so they are replaced with zero rather than cast. +/// +/// # Errors +/// +/// Returns an error if a non-null value cannot be represented in `target`. +pub fn cast_decimal_values( + array: &DecimalArray, + target: DecimalType, + mask: &Mask, +) -> VortexResult { + let decimal_dtype = array.decimal_dtype(); + let validity = array.validity()?; + match_each_decimal_value_type!(array.values_type(), |F| { + let from = array.buffer::(); + match_each_decimal_value_type!(target, |T| { + let values = from + .iter() + .enumerate() + .map(|(i, &v)| { + if mask.value(i) { + ::from(v).ok_or_else(|| { + vortex_err!("decimal value {v} does not fit values type {target}") + }) + } else { + Ok(T::default()) + } + }) + .collect::>>()?; + Ok(DecimalArray::new::(values, decimal_dtype, validity)) + }) + }) +} + /// Attempt to narrow the decimal array to any smaller supported type. pub fn narrowed_decimal(decimal_array: DecimalArray) -> DecimalArray { match decimal_array.values_type() { @@ -63,3 +107,45 @@ pub fn narrowed_decimal(decimal_array: DecimalArray) -> DecimalArray { } } } + +#[cfg(test)] +mod tests { + use vortex_buffer::BitBuffer; + use vortex_buffer::Buffer; + use vortex_error::VortexResult; + use vortex_mask::Mask; + + use super::cast_decimal_values; + use crate::arrays::DecimalArray; + use crate::dtype::DecimalDType; + use crate::dtype::DecimalType; + use crate::validity::Validity; + + #[test] + fn cast_zeroes_garbage_null_slots() -> VortexResult<()> { + let dt = DecimalDType::new(5, 2); + let validity = Validity::from(BitBuffer::from_iter([true, false, true])); + let arr = DecimalArray::new::( + Buffer::::copy_from([7i64, i64::MAX, -99_999]), + dt, + validity, + ); + let mask = Mask::from_iter([true, false, true]); + let narrowed = cast_decimal_values(&arr, DecimalType::I32, &mask)?; + assert_eq!(narrowed.values_type(), DecimalType::I32); + assert_eq!(narrowed.buffer::().as_slice(), &[7, 0, -99_999]); + Ok(()) + } + + #[test] + fn cast_rejects_non_null_value_that_does_not_fit() { + let dt = DecimalDType::new(5, 2); + let arr = DecimalArray::new::( + Buffer::::copy_from([i64::MAX]), + dt, + Validity::NonNullable, + ); + let mask = Mask::new_true(1); + assert!(cast_decimal_values(&arr, DecimalType::I32, &mask).is_err()); + } +} diff --git a/vortex-row/src/codec/encoding.rs b/vortex-row/src/codec/encoding.rs index c3e90641b2c..153f14c6015 100644 --- a/vortex-row/src/codec/encoding.rs +++ b/vortex-row/src/codec/encoding.rs @@ -4,6 +4,8 @@ //! Encode pass leaf kernels: per-row byte writers for each canonical variant, plus the //! variable-length block body encoder. +use vortex_array::arrays::decimal::cast_decimal_values; + use super::*; pub(super) fn encode_null( @@ -127,29 +129,18 @@ fn encode_primitive_typed( /// values. A `DecimalArray` may legally carry a wider `values_type` than its precision requires, /// so without this normalization the encode pass would write more bytes than the size pass /// reserved. The narrowing is always lossless because a decimal's precision bounds the magnitude -/// of every valid value, so the precision-minimal type can represent it. -fn narrow_decimal_to_smallest(arr: &DecimalArray) -> VortexResult> { - let decimal_dtype = arr.decimal_dtype(); - let target = DecimalType::smallest_decimal_value_type(&decimal_dtype); +/// of every valid *non-null* value, so the precision-minimal type can represent it. Null slots +/// are unconstrained and may hold values that do not fit; [`cast_decimal_values`] narrows them +/// to zero instead of casting (the encoder zero-fills null bodies anyway). +fn narrow_decimal_to_smallest( + arr: &DecimalArray, + mask: &vortex_mask::Mask, +) -> VortexResult> { + let target = DecimalType::smallest_decimal_value_type(&arr.decimal_dtype()); if arr.values_type() == target { return Ok(None); } - let validity = arr.as_ref().validity()?; - let narrowed = match_each_decimal_value_type!(arr.values_type(), |P| { - let from = arr.buffer::

(); - match_each_decimal_value_type!(target, |Q| { - DecimalArray::new::(narrow_decimal_buffer::(from), decimal_dtype, validity) - }) - }); - Ok(Some(narrowed)) -} - -/// Narrow a buffer of decimal values from type `F` to a smaller type `T`. Lossless because the -/// caller only narrows to the precision-minimal type, which can represent every valid value. -fn narrow_decimal_buffer(from: Buffer) -> Buffer { - from.iter() - .map(|&v| T::from(v).vortex_expect("decimal value must fit its precision-minimal type")) - .collect() + cast_decimal_values(arr, target, mask).map(Some) } pub(super) fn encode_decimal( @@ -162,9 +153,9 @@ pub(super) fn encode_decimal( ) -> VortexResult<()> { // Normalize to the precision-minimal physical type so the bytes we write match the width the // size pass reserved (see `narrow_decimal_to_smallest`). - let narrowed = narrow_decimal_to_smallest(arr)?; - let arr = narrowed.as_ref().unwrap_or(arr); let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?; + let narrowed = narrow_decimal_to_smallest(arr, &mask)?; + let arr = narrowed.as_ref().unwrap_or(arr); match arr.values_type() { DecimalType::I8 => { encode_decimal_typed::(arr, &mask, field, row_offsets, col_offset, out) diff --git a/vortex-row/src/codec/mod.rs b/vortex-row/src/codec/mod.rs index d5c8afbcfb7..e1f63da2852 100644 --- a/vortex-row/src/codec/mod.rs +++ b/vortex-row/src/codec/mod.rs @@ -40,10 +40,8 @@ use vortex_array::dtype::DecimalType; use vortex_array::dtype::NativeDecimalType; use vortex_array::dtype::NativePType; use vortex_array::dtype::half::f16; -use vortex_array::match_each_decimal_value_type; use vortex_array::match_each_native_ptype; use vortex_array::validity::Validity; -use vortex_buffer::Buffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs index 8c36c66a95c..90720b818fb 100644 --- a/vortex-row/src/tests.rs +++ b/vortex-row/src/tests.rs @@ -656,6 +656,43 @@ fn decimal_wide_physical_storage_sort_order(#[case] descending: bool) -> VortexR Ok(()) } +/// Regression: a nullable decimal stored wider than its precision requires may hold arbitrary +/// garbage in null slots (`DecimalArray` only constrains *non-null* values to the precision). +/// The narrowing pass must skip null slots instead of panicking when the garbage does not fit +/// the precision-minimal type. +#[test] +fn decimal_wide_storage_with_garbage_null_slot() -> VortexResult<()> { + use vortex_array::arrays::DecimalArray; + use vortex_array::dtype::DecimalDType; + use vortex_array::validity::Validity; + use vortex_buffer::BitBuffer; + use vortex_buffer::Buffer; + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // precision=5 -> minimal physical type I32, but stored as i64. Row 1 is null and its slot + // holds a value that fits neither i32 nor precision 5. + let dt = DecimalDType::new(5, 2); + let values: Vec = vec![7, i64::MAX, -99_999]; + let validity = Validity::from(BitBuffer::from_iter([true, false, true])); + let col = + DecimalArray::new::(Buffer::::copy_from(&values), dt, validity).into_array(); + + let encoded = convert_columns(&[col], &[RowSortFieldOptions::ascending()], &mut ctx)?; + let rows = collect_row_bytes(&encoded); + // sentinel(1) + i32(4) per row. + assert!(rows.iter().all(|r| r.len() == 5), "row lens: {rows:?}"); + // The null row encodes as the canonical null (sentinel 0x00, zero body), so it sorts first + // and carries no trace of the garbage slot value. + assert_eq!(rows[1], vec![0x00, 0, 0, 0, 0]); + let mut sorted = rows.clone(); + sorted.sort(); + assert_eq!( + sorted, + vec![rows[1].clone(), rows[2].clone(), rows[0].clone()] + ); + Ok(()) +} + /// Lock-in reference test: encode the worked-example row from `docs/specs/row-encoding.md` /// (one row with every supported encoding family, all columns ascending nulls-first) and /// assert the exact encoded bytes. This pins the byte layout so any accidental change to the