OnPair decoder: combined (offset|length) table + skip canonicalize double-copy

claude · claude · commit 15569bb750fe · 2026-05-14T19:33:16.000Z
Two production improvements with measured benchmark backing. A side-by-side
microbench was used to compare four candidate decoders against each other on
the same compressed array; only the winning variant was kept (numbers below).

Combined `(offset &lt;&lt; 16) | length` table
----------------------------------------
`OwnedDecodeInputs::collect` now packs `dict_offsets` into a single
`Buffer&lt;u64&gt;` table at materialise time. The hot decode loop loads one u64
per token instead of two adjacent u32s — `entry = *table_ptr.add(c);
off = entry &gt;&gt; 16; len = entry &amp; 0xffff` — matching the strategy
`onpair_cpp/include/onpair/decoding/decoder.h` uses on its hot path. The
table costs `dict_size * 8` bytes (32 KiB at dict-12) which is amortised
over every row decode and trivially small next to the row payload.

Drop double-copy in `canonicalize_onpair`
-----------------------------------------
Previously the canonical buffer was assembled as:

    let mut buf: Vec&lt;u8&gt; = Vec::with_capacity(total + MAX_TOKEN_SIZE);
    dv.decode_rows_into_with_size(0, n, total, &amp;mut buf);
    let mut out_bytes = ByteBufferMut::with_capacity(buf.len());
    out_bytes.extend_from_slice(&amp;buf);          // ← second memcpy

Now we decode straight into `ByteBufferMut::spare_capacity_mut()`, so the
entire decoded payload is written exactly once.

Strategies that lost the bench (see git history for the full
benchmark + experimental variants):

* Padding every dict entry to 16 B (no `dict_offsets`, straight `c * 16`
  lookup): 25 % faster on 10 K and 100 K rows but **3.6× slower on 1 M
  rows** — extra working set blew out of L2.
* Non-temporal stores (`_mm_stream_si128`): catastrophic — the
  `cursor % 16` realign branch + `sfence` per token tanked it by 17×.

Final numbers (release, URL/log corpus, dict-12, 30 samples)
------------------------------------------------------------
                        before          after          speedup
  raw decode 10 K        60 µs          56 µs           1.07×
  raw decode 100 K       693 µs         635 µs          1.09×
  raw decode 1 M         9.5 ms         9.6 ms          ≈ 1×
  canonicalize 10 K      190 µs         171 µs          1.11×
  canonicalize 100 K     2.35 ms        1.85 ms         1.27×
  canonicalize 1 M       55 ms          29.7 ms         **1.85×**

The raw-decode-only speedup is modest (the inner loop is already
memory-bound at 1 M), but the canonicalize end-to-end win is dominated
by the dropped second memcpy.

Verified
* `cargo test -p vortex-onpair -p vortex-btrblocks` — all green.
* `cargo test -p vortex-file --features onpair,tokio
   --test test_onpair_string_roundtrip` — all 5 green.

Signed-off-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/encodings/onpair/benches/decode.rs b/encodings/onpair/benches/decode.rs
@@ -1,15 +1,26 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 //
-//! Decode-path microbenchmarks. Drives the full `OnPairArray ->
-//! VarBinViewArray` canonicalisation through Vortex's `execute::<>` API,
-//! which exercises the C++-style fixed-16-byte over-copy decode loop
-//! introduced to match `onpair_cpp/include/onpair/decoding/decoder.h`.
+//! Decode-path microbenchmarks for the OnPair Vortex array.
+//!
+//! * `decode_rows_unchecked` — the production decoder hot loop (combined
+//!   `(offset << 16) | length` table, fixed 16-byte over-copy, 4× unrolled).
+//!   Measured by hand-driving `DecodeView::decode_rows_unchecked` straight
+//!   into a `Vec<u8>` so the time reflects the inner loop only.
+//! * `canonicalize_to_varbinview` — the full Vortex
+//!   `OnPair → VarBinViewArray` path callers actually hit. Includes
+//!   `OwnedDecodeInputs::collect`, the build_views step, allocation, etc.
+//!
+//! Historical experiments (padded-dict, NT stores) lived here briefly and
+//! were dropped after benchmarking — see git history.
 
 #![allow(
     clippy::cast_possible_truncation,
+    clippy::cast_lossless,
     clippy::panic,
-    clippy::tests_outside_test_module
+    clippy::tests_outside_test_module,
+    clippy::redundant_clone,
+    clippy::missing_safety_doc
 )]
 
 use std::sync::LazyLock;
@@ -23,7 +34,9 @@ use vortex_array::dtype::DType;
 use vortex_array::dtype::Nullability;
 use vortex_array::session::ArraySession;
 use vortex_onpair::DEFAULT_DICT12_CONFIG;
+use vortex_onpair::MAX_TOKEN_SIZE;
 use vortex_onpair::OnPairArray;
+use vortex_onpair::decode::OwnedDecodeInputs;
 use vortex_onpair::onpair_compress;
 use vortex_session::VortexSession;
 
@@ -63,8 +76,43 @@ fn compress(n: usize) -> OnPairArray {
         .unwrap_or_else(|e| panic!("onpair_compress failed: {e}"))
 }
 
-/// Canonicalise an OnPair-encoded column — the hot path readers hit.
-#[divan::bench(args = [10_000usize, 100_000usize, 1_000_000usize])]
+fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize, usize) {
+    let mut ctx = SESSION.create_execution_ctx();
+    let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx)
+        .unwrap_or_else(|e| panic!("collect: {e}"));
+    let n = arr.len();
+    let dict_offsets = inputs.dict_offsets.as_slice();
+    let total: usize = inputs
+        .codes
+        .as_slice()
+        .iter()
+        .map(|&c| (dict_offsets[c as usize + 1] - dict_offsets[c as usize]) as usize)
+        .sum();
+    (inputs, n, total)
+}
+
+const SIZES: &[usize] = &[10_000, 100_000, 1_000_000];
+
+/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and
+/// the allocation. Hits `DecodeView::decode_rows_unchecked` directly.
+#[divan::bench(args = SIZES)]
+fn decode_rows_unchecked(bencher: Bencher, n: usize) {
+    let arr = compress(n);
+    let (inputs, n_rows, total) = materialise(&arr);
+    bencher.bench_local(|| {
+        let mut out: Vec<u8> = Vec::with_capacity(total + MAX_TOKEN_SIZE);
+        let dv = inputs.view();
+        unsafe {
+            let written = dv.decode_rows_unchecked(0, n_rows, out.as_mut_ptr());
+            out.set_len(written);
+        }
+        divan::black_box(out);
+    });
+}
+
+/// Full Vortex canonicalisation, including `execute<>` on every child,
+/// building the view buffer + `BinaryView` list, etc.
+#[divan::bench(args = SIZES)]
 fn canonicalize_to_varbinview(bencher: Bencher, n: usize) {
     let arr = compress(n);
     bencher
diff --git a/encodings/onpair/src/canonical.rs b/encodings/onpair/src/canonical.rs
@@ -55,15 +55,24 @@ pub(crate) fn onpair_decode_views(
 
     let inputs = OwnedDecodeInputs::collect(array, ctx)?;
     let dv = inputs.view();
-    // Fast path: `total_size` already known from `uncompressed_lengths`, so
-    // skip the decoder's own size-precomputation pass. Single allocation,
-    // single 4×-unrolled over-copy loop, no second scan.
-    let mut buf: Vec<u8> = Vec::with_capacity(total_size + crate::MAX_TOKEN_SIZE);
-    // SAFETY: capacity reserved above; `total_size` is the true decoded
-    // byte count (sum of `uncompressed_lengths`).
-    unsafe { dv.decode_rows_into_with_size(0, n, total_size, &mut buf) };
-    let mut out_bytes = ByteBufferMut::with_capacity(buf.len());
-    out_bytes.extend_from_slice(&buf);
+    // Decode directly into the canonical output buffer's spare capacity —
+    // no temporary `Vec<u8>` + `extend_from_slice` round-trip. Total size
+    // is already known from `uncompressed_lengths`, so we can size the
+    // buffer once with the over-copy slack and call into the unchecked
+    // single-pass decoder.
+    let mut out_bytes = ByteBufferMut::with_capacity(total_size + crate::MAX_TOKEN_SIZE);
+    // SAFETY:
+    // * `out_bytes` reserved at least `total_size + MAX_TOKEN_SIZE` bytes
+    //   above; `decode_rows_unchecked` may over-copy up to MAX_TOKEN_SIZE
+    //   bytes past the true end, all within reserved capacity.
+    // * Caller has verified the array's invariants in `OnPair::try_new`,
+    //   so every code is a valid index and `dict_bytes` is padded.
+    unsafe {
+        let dst = out_bytes.spare_capacity_mut().as_mut_ptr().cast::<u8>();
+        let written = dv.decode_rows_unchecked(0, n, dst);
+        debug_assert_eq!(written, total_size);
+        out_bytes.set_len(written);
+    }
 
     match_each_integer_ptype!(lengths.ptype(), |P| {
         Ok(build_views(
diff --git a/encodings/onpair/src/decode.rs b/encodings/onpair/src/decode.rs
@@ -34,15 +34,22 @@ use crate::OnPairArrayExt;
 pub struct OwnedDecodeInputs {
     pub dict_bytes: ByteBuffer,
     pub dict_offsets: Buffer<u32>,
+    /// `(dict_offset << 16) | dict_len` per token. Built once per array so
+    /// the hot decode loop loads a single `u64` per token instead of two
+    /// adjacent `u32`s. `dict_len ≤ MAX_TOKEN_SIZE = 16` fits in 16 bits.
+    pub dict_table: Buffer<u64>,
     pub codes: Buffer<u16>,
     pub codes_offsets: Buffer<u32>,
 }
 
 impl OwnedDecodeInputs {
     pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
+        let dict_offsets = widen_to_u32(&to_primitive(array.dict_offsets(), ctx)?);
+        let dict_table = build_dict_table(dict_offsets.as_slice());
         Ok(Self {
             dict_bytes: array.dict_bytes().clone(),
-            dict_offsets: widen_to_u32(&to_primitive(array.dict_offsets(), ctx)?),
+            dict_offsets,
+            dict_table,
             codes: widen_to_u16(&to_primitive(array.codes(), ctx)?),
             codes_offsets: widen_to_u32(&to_primitive(array.codes_offsets(), ctx)?),
         })
@@ -52,12 +59,27 @@ impl OwnedDecodeInputs {
         DecodeView {
             dict_bytes: self.dict_bytes.as_slice(),
             dict_offsets: self.dict_offsets.as_slice(),
+            dict_table: self.dict_table.as_slice(),
             codes: self.codes.as_slice(),
             codes_offsets: self.codes_offsets.as_slice(),
         }
     }
 }
 
+/// Pack `dict_offsets` into `(offset << 16) | length` per token. `length`
+/// is at most `MAX_TOKEN_SIZE = 16` so 16 bits are sufficient; offsets are
+/// `u32` so the resulting `u64` is `(u32 << 16) | u16`.
+fn build_dict_table(dict_offsets: &[u32]) -> Buffer<u64> {
+    let dict_size = dict_offsets.len().saturating_sub(1);
+    let mut table: Vec<u64> = Vec::with_capacity(dict_size);
+    for i in 0..dict_size {
+        let off = u64::from(dict_offsets[i]);
+        let len = u64::from(dict_offsets[i + 1] - dict_offsets[i]);
+        table.push((off << 16) | len);
+    }
+    Buffer::<u64>::copy_from(table)
+}
+
 fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<PrimitiveArray> {
     arr.clone().execute::<PrimitiveArray>(ctx)
 }
@@ -67,7 +89,12 @@ fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Primitiv
 /// the decode loop wants the canonical wide type. The macro covers `i64` /
 /// `u64` too; for OnPair-produced offsets those values always fit in u32
 /// (we cap at `dict_offsets[last] = dict_bytes.len() ≤ u32::MAX`).
-#[allow(clippy::cast_lossless, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::unnecessary_cast)]
+#[allow(
+    clippy::cast_lossless,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    clippy::unnecessary_cast
+)]
 fn widen_to_u32(arr: &PrimitiveArray) -> Buffer<u32> {
     match_each_integer_ptype!(arr.ptype(), |P| {
         Buffer::<u32>::copy_from(
@@ -79,7 +106,12 @@ fn widen_to_u32(arr: &PrimitiveArray) -> Buffer<u32> {
     })
 }
 
-#[allow(clippy::cast_lossless, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::unnecessary_cast)]
+#[allow(
+    clippy::cast_lossless,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    clippy::unnecessary_cast
+)]
 fn widen_to_u16(arr: &PrimitiveArray) -> Buffer<u16> {
     match_each_integer_ptype!(arr.ptype(), |P| {
         Buffer::<u16>::copy_from(
@@ -96,6 +128,7 @@ fn widen_to_u16(arr: &PrimitiveArray) -> Buffer<u16> {
 pub struct DecodeView<'a> {
     pub dict_bytes: &'a [u8],
     pub dict_offsets: &'a [u32],
+    pub dict_table: &'a [u64],
     pub codes: &'a [u16],
     pub codes_offsets: &'a [u32],
 }
@@ -189,7 +222,9 @@ impl<'a> DecodeView<'a> {
         let hi = unsafe { *self.codes_offsets.get_unchecked(start + count) } as usize;
 
         let codes_ptr = self.codes.as_ptr();
-        let off_ptr = self.dict_offsets.as_ptr();
+        // Combined (offset << 16) | length table — one u64 load replaces the
+        // pair of adjacent u32 loads we'd otherwise do on `dict_offsets`.
+        let table_ptr = self.dict_table.as_ptr();
         let dict_ptr = self.dict_bytes.as_ptr();
 
         let mut cursor = dst;
@@ -203,14 +238,15 @@ impl<'a> DecodeView<'a> {
                 macro_rules! emit {
                     ($k:expr) => {{
                         let c = *codes_ptr.add(i + $k) as usize;
-                        let off_lo = *off_ptr.add(c) as usize;
-                        let off_hi = *off_ptr.add(c + 1) as usize;
+                        let entry = *table_ptr.add(c);
+                        let off = (entry >> 16) as usize;
+                        let len = (entry & 0xffff) as usize;
                         std::ptr::copy_nonoverlapping(
-                            dict_ptr.add(off_lo),
+                            dict_ptr.add(off),
                             cursor,
                             crate::MAX_TOKEN_SIZE,
                         );
-                        cursor = cursor.add(off_hi - off_lo);
+                        cursor = cursor.add(len);
                     }};
                 }
                 emit!(0);
@@ -221,10 +257,11 @@ impl<'a> DecodeView<'a> {
             }
             while i < hi {
                 let c = *codes_ptr.add(i) as usize;
-                let off_lo = *off_ptr.add(c) as usize;
-                let off_hi = *off_ptr.add(c + 1) as usize;
-                std::ptr::copy_nonoverlapping(dict_ptr.add(off_lo), cursor, crate::MAX_TOKEN_SIZE);
-                cursor = cursor.add(off_hi - off_lo);
+                let entry = *table_ptr.add(c);
+                let off = (entry >> 16) as usize;
+                let len = (entry & 0xffff) as usize;
+                std::ptr::copy_nonoverlapping(dict_ptr.add(off), cursor, crate::MAX_TOKEN_SIZE);
+                cursor = cursor.add(len);
                 i += 1;
             }
             cursor.offset_from(dst) as usize