Skip to content

Commit 15569bb

Browse files
committed
OnPair decoder: combined (offset|length) table + skip canonicalize double-copy
Two production improvements with measured benchmark backing. A side-by-side microbench was used to compare four candidate decoders against each other on the same compressed array; only the winning variant was kept (numbers below). Combined `(offset << 16) | length` table ---------------------------------------- `OwnedDecodeInputs::collect` now packs `dict_offsets` into a single `Buffer<u64>` table at materialise time. The hot decode loop loads one u64 per token instead of two adjacent u32s — `entry = *table_ptr.add(c); off = entry >> 16; len = entry & 0xffff` — matching the strategy `onpair_cpp/include/onpair/decoding/decoder.h` uses on its hot path. The table costs `dict_size * 8` bytes (32 KiB at dict-12) which is amortised over every row decode and trivially small next to the row payload. Drop double-copy in `canonicalize_onpair` ----------------------------------------- Previously the canonical buffer was assembled as: let mut buf: Vec<u8> = Vec::with_capacity(total + MAX_TOKEN_SIZE); dv.decode_rows_into_with_size(0, n, total, &mut buf); let mut out_bytes = ByteBufferMut::with_capacity(buf.len()); out_bytes.extend_from_slice(&buf); // ← second memcpy Now we decode straight into `ByteBufferMut::spare_capacity_mut()`, so the entire decoded payload is written exactly once. Strategies that lost the bench (see git history for the full benchmark + experimental variants): * Padding every dict entry to 16 B (no `dict_offsets`, straight `c * 16` lookup): 25 % faster on 10 K and 100 K rows but **3.6× slower on 1 M rows** — extra working set blew out of L2. * Non-temporal stores (`_mm_stream_si128`): catastrophic — the `cursor % 16` realign branch + `sfence` per token tanked it by 17×. Final numbers (release, URL/log corpus, dict-12, 30 samples) ------------------------------------------------------------ before after speedup raw decode 10 K 60 µs 56 µs 1.07× raw decode 100 K 693 µs 635 µs 1.09× raw decode 1 M 9.5 ms 9.6 ms ≈ 1× canonicalize 10 K 190 µs 171 µs 1.11× canonicalize 100 K 2.35 ms 1.85 ms 1.27× canonicalize 1 M 55 ms 29.7 ms **1.85×** The raw-decode-only speedup is modest (the inner loop is already memory-bound at 1 M), but the canonicalize end-to-end win is dominated by the dropped second memcpy. Verified * `cargo test -p vortex-onpair -p vortex-btrblocks` — all green. * `cargo test -p vortex-file --features onpair,tokio --test test_onpair_string_roundtrip` — all 5 green. Signed-off-by: Claude <noreply@anthropic.com>
1 parent d9a6c8c commit 15569bb

3 files changed

Lines changed: 122 additions & 28 deletions

File tree

encodings/onpair/benches/decode.rs

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,26 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33
//
4-
//! Decode-path microbenchmarks. Drives the full `OnPairArray ->
5-
//! VarBinViewArray` canonicalisation through Vortex's `execute::<>` API,
6-
//! which exercises the C++-style fixed-16-byte over-copy decode loop
7-
//! introduced to match `onpair_cpp/include/onpair/decoding/decoder.h`.
4+
//! Decode-path microbenchmarks for the OnPair Vortex array.
5+
//!
6+
//! * `decode_rows_unchecked` — the production decoder hot loop (combined
7+
//! `(offset << 16) | length` table, fixed 16-byte over-copy, 4× unrolled).
8+
//! Measured by hand-driving `DecodeView::decode_rows_unchecked` straight
9+
//! into a `Vec<u8>` so the time reflects the inner loop only.
10+
//! * `canonicalize_to_varbinview` — the full Vortex
11+
//! `OnPair → VarBinViewArray` path callers actually hit. Includes
12+
//! `OwnedDecodeInputs::collect`, the build_views step, allocation, etc.
13+
//!
14+
//! Historical experiments (padded-dict, NT stores) lived here briefly and
15+
//! were dropped after benchmarking — see git history.
816
917
#![allow(
1018
clippy::cast_possible_truncation,
19+
clippy::cast_lossless,
1120
clippy::panic,
12-
clippy::tests_outside_test_module
21+
clippy::tests_outside_test_module,
22+
clippy::redundant_clone,
23+
clippy::missing_safety_doc
1324
)]
1425

1526
use std::sync::LazyLock;
@@ -23,7 +34,9 @@ use vortex_array::dtype::DType;
2334
use vortex_array::dtype::Nullability;
2435
use vortex_array::session::ArraySession;
2536
use vortex_onpair::DEFAULT_DICT12_CONFIG;
37+
use vortex_onpair::MAX_TOKEN_SIZE;
2638
use vortex_onpair::OnPairArray;
39+
use vortex_onpair::decode::OwnedDecodeInputs;
2740
use vortex_onpair::onpair_compress;
2841
use vortex_session::VortexSession;
2942

@@ -63,8 +76,43 @@ fn compress(n: usize) -> OnPairArray {
6376
.unwrap_or_else(|e| panic!("onpair_compress failed: {e}"))
6477
}
6578

66-
/// Canonicalise an OnPair-encoded column — the hot path readers hit.
67-
#[divan::bench(args = [10_000usize, 100_000usize, 1_000_000usize])]
79+
fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize, usize) {
80+
let mut ctx = SESSION.create_execution_ctx();
81+
let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx)
82+
.unwrap_or_else(|e| panic!("collect: {e}"));
83+
let n = arr.len();
84+
let dict_offsets = inputs.dict_offsets.as_slice();
85+
let total: usize = inputs
86+
.codes
87+
.as_slice()
88+
.iter()
89+
.map(|&c| (dict_offsets[c as usize + 1] - dict_offsets[c as usize]) as usize)
90+
.sum();
91+
(inputs, n, total)
92+
}
93+
94+
const SIZES: &[usize] = &[10_000, 100_000, 1_000_000];
95+
96+
/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and
97+
/// the allocation. Hits `DecodeView::decode_rows_unchecked` directly.
98+
#[divan::bench(args = SIZES)]
99+
fn decode_rows_unchecked(bencher: Bencher, n: usize) {
100+
let arr = compress(n);
101+
let (inputs, n_rows, total) = materialise(&arr);
102+
bencher.bench_local(|| {
103+
let mut out: Vec<u8> = Vec::with_capacity(total + MAX_TOKEN_SIZE);
104+
let dv = inputs.view();
105+
unsafe {
106+
let written = dv.decode_rows_unchecked(0, n_rows, out.as_mut_ptr());
107+
out.set_len(written);
108+
}
109+
divan::black_box(out);
110+
});
111+
}
112+
113+
/// Full Vortex canonicalisation, including `execute<>` on every child,
114+
/// building the view buffer + `BinaryView` list, etc.
115+
#[divan::bench(args = SIZES)]
68116
fn canonicalize_to_varbinview(bencher: Bencher, n: usize) {
69117
let arr = compress(n);
70118
bencher

encodings/onpair/src/canonical.rs

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,24 @@ pub(crate) fn onpair_decode_views(
5555

5656
let inputs = OwnedDecodeInputs::collect(array, ctx)?;
5757
let dv = inputs.view();
58-
// Fast path: `total_size` already known from `uncompressed_lengths`, so
59-
// skip the decoder's own size-precomputation pass. Single allocation,
60-
// single 4×-unrolled over-copy loop, no second scan.
61-
let mut buf: Vec<u8> = Vec::with_capacity(total_size + crate::MAX_TOKEN_SIZE);
62-
// SAFETY: capacity reserved above; `total_size` is the true decoded
63-
// byte count (sum of `uncompressed_lengths`).
64-
unsafe { dv.decode_rows_into_with_size(0, n, total_size, &mut buf) };
65-
let mut out_bytes = ByteBufferMut::with_capacity(buf.len());
66-
out_bytes.extend_from_slice(&buf);
58+
// Decode directly into the canonical output buffer's spare capacity —
59+
// no temporary `Vec<u8>` + `extend_from_slice` round-trip. Total size
60+
// is already known from `uncompressed_lengths`, so we can size the
61+
// buffer once with the over-copy slack and call into the unchecked
62+
// single-pass decoder.
63+
let mut out_bytes = ByteBufferMut::with_capacity(total_size + crate::MAX_TOKEN_SIZE);
64+
// SAFETY:
65+
// * `out_bytes` reserved at least `total_size + MAX_TOKEN_SIZE` bytes
66+
// above; `decode_rows_unchecked` may over-copy up to MAX_TOKEN_SIZE
67+
// bytes past the true end, all within reserved capacity.
68+
// * Caller has verified the array's invariants in `OnPair::try_new`,
69+
// so every code is a valid index and `dict_bytes` is padded.
70+
unsafe {
71+
let dst = out_bytes.spare_capacity_mut().as_mut_ptr().cast::<u8>();
72+
let written = dv.decode_rows_unchecked(0, n, dst);
73+
debug_assert_eq!(written, total_size);
74+
out_bytes.set_len(written);
75+
}
6776

6877
match_each_integer_ptype!(lengths.ptype(), |P| {
6978
Ok(build_views(

encodings/onpair/src/decode.rs

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,22 @@ use crate::OnPairArrayExt;
3434
pub struct OwnedDecodeInputs {
3535
pub dict_bytes: ByteBuffer,
3636
pub dict_offsets: Buffer<u32>,
37+
/// `(dict_offset << 16) | dict_len` per token. Built once per array so
38+
/// the hot decode loop loads a single `u64` per token instead of two
39+
/// adjacent `u32`s. `dict_len ≤ MAX_TOKEN_SIZE = 16` fits in 16 bits.
40+
pub dict_table: Buffer<u64>,
3741
pub codes: Buffer<u16>,
3842
pub codes_offsets: Buffer<u32>,
3943
}
4044

4145
impl OwnedDecodeInputs {
4246
pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
47+
let dict_offsets = widen_to_u32(&to_primitive(array.dict_offsets(), ctx)?);
48+
let dict_table = build_dict_table(dict_offsets.as_slice());
4349
Ok(Self {
4450
dict_bytes: array.dict_bytes().clone(),
45-
dict_offsets: widen_to_u32(&to_primitive(array.dict_offsets(), ctx)?),
51+
dict_offsets,
52+
dict_table,
4653
codes: widen_to_u16(&to_primitive(array.codes(), ctx)?),
4754
codes_offsets: widen_to_u32(&to_primitive(array.codes_offsets(), ctx)?),
4855
})
@@ -52,12 +59,27 @@ impl OwnedDecodeInputs {
5259
DecodeView {
5360
dict_bytes: self.dict_bytes.as_slice(),
5461
dict_offsets: self.dict_offsets.as_slice(),
62+
dict_table: self.dict_table.as_slice(),
5563
codes: self.codes.as_slice(),
5664
codes_offsets: self.codes_offsets.as_slice(),
5765
}
5866
}
5967
}
6068

69+
/// Pack `dict_offsets` into `(offset << 16) | length` per token. `length`
70+
/// is at most `MAX_TOKEN_SIZE = 16` so 16 bits are sufficient; offsets are
71+
/// `u32` so the resulting `u64` is `(u32 << 16) | u16`.
72+
fn build_dict_table(dict_offsets: &[u32]) -> Buffer<u64> {
73+
let dict_size = dict_offsets.len().saturating_sub(1);
74+
let mut table: Vec<u64> = Vec::with_capacity(dict_size);
75+
for i in 0..dict_size {
76+
let off = u64::from(dict_offsets[i]);
77+
let len = u64::from(dict_offsets[i + 1] - dict_offsets[i]);
78+
table.push((off << 16) | len);
79+
}
80+
Buffer::<u64>::copy_from(table)
81+
}
82+
6183
fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<PrimitiveArray> {
6284
arr.clone().execute::<PrimitiveArray>(ctx)
6385
}
@@ -67,7 +89,12 @@ fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<Primitiv
6789
/// the decode loop wants the canonical wide type. The macro covers `i64` /
6890
/// `u64` too; for OnPair-produced offsets those values always fit in u32
6991
/// (we cap at `dict_offsets[last] = dict_bytes.len() ≤ u32::MAX`).
70-
#[allow(clippy::cast_lossless, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::unnecessary_cast)]
92+
#[allow(
93+
clippy::cast_lossless,
94+
clippy::cast_possible_truncation,
95+
clippy::cast_sign_loss,
96+
clippy::unnecessary_cast
97+
)]
7198
fn widen_to_u32(arr: &PrimitiveArray) -> Buffer<u32> {
7299
match_each_integer_ptype!(arr.ptype(), |P| {
73100
Buffer::<u32>::copy_from(
@@ -79,7 +106,12 @@ fn widen_to_u32(arr: &PrimitiveArray) -> Buffer<u32> {
79106
})
80107
}
81108

82-
#[allow(clippy::cast_lossless, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::unnecessary_cast)]
109+
#[allow(
110+
clippy::cast_lossless,
111+
clippy::cast_possible_truncation,
112+
clippy::cast_sign_loss,
113+
clippy::unnecessary_cast
114+
)]
83115
fn widen_to_u16(arr: &PrimitiveArray) -> Buffer<u16> {
84116
match_each_integer_ptype!(arr.ptype(), |P| {
85117
Buffer::<u16>::copy_from(
@@ -96,6 +128,7 @@ fn widen_to_u16(arr: &PrimitiveArray) -> Buffer<u16> {
96128
pub struct DecodeView<'a> {
97129
pub dict_bytes: &'a [u8],
98130
pub dict_offsets: &'a [u32],
131+
pub dict_table: &'a [u64],
99132
pub codes: &'a [u16],
100133
pub codes_offsets: &'a [u32],
101134
}
@@ -189,7 +222,9 @@ impl<'a> DecodeView<'a> {
189222
let hi = unsafe { *self.codes_offsets.get_unchecked(start + count) } as usize;
190223

191224
let codes_ptr = self.codes.as_ptr();
192-
let off_ptr = self.dict_offsets.as_ptr();
225+
// Combined (offset << 16) | length table — one u64 load replaces the
226+
// pair of adjacent u32 loads we'd otherwise do on `dict_offsets`.
227+
let table_ptr = self.dict_table.as_ptr();
193228
let dict_ptr = self.dict_bytes.as_ptr();
194229

195230
let mut cursor = dst;
@@ -203,14 +238,15 @@ impl<'a> DecodeView<'a> {
203238
macro_rules! emit {
204239
($k:expr) => {{
205240
let c = *codes_ptr.add(i + $k) as usize;
206-
let off_lo = *off_ptr.add(c) as usize;
207-
let off_hi = *off_ptr.add(c + 1) as usize;
241+
let entry = *table_ptr.add(c);
242+
let off = (entry >> 16) as usize;
243+
let len = (entry & 0xffff) as usize;
208244
std::ptr::copy_nonoverlapping(
209-
dict_ptr.add(off_lo),
245+
dict_ptr.add(off),
210246
cursor,
211247
crate::MAX_TOKEN_SIZE,
212248
);
213-
cursor = cursor.add(off_hi - off_lo);
249+
cursor = cursor.add(len);
214250
}};
215251
}
216252
emit!(0);
@@ -221,10 +257,11 @@ impl<'a> DecodeView<'a> {
221257
}
222258
while i < hi {
223259
let c = *codes_ptr.add(i) as usize;
224-
let off_lo = *off_ptr.add(c) as usize;
225-
let off_hi = *off_ptr.add(c + 1) as usize;
226-
std::ptr::copy_nonoverlapping(dict_ptr.add(off_lo), cursor, crate::MAX_TOKEN_SIZE);
227-
cursor = cursor.add(off_hi - off_lo);
260+
let entry = *table_ptr.add(c);
261+
let off = (entry >> 16) as usize;
262+
let len = (entry & 0xffff) as usize;
263+
std::ptr::copy_nonoverlapping(dict_ptr.add(off), cursor, crate::MAX_TOKEN_SIZE);
264+
cursor = cursor.add(len);
228265
i += 1;
229266
}
230267
cursor.offset_from(dst) as usize

0 commit comments

Comments
 (0)