Skip to content

Commit adeda19

Browse files
committed
OnPair decoder: drop redundant dict_offsets widen + tighter hot path
Local-only follow-up to the combined-table decoder (15569bb). Four correctness-preserving micro-optimisations and some test/bench hygiene. Not pushed; user requested local-only review. 1. Drop `OwnedDecodeInputs::dict_offsets` — the decoder only needs the combined `(offset << 16) | length` `dict_table`, so `collect` no longer materialises a `Buffer<u32>` for the offsets at all. The table is built directly from whatever ptype the cascading compressor handed back via `match_each_integer_ptype!`. Saves one `dict_size`-element allocation per decode. 2. Single-allocation widen. `widen_to_{u16,u32}` now go through `BufferMut::with_capacity` + `push_unchecked` + `freeze` rather than `Vec → Buffer::copy_from`, halving allocator traffic. 3. Zero-copy widen fast path. When the cascading compressor did *not* narrow (the common case for small dicts / wide value ranges), the widen function refcount-bumps the underlying Arc via `PrimitiveArray::into_buffer::<u_N>()` instead of copying. 4. `for_each_dict_slice` + `decoded_len_rows` use `dict_table`. One `u64` load per token instead of two adjacent `u32` loads. 5. Tighter predicate kernels. `row_equals` / `row_starts_with` use raw slice pointer math on the needle/prefix after a single length check, instead of re-running bounds-checked subslicing on every iteration. Tests + bench * New `rstest`-parameterised `test_onpair_unroll_tail_boundaries` for `n ∈ {1, 2, 3, 4, 5, 7, 8, 9}` to stress the 4×-unrolled decode loop's scalar tail. Plus `test_onpair_empty`. * Bench sweeps four corpus shapes (URL/log, short, long, high-card) across two row counts, so a regression on any shape surfaces clearly. Benchmark (release, 30 samples, vs prior tip 15569bb) canonicalize UrlLog 100 K 1.85 ms → 1.42 ms (-23 %) canonicalize UrlLog 1 M 29.7 ms → 15.1 ms (-49 %) decode_rows UrlLog 1 M 9.6 ms → 4.6 ms (-52 %) Verified * `cargo test -p vortex-onpair` — 16/16 (was 7/7). * `cargo test -p vortex-btrblocks` — 35/35. * `cargo test -p vortex-file --features onpair,tokio --test test_onpair_string_roundtrip` — 5/5. * `cargo clippy -p vortex-onpair -p vortex-onpair-sys -p vortex-btrblocks --all-targets` — clean. Signed-off-by: Claude <noreply@anthropic.com>
1 parent 15569bb commit adeda19

5 files changed

Lines changed: 302 additions & 146 deletions

File tree

encodings/onpair/benches/decode.rs

Lines changed: 83 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
//! `OnPair → VarBinViewArray` path callers actually hit. Includes
1212
//! `OwnedDecodeInputs::collect`, the build_views step, allocation, etc.
1313
//!
14-
//! Historical experiments (padded-dict, NT stores) lived here briefly and
15-
//! were dropped after benchmarking — see git history.
14+
//! Each bench sweeps four corpus shapes against two row counts to surface
15+
//! cache-pressure cliffs and per-row decode cost.
1616
1717
#![allow(
1818
clippy::cast_possible_truncation,
@@ -43,31 +43,77 @@ use vortex_session::VortexSession;
4343
static SESSION: LazyLock<VortexSession> =
4444
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
4545

46-
fn corpus(n: usize) -> Vec<String> {
47-
let templates: &[&str] = &[
48-
"https://www.example.com/products/{id}",
49-
"https://cdn.example.com/img/{id}.webp",
50-
"https://api.example.com/v2/orders/{id}",
51-
"https://www.example.com/users/{id}/profile",
52-
"INFO request_id={id} status=200 method=GET",
53-
"WARN request_id={id} status=429 method=POST",
54-
"ERROR request_id={id} status=500 method=PUT",
55-
];
56-
let mut out = Vec::with_capacity(n);
46+
#[derive(Copy, Clone, Debug)]
47+
enum Shape {
48+
/// URL / HTTP-log shaped — high lexical overlap, ~35–45 bytes per row.
49+
UrlLog,
50+
/// Short uniform strings — 4–8 bytes per row, very low cardinality.
51+
Short,
52+
/// Long log-line shaped — ~120 bytes per row, more tokens per row.
53+
Long,
54+
/// High cardinality — every row unique.
55+
HighCard,
56+
}
57+
58+
fn corpus(n: usize, shape: Shape) -> Vec<String> {
5759
let mut state = 0x9e37_79b9_7f4a_7c15_u64;
58-
for _ in 0..n {
60+
let mut next = || {
5961
state = state
6062
.wrapping_mul(6364136223846793005)
6163
.wrapping_add(1442695040888963407);
62-
let pick = (state as usize) % templates.len();
63-
let id = state as u32;
64-
out.push(templates[pick].replace("{id}", &format!("{id:08x}")));
64+
state
65+
};
66+
let mut out = Vec::with_capacity(n);
67+
match shape {
68+
Shape::UrlLog => {
69+
let templates: &[&str] = &[
70+
"https://www.example.com/products/{id}",
71+
"https://cdn.example.com/img/{id}.webp",
72+
"https://api.example.com/v2/orders/{id}",
73+
"https://www.example.com/users/{id}/profile",
74+
"INFO request_id={id} status=200 method=GET",
75+
"WARN request_id={id} status=429 method=POST",
76+
"ERROR request_id={id} status=500 method=PUT",
77+
];
78+
for _ in 0..n {
79+
let s = next();
80+
let pick = (s as usize) % templates.len();
81+
let id = s as u32;
82+
out.push(templates[pick].replace("{id}", &format!("{id:08x}")));
83+
}
84+
}
85+
Shape::Short => {
86+
let templates: &[&str] =
87+
&["alpha", "beta", "gamma", "delta", "eps", "zeta", "eta"];
88+
for _ in 0..n {
89+
let s = next();
90+
out.push(templates[(s as usize) % templates.len()].to_string());
91+
}
92+
}
93+
Shape::Long => {
94+
let templates: &[&str] = &[
95+
"2026-05-14T12:34:56.789012Z INFO request_id={id} method=GET path=/api/v1/users/{id}/profile status=200",
96+
"2026-05-14T12:34:56.789012Z WARN request_id={id} method=POST path=/api/v1/users/{id}/sessions status=429",
97+
"2026-05-14T12:34:56.789012Z ERROR request_id={id} method=PUT path=/api/v1/users/{id}/settings status=500",
98+
];
99+
for _ in 0..n {
100+
let s = next();
101+
let pick = (s as usize) % templates.len();
102+
let id = s as u32;
103+
out.push(templates[pick].replace("{id}", &format!("{id:08x}")));
104+
}
105+
}
106+
Shape::HighCard => {
107+
for i in 0..n {
108+
out.push(format!("row-{i:010x}-{rand:016x}", rand = next()));
109+
}
110+
}
65111
}
66112
out
67113
}
68114

69-
fn compress(n: usize) -> OnPairArray {
70-
let strings = corpus(n);
115+
fn compress(n: usize, shape: Shape) -> OnPairArray {
116+
let strings = corpus(n, shape);
71117
let varbin = VarBinArray::from_iter(
72118
strings.iter().map(|s| Some(s.as_bytes())),
73119
DType::Utf8(Nullability::NonNullable),
@@ -81,23 +127,29 @@ fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize, usize) {
81127
let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx)
82128
.unwrap_or_else(|e| panic!("collect: {e}"));
83129
let n = arr.len();
84-
let dict_offsets = inputs.dict_offsets.as_slice();
85130
let total: usize = inputs
86131
.codes
87132
.as_slice()
88133
.iter()
89-
.map(|&c| (dict_offsets[c as usize + 1] - dict_offsets[c as usize]) as usize)
134+
.map(|&c| (inputs.dict_table.as_slice()[c as usize] & 0xffff) as usize)
90135
.sum();
91136
(inputs, n, total)
92137
}
93138

94-
const SIZES: &[usize] = &[10_000, 100_000, 1_000_000];
139+
const CASES: &[(Shape, usize)] = &[
140+
(Shape::UrlLog, 100_000),
141+
(Shape::UrlLog, 1_000_000),
142+
(Shape::Short, 100_000),
143+
(Shape::Long, 100_000),
144+
(Shape::HighCard, 100_000),
145+
];
95146

96-
/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and
97-
/// the allocation. Hits `DecodeView::decode_rows_unchecked` directly.
98-
#[divan::bench(args = SIZES)]
99-
fn decode_rows_unchecked(bencher: Bencher, n: usize) {
100-
let arr = compress(n);
147+
/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and the
148+
/// output allocation. Hits `DecodeView::decode_rows_unchecked` directly.
149+
#[divan::bench(args = CASES)]
150+
fn decode_rows_unchecked(bencher: Bencher, case: (Shape, usize)) {
151+
let (shape, n) = case;
152+
let arr = compress(n, shape);
101153
let (inputs, n_rows, total) = materialise(&arr);
102154
bencher.bench_local(|| {
103155
let mut out: Vec<u8> = Vec::with_capacity(total + MAX_TOKEN_SIZE);
@@ -112,9 +164,10 @@ fn decode_rows_unchecked(bencher: Bencher, n: usize) {
112164

113165
/// Full Vortex canonicalisation, including `execute<>` on every child,
114166
/// building the view buffer + `BinaryView` list, etc.
115-
#[divan::bench(args = SIZES)]
116-
fn canonicalize_to_varbinview(bencher: Bencher, n: usize) {
117-
let arr = compress(n);
167+
#[divan::bench(args = CASES)]
168+
fn canonicalize_to_varbinview(bencher: Bencher, case: (Shape, usize)) {
169+
let (shape, n) = case;
170+
let arr = compress(n, shape);
118171
bencher
119172
.with_inputs(|| arr.clone().into_array())
120173
.bench_local_values(|arr| {

encodings/onpair/src/compute/compare.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,27 @@ fn needle_bytes(scalar: &Scalar) -> Option<Vec<u8>> {
7171
/// True iff row `r` decodes to exactly `needle`.
7272
fn row_equals_needle(dv: &DecodeView<'_>, r: usize, needle: &[u8]) -> bool {
7373
let mut pos = 0usize;
74+
let n = needle.len();
75+
let needle_ptr = needle.as_ptr();
7476
let ok = dv.for_each_dict_slice(r, |slice| {
7577
let take = slice.len();
76-
if pos + take > needle.len() || &needle[pos..pos + take] != slice {
78+
// Fast-path: bail on length overflow first so we never compare a
79+
// partial slice that would walk past `needle`.
80+
if pos + take > n {
81+
return false;
82+
}
83+
// SAFETY: `pos + take <= n`, `take == slice.len()`. Compares
84+
// `needle[pos..pos+take]` with `slice` via raw `memcmp`-style
85+
// pointer math. The branch on length above is the only check.
86+
let eq = unsafe {
87+
let lhs = needle_ptr.add(pos);
88+
std::slice::from_raw_parts(lhs, take) == slice
89+
};
90+
if !eq {
7791
return false;
7892
}
7993
pos += take;
8094
true
8195
});
82-
ok && pos == needle.len()
96+
ok && pos == n
8397
}

encodings/onpair/src/compute/like.rs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,22 @@ impl LikeKernel for OnPair {
106106

107107
fn row_equals(dv: &DecodeView<'_>, r: usize, needle: &[u8]) -> bool {
108108
let mut pos = 0usize;
109+
let n = needle.len();
110+
let needle_ptr = needle.as_ptr();
109111
let ok = dv.for_each_dict_slice(r, |slice| {
110112
let take = slice.len();
111-
if pos + take > needle.len() || &needle[pos..pos + take] != slice {
113+
if pos + take > n {
114+
return false;
115+
}
116+
// SAFETY: `pos + take <= n`.
117+
let eq = unsafe { std::slice::from_raw_parts(needle_ptr.add(pos), take) == slice };
118+
if !eq {
112119
return false;
113120
}
114121
pos += take;
115122
true
116123
});
117-
ok && pos == needle.len()
124+
ok && pos == n
118125
}
119126

120127
fn row_starts_with(dv: &DecodeView<'_>, r: usize, prefix: &[u8]) -> bool {
@@ -123,14 +130,24 @@ fn row_starts_with(dv: &DecodeView<'_>, r: usize, prefix: &[u8]) -> bool {
123130
}
124131
let mut pos = 0usize;
125132
let mut matched = false;
133+
let plen = prefix.len();
134+
let prefix_ptr = prefix.as_ptr();
126135
dv.for_each_dict_slice(r, |slice| {
127-
let remaining = prefix.len() - pos;
136+
let remaining = plen - pos;
128137
let take = slice.len().min(remaining);
129-
if prefix[pos..pos + take] != slice[..take] {
138+
// SAFETY:
139+
// * `pos + take <= plen` because `take <= remaining`.
140+
// * `take <= slice.len()` by construction.
141+
let eq = unsafe {
142+
let lhs = std::slice::from_raw_parts(prefix_ptr.add(pos), take);
143+
let rhs = slice.get_unchecked(..take);
144+
lhs == rhs
145+
};
146+
if !eq {
130147
return false;
131148
}
132149
pos += take;
133-
if pos == prefix.len() {
150+
if pos == plen {
134151
matched = true;
135152
return false; // short-circuit, prefix satisfied
136153
}

0 commit comments

Comments
 (0)