Skip to content

Commit b3b75e2

Browse files
Dandandanclaude
andcommitted
perf(parquet): vectorise dict-index bounds check in RleDecoder::get_batch_with_dict
Replace `idx_chunk.iter().all(|&i| (i as usize) < dict_len)` with a u32 max-reduction (`fold(0u32, |acc, &i| acc.max(i as u32))`). `.all` short-circuits and so blocks autovectorisation; on aarch64 the old form compiled to eight serialised `ldrsw` + `cmp` + `b.ls` pairs per 8-index chunk, followed by eight separate scalar gather loads. The max-reduction has no early exit, so LLVM now lowers the check to a single `ldp q1, q0` + `umax.4s` + `umaxv.4s` + one `cmp` + `b.ls`, then reuses the loaded NEON registers for the gather that follows. Negative `i32` values cast to `u32` become large, so the bounds check still rejects them. Also adds a small targeted bench (`parquet/benches/rle_dict.rs`) that exercises `get_batch_with_dict` directly. Measured on aarch64 (Apple Silicon) with both i32 and String dictionaries of size 16/256/1024: ~2–4% faster on most cases, within noise on a couple. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 89b1497 commit b3b75e2

3 files changed

Lines changed: 100 additions & 1 deletion

File tree

parquet/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,5 +289,10 @@ required-features = ["arrow"]
289289
name = "bloom_filter"
290290
harness = false
291291

292+
[[bench]]
293+
name = "rle_dict"
294+
required-features = ["experimental"]
295+
harness = false
296+
292297
[lib]
293298
bench = false

parquet/benches/rle_dict.rs

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Targeted benchmark for `RleDecoder::get_batch_with_dict`, exercising the
19+
//! bit-packed gather loop (the one with the dictionary bounds check).
20+
21+
use bytes::Bytes;
22+
use criterion::{Criterion, Throughput, criterion_group, criterion_main};
23+
use parquet::encodings::rle::{RleDecoder, RleEncoder};
24+
use rand::{Rng, SeedableRng, rngs::StdRng};
25+
26+
fn encode_bit_packed(values: &[u32], bit_width: u8) -> Bytes {
27+
// Produce a pure bit-packed RLE stream (no RLE runs).
28+
let mut encoder = RleEncoder::new(bit_width, values.len() * (bit_width as usize).max(1));
29+
for &v in values {
30+
encoder.put(v as u64);
31+
}
32+
Bytes::from(encoder.consume())
33+
}
34+
35+
fn bench_get_batch_with_dict(c: &mut Criterion) {
36+
let mut group = c.benchmark_group("rle_dict/get_batch_with_dict");
37+
38+
// A handful of (dict_size, total_values) pairs. bit_width is
39+
// ceil(log2(dict_size)). The bench makes sure each input fits in one
40+
// bit-packed run so we stay in the hot loop.
41+
for (dict_size, bit_width) in [(16usize, 4u8), (256, 8), (1024, 10)] {
42+
let total: usize = 8192;
43+
44+
let mut rng = StdRng::seed_from_u64(42);
45+
let indices: Vec<u32> = (0..total)
46+
.map(|_| rng.random_range(0..dict_size as u32))
47+
.collect();
48+
let encoded = encode_bit_packed(&indices, bit_width);
49+
50+
// String dictionary of the given size.
51+
let dict: Vec<String> = (0..dict_size).map(|i| format!("value-{i}")).collect();
52+
53+
// Int32 dictionary as well (covers the primitive path).
54+
let int_dict: Vec<i32> = (0..dict_size as i32).collect();
55+
56+
group.throughput(Throughput::Elements(total as u64));
57+
58+
group.bench_function(format!("str/dict={dict_size}"), |b| {
59+
let mut out: Vec<String> = vec![String::new(); total];
60+
b.iter(|| {
61+
let mut decoder = RleDecoder::new(bit_width);
62+
decoder.set_data(encoded.clone()).unwrap();
63+
let n = decoder
64+
.get_batch_with_dict::<String>(&dict, &mut out, total)
65+
.unwrap();
66+
assert_eq!(n, total);
67+
});
68+
});
69+
70+
group.bench_function(format!("i32/dict={dict_size}"), |b| {
71+
let mut out: Vec<i32> = vec![0; total];
72+
b.iter(|| {
73+
let mut decoder = RleDecoder::new(bit_width);
74+
decoder.set_data(encoded.clone()).unwrap();
75+
let n = decoder
76+
.get_batch_with_dict::<i32>(&int_dict, &mut out, total)
77+
.unwrap();
78+
assert_eq!(n, total);
79+
});
80+
});
81+
}
82+
83+
group.finish();
84+
}
85+
86+
criterion_group!(benches, bench_get_batch_with_dict);
87+
criterion_main!(benches);

parquet/src/encodings/rle.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,8 +520,15 @@ impl RleDecoder {
520520
let idx_chunks = idx.chunks_exact(8);
521521
for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) {
522522
let dict_len = dict.len();
523+
// u32 max-reduction instead of `.all(|&i| ..)`: `.all`
524+
// short-circuits and blocks autovectorisation. Negative
525+
// i32 cast to u32 becomes a large value so the bounds
526+
// check still rejects it.
527+
let max_idx = idx_chunk
528+
.iter()
529+
.fold(0u32, |acc, &i| acc.max(i as u32));
523530
assert!(
524-
idx_chunk.iter().all(|&i| (i as usize) < dict_len),
531+
(max_idx as usize) < dict_len,
525532
"dictionary index out of bounds"
526533
);
527534
for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) {

0 commit comments

Comments
 (0)