perf(parquet): vectorise dict-index bounds check in RleDecoder::get_batch_with_dict

Dandandan · claude · Dandandan · commit b3b75e22ddce · 2026-04-16T20:40:59.000+02:00
Replace `idx_chunk.iter().all(|&amp;i| (i as usize) &lt; dict_len)` with a
u32 max-reduction (`fold(0u32, |acc, &amp;i| acc.max(i as u32))`). `.all`
short-circuits and so blocks autovectorisation; on aarch64 the old
form compiled to eight serialised `ldrsw` + `cmp` + `b.ls` pairs per
8-index chunk, followed by eight separate scalar gather loads.

The max-reduction has no early exit, so LLVM now lowers the check to
a single `ldp q1, q0` + `umax.4s` + `umaxv.4s` + one `cmp` + `b.ls`,
then reuses the loaded NEON registers for the gather that follows.
Negative `i32` values cast to `u32` become large, so the bounds
check still rejects them.

Also adds a small targeted bench (`parquet/benches/rle_dict.rs`)
that exercises `get_batch_with_dict` directly. Measured on
aarch64 (Apple Silicon) with both i32 and String dictionaries of
size 16/256/1024: ~2–4% faster on most cases, within noise on a
couple.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
@@ -289,5 +289,10 @@ required-features = ["arrow"]
 name = "bloom_filter"
 harness = false
 
+[[bench]]
+name = "rle_dict"
+required-features = ["experimental"]
+harness = false
+
 [lib]
 bench = false
diff --git a/parquet/benches/rle_dict.rs b/parquet/benches/rle_dict.rs
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Targeted benchmark for `RleDecoder::get_batch_with_dict`, exercising the
+//! bit-packed gather loop (the one with the dictionary bounds check).
+
+use bytes::Bytes;
+use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use parquet::encodings::rle::{RleDecoder, RleEncoder};
+use rand::{Rng, SeedableRng, rngs::StdRng};
+
+fn encode_bit_packed(values: &[u32], bit_width: u8) -> Bytes {
+    // Produce a pure bit-packed RLE stream (no RLE runs).
+    let mut encoder = RleEncoder::new(bit_width, values.len() * (bit_width as usize).max(1));
+    for &v in values {
+        encoder.put(v as u64);
+    }
+    Bytes::from(encoder.consume())
+}
+
+fn bench_get_batch_with_dict(c: &mut Criterion) {
+    let mut group = c.benchmark_group("rle_dict/get_batch_with_dict");
+
+    // A handful of (dict_size, total_values) pairs. bit_width is
+    // ceil(log2(dict_size)). The bench makes sure each input fits in one
+    // bit-packed run so we stay in the hot loop.
+    for (dict_size, bit_width) in [(16usize, 4u8), (256, 8), (1024, 10)] {
+        let total: usize = 8192;
+
+        let mut rng = StdRng::seed_from_u64(42);
+        let indices: Vec<u32> = (0..total)
+            .map(|_| rng.random_range(0..dict_size as u32))
+            .collect();
+        let encoded = encode_bit_packed(&indices, bit_width);
+
+        // String dictionary of the given size.
+        let dict: Vec<String> = (0..dict_size).map(|i| format!("value-{i}")).collect();
+
+        // Int32 dictionary as well (covers the primitive path).
+        let int_dict: Vec<i32> = (0..dict_size as i32).collect();
+
+        group.throughput(Throughput::Elements(total as u64));
+
+        group.bench_function(format!("str/dict={dict_size}"), |b| {
+            let mut out: Vec<String> = vec![String::new(); total];
+            b.iter(|| {
+                let mut decoder = RleDecoder::new(bit_width);
+                decoder.set_data(encoded.clone()).unwrap();
+                let n = decoder
+                    .get_batch_with_dict::<String>(&dict, &mut out, total)
+                    .unwrap();
+                assert_eq!(n, total);
+            });
+        });
+
+        group.bench_function(format!("i32/dict={dict_size}"), |b| {
+            let mut out: Vec<i32> = vec![0; total];
+            b.iter(|| {
+                let mut decoder = RleDecoder::new(bit_width);
+                decoder.set_data(encoded.clone()).unwrap();
+                let n = decoder
+                    .get_batch_with_dict::<i32>(&int_dict, &mut out, total)
+                    .unwrap();
+                assert_eq!(n, total);
+            });
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_get_batch_with_dict);
+criterion_main!(benches);
diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs
@@ -520,8 +520,15 @@ impl RleDecoder {
                         let idx_chunks = idx.chunks_exact(8);
                         for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) {
                             let dict_len = dict.len();
+                            // u32 max-reduction instead of `.all(|&i| ..)`: `.all`
+                            // short-circuits and blocks autovectorisation. Negative
+                            // i32 cast to u32 becomes a large value so the bounds
+                            // check still rejects it.
+                            let max_idx = idx_chunk
+                                .iter()
+                                .fold(0u32, |acc, &i| acc.max(i as u32));
                             assert!(
-                                idx_chunk.iter().all(|&i| (i as usize) < dict_len),
+                                (max_idx as usize) < dict_len,
                                 "dictionary index out of bounds"
                             );
                             for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) {