perf(parquet): skip per-chunk vals_in_chunk computation when all values are non-null

adriangb · claude · adriangb · commit bb19d3eb5e52 · 2026-05-14T14:57:56.000-07:00
The chunker's per-chunk `partition_point` (arrow path) or
`LevelDataRef::value_count` (non-arrow path) returns `chunk_size` by
construction whenever the column has no nulls. The GKE bench showed
~+12–27% regressions on `list_primitive_non_null/*` and
`string_non_null/*` consistent with that walk dominating: ~50 K chunks
× a binary search through a 50 M-entry `non_null_indices` buffer means
cold cache reads on every chunk.

Compute a `ValueCountStrategy` once at `write_batch_internal` entry:

- `AllPresent` — set when the arrow caller passed
  `non_null_indices.len() == num_levels`, or when the column has
  `max_def_level == 0`. The chunker uses `chunk_size` directly with no
  per-chunk work.
- `Sorted(&amp;[usize])` — arrow nullable path; binary-search the indices.
- `DefLevelScan(max_def)` — non-arrow nullable path; def-level scan.

For the bench's `list_primitive_non_null` (all-non-null lists with a
50 M-entry leaf), this drops the per-chunk binary search entirely;
expected to bring those rows back near noise.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/parquet/src/column/writer/byte_budget_chunker.rs b/parquet/src/column/writer/byte_budget_chunker.rs
@@ -38,6 +38,25 @@ use crate::column::writer::encoder::ColumnValueEncoder;
 use crate::file::properties::WriterProperties;
 use crate::schema::types::ColumnDescriptor;
 
+/// Strategy for counting how many values fall in a chunk's level range.
+/// Computed once per `write_batch_internal` call rather than per chunk so
+/// `partition_point` and `LevelDataRef::value_count` don't run when their
+/// answer is statically known to be `chunk_size`.
+#[derive(Clone, Copy)]
+pub(crate) enum ValueCountStrategy<'a> {
+    /// Every level corresponds to a non-null value, so the answer is
+    /// always `chunk_size`. Either the column has `max_def_level == 0`
+    /// or the arrow caller's `non_null_indices.len() == num_levels`.
+    AllPresent,
+    /// Arrow nullable path: binary-search the sorted `non_null_indices`
+    /// for the chunk's level range. O(log n) per chunk.
+    Sorted(&'a [usize]),
+    /// Non-arrow nullable path: scan the def-level slice for entries
+    /// matching `max_def`. O(n) per chunk; only used when no sorted
+    /// `value_indices` were supplied.
+    DefLevelScan(i16),
+}
+
 /// Per-column-open chunker that picks byte-budget-aware mini-batch sizes.
 pub(crate) struct ByteBudgetChunker {
     /// Configured data page byte limit for the column.
@@ -48,9 +67,6 @@ pub(crate) struct ByteBudgetChunker {
     /// decision short-circuit with no work for every numeric, bool, or
     /// narrow `FIXED_LEN_BYTE_ARRAY` column.
     static_always_fits: bool,
-    /// Column's `max_def_level`, needed by `LevelDataRef::value_count` for
-    /// the non-arrow path where we don't have a sorted `non_null_indices`.
-    max_def_level: i16,
 }
 
 impl ByteBudgetChunker {
@@ -75,7 +91,29 @@ impl ByteBudgetChunker {
         Self {
             page_byte_limit,
             static_always_fits,
-            max_def_level: descr.max_def_level(),
+        }
+    }
+
+    /// Pick the cheapest strategy for `vals_in_chunk` queries for this
+    /// `write_batch_internal` call. Computed once and reused per chunk so
+    /// we don't repeat the check on every iteration.
+    #[inline]
+    pub(crate) fn value_count_strategy<'a>(
+        descr: &ColumnDescriptor,
+        value_indices: Option<&'a [usize]>,
+        num_levels: usize,
+    ) -> ValueCountStrategy<'a> {
+        match value_indices {
+            // Arrow path. If every level has a non-null value, the gather
+            // index is the trivial `0..num_levels` and we don't need to
+            // walk it per chunk — `vals_in_chunk == chunk_size` by
+            // construction.
+            Some(idx) if idx.len() == num_levels => ValueCountStrategy::AllPresent,
+            Some(idx) => ValueCountStrategy::Sorted(idx),
+            // Non-arrow path. `max_def_level == 0` means the column has
+            // no nullability, so again `vals_in_chunk == chunk_size`.
+            None if descr.max_def_level() == 0 => ValueCountStrategy::AllPresent,
+            None => ValueCountStrategy::DefLevelScan(descr.max_def_level()),
         }
     }
 
@@ -108,6 +146,7 @@ impl ByteBudgetChunker {
         values: &E::Values,
         value_indices: Option<&[usize]>,
         chunk_def: LevelDataRef<'_>,
+        strategy: ValueCountStrategy<'_>,
         values_offset: usize,
         chunk_size: usize,
         end_offset: usize,
@@ -116,11 +155,15 @@ impl ByteBudgetChunker {
             return chunk_size;
         }
         // Count how many values fall in this chunk's level range. The
-        // arrow path passes a sorted `non_null_indices`, so this is one
-        // binary search; otherwise we walk the def-level slice.
-        let vals_in_chunk = match value_indices {
-            Some(idx) => idx[values_offset..].partition_point(|&i| i < end_offset),
-            None => chunk_def.value_count(chunk_size, self.max_def_level),
+        // strategy was picked once per `write_batch_internal` call so
+        // the common all-non-null case (every level has a value) skips
+        // the per-chunk binary search and def-level scan entirely.
+        let vals_in_chunk = match strategy {
+            ValueCountStrategy::AllPresent => chunk_size,
+            ValueCountStrategy::Sorted(idx) => {
+                idx[values_offset..].partition_point(|&i| i < end_offset)
+            }
+            ValueCountStrategy::DefLevelScan(max_def) => chunk_def.value_count(chunk_size, max_def),
         };
         if vals_in_chunk == 0 {
             return chunk_size;
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
@@ -566,6 +566,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
             self.props.write_batch_size()
         };
         let chunker = ByteBudgetChunker::new(&self.descr, &self.props, base_batch_size);
+        let value_count_strategy =
+            ByteBudgetChunker::value_count_strategy(&self.descr, value_indices, num_levels);
         while levels_offset < num_levels {
             let mut end_offset = num_levels.min(levels_offset + base_batch_size);
 
@@ -585,6 +587,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
                 values,
                 value_indices,
                 chunk_def,
+                value_count_strategy,
                 values_offset,
                 chunk_size,
                 end_offset,