perf(parquet): O(1) estimated_value_bytes for byte arrays with contiguous indices

adriangb · claude · adriangb · commit 393ead071aa9 · 2026-05-13T21:09:32.000-07:00
The previous patch made `ColumnValueEncoder::estimated_value_bytes` walk
every value to sum byte lengths, which added a measurable ~5 % to the
short-string write bench (1M × 8 B strings) because every chunk did a
~1024-entry loop calling `ArrayAccessor::value(idx).as_ref().len()`.

For the simple offset-buffer byte array types (Utf8 / LargeUtf8 / Binary /
LargeBinary), detect contiguous-and-sorted indices — true for every
non-null column written via `non_null_indices` — and compute the total
payload size as one subtraction on `value_offsets()`. For sparse
indices in the same family of types, look lengths up via the offsets
buffer directly rather than going through `ArrayAccessor::value`.

View / fixed-size / dictionary arrays keep the existing per-value walk
via `ArrayAccessor`. Dictionary-encoded data isn't on the hot path in
practice because the writer's `has_dictionary()` short-circuits
`estimated_value_bytes` while parquet dictionary encoding is active.

Bench delta after this change (5-run medians, `arrow_writer` bench):
- short_string_non_null/default (1M × 8 B): ±0 % (was +5–8 %)
- large_string_non_null/default (1024 × 256 KiB): +1 % (was +3 %)
- string_non_null/default (1M random Utf8/LargeUtf8): −2 % (was +2 %)
- string_dictionary/default: ±0 % (was +1 %)
All other benches within ±1 % of main.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs
@@ -30,10 +30,12 @@ use crate::geospatial::statistics::GeospatialStatistics;
 use crate::schema::types::ColumnDescPtr;
 use crate::util::bit_util::num_required_bits;
 use crate::util::interner::{Interner, Storage};
+use arrow_array::types::ByteArrayType;
 use arrow_array::{
     Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray,
-    LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
+    GenericByteArray, LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
 };
+use arrow_buffer::ArrowNativeType;
 use arrow_schema::DataType;
 
 macro_rules! downcast_dict_impl {
@@ -490,7 +492,31 @@ impl ColumnValueEncoder for ByteArrayEncoder {
         let end = (offset + len).min(indices.len());
         let start = offset.min(end);
         let idx_slice = &indices[start..end];
-        downcast_op!(values.data_type(), values, estimate_byte_size, idx_slice)
+        // Fast path for the simple offset-buffer byte array types — the
+        // overwhelmingly common case from `write_primitive`. Reduces a
+        // 1024-iteration loop to one subtraction when the indices are
+        // contiguous (e.g. the non-null indices of a non-null column).
+        match values.data_type() {
+            DataType::Utf8 => estimate_byte_size_offsets(
+                values.as_any().downcast_ref::<StringArray>().unwrap(),
+                idx_slice,
+            ),
+            DataType::LargeUtf8 => estimate_byte_size_offsets(
+                values.as_any().downcast_ref::<LargeStringArray>().unwrap(),
+                idx_slice,
+            ),
+            DataType::Binary => estimate_byte_size_offsets(
+                values.as_any().downcast_ref::<BinaryArray>().unwrap(),
+                idx_slice,
+            ),
+            DataType::LargeBinary => estimate_byte_size_offsets(
+                values.as_any().downcast_ref::<LargeBinaryArray>().unwrap(),
+                idx_slice,
+            ),
+            // Utf8View/BinaryView/FixedSizeBinary/Dictionary fall through
+            // to the per-value walk via ArrayAccessor::value.
+            _ => downcast_op!(values.data_type(), values, estimate_byte_size, idx_slice),
+        }
     }
 
     fn num_values(&self) -> usize {
@@ -607,13 +633,11 @@ where
 
 /// Sum of plain-encoded byte sizes for the values picked out by `indices`.
 ///
-/// Used by `ColumnValueEncoder::estimated_value_bytes` to decide whether a
-/// chunk of arrow values is large enough that the column writer should
-/// switch to per-value mini-batches. The estimate is over the *plain*
-/// encoding (4-byte length prefix + payload, or just payload for fixed-size
-/// binary). It deliberately ignores dict-encoded sizes — when dictionary
-/// encoding is active, the dict_encoder accumulates indexes so a chunk
-/// can't blow the data page byte limit the way raw values can.
+/// Fallback used by `ColumnValueEncoder::estimated_value_bytes` for array
+/// types that don't expose a single contiguous offsets buffer — view arrays,
+/// dictionary arrays, fixed-size binary. The simple offset-buffer types
+/// (Utf8/LargeUtf8/Binary/LargeBinary) take the much faster
+/// `estimate_byte_size_offsets` path.
 ///
 /// Free function so it can be used with `downcast_op!`.
 fn estimate_byte_size<T>(values: T, indices: &[usize]) -> usize
@@ -628,6 +652,39 @@ where
     total
 }
 
+/// Fast path for `GenericByteArray<O>` (Utf8/LargeUtf8/Binary/LargeBinary).
+///
+/// When `indices` are contiguous and sorted — true for the non-null indices
+/// of any non-null column, and a frequent case otherwise — the total
+/// payload byte size is one subtraction on the offsets buffer
+/// (`offsets[last+1] - offsets[first]`). For sparse indices, a per-index
+/// length lookup via the offsets buffer is still cheaper than going through
+/// `ArrayAccessor::value` and constructing a slice/string for each value.
+fn estimate_byte_size_offsets<T: ByteArrayType>(
+    values: &GenericByteArray<T>,
+    indices: &[usize],
+) -> usize {
+    if indices.is_empty() {
+        return 0;
+    }
+    let n = indices.len();
+    let first = indices[0];
+    let last = indices[n - 1];
+    let offsets = values.value_offsets();
+    let data_bytes = if last >= first && last - first + 1 == n {
+        // Contiguous: one subtraction on the offsets buffer.
+        (offsets[last + 1] - offsets[first]).as_usize()
+    } else {
+        // Sparse: still cheaper than walking through ArrayAccessor::value
+        // because we skip the slice/UTF-8 construction.
+        indices
+            .iter()
+            .map(|i| (offsets[*i + 1] - offsets[*i]).as_usize())
+            .sum()
+    };
+    data_bytes + n * std::mem::size_of::<u32>()
+}
+
 /// Computes the min and max for the provided array and indices
 ///
 /// This is a free function so it can be used with `downcast_op!`