Skip to content

Commit 393ead0

Browse files
adriangbclaude
andcommitted
perf(parquet): O(1) estimated_value_bytes for byte arrays with contiguous indices
The previous patch made `ColumnValueEncoder::estimated_value_bytes` walk every value to sum byte lengths, which added a measurable ~5 % to the short-string write bench (1M × 8 B strings) because every chunk did a ~1024-entry loop calling `ArrayAccessor::value(idx).as_ref().len()`. For the simple offset-buffer byte array types (Utf8 / LargeUtf8 / Binary / LargeBinary), detect contiguous-and-sorted indices — true for every non-null column written via `non_null_indices` — and compute the total payload size as one subtraction on `value_offsets()`. For sparse indices in the same family of types, look lengths up via the offsets buffer directly rather than going through `ArrayAccessor::value`. View / fixed-size / dictionary arrays keep the existing per-value walk via `ArrayAccessor`. Dictionary-encoded data isn't on the hot path in practice because the writer's `has_dictionary()` short-circuits `estimated_value_bytes` while parquet dictionary encoding is active. Bench delta after this change (5-run medians, `arrow_writer` bench): - short_string_non_null/default (1M × 8 B): ±0 % (was +5–8 %) - large_string_non_null/default (1024 × 256 KiB): +1 % (was +3 %) - string_non_null/default (1M random Utf8/LargeUtf8): −2 % (was +2 %) - string_dictionary/default: ±0 % (was +1 %) All other benches within ±1 % of main. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent f679309 commit 393ead0

1 file changed

Lines changed: 66 additions & 9 deletions

File tree

parquet/src/arrow/arrow_writer/byte_array.rs

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@ use crate::geospatial::statistics::GeospatialStatistics;
3030
use crate::schema::types::ColumnDescPtr;
3131
use crate::util::bit_util::num_required_bits;
3232
use crate::util::interner::{Interner, Storage};
33+
use arrow_array::types::ByteArrayType;
3334
use arrow_array::{
3435
Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray,
35-
LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
36+
GenericByteArray, LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
3637
};
38+
use arrow_buffer::ArrowNativeType;
3739
use arrow_schema::DataType;
3840

3941
macro_rules! downcast_dict_impl {
@@ -490,7 +492,31 @@ impl ColumnValueEncoder for ByteArrayEncoder {
490492
let end = (offset + len).min(indices.len());
491493
let start = offset.min(end);
492494
let idx_slice = &indices[start..end];
493-
downcast_op!(values.data_type(), values, estimate_byte_size, idx_slice)
495+
// Fast path for the simple offset-buffer byte array types — the
496+
// overwhelmingly common case from `write_primitive`. Reduces a
497+
// 1024-iteration loop to one subtraction when the indices are
498+
// contiguous (e.g. the non-null indices of a non-null column).
499+
match values.data_type() {
500+
DataType::Utf8 => estimate_byte_size_offsets(
501+
values.as_any().downcast_ref::<StringArray>().unwrap(),
502+
idx_slice,
503+
),
504+
DataType::LargeUtf8 => estimate_byte_size_offsets(
505+
values.as_any().downcast_ref::<LargeStringArray>().unwrap(),
506+
idx_slice,
507+
),
508+
DataType::Binary => estimate_byte_size_offsets(
509+
values.as_any().downcast_ref::<BinaryArray>().unwrap(),
510+
idx_slice,
511+
),
512+
DataType::LargeBinary => estimate_byte_size_offsets(
513+
values.as_any().downcast_ref::<LargeBinaryArray>().unwrap(),
514+
idx_slice,
515+
),
516+
// Utf8View/BinaryView/FixedSizeBinary/Dictionary fall through
517+
// to the per-value walk via ArrayAccessor::value.
518+
_ => downcast_op!(values.data_type(), values, estimate_byte_size, idx_slice),
519+
}
494520
}
495521

496522
fn num_values(&self) -> usize {
@@ -607,13 +633,11 @@ where
607633

608634
/// Sum of plain-encoded byte sizes for the values picked out by `indices`.
609635
///
610-
/// Used by `ColumnValueEncoder::estimated_value_bytes` to decide whether a
611-
/// chunk of arrow values is large enough that the column writer should
612-
/// switch to per-value mini-batches. The estimate is over the *plain*
613-
/// encoding (4-byte length prefix + payload, or just payload for fixed-size
614-
/// binary). It deliberately ignores dict-encoded sizes — when dictionary
615-
/// encoding is active, the dict_encoder accumulates indexes so a chunk
616-
/// can't blow the data page byte limit the way raw values can.
636+
/// Fallback used by `ColumnValueEncoder::estimated_value_bytes` for array
637+
/// types that don't expose a single contiguous offsets buffer — view arrays,
638+
/// dictionary arrays, fixed-size binary. The simple offset-buffer types
639+
/// (Utf8/LargeUtf8/Binary/LargeBinary) take the much faster
640+
/// `estimate_byte_size_offsets` path.
617641
///
618642
/// Free function so it can be used with `downcast_op!`.
619643
fn estimate_byte_size<T>(values: T, indices: &[usize]) -> usize
@@ -628,6 +652,39 @@ where
628652
total
629653
}
630654

655+
/// Fast path for `GenericByteArray<O>` (Utf8/LargeUtf8/Binary/LargeBinary).
656+
///
657+
/// When `indices` are contiguous and sorted — true for the non-null indices
658+
/// of any non-null column, and a frequent case otherwise — the total
659+
/// payload byte size is one subtraction on the offsets buffer
660+
/// (`offsets[last+1] - offsets[first]`). For sparse indices, a per-index
661+
/// length lookup via the offsets buffer is still cheaper than going through
662+
/// `ArrayAccessor::value` and constructing a slice/string for each value.
663+
fn estimate_byte_size_offsets<T: ByteArrayType>(
664+
values: &GenericByteArray<T>,
665+
indices: &[usize],
666+
) -> usize {
667+
if indices.is_empty() {
668+
return 0;
669+
}
670+
let n = indices.len();
671+
let first = indices[0];
672+
let last = indices[n - 1];
673+
let offsets = values.value_offsets();
674+
let data_bytes = if last >= first && last - first + 1 == n {
675+
// Contiguous: one subtraction on the offsets buffer.
676+
(offsets[last + 1] - offsets[first]).as_usize()
677+
} else {
678+
// Sparse: still cheaper than walking through ArrayAccessor::value
679+
// because we skip the slice/UTF-8 construction.
680+
indices
681+
.iter()
682+
.map(|i| (offsets[*i + 1] - offsets[*i]).as_usize())
683+
.sum()
684+
};
685+
data_bytes + n * std::mem::size_of::<u32>()
686+
}
687+
631688
/// Computes the min and max for the provided array and indices
632689
///
633690
/// This is a free function so it can be used with `downcast_op!`

0 commit comments

Comments
 (0)