From b66d452e994f9868c141757e29ccb842c63e0d78 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Tue, 17 Feb 2026 17:45:11 +0100 Subject: [PATCH 01/21] feat(parquet): add content defined chunking for arrow writer --- parquet/benches/arrow_writer.rs | 17 +- parquet/src/arrow/arrow_writer/levels.rs | 46 + parquet/src/arrow/arrow_writer/mod.rs | 292 +++- parquet/src/arrow/mod.rs | 27 + parquet/src/column/chunker/cdc.rs | 1431 +++++++++++++++++++ parquet/src/column/chunker/cdc_codegen.py | 118 ++ parquet/src/column/chunker/cdc_generated.rs | 558 ++++++++ parquet/src/column/chunker/mod.rs | 38 + parquet/src/column/mod.rs | 1 + parquet/src/column/writer/mod.rs | 17 + parquet/src/file/properties.rs | 81 ++ parquet/src/lib.rs | 22 + 12 files changed, 2629 insertions(+), 19 deletions(-) create mode 100644 parquet/src/column/chunker/cdc.rs create mode 100644 parquet/src/column/chunker/cdc_codegen.py create mode 100644 parquet/src/column/chunker/cdc_generated.rs create mode 100644 parquet/src/column/chunker/mod.rs diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index b92f0788b2fc..140a8780088b 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -35,7 +35,7 @@ use arrow::{record_batch::RecordBatch, util::data_gen::*}; use arrow_array::RecordBatchOptions; use parquet::arrow::ArrowSchemaConverter; use parquet::errors::Result; -use parquet::file::properties::{WriterProperties, WriterVersion}; +use parquet::file::properties::{CdcOptions, WriterProperties, WriterVersion}; use parquet::file::writer::SerializedFileWriter; fn create_primitive_bench_batch( @@ -348,7 +348,7 @@ fn write_batch_with_option( .with_coerce_types(props.coerce_types()) .convert(batch.schema_ref())?; let writer = SerializedFileWriter::new(&mut file, parquet_schema.root_schema_ptr(), props)?; - let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); + let mut row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); bench.iter(|| { let mut row_group = row_group_writer_factory.create_column_writers(0).unwrap(); @@ -440,6 +440,19 @@ fn create_writer_props() -> Vec<(&'static str, WriterProperties)> { .build(); props.push(("zstd_parquet_2", prop)); + // CDC with small chunk sizes so that boundaries actually trigger within the + // benchmark batch size (~16 KiB for a 4096-row i32 batch). Dictionary encoding + // is disabled because CDC materializes dictionary arrays before hashing. + let prop = WriterProperties::builder() + .set_cdc_options(CdcOptions { + min_chunk_size: 4 * 1024, + max_chunk_size: 16 * 1024, + norm_level: 0, + }) + .set_dictionary_enabled(false) + .build(); + props.push(("cdc", prop)); + props } diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 0ff2137d907e..1716c14d1aea 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -801,6 +801,52 @@ impl ArrayLevels { pub fn non_null_indices(&self) -> &[usize] { &self.non_null_indices } + + /// Create a sliced view of this `ArrayLevels` for a CDC chunk. + /// + /// - `level_offset`: start position within `def_levels`/`rep_levels` + /// - `levels_to_write`: number of levels in this chunk + /// - `value_offset`: start position within the values array + /// - `num_values`: number of values in this chunk + pub(crate) fn slice_for_chunk( + &self, + level_offset: usize, + levels_to_write: usize, + value_offset: usize, + num_values: usize, + ) -> Self { + let def_levels = self + .def_levels + .as_ref() + .map(|levels| levels[level_offset..level_offset + levels_to_write].to_vec()); + let rep_levels = self + .rep_levels + .as_ref() + .map(|levels| levels[level_offset..level_offset + levels_to_write].to_vec()); + + // Filter non_null_indices to [value_offset, value_offset + num_values) + // and shift by -value_offset. + let value_end = value_offset + num_values; + let non_null_indices: Vec = self + .non_null_indices + .iter() + .filter(|&&idx| idx >= value_offset && idx < value_end) + .map(|&idx| idx - value_offset) + .collect(); + + let array = self.array.slice(value_offset, num_values); + let logical_nulls = array.logical_nulls(); + + Self { + def_levels, + rep_levels, + non_null_indices, + max_def_level: self.max_def_level, + max_rep_level: self.max_rep_level, + array, + logical_nulls, + } + } } #[cfg(test)] diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 979988eebc05..5d704ec2b8aa 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -17,6 +17,8 @@ //! Contains writer which writes arrow data into parquet data. +use crate::column::chunker; + use bytes::Bytes; use std::io::{Read, Write}; use std::iter::Peekable; @@ -335,10 +337,12 @@ impl ArrowWriter { let in_progress = match &mut self.in_progress { Some(in_progress) => in_progress, - x => x.insert( - self.row_group_writer_factory - .create_row_group_writer(self.writer.flushed_row_groups().len())?, - ), + x => { + let rg = self + .row_group_writer_factory + .create_row_group_writer(self.writer.flushed_row_groups().len())?; + x.insert(rg) + } }; if let Some(max_rows) = self.max_row_group_row_count { @@ -421,8 +425,11 @@ impl ArrowWriter { None => return Ok(()), }; + let (chunks, chunkers) = in_progress.close()?; + self.row_group_writer_factory.cdc_chunkers = chunkers; + let mut row_group_writer = self.writer.next_row_group()?; - for chunk in in_progress.close()? { + for chunk in chunks { chunk.append_to_row_group(&mut row_group_writer)?; } row_group_writer.close()?; @@ -792,7 +799,7 @@ impl ArrowColumnChunk { /// .unwrap(); /// /// // Create a factory for building Arrow column writers -/// let row_group_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); +/// let mut row_group_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); /// // Create column writers for the 0th row group /// let col_writers = row_group_factory.create_column_writers(0).unwrap(); /// @@ -853,6 +860,7 @@ impl ArrowColumnChunk { pub struct ArrowColumnWriter { writer: ArrowColumnWriterImpl, chunk: SharedColumnChunk, + pub(crate) chunker: Option, } impl std::fmt::Debug for ArrowColumnWriter { @@ -869,6 +877,14 @@ enum ArrowColumnWriterImpl { impl ArrowColumnWriter { /// Write an [`ArrowLeafColumn`] pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()> { + if self.chunker.is_some() { + self.write_with_cdc(col) + } else { + self.write_without_cdc(col) + } + } + + fn write_without_cdc(&mut self, col: &ArrowLeafColumn) -> Result<()> { match &mut self.writer { ArrowColumnWriterImpl::Column(c) => { let leaf = col.0.array(); @@ -888,6 +904,67 @@ impl ArrowColumnWriter { Ok(()) } + fn write_with_cdc(&mut self, col: &ArrowLeafColumn) -> Result<()> { + let levels = &col.0; + + // Dictionary-encoded arrays must be materialized before hashing because the + // CDC chunker must see the actual values, not dictionary indices. Two arrays + // with the same values but different dictionary orderings would otherwise + // produce different rolling hash states, breaking cross-file deduplication. + let leaf_array = match levels.array().as_any_dictionary_opt() { + Some(dictionary) => { + arrow_select::take::take(dictionary.values(), dictionary.keys(), None)? + } + None => levels.array().clone(), + }; + + let def_levels = levels.def_levels(); + let rep_levels = levels.rep_levels(); + let num_levels = def_levels + .map(|d| d.len()) + .or_else(|| rep_levels.map(|r| r.len())) + .unwrap_or(leaf_array.len()); + + // Compute CDC chunk boundaries + let chunks = { + let chunker = self.chunker.as_mut().unwrap(); + get_cdc_chunks(chunker, def_levels, rep_levels, num_levels, &leaf_array)? + }; + + let num_chunks = chunks.len(); + for (i, chunk) in chunks.iter().enumerate() { + // Compute the number of values in this chunk + let num_values = if i + 1 < num_chunks { + chunks[i + 1].value_offset - chunk.value_offset + } else { + leaf_array.len() - chunk.value_offset + }; + + let chunk_levels = levels.slice_for_chunk( + chunk.level_offset, + chunk.levels_to_write, + chunk.value_offset, + num_values, + ); + let chunk_col = ArrowLeafColumn(chunk_levels); + + self.write_without_cdc(&chunk_col)?; + + // Flush the page after each chunk except the last + if i + 1 < num_chunks { + self.flush_current_page()?; + } + } + Ok(()) + } + + fn flush_current_page(&mut self) -> Result<()> { + match &mut self.writer { + ArrowColumnWriterImpl::Column(c) => c.flush_current_page(), + ArrowColumnWriterImpl::ByteArray(c) => c.flush_current_page(), + } + } + /// Close this column returning the written [`ArrowColumnChunk`] pub fn close(self) -> Result { let close = match self.writer { @@ -972,11 +1049,26 @@ impl ArrowRowGroupWriter { .sum() } - fn close(self) -> Result> { - self.writers - .into_iter() - .map(|writer| writer.close()) - .collect() + fn close( + self, + ) -> Result<( + Vec, + Option>, + )> { + let mut chunks = Vec::with_capacity(self.writers.len()); + let mut chunkers = Vec::new(); + for mut writer in self.writers { + if let Some(chunker) = writer.chunker.take() { + chunkers.push(chunker); + } + chunks.push(writer.close()?); + } + let chunkers = if chunkers.is_empty() { + None + } else { + Some(chunkers) + }; + Ok((chunks, chunkers)) } } @@ -991,6 +1083,9 @@ pub struct ArrowRowGroupWriterFactory { props: WriterPropertiesPtr, #[cfg(feature = "encryption")] file_encryptor: Option>, + /// CDC chunkers persisted across row groups (one per leaf column). + /// `None` when CDC is not enabled. + cdc_chunkers: Option>, } impl ArrowRowGroupWriterFactory { @@ -1007,30 +1102,72 @@ impl ArrowRowGroupWriterFactory { props, #[cfg(feature = "encryption")] file_encryptor: file_writer.file_encryptor(), + cdc_chunkers: None, } } - fn create_row_group_writer(&self, row_group_index: usize) -> Result { + fn create_row_group_writer(&mut self, row_group_index: usize) -> Result { let writers = self.create_column_writers(row_group_index)?; Ok(ArrowRowGroupWriter::new(writers, &self.arrow_schema)) } /// Create column writers for a new row group, with the given row group index - pub fn create_column_writers(&self, row_group_index: usize) -> Result> { + pub fn create_column_writers( + &mut self, + row_group_index: usize, + ) -> Result> { let mut writers = Vec::with_capacity(self.arrow_schema.fields.len()); let mut leaves = self.schema.columns().iter(); let column_factory = self.column_writer_factory(row_group_index); + let schema_root = self.schema.root_schema(); for field in &self.arrow_schema.fields { column_factory.get_arrow_column_writer( field.data_type(), + schema_root, &self.props, &mut leaves, &mut writers, )?; } + let chunkers = match self.cdc_chunkers.take() { + Some(chunkers) => chunkers, + None => match self.create_cdc_chunkers()? { + Some(chunkers) => chunkers, + None => return Ok(writers), + }, + }; + for (writer, chunker) in writers.iter_mut().zip(chunkers) { + writer.chunker = Some(chunker); + } Ok(writers) } + /// Create CDC chunkers for all leaf columns, or `None` if CDC is not enabled. + fn create_cdc_chunkers(&self) -> Result>> { + let opts = match self.props.cdc_options() { + Some(opts) => opts, + None => return Ok(None), + }; + let schema_root = self.schema.root_schema(); + self.schema + .columns() + .iter() + .map(|desc| { + let max_def_level = desc.max_def_level(); + let max_rep_level = desc.max_rep_level(); + let repeated_ancestor_def_level = + compute_repeated_ancestor_def_level(schema_root, desc.path()); + chunker::ContentDefinedChunker::new( + max_def_level, + max_rep_level, + repeated_ancestor_def_level, + opts, + ) + }) + .collect::>>() + .map(Some) + } + #[cfg(feature = "encryption")] fn column_writer_factory(&self, row_group_idx: usize) -> ArrowColumnWriterFactory { ArrowColumnWriterFactory::new() @@ -1053,9 +1190,11 @@ pub fn get_column_writers( let mut writers = Vec::with_capacity(arrow.fields.len()); let mut leaves = parquet.columns().iter(); let column_factory = ArrowColumnWriterFactory::new(); + let schema_root = parquet.root_schema(); for field in &arrow.fields { column_factory.get_arrow_column_writer( field.data_type(), + schema_root, props, &mut leaves, &mut writers, @@ -1125,6 +1264,7 @@ impl ArrowColumnWriterFactory { fn get_arrow_column_writer( &self, data_type: &ArrowDataType, + schema_root: &crate::schema::types::Type, props: &WriterPropertiesPtr, leaves: &mut Iter<'_, ColumnDescPtr>, out: &mut Vec, @@ -1137,6 +1277,7 @@ impl ArrowColumnWriterFactory { Ok(ArrowColumnWriter { chunk, writer: ArrowColumnWriterImpl::Column(writer), + chunker: None, }) }; @@ -1148,6 +1289,7 @@ impl ArrowColumnWriterFactory { Ok(ArrowColumnWriter { chunk, writer: ArrowColumnWriterImpl::ByteArray(writer), + chunker: None, }) }; @@ -1167,17 +1309,29 @@ impl ArrowColumnWriterFactory { | ArrowDataType::FixedSizeList(f, _) | ArrowDataType::ListView(f) | ArrowDataType::LargeListView(f) => { - self.get_arrow_column_writer(f.data_type(), props, leaves, out)? + self.get_arrow_column_writer(f.data_type(), schema_root, props, leaves, out)? } ArrowDataType::Struct(fields) => { for field in fields { - self.get_arrow_column_writer(field.data_type(), props, leaves, out)? + self.get_arrow_column_writer( + field.data_type(), + schema_root, + props, + leaves, + out, + )? } } ArrowDataType::Map(f, _) => match f.data_type() { ArrowDataType::Struct(f) => { - self.get_arrow_column_writer(f[0].data_type(), props, leaves, out)?; - self.get_arrow_column_writer(f[1].data_type(), props, leaves, out)? + self.get_arrow_column_writer( + f[0].data_type(), + schema_root, + props, + leaves, + out, + )?; + self.get_arrow_column_writer(f[1].data_type(), schema_root, props, leaves, out)? } _ => unreachable!("invalid map type"), }, @@ -1590,6 +1744,110 @@ fn get_fsb_array_slice( values } +/// Compute the definition level at the nearest REPEATED ancestor by traversing +/// the Parquet schema tree from root to the given leaf column path. +fn compute_repeated_ancestor_def_level( + schema_root: &crate::schema::types::Type, + path: &crate::schema::types::ColumnPath, +) -> i16 { + use crate::basic::Repetition; + let parts = path.parts(); + if parts.is_empty() { + return 0; + } + + let mut current_type = schema_root; + let mut def_level: i16 = 0; + let mut repeated_ancestor_def_level: i16 = 0; + + for part in parts { + // Find the child with matching name + if !current_type.is_group() { + break; + } + let child = current_type.get_fields().iter().find(|f| f.name() == part); + let child = match child { + Some(c) => c, + None => break, + }; + + // Update def/rep levels based on this node's repetition + if child.get_basic_info().has_repetition() { + match child.get_basic_info().repetition() { + Repetition::OPTIONAL => { + def_level += 1; + } + Repetition::REPEATED => { + def_level += 1; + repeated_ancestor_def_level = def_level; + } + Repetition::REQUIRED => {} + } + } + current_type = child.as_ref(); + } + + repeated_ancestor_def_level +} + +/// Compute CDC chunk boundaries by dispatching on the Arrow array's data type +/// to feed value bytes into the rolling hash. +fn get_cdc_chunks( + chunker: &mut chunker::ContentDefinedChunker, + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + num_levels: usize, + array: &dyn arrow_array::Array, +) -> Result> { + // Downcasts `array` to a concrete type, binds it to `$a`, then calls + // `get_chunks` with a closure that yields value bytes for index `$i`. + macro_rules! chunk { + ($a:ident = $downcast:expr, |$i:ident| $bytes:expr) => {{ + let $a = $downcast; + chunker.get_chunks(def_levels, rep_levels, num_levels, |$i| $bytes) + }}; + } + + let dtype = array.data_type(); + let chunks = match dtype { + ArrowDataType::Null => { + chunker.get_chunks(def_levels, rep_levels, num_levels, |_| -> &[u8] { &[] }) + } + ArrowDataType::Boolean => chunk!(a = array.as_boolean(), |i| [a.value(i) as u8]), + ArrowDataType::FixedSizeBinary(_) => { + chunk!(a = array.as_fixed_size_binary(), |i| a.value(i)) + } + ArrowDataType::Binary => chunk!(a = array.as_binary::(), |i| a.value(i)), + ArrowDataType::Utf8 => chunk!(a = array.as_string::(), |i| a.value(i).as_bytes()), + ArrowDataType::LargeBinary => chunk!(a = array.as_binary::(), |i| a.value(i)), + ArrowDataType::LargeUtf8 => chunk!(a = array.as_string::(), |i| a.value(i).as_bytes()), + ArrowDataType::BinaryView => chunk!(a = array.as_binary_view(), |i| a.value(i)), + ArrowDataType::Utf8View => chunk!(a = array.as_string_view(), |i| a.value(i).as_bytes()), + // All fixed-width primitive types (ints, floats, dates, times, timestamps, + // durations, intervals, decimals, float16). + // + // Values are read directly from the underlying buffer. `data.offset()` accounts + // for sliced arrays (non-zero logical start), so `base + i * byte_width` always + // resolves to the correct physical byte position for logical index `i`. + _ => { + let byte_width = dtype.primitive_width().ok_or_else(|| { + ParquetError::General(format!( + "content-defined chunking is not supported for data type {:?}", + dtype + )) + })?; + let data = array.to_data(); + let buffer = &data.buffers()[0]; + let base = data.offset() * byte_width; + chunker.get_chunks(def_levels, rep_levels, num_levels, |i| { + let start = base + i * byte_width; + &buffer[start..start + byte_width] + }) + } + }; + Ok(chunks) +} + #[cfg(test)] mod tests { use super::*; diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 52152988166f..57b350a766c4 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -88,6 +88,33 @@ //! writer.close().unwrap(); //! ``` //! +//! ## EXPERIMENTAL: Content-Defined Chunking +//! +//! Enable content-defined chunking (CDC) via [`WriterProperties`] to improve +//! deduplication efficiency in content-addressable storage (CAS) systems such as +//! Hugging Face Hub. CDC creates data page boundaries based on content rather than +//! fixed sizes, so unchanged data across file versions produces identical byte +//! sequences that CAS backends can deduplicate at the page level. +//! +//! ```no_run +//! # use parquet::arrow::arrow_writer::ArrowWriter; +//! # use parquet::file::properties::WriterProperties; +//! # use std::fs::File; +//! # use arrow_array::RecordBatch; +//! # fn write(batch: &RecordBatch) { +//! let file = File::create("data.parquet").unwrap(); +//! let props = WriterProperties::builder() +//! .set_content_defined_chunking(true) +//! .build(); +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props)).unwrap(); +//! writer.write(batch).unwrap(); +//! writer.close().unwrap(); +//! # } +//! ``` +//! +//! See [`CdcOptions`](crate::file::properties::CdcOptions) for chunk size and +//! normalization level configuration. +//! //! # Example: Reading Parquet file into Arrow `RecordBatch` //! //! ```rust diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs new file mode 100644 index 000000000000..7732ab39da9a --- /dev/null +++ b/parquet/src/column/chunker/cdc.rs @@ -0,0 +1,1431 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::errors::{ParquetError, Result}; +use crate::file::properties::CdcOptions; + +use super::Chunk; +use super::cdc_generated::{GEARHASH_TABLE, NUM_GEARHASH_TABLES}; + +/// Content-defined chunker that uses a rolling gear hash to find chunk boundaries. +/// +/// This implements a [FastCDC]-inspired algorithm using gear hashing. The input data is +/// fed byte-by-byte into a rolling hash; when the hash matches a predefined mask, a new +/// chunk boundary candidate is recorded. To reduce the exponential variance of chunk +/// sizes inherent in a single gear hash, the algorithm requires **8 consecutive mask +/// matches** — each against a different pre-computed gear hash table — before committing +/// to a boundary. This central-limit-theorem normalization makes the chunk size +/// distribution approximately normal between `min_chunk_size` and `max_chunk_size`. +/// +/// The chunker's state (rolling hash, run counter, accumulated size) persists across the +/// entire column (across pages and row groups), so boundaries are determined solely by +/// data content and are reproducible given the same input. +/// +/// For nested data (lists, maps, structs) chunk boundaries are restricted to top-level +/// record boundaries (`rep_level == 0`) so that a nested row is never split across +/// chunks. +/// +/// Ported from the C++ implementation in apache/arrow#45360 +/// (`cpp/src/parquet/chunker_internal.cc`). +/// +/// [FastCDC]: https://www.usenix.org/conference/atc16/technical-sessions/presentation/xia +#[derive(Debug)] +pub(crate) struct ContentDefinedChunker { + /// Maximum definition level for this column. + max_def_level: i16, + /// Maximum repetition level for this column. + max_rep_level: i16, + /// Definition level at the nearest REPEATED ancestor. + repeated_ancestor_def_level: i16, + + min_chunk_size: i64, + max_chunk_size: i64, + /// Mask for matching against the rolling hash. + rolling_hash_mask: u64, + + /// Rolling hash state, never reset — initialized once for the entire column. + rolling_hash: u64, + /// Whether the rolling hash has matched the mask since the last chunk boundary. + has_matched: bool, + /// Current run count for the central-limit-theorem normalization. + nth_run: usize, + /// Current chunk size in bytes. + chunk_size: i64, +} + +impl ContentDefinedChunker { + pub fn new( + max_def_level: i16, + max_rep_level: i16, + repeated_ancestor_def_level: i16, + options: &CdcOptions, + ) -> Result { + let rolling_hash_mask = Self::calculate_mask( + options.min_chunk_size as i64, + options.max_chunk_size as i64, + options.norm_level, + )?; + Ok(Self { + max_def_level, + max_rep_level, + repeated_ancestor_def_level, + min_chunk_size: options.min_chunk_size as i64, + max_chunk_size: options.max_chunk_size as i64, + rolling_hash_mask, + rolling_hash: 0, + has_matched: false, + nth_run: 0, + chunk_size: 0, + }) + } + + /// Calculate the mask used to determine chunk boundaries from the rolling hash. + /// + /// The mask is calculated so that the expected chunk size distribution approximates + /// a normal distribution between min and max chunk sizes. + fn calculate_mask(min_chunk_size: i64, max_chunk_size: i64, norm_level: i32) -> Result { + if min_chunk_size < 0 { + return Err(ParquetError::General( + "min_chunk_size must be non-negative".to_string(), + )); + } + if max_chunk_size <= min_chunk_size { + return Err(ParquetError::General( + "max_chunk_size must be greater than min_chunk_size".to_string(), + )); + } + + let avg_chunk_size = (min_chunk_size + max_chunk_size) / 2; + // Target size after subtracting the min-size skip window and dividing by the + // number of hash tables (for central-limit-theorem normalization). + let target_size = (avg_chunk_size - min_chunk_size) / NUM_GEARHASH_TABLES as i64; + + // floor(log2(target_size)) — equivalent to C++ NumRequiredBits(target_size) - 1 + let mask_bits = if target_size > 0 { + 63 - target_size.leading_zeros() as i32 + } else { + 0 + }; + + let effective_bits = mask_bits - norm_level; + + if effective_bits < 1 || effective_bits > 63 { + return Err(ParquetError::General(format!( + "The number of bits in the CDC mask must be between 1 and 63, got {effective_bits}" + ))); + } + + // Create the mask by setting the top `effective_bits` bits. + Ok(u64::MAX << (64 - effective_bits)) + } + + /// Feed raw bytes into the rolling hash. + /// + /// The byte count always accumulates toward `chunk_size`, but the actual hash + /// update is skipped until `min_chunk_size` has been reached. This "skip window" + /// is the FastCDC optimization that prevents boundaries from appearing too early + /// in a chunk. + #[inline] + pub fn roll_value_bytes(&mut self, bytes: &[u8]) { + self.chunk_size += bytes.len() as i64; + if self.chunk_size < self.min_chunk_size { + return; + } + for &b in bytes { + self.rolling_hash = self + .rolling_hash + .wrapping_shl(1) + .wrapping_add(GEARHASH_TABLE[self.nth_run][b as usize]); + self.has_matched = + self.has_matched || ((self.rolling_hash & self.rolling_hash_mask) == 0); + } + } + + /// Feed a definition or repetition level (i16) into the rolling hash. + #[inline] + fn roll_level(&mut self, level: i16) { + self.roll_value_bytes(&level.to_le_bytes()); + } + + /// Check whether a new chunk boundary should be created. + /// + /// A boundary is created when **either** of two conditions holds: + /// + /// 1. **CLT normalization**: The rolling hash has matched the mask (`has_matched`) + /// *and* this is the 8th consecutive such match (`nth_run` reaches + /// `NUM_GEARHASH_TABLES`). Each match advances to the next gear hash table, so + /// 8 independent matches are required. A single hash table would yield + /// exponentially distributed chunk sizes; requiring 8 independent matches + /// approximates a normal (Gaussian) distribution by the central limit theorem. + /// + /// 2. **Hard size limit**: `chunk_size` has reached `max_chunk_size`. This caps + /// chunk size even if the CLT normalization sequence has not completed. + /// + /// Note: when `max_chunk_size` forces a boundary, `nth_run` is **not** reset, so + /// the CLT sequence continues from where it left off in the next chunk. This + /// matches the C++ behavior. + #[inline] + fn need_new_chunk(&mut self) -> bool { + if self.has_matched { + self.has_matched = false; + self.nth_run += 1; + if self.nth_run >= NUM_GEARHASH_TABLES { + self.nth_run = 0; + self.chunk_size = 0; + return true; + } + } + if self.chunk_size >= self.max_chunk_size { + self.chunk_size = 0; + return true; + } + false + } + + /// Compute chunk boundaries for the given column data. + /// + /// `value_bytes` returns the byte representation of the value at the given index. + /// The chunker feeds these bytes into the rolling hash to determine boundaries. + pub fn get_chunks( + &mut self, + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + num_levels: usize, + value_bytes: F, + ) -> Vec + where + F: Fn(usize) -> B, + B: AsRef<[u8]>, + { + let has_def_levels = self.max_def_level > 0; + let has_rep_levels = self.max_rep_level > 0; + + let mut chunks = Vec::new(); + + if !has_rep_levels && !has_def_levels { + // Fastest path: non-nested, non-null data. + // level_offset == value_offset for this case. + let mut prev_offset: usize = 0; + for offset in 0..num_levels { + self.roll_value_bytes(value_bytes(offset).as_ref()); + if self.need_new_chunk() { + chunks.push(Chunk { + level_offset: prev_offset, + value_offset: prev_offset, + levels_to_write: offset - prev_offset, + }); + prev_offset = offset; + } + } + // Last chunk + if prev_offset < num_levels { + chunks.push(Chunk { + level_offset: prev_offset, + value_offset: prev_offset, + levels_to_write: num_levels - prev_offset, + }); + } + } else if !has_rep_levels { + // Non-nested data with nulls (def levels only). + // level_offset == value_offset for non-nested data. + let def_levels = def_levels.expect("def_levels required when max_def_level > 0"); + let mut prev_offset: usize = 0; + for offset in 0..num_levels { + let def_level = def_levels[offset]; + self.roll_level(def_level); + if def_level == self.max_def_level { + self.roll_value_bytes(value_bytes(offset).as_ref()); + } + if self.need_new_chunk() { + chunks.push(Chunk { + level_offset: prev_offset, + value_offset: prev_offset, + levels_to_write: offset - prev_offset, + }); + prev_offset = offset; + } + } + // Last chunk + if prev_offset < num_levels { + chunks.push(Chunk { + level_offset: prev_offset, + value_offset: prev_offset, + levels_to_write: num_levels - prev_offset, + }); + } + } else { + // Nested data (def + rep levels). + // value_offset tracks the leaf value index independently. + let def_levels = def_levels.expect("def_levels required for nested data"); + let rep_levels = rep_levels.expect("rep_levels required for nested data"); + let mut prev_offset: usize = 0; + let mut prev_value_offset: usize = 0; + let mut value_offset: usize = 0; + + for offset in 0..num_levels { + let def_level = def_levels[offset]; + let rep_level = rep_levels[offset]; + + self.roll_level(def_level); + self.roll_level(rep_level); + if def_level == self.max_def_level { + self.roll_value_bytes(value_bytes(value_offset).as_ref()); + } + + // Boundaries are only created at top-level record boundaries + // (rep_level == 0). Splitting inside a nested record would require + // writing a partial row, which is not valid in Parquet. + if rep_level == 0 && self.need_new_chunk() { + let levels_to_write = offset - prev_offset; + if levels_to_write > 0 { + chunks.push(Chunk { + level_offset: prev_offset, + value_offset: prev_value_offset, + levels_to_write, + }); + prev_offset = offset; + prev_value_offset = value_offset; + } + } + // Count a value whenever the definition level reaches the nearest + // repeated ancestor. This tracks position in the Arrow array (which + // includes null inner elements), matching how Arrow encodes lists. + if def_level >= self.repeated_ancestor_def_level { + value_offset += 1; + } + } + // Last chunk + if prev_offset < num_levels { + chunks.push(Chunk { + level_offset: prev_offset, + value_offset: prev_value_offset, + levels_to_write: num_levels - prev_offset, + }); + } + } + + #[cfg(debug_assertions)] + self.validate_chunks(&chunks, num_levels); + + chunks + } + + #[cfg(debug_assertions)] + fn validate_chunks(&self, chunks: &[Chunk], num_levels: usize) { + assert!(!chunks.is_empty(), "chunks must be non-empty"); + + let first = &chunks[0]; + assert_eq!(first.level_offset, 0, "first chunk must start at level 0"); + assert_eq!(first.value_offset, 0, "first chunk must start at value 0"); + + let mut sum_levels = first.levels_to_write; + for i in 1..chunks.len() { + let chunk = &chunks[i]; + let prev = &chunks[i - 1]; + assert!(chunk.levels_to_write > 0, "chunk must have levels"); + assert!( + chunk.value_offset >= prev.value_offset, + "value offsets must be monotonically increasing" + ); + assert_eq!( + chunk.level_offset, + prev.level_offset + prev.levels_to_write, + "chunks must be contiguous" + ); + sum_levels += chunk.levels_to_write; + } + assert_eq!(sum_levels, num_levels, "chunks must cover all levels"); + + let last = chunks.last().unwrap(); + assert_eq!( + last.level_offset + last.levels_to_write, + num_levels, + "last chunk must end at num_levels" + ); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_calculate_mask_defaults() { + let mask = ContentDefinedChunker::calculate_mask(256 * 1024, 1024 * 1024, 0).unwrap(); + // avg = 640 KiB, target = (640-256)*1024/8 = 49152, log2(49152) = 15 + // mask = u64::MAX << (64 - 15) = top 15 bits set + let expected = u64::MAX << (64 - 15); + assert_eq!(mask, expected); + } + + #[test] + fn test_calculate_mask_with_norm_level() { + let mask = ContentDefinedChunker::calculate_mask(256 * 1024, 1024 * 1024, 1).unwrap(); + let expected = u64::MAX << (64 - 14); + assert_eq!(mask, expected); + } + + #[test] + fn test_calculate_mask_invalid() { + assert!(ContentDefinedChunker::calculate_mask(-1, 100, 0).is_err()); + assert!(ContentDefinedChunker::calculate_mask(100, 50, 0).is_err()); + assert!(ContentDefinedChunker::calculate_mask(100, 100, 0).is_err()); + } + + #[test] + fn test_non_nested_non_null_single_chunk() { + let options = CdcOptions { + min_chunk_size: 8, + max_chunk_size: 1024, + norm_level: 0, + }; + let mut chunker = ContentDefinedChunker::new(0, 0, 0, &options).unwrap(); + + // Write a small amount of data — should produce exactly 1 chunk. + let num_values = 4; + let chunks = chunker.get_chunks(None, None, num_values, |i| (i as i32).to_le_bytes()); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].level_offset, 0); + assert_eq!(chunks[0].value_offset, 0); + assert_eq!(chunks[0].levels_to_write, 4); + } + + #[test] + fn test_max_chunk_size_forces_boundary() { + let options = CdcOptions { + min_chunk_size: 256, + max_chunk_size: 1024, + norm_level: 0, + }; + let mut chunker = ContentDefinedChunker::new(0, 0, 0, &options).unwrap(); + + // Write enough data to exceed max_chunk_size multiple times. + // Each i32 = 4 bytes, max_chunk_size=1024, so ~256 values per chunk max. + let num_values = 2000; + let chunks = chunker.get_chunks(None, None, num_values, |i| (i as i32).to_le_bytes()); + + // Should have multiple chunks + assert!(chunks.len() > 1); + + // Verify contiguity + let mut total_levels = 0; + for (i, chunk) in chunks.iter().enumerate() { + assert_eq!(chunk.level_offset, total_levels); + if i < chunks.len() - 1 { + assert!(chunk.levels_to_write > 0); + } + total_levels += chunk.levels_to_write; + } + assert_eq!(total_levels, num_values); + } + + #[test] + fn test_deterministic_chunks() { + let options = CdcOptions { + min_chunk_size: 4, + max_chunk_size: 64, + norm_level: 0, + }; + + let roll = |i: usize| (i as i64).to_le_bytes(); + + let mut chunker1 = ContentDefinedChunker::new(0, 0, 0, &options).unwrap(); + let chunks1 = chunker1.get_chunks(None, None, 200, roll); + + let mut chunker2 = ContentDefinedChunker::new(0, 0, 0, &options).unwrap(); + let chunks2 = chunker2.get_chunks(None, None, 200, roll); + + assert_eq!(chunks1.len(), chunks2.len()); + for (a, b) in chunks1.iter().zip(chunks2.iter()) { + assert_eq!(a.level_offset, b.level_offset); + assert_eq!(a.value_offset, b.value_offset); + assert_eq!(a.levels_to_write, b.levels_to_write); + } + } + + #[test] + fn test_nullable_non_nested() { + let options = CdcOptions { + min_chunk_size: 4, + max_chunk_size: 64, + norm_level: 0, + }; + let mut chunker = ContentDefinedChunker::new(1, 0, 0, &options).unwrap(); + + let num_levels = 20; + // def_level=1 means non-null, def_level=0 means null + let def_levels: Vec = (0..num_levels) + .map(|i| if i % 3 == 0 { 0 } else { 1 }) + .collect(); + + let chunks = chunker.get_chunks(Some(&def_levels), None, num_levels, |i| { + (i as i32).to_le_bytes() + }); + + assert!(!chunks.is_empty()); + let total: usize = chunks.iter().map(|c| c.levels_to_write).sum(); + assert_eq!(total, num_levels); + } +} + +/// Integration tests that exercise CDC through the Arrow writer/reader roundtrip. +#[cfg(all(test, feature = "arrow"))] +mod arrow_tests { + use std::borrow::Borrow; + use std::sync::Arc; + + use arrow_array::builder::ListBuilder; + use arrow_array::{ArrayRef, Float64Array, Int32Array, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + + use crate::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + use crate::arrow::arrow_writer::ArrowWriter; + use crate::file::properties::{CdcOptions, WriterProperties}; + use crate::file::reader::{FileReader, SerializedFileReader}; + + // --- Constants --- + + const CDC_MIN_CHUNK_SIZE: usize = 4 * 1024; + const CDC_MAX_CHUNK_SIZE: usize = 16 * 1024; + const CDC_PART_SIZE: usize = 128 * 1024; + const CDC_EDIT_SIZE: usize = 128; + + // --- Helpers --- + + /// Deterministic hash function matching the C++ test generator. + fn test_hash(seed: u64, index: u64) -> u64 { + let mut h = (index.wrapping_add(seed)).wrapping_mul(0xc4ceb9fe1a85ec53u64); + h ^= h >> 33; + h = h.wrapping_mul(0xff51afd7ed558ccdu64); + h ^= h >> 33; + h = h.wrapping_mul(0xc4ceb9fe1a85ec53u64); + h ^= h >> 33; + h + } + + fn generate_i32_array(length: usize, seed: u64) -> Int32Array { + (0..length) + .map(|i| test_hash(seed, i as u64) as i32) + .collect() + } + + fn generate_nullable_i32_array(length: usize, seed: u64) -> Int32Array { + (0..length) + .map(|i| { + let val = test_hash(seed, i as u64); + if val % 10 == 0 { + None + } else { + Some(val as i32) + } + }) + .collect() + } + + fn generate_string_array(length: usize, seed: u64) -> StringArray { + (0..length) + .map(|i| { + let val = test_hash(seed, i as u64); + Some(format!("str_{val}")) + }) + .collect() + } + + fn write_batch_with_cdc(batch: &RecordBatch) -> Vec { + let props = WriterProperties::builder() + .set_content_defined_chunking(true) + .build(); + let mut buf = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), Some(props)).unwrap(); + writer.write(batch).unwrap(); + writer.close().unwrap(); + buf + } + + fn write_batch_without_cdc(batch: &RecordBatch) -> Vec { + let mut buf = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), None).unwrap(); + writer.write(batch).unwrap(); + writer.close().unwrap(); + buf + } + + fn read_batches(data: &[u8]) -> Vec { + let reader = ParquetRecordBatchReaderBuilder::try_new(bytes::Bytes::from(data.to_vec())) + .unwrap() + .build() + .unwrap(); + reader.collect::, _>>().unwrap() + } + + fn get_data_page_bytes(data: &[u8]) -> Vec> { + let reader = SerializedFileReader::new(bytes::Bytes::from(data.to_vec())).unwrap(); + let metadata = reader.metadata(); + let mut pages = Vec::new(); + for rg in 0..metadata.num_row_groups() { + let rg_reader = reader.get_row_group(rg).unwrap(); + for col in 0..metadata.row_group(rg).num_columns() { + let col_reader = rg_reader.get_column_page_reader(col).unwrap(); + for page in col_reader { + let page = page.unwrap(); + pages.push(page.buffer().to_vec()); + } + } + } + pages + } + + fn write_with_cdc_options( + batches: &[&RecordBatch], + min_chunk_size: usize, + max_chunk_size: usize, + max_row_group_rows: Option, + ) -> Vec { + assert!(!batches.is_empty()); + let schema = batches[0].schema(); + let mut builder = WriterProperties::builder() + .set_dictionary_enabled(false) + .set_cdc_options(CdcOptions { + min_chunk_size, + max_chunk_size, + norm_level: 0, + }); + if let Some(max_rows) = max_row_group_rows { + builder = builder.set_max_row_group_row_count(Some(max_rows)); + } + let props = builder.build(); + let mut buf = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buf, schema, Some(props)).unwrap(); + for batch in batches { + writer.write(batch).unwrap(); + } + writer.close().unwrap(); + buf + } + + fn get_page_lengths(data: &[u8], column_index: usize) -> Vec> { + let reader = SerializedFileReader::new(bytes::Bytes::from(data.to_vec())).unwrap(); + let metadata = reader.metadata(); + let mut result = Vec::new(); + for rg in 0..metadata.num_row_groups() { + let rg_reader = reader.get_row_group(rg).unwrap(); + let col_reader = rg_reader.get_column_page_reader(column_index).unwrap(); + let mut lengths = Vec::new(); + for page in col_reader { + let page = page.unwrap(); + if matches!( + page.page_type(), + crate::basic::PageType::DATA_PAGE | crate::basic::PageType::DATA_PAGE_V2 + ) { + lengths.push(page.num_values() as i64); + } + } + result.push(lengths); + } + result + } + + /// LCS-based diff between two sequences of page lengths (ported from C++). + fn find_differences(first: &[i64], second: &[i64]) -> Vec<(Vec, Vec)> { + let n = first.len(); + let m = second.len(); + let mut dp = vec![vec![0usize; m + 1]; n + 1]; + for i in 0..n { + for j in 0..m { + if first[i] == second[j] { + dp[i + 1][j + 1] = dp[i][j] + 1; + } else { + dp[i + 1][j + 1] = dp[i + 1][j].max(dp[i][j + 1]); + } + } + } + // Backtrack to find common elements + let mut common = Vec::new(); + let (mut i, mut j) = (n, m); + while i > 0 && j > 0 { + if first[i - 1] == second[j - 1] { + common.push((i - 1, j - 1)); + i -= 1; + j -= 1; + } else if dp[i - 1][j] >= dp[i][j - 1] { + i -= 1; + } else { + j -= 1; + } + } + common.reverse(); + + let mut result = Vec::new(); + let (mut last_i, mut last_j) = (0usize, 0usize); + for (ci, cj) in &common { + if *ci > last_i || *cj > last_j { + result.push((first[last_i..*ci].to_vec(), second[last_j..*cj].to_vec())); + } + last_i = ci + 1; + last_j = cj + 1; + } + if last_i < n || last_j < m { + result.push((first[last_i..].to_vec(), second[last_j..].to_vec())); + } + result + } + + fn make_i32_batch(length: usize, seed: u64) -> RecordBatch { + let col: ArrayRef = Arc::new(generate_i32_array(length, seed)); + RecordBatch::try_from_iter(vec![("col", col)]).unwrap() + } + + fn concat_batches(batches: impl IntoIterator>) -> RecordBatch { + let batches: Vec<_> = batches.into_iter().collect(); + let schema = batches[0].borrow().schema(); + let batches = batches.iter().map(|b| b.borrow()); + arrow_select::concat::concat_batches(&schema, batches).unwrap() + } + + fn i32_part_length() -> usize { + CDC_PART_SIZE / 4 + } + + fn i32_edit_length() -> usize { + CDC_EDIT_SIZE / 4 + } + + // --- Roundtrip tests --- + + #[test] + fn test_cdc_roundtrip_i32() { + let array: ArrayRef = Arc::new(Int32Array::from_iter(0..10_000)); + let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); + + let data = write_batch_with_cdc(&batch); + let batches = read_batches(&data); + let result = concat_batches(&batches); + assert_eq!(batch, result); + } + + #[test] + fn test_cdc_roundtrip_string() { + let values = (0..5_000).map(|i| Some(format!("value_{i}"))); + let array: ArrayRef = Arc::new(StringArray::from_iter(values)); + let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); + + let data = write_batch_with_cdc(&batch); + let batches = read_batches(&data); + let result = concat_batches(&batches); + assert_eq!(batch, result); + } + + #[test] + fn test_cdc_roundtrip_large_binary() { + let mut builder = arrow_array::builder::LargeBinaryBuilder::new(); + for i in 0..5_000u32 { + builder.append_value(format!("value_{i}")); + } + let array: ArrayRef = Arc::new(builder.finish()); + let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); + + let data = write_batch_with_cdc(&batch); + let batches = read_batches(&data); + let result = concat_batches(&batches); + assert_eq!(batch, result); + } + + #[test] + fn test_cdc_roundtrip_nullable() { + let values = (0..10_000).map(|i| if i % 7 == 0 { None } else { Some(i) }); + let array: ArrayRef = Arc::new(Int32Array::from_iter(values)); + let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); + + let data = write_batch_with_cdc(&batch); + let batches = read_batches(&data); + let result = concat_batches(&batches); + assert_eq!(batch, result); + } + + #[test] + fn test_cdc_deterministic() { + let values = 0..10_000; + let array: ArrayRef = Arc::new(Int32Array::from_iter(values)); + let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); + + let data1 = write_batch_with_cdc(&batch); + let data2 = write_batch_with_cdc(&batch); + assert_eq!(data1, data2, "CDC output must be deterministic"); + } + + #[test] + fn test_cdc_produces_multiple_pages() { + let values = 0..500_000; + let array: ArrayRef = Arc::new(Int32Array::from_iter(values)); + let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); + + let cdc_data = write_batch_with_cdc(&batch); + let no_cdc_data = write_batch_without_cdc(&batch); + + let cdc_pages = get_data_page_bytes(&cdc_data); + let no_cdc_pages = get_data_page_bytes(&no_cdc_data); + + assert!( + cdc_pages.len() > 1, + "CDC should produce multiple pages, got {}", + cdc_pages.len() + ); + assert!( + cdc_pages.len() >= no_cdc_pages.len(), + "CDC pages {} should be >= non-CDC pages {}", + cdc_pages.len(), + no_cdc_pages.len() + ); + } + + #[test] + fn test_cdc_page_reuse_on_append() { + let n = 500_000; + let original_values = 0..n; + let appended_values = 0..n + 100; + let original: ArrayRef = Arc::new(Int32Array::from_iter(original_values)); + let appended: ArrayRef = Arc::new(Int32Array::from_iter(appended_values)); + + let batch1 = RecordBatch::try_from_iter(vec![("col", original)]).unwrap(); + let batch2 = RecordBatch::try_from_iter(vec![("col", appended)]).unwrap(); + + let pages1 = get_data_page_bytes(&write_batch_with_cdc(&batch1)); + let pages2 = get_data_page_bytes(&write_batch_with_cdc(&batch2)); + + let reused = pages1.iter().filter(|p| pages2.contains(p)).count(); + assert!( + reused > 0, + "At least some pages should be reused after append, pages1={}, pages2={}", + pages1.len(), + pages2.len() + ); + } + + #[test] + fn test_cdc_state_persists_across_row_groups() { + let n = 500_000i32; + let all_data: ArrayRef = Arc::new(Int32Array::from_iter(0..n)); + let batch_all = RecordBatch::try_from_iter(vec![("col", all_data)]).unwrap(); + let schema = batch_all.schema(); + let data_one_rg = write_batch_with_cdc(&batch_all); + + let props = WriterProperties::builder() + .set_content_defined_chunking(true) + .set_max_row_group_row_count(Some(n as usize / 2)) + .build(); + let mut buf = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); + writer.write(&batch_all).unwrap(); + writer.close().unwrap(); + let data_two_rg = buf; + + let result1 = read_batches(&data_one_rg); + let result2 = read_batches(&data_two_rg); + let concat1 = concat_batches(&result1); + let concat2 = concat_batches(&result2); + assert_eq!(concat1, concat2); + } + + #[test] + fn test_cdc_roundtrip_list() { + let mut builder = ListBuilder::new(arrow_array::builder::Int32Builder::new()); + for i in 0..5_000 { + for j in 0..(i % 5) { + builder.values().append_value(i * 10 + j); + } + builder.append(true); + } + let list_array: ArrayRef = Arc::new(builder.finish()); + + let batch = RecordBatch::try_from_iter(vec![("col", list_array)]).unwrap(); + + let data = write_batch_with_cdc(&batch); + let batches = read_batches(&data); + let result = concat_batches(&batches); + assert_eq!(batch, result); + } + + #[test] + fn test_cdc_roundtrip_multiple_columns() { + let i32_array: ArrayRef = Arc::new(Int32Array::from_iter(0..10_000)); + let str_array: ArrayRef = Arc::new(StringArray::from_iter( + (0..10_000).map(|i| Some(format!("s{i}"))), + )); + let f64_array: ArrayRef = + Arc::new(Float64Array::from_iter((0..10_000).map(|i| i as f64 * 0.1))); + + let batch = RecordBatch::try_from_iter(vec![ + ("ints", i32_array), + ("strings", str_array), + ("floats", f64_array), + ]) + .unwrap(); + + let data = write_batch_with_cdc(&batch); + let batches = read_batches(&data); + let result = concat_batches(&batches); + assert_eq!(batch, result); + } + + // --- Page-level CDC tests ported from C++ chunker_internal_test.cc --- + + #[test] + fn test_cdc_find_differences() { + let diffs = find_differences(&[1, 2, 3, 4, 5], &[1, 7, 8, 4, 5]); + assert_eq!(diffs.len(), 1); + assert_eq!(diffs[0].0, vec![2, 3]); + assert_eq!(diffs[0].1, vec![7, 8]); + + let diffs = find_differences(&[1, 2, 3], &[1, 2, 3, 4, 5]); + assert_eq!(diffs.len(), 1); + assert!(diffs[0].0.is_empty()); + assert_eq!(diffs[0].1, vec![4, 5]); + + let diffs = find_differences(&[], &[]); + assert!(diffs.is_empty()); + } + + #[test] + fn test_cdc_delete_once() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let part1 = make_i32_batch(part_len, 0); + let edit = make_i32_batch(edit_len, 1); + let part2 = make_i32_batch(part_len, 100); + + let base = concat_batches([&part1, &edit, &part2]); + let modified = concat_batches([&part1, &part2]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + // Verify roundtrip + let base_result = read_batches(&base_data); + let mod_result = read_batches(&mod_data); + assert_eq!(concat_batches(&base_result), base); + assert_eq!(concat_batches(&mod_result), modified); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + assert_eq!(base_pages.len(), 1); + assert_eq!(mod_pages.len(), 1); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert_eq!(diffs.len(), 1, "Expected 1 diff, got {diffs:?}"); + let base_sum: i64 = diffs[0].0.iter().sum(); + let mod_sum: i64 = diffs[0].1.iter().sum(); + assert_eq!( + base_sum - mod_sum, + edit_len as i64, + "Diff should account for deleted rows" + ); + } + + #[test] + fn test_cdc_insert_once() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let part1 = make_i32_batch(part_len, 0); + let edit = make_i32_batch(edit_len, 1); + let part2 = make_i32_batch(part_len, 100); + + let base = concat_batches([&part1, &part2]); + let modified = concat_batches([&part1, &edit, &part2]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let mod_result = read_batches(&mod_data); + assert_eq!(concat_batches(&mod_result), modified); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + assert_eq!(base_pages.len(), 1); + assert_eq!(mod_pages.len(), 1); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert_eq!(diffs.len(), 1, "Expected 1 diff, got {diffs:?}"); + let base_sum: i64 = diffs[0].0.iter().sum(); + let mod_sum: i64 = diffs[0].1.iter().sum(); + assert_eq!( + mod_sum - base_sum, + edit_len as i64, + "Diff should account for inserted rows" + ); + } + + #[test] + fn test_cdc_update_once() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let part1 = make_i32_batch(part_len, 0); + let edit1 = make_i32_batch(edit_len, 1); + let edit2 = make_i32_batch(edit_len, 2); + let part2 = make_i32_batch(part_len, 100); + + let base = concat_batches([&part1, &edit1, &part2]); + let modified = concat_batches([&part1, &edit2, &part2]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + assert_eq!(base_pages.len(), 1); + assert_eq!(mod_pages.len(), 1); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert!(diffs.len() <= 1, "Expected at most 1 diff, got {diffs:?}"); + for (left, right) in &diffs { + let left_sum: i64 = left.iter().sum(); + let right_sum: i64 = right.iter().sum(); + assert_eq!( + left_sum, right_sum, + "Update should not change total row count" + ); + } + } + + #[test] + fn test_cdc_update_twice() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let part1 = make_i32_batch(part_len, 0); + let edit1_old = make_i32_batch(edit_len, 1); + let edit1_new = make_i32_batch(edit_len, 2); + let part2 = make_i32_batch(part_len, 100); + let edit2_old = make_i32_batch(edit_len, 3); + let edit2_new = make_i32_batch(edit_len, 4); + let part3 = make_i32_batch(part_len, 200); + + let base = concat_batches([&part1, &edit1_old, &part2, &edit2_old, &part3]); + let modified = concat_batches([&part1, &edit1_new, &part2, &edit2_new, &part3]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + + // A double update may produce 0, 1, or 2 diffs depending on whether the + // edits shift CDC boundaries. What must always hold is that the total row + // count within each diff region is unchanged (updates are row-count-neutral). + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert!(diffs.len() <= 2, "Expected at most 2 diffs, got {diffs:?}"); + for (left, right) in &diffs { + let left_sum: i64 = left.iter().sum(); + let right_sum: i64 = right.iter().sum(); + assert_eq!( + left_sum, right_sum, + "Each update diff should not change total row count" + ); + } + } + + /// Verifies that the `primitive_width` fallback in `get_cdc_chunks` (used for + /// f64 and other fixed-width non-integer types) produces correct CDC boundaries. + #[test] + fn test_cdc_f64_column() { + let part_len = CDC_PART_SIZE / 8; // 8 bytes per f64 + let edit_len = CDC_EDIT_SIZE / 8; + + let schema = Arc::new(Schema::new(vec![Field::new( + "col", + DataType::Float64, + false, + )])); + + let make_batch = |len: usize, seed: u64| { + let array: ArrayRef = Arc::new( + (0..len) + .map(|i| test_hash(seed, i as u64) as f64) + .collect::(), + ); + RecordBatch::try_new(schema.clone(), vec![array]).unwrap() + }; + + let part1 = make_batch(part_len, 0); + let edit = make_batch(edit_len, 1); + let part2 = make_batch(part_len, 100); + + let base = concat_batches([&part1, &part2]); + let modified = concat_batches([&part1, &edit, &part2]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let mod_result = read_batches(&mod_data); + assert_eq!(concat_batches(&mod_result), modified); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert_eq!( + diffs.len(), + 1, + "Expected 1 diff for f64 insert, got {diffs:?}" + ); + let mod_sum: i64 = diffs[0].1.iter().sum(); + let base_sum: i64 = diffs[0].0.iter().sum(); + assert_eq!(mod_sum - base_sum, edit_len as i64); + } + + #[test] + fn test_cdc_append() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let part1 = make_i32_batch(part_len, 0); + let part2 = make_i32_batch(part_len, 100); + let edit = make_i32_batch(edit_len, 1); + + let base = concat_batches([&part1, &part2]); + let modified = concat_batches([&part1, &part2, &edit]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + assert_eq!(base_pages.len(), 1); + assert_eq!(mod_pages.len(), 1); + + let bp = &base_pages[0]; + let mp = &mod_pages[0]; + + assert!(mp.len() >= bp.len()); + for i in 0..bp.len() - 1 { + assert_eq!(bp[i], mp[i], "Page {i} should be identical"); + } + assert!( + mp[bp.len() - 1] >= bp[bp.len() - 1], + "Last original page should be same or larger in modified" + ); + } + + #[test] + fn test_cdc_prepend() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let part1 = make_i32_batch(part_len, 0); + let part2 = make_i32_batch(part_len, 100); + let edit = make_i32_batch(edit_len, 1); + + let base = concat_batches([&part1, &part2]); + let modified = concat_batches([&edit, &part1, &part2]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + assert_eq!(base_pages.len(), 1); + assert_eq!(mod_pages.len(), 1); + + assert!(mod_pages[0].len() >= base_pages[0].len()); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert_eq!(diffs.len(), 1, "Expected 1 diff, got {diffs:?}"); + let base_sum: i64 = diffs[0].0.iter().sum(); + let mod_sum: i64 = diffs[0].1.iter().sum(); + assert_eq!( + mod_sum - base_sum, + edit_len as i64, + "Diff should account for prepended rows" + ); + } + + #[test] + fn test_cdc_empty_table() { + let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Int32, false)])); + let empty = RecordBatch::new_empty(schema.clone()); + let data = write_with_cdc_options(&[&empty], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let pages = get_page_lengths(&data, 0); + assert!(pages.is_empty(), "Empty table should produce no row groups"); + + let result = read_batches(&data); + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 0); + } + + #[test] + fn test_cdc_multiple_row_groups_insert() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + let rg_rows = part_len / 2; + + let part1 = make_i32_batch(part_len, 0); + let edit1 = make_i32_batch(edit_len, 1); + let edit2 = make_i32_batch(edit_len, 3); + let part2 = make_i32_batch(part_len, 100); + let part3 = make_i32_batch(part_len, 200); + + let base = concat_batches([&part1, &edit1, &part2, &part3]); + let modified = concat_batches([&part1, &edit1, &edit2, &part2, &part3]); + + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(rg_rows), + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(rg_rows), + ); + + let base_result = read_batches(&base_data); + let mod_result = read_batches(&mod_data); + assert_eq!(concat_batches(&base_result), base); + assert_eq!(concat_batches(&mod_result), modified); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + + assert!(base_pages.len() > 1); + assert_eq!(base_pages.len(), mod_pages.len()); + + assert_eq!(base_pages[0], mod_pages[0]); + assert_eq!(base_pages[1], mod_pages[1]); + } + + #[test] + fn test_cdc_multiple_row_groups_append() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + let rg_rows = part_len / 2; + + let part1 = make_i32_batch(part_len, 0); + let edit1 = make_i32_batch(edit_len, 1); + let part2 = make_i32_batch(part_len, 100); + let part3 = make_i32_batch(part_len, 200); + let edit2 = make_i32_batch(edit_len, 3); + + let base = concat_batches([&part1, &edit1, &part2, &part3]); + let modified = concat_batches([&part1, &edit1, &part2, &part3, &edit2]); + + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(rg_rows), + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(rg_rows), + ); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + assert!(base_pages.len() > 1); + assert_eq!(base_pages.len(), mod_pages.len()); + + for i in 0..base_pages.len() - 1 { + assert_eq!( + base_pages[i], mod_pages[i], + "Row group {i} pages should be identical" + ); + } + } + + #[test] + fn test_cdc_nullable_column() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Int32, true)])); + + let make_batch = |len, seed| { + RecordBatch::try_new( + schema.clone(), + vec![Arc::new(generate_nullable_i32_array(len, seed)) as _], + ) + .unwrap() + }; + + let part1 = make_batch(part_len, 0); + let edit = make_batch(edit_len, 1); + let part2 = make_batch(part_len, 100); + + let base = concat_batches([&part1, &part2]); + let modified = concat_batches([&part1, &edit, &part2]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let mod_result = read_batches(&mod_data); + assert_eq!(concat_batches(&mod_result), modified); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert_eq!(diffs.len(), 1, "Expected 1 diff, got {diffs:?}"); + let mod_sum: i64 = diffs[0].1.iter().sum(); + let base_sum: i64 = diffs[0].0.iter().sum(); + assert_eq!(mod_sum - base_sum, edit_len as i64); + } + + #[test] + fn test_cdc_string_column() { + let part_len = CDC_PART_SIZE / 16; + let edit_len = CDC_EDIT_SIZE / 16; + + let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)])); + + let make_batch = |len, seed| { + RecordBatch::try_new( + schema.clone(), + vec![Arc::new(generate_string_array(len, seed)) as _], + ) + .unwrap() + }; + + let part1 = make_batch(part_len, 0); + let edit = make_batch(edit_len, 1); + let part2 = make_batch(part_len, 100); + + let base = concat_batches([&part1, &part2]); + let modified = concat_batches([&part1, &edit, &part2]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let mod_result = read_batches(&mod_data); + assert_eq!(concat_batches(&mod_result), modified); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert_eq!( + diffs.len(), + 1, + "Expected 1 diff for string insert, got {diffs:?}" + ); + let mod_sum: i64 = diffs[0].1.iter().sum(); + let base_sum: i64 = diffs[0].0.iter().sum(); + assert_eq!(mod_sum - base_sum, edit_len as i64); + } + + #[test] + fn test_cdc_delete_twice() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let part1 = make_i32_batch(part_len, 0); + let edit1 = make_i32_batch(edit_len, 1); + let part2 = make_i32_batch(part_len, 100); + let edit2 = make_i32_batch(edit_len, 2); + let part3 = make_i32_batch(part_len, 200); + + let base = concat_batches([&part1, &edit1, &part2, &edit2, &part3]); + let modified = concat_batches([&part1, &part2, &part3]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert_eq!( + diffs.len(), + 2, + "Expected 2 diffs for double delete, got {diffs:?}" + ); + for (left, right) in &diffs { + let left_sum: i64 = left.iter().sum(); + let right_sum: i64 = right.iter().sum(); + assert_eq!( + left_sum - right_sum, + edit_len as i64, + "Each diff should account for one deletion" + ); + } + } + + #[test] + fn test_cdc_insert_twice() { + let part_len = i32_part_length(); + let edit_len = i32_edit_length(); + + let part1 = make_i32_batch(part_len, 0); + let edit1 = make_i32_batch(edit_len, 1); + let part2 = make_i32_batch(part_len, 100); + let edit2 = make_i32_batch(edit_len, 2); + let part3 = make_i32_batch(part_len, 200); + + let base = concat_batches([&part1, &part2, &part3]); + let modified = concat_batches([&part1, &edit1, &part2, &edit2, &part3]); + + let base_data = + write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let mod_data = + write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + let base_pages = get_page_lengths(&base_data, 0); + let mod_pages = get_page_lengths(&mod_data, 0); + + let diffs = find_differences(&base_pages[0], &mod_pages[0]); + assert_eq!( + diffs.len(), + 2, + "Expected 2 diffs for double insert, got {diffs:?}" + ); + for (left, right) in &diffs { + let left_sum: i64 = left.iter().sum(); + let right_sum: i64 = right.iter().sum(); + assert_eq!( + right_sum - left_sum, + edit_len as i64, + "Each diff should account for one insertion" + ); + } + } +} diff --git a/parquet/src/column/chunker/cdc_codegen.py b/parquet/src/column/chunker/cdc_codegen.py new file mode 100644 index 000000000000..3675c92d0281 --- /dev/null +++ b/parquet/src/column/chunker/cdc_codegen.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Produce the given number gearhash tables for rolling hash calculations. + +Each table consists of 256 64-bit integer values and by default 8 tables are +produced. The tables are written to a Rust source file. + +The generated numbers are deterministic "random" numbers created by MD5 hashing +a fixed seed and the table index. This ensures that the tables are the same +across different runs and platforms. The function of generating the numbers is +less important as long as they have sufficiently uniform distribution. + +Reference implementations: +- https://github.com/Borelset/destor/blob/master/src/chunking/fascdc_chunking.c +- https://github.com/nlfiedler/fastcdc-rs/blob/master/examples/table64.rs + +Usage: + python cdc_codegen.py [ntables] + + ntables: Number of gearhash tables to generate (default 8). + + The generated source file is written to ./cdc_generated.rs +""" + +import hashlib +import pathlib +import sys +from io import StringIO + + +template = """\ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This table should be identical with +// https://github.com/apache/arrow/blob/main/cpp/src/parquet/chunker_internal_generated.h +// Ensure that both tables remain in sync after any changes. + +#[rustfmt::skip] +pub(crate) const NUM_GEARHASH_TABLES: usize = {ntables}; + +#[rustfmt::skip] +pub(crate) const GEARHASH_TABLE: [[u64; 256]; NUM_GEARHASH_TABLES] = [ +{content}]; +""" + + +def generate_hash(n: int, seed: int): + """Produce predictable hash values for a given seed and n using MD5. + + The value can be arbitrary as long as it is deterministic and has a uniform + distribution. The MD5 hash is used to produce a 16 character hexadecimal + string which is then converted to a 64-bit integer. + """ + value = bytes([seed] * 64 + [n] * 64) + hasher = hashlib.md5(value) + return hasher.hexdigest()[:16] + + +def generate_hashtable(seed: int, length=256): + """Generate and render a single gearhash table.""" + table = [generate_hash(n, seed=seed) for n in range(length)] + + out = StringIO() + out.write(f" // seed = {seed}\n") + out.write(" [\n") + for i in range(0, length, 4): + values = [f"0x{value}" for value in table[i : i + 4]] + out.write(f" {', '.join(values)},\n") + out.write(" ]") + + return out.getvalue() + + +def generate_source(ntables=8, relative_path="cdc_generated.rs"): + """Generate a Rust source file with multiple gearhash tables.""" + path = pathlib.Path(__file__).parent / relative_path + tables = [generate_hashtable(seed) for seed in range(ntables)] + content = ",\n".join(tables) + text = template.format(ntables=ntables, content=content) + path.write_text(text) + + +if __name__ == "__main__": + ntables = int(sys.argv[1]) if len(sys.argv) > 1 else 8 + generate_source(ntables) diff --git a/parquet/src/column/chunker/cdc_generated.rs b/parquet/src/column/chunker/cdc_generated.rs new file mode 100644 index 000000000000..4222e3669245 --- /dev/null +++ b/parquet/src/column/chunker/cdc_generated.rs @@ -0,0 +1,558 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[rustfmt::skip] +pub(crate) const NUM_GEARHASH_TABLES: usize = 8; + +#[rustfmt::skip] +pub(crate) const GEARHASH_TABLE: [[u64; 256]; NUM_GEARHASH_TABLES] = [ + // seed = 0 + [ + 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, + 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, + 0xd2fff45d1989642d, 0x80217c1c600e30a6, 0xb9469ee2e43df7ac, 0x3654b76a61999706, + 0x6ea73dfe5de0c6b6, 0xdfd662e1937a589d, 0x0dbe0cc74b188a68, 0xde45f4e6d73ffc6f, + 0xcdf7a7759e70d87e, 0x5d6a951b8d38c310, 0xdc9423c3813fcf2c, 0x25dc2976e167ffce, + 0xc2555baa1d031c84, 0x115bc3f2230a3ab6, 0xd4b10260f350bede, 0xdfd3501ab447d723, + 0x022e79217edaf167, 0x1635e2255c5a7526, 0xa0a750350cc77102, 0xc027133e05d39f56, + 0xd949459779cf0387, 0xb92f1464f5c688c2, 0xd9ac5f3e8b42f2f3, 0xdf02bb6f5ecaac21, + 0x8156f988fac7bfa4, 0xe4580f97bede2ec8, 0x44fe7d17a76fca32, 0x885f59bd54c2014c, + 0x435e63ec655ffae9, 0x5ebc51930967b1f1, 0x5428c2084ac29e47, 0x9465938fec30e36b, + 0xc7cb3de4977772cd, 0x15692d7c201e8c3a, 0x505ee65cdc4b17f4, 0x7d9839a0a7aead6b, + 0xeef5f5b6a0105291, 0x76c2fb232ce7f5bf, 0x5c13893c1c3ff3a9, 0x65b6b547d4442f98, + 0xb8ad7487c8c96fce, 0x906bcf51c99974f8, 0x2f56e48bb943a48c, 0xbc9ab109f82d3a44, + 0xcd5160cdc8c7e735, 0xbe9acb9df3427732, 0x386b91d477d7fade, 0x36be463621dd5af2, + 0xcbe6a2faffd627a8, 0x9c8fd528463a2f5a, 0xb9b88c6bb802b184, 0xb414b4e665c597c7, + 0xbedb142568209556, 0x5360d81c25429dce, 0x63a69a960a952f37, 0xc900d63899e1b503, + 0x1abc63a8b37c7728, 0xa8b3a8b6409080eb, 0x495e391f662959f6, 0xdf1e136f3e12229b, + 0x33d5fc526b0dd38d, 0x321221ae2abfac63, 0x7fde18351fda7395, 0xed79fe5c3a6aa4c3, + 0x2dd6965a4867d8d4, 0x54813ca20fe8799b, 0x5d59ea6456465c39, 0x0de0c294d1936b81, + 0x4aaf0755002c588c, 0x3530a1857ad04c6d, 0xb8a64f4ce184442b, 0xe0def10bceedfa17, + 0x46e38d0a443757ec, 0x9795a1c645ee16d7, 0x7e531def245eac8a, 0x683b25c43a0716cf, + 0x884583d372da219d, 0x5b06b62c910416e5, 0x54b6902fbebd3dbe, 0x931198d40a761a75, + 0xead7d8e830013590, 0x80b4d5dc99bfaced, 0xf98272c8108a1ad2, 0x1adce054289a0ec6, + 0x7d53a1143c56b465, 0x497fbe4f00c92b52, 0x525e4cc2e81ebd69, 0xc94478e0d5508ff6, + 0xb8a5da83c196d07c, 0x7667a921b65b0603, 0xf236fabbdefe6cd1, 0x53da978d19a92b98, + 0xc604f6e97087124d, 0x2cbd27221924b094, 0x65cd1102c985b1d2, 0x08c0755dc1a97eb4, + 0x5e0419e921c0fef1, 0x282d2c1196f84a29, 0xe21117fcfc5793f7, 0xcf4e985dc38e6c2e, + 0xd521f4f264d55616, 0xde69b04c485f2a10, 0x59410e245305178a, 0xceab1d477c943601, + 0xa9805732d71ee5e9, 0x054cd443896974f6, 0xf2b517717a423a3e, 0x09517937fa9fac95, + 0x4938233e9ca871e3, 0x9132cbaf56f83ec0, 0x4703421ed1dd027d, 0xfd9933f4e6f1ec4e, + 0xf237c7fded2274a8, 0xdf4616efe68cd7b4, 0x5e46de0f39f0a380, 0x3d41e0c6d8e095b0, + 0xc5272f8a5bb2df09, 0x68aa78e8301fb964, 0xbf5b5b52c8e32ae0, 0xbf28ed3df74bdcf7, + 0xd6198f64c833815a, 0x8cd99d2974267544, 0xd90560ea4465ff2c, 0x571d65ad7ad59261, + 0x309453518baa367a, 0xa60538377bc79fb2, 0xace515da1ab4183c, 0xf56d3c8d891d1c5b, + 0x5b0d8370b59def49, 0x775866ce7c83c762, 0x3d76085695c8e18a, 0xba064d1a9af1b114, + 0xc84ef7cd7b98b521, 0x90b9231681c2bc37, 0x37e2b13e6f585b6b, 0x1d0a34e55e0f369f, + 0x86bb8019cf41447c, 0x4b95c6ef55b3f71f, 0x3b6ed1660732b310, 0x617eee603d137f21, + 0xf4f6278b464f3bbc, 0xdfb763b720da205a, 0x353478899b871cb7, 0xe45fbbff574cc41e, + 0x1a94b60847907d72, 0xb10eef051eff67a5, 0xf0e012ec6a284d40, 0xcc1cd1a11b926d7c, + 0xcf9d9c5453e19cad, 0x270febcc0fc0e86b, 0xd6567568778b781e, 0x7323b98965eeb46b, + 0xccecd374567086ff, 0xef7b44bfc497a704, 0xebc479c051a9f0a5, 0xc9b7410e3e00a235, + 0x1d084f7ecdf83dab, 0xc8a9a97e33ba8ba3, 0x8c75318f5b2350d6, 0xaa3cd5d0c684bdda, + 0xa81125fe0901bedf, 0xf7bcd76020edfc93, 0x834ee4c12e75874f, 0xb2bb8a7beb44fa14, + 0x32cd26f50a4f4e4d, 0x0fc5817ca55d959a, 0xd6e4ae2e3ae10718, 0x074abdcceb8d6e38, + 0xc0cc5f4f9b3a9c43, 0x1115d364363595b2, 0x69861db2eb19f2e8, 0x59b8d804cf92bc67, + 0x9bac9785e5e4b863, 0x7fa0e17a41869561, 0x10d3c9633f0c709c, 0x534a03deee6bc44a, + 0x73b1f7201257f581, 0x46fd6a11e2e0706b, 0x494abb554946e67a, 0xb5d6da317864dc8e, + 0x402ded9238f39687, 0xd8fa37d2cbd6d290, 0xcc818293fcb06791, 0x6482ab344806cd4d, + 0x0956e6ee9d8eb60b, 0x01fee622d8465ac8, 0xae7ece370cbd9c35, 0x7ff09e937a177279, + 0xa2c29ee7a33ca5f1, 0x990e8dbee083923b, 0x4a819b72f610863a, 0xddecfad79d3f08be, + 0x627372480fac20a7, 0x802154d6eca2db4c, 0x8fcf02e42f805e55, 0x040a911ff8cea977, + 0xbb544485bc64d0d4, 0xaddde1aeb406d0fb, 0xf6b35fae23dce66f, 0xc07a9fb3645d2f9b, + 0xccd113907e9c0fed, 0xd17af369984fd213, 0x9223823c59a083e7, 0xe19d475606b81013, + 0xe181ac116a90e57a, 0x71f7b6258c6def4c, 0x2246f34b45964f7c, 0xd74aedaea2d31751, + 0xb1add86e5dd305d1, 0xeb9ba881f16d6471, 0xef7600e036f5c6ff, 0x1d50bc9735b8fb85, + 0xe63942bd1f3e2969, 0x9241ba9f8b3f4e72, 0xee8bb2bca07d35b6, 0x55cd55dab522654e, + 0x94d0cfa7c1a6845d, 0x02f9845d559884c3, 0x8ce70ea21063b560, 0xd70998028ef08b74, + 0xdfdb5bbee310876b, 0x4e21b2e348256d16, 0xde007a981c13debc, 0xe51950cbbddabfdd, + 0xd223301dbe9957c1, 0x084b8634cc2cce4b, 0x90e551378aa9d70c, 0x833b533ac633e448, + 0x7891e232882da57f, 0xa1bf26f0163ce2b3, 0xf33a0171eb9c68d5, 0x2e7de18ca69b3fa2, + 0x666fd6f175619199, 0x1239d37edb5feb9f, 0xfa9fc9382e61ff5c, 0x3ca4ad427e3c126f, + 0x37c6dd4c2c31ae6e, 0x1f1bacb619d427b2, 0x7dd09f5d10759afe, 0xc8d941432327d733, + 0x2b389ba25e1d43a7, 0xa4e3030c3740ff21, 0xcc56dae13fd37463, 0x2481457c175b560f, + 0x9deb35bde77c5c41, 0x847aa6ea5549a0c3, 0xcde01bb48b6e7f02, 0x15a28844e64cb211, + ], + // seed = 1 + [ + 0xecfcba92fe5691a3, 0x71377799fea34699, 0xb284c9096fa614e5, 0x54534170f40de6c8, + 0xbbd804d45884fba3, 0x44929a896388c8a1, 0x79b712508e0fa3b1, 0xeb53ab280af31054, + 0x351ea23a6319da7a, 0x2fbe55d9819d85a2, 0x34f4b6568dcd28b1, 0x8c94ea5e5d82967a, + 0x09068d333a46d3c5, 0x762ad4f64cb73381, 0xd5c6db5ef0e22640, 0x36d8ab5a36175680, + 0xd41fe333cdc3525a, 0xa1f51dbdf20ce781, 0x1410a95e786c8be6, 0x96b7499a670c2b41, + 0x3912e1037835d893, 0x272c5bd83e1e9115, 0x2ea7f91cad82a0d6, 0xcd10e85662ce9931, + 0xedad49be8d5e8b74, 0x7ccd8fe0f37d12bc, 0xfac0482005eed593, 0x4513991681f6c8b0, + 0x2804d612eb0ad37d, 0x7cca9e8412b81d34, 0x85ffd6707192b7b8, 0xea0560aeea954411, + 0x0122d28226102bba, 0xf51c47cdbd22fdd1, 0x3707d851183ff17c, 0xaef5a1465f3e902d, + 0xbcb38c2d8736a04f, 0x4025317e864bef15, 0x8d3f66d86e1ea58f, 0xc16759a3d97ed79a, + 0x1c62abdc0659f2f5, 0x23b3eb4e699bd28f, 0x5083c4fceed3ccaf, 0xa65bf34562cc989c, + 0xaa5865932fd79064, 0xf24d08d268c24593, 0x7fbd00a215196999, 0x7812cd366d752964, + 0x62e8dcb27ef3d945, 0xf08b7984e1b946dc, 0x547d23ad9a5c1dcf, 0x496b1fb249b27fb7, + 0xcd692e1db5f3b3ba, 0x41931e39f1e1bc61, 0x286c6a7d7edae82b, 0x17ef6638b6c4ca6e, + 0x609beb5a2576a934, 0xcc5e16fe4a69b83c, 0xbbd14d08b078fc24, 0x2a617680f481cb94, + 0x81dbbd5f86e6d039, 0xeb8205e1fc8ecc3c, 0xe5e3bb576faa8042, 0x5d6f1eb9d9df01b5, + 0x9a47b8739c10fb44, 0x398a7caad7ea7696, 0x9c0fc1d7c46adde6, 0x67cd6de0a51978a6, + 0x68ccc4b77a21cca4, 0x1e067066b82f415c, 0xf7ddade6535e1819, 0xf2185c884291751b, + 0xc322b7381fcbe34f, 0x242f593e88290b9b, 0x8e11ccc0ea5e84a3, 0x40e3a2e3346db8a2, + 0xf18bfc3ad2931a2c, 0x2468397394b00144, 0xeae199cce14e6817, 0x05b462686c75a1ae, + 0xda096cb859c51673, 0xd87aeb967a906bef, 0xaabc74493cb02fe6, 0x74d48fc2e7da143e, + 0x6ec1c8fed3f2c1fd, 0xe01e0704b463f18e, 0xc3d88a4d3a8056e4, 0xd01ae0ffab6c8f3f, + 0x881ba052620ae7c7, 0xcea033aef0a823a5, 0x8d2cad91d83df1e3, 0x18746d205e66dbe9, + 0x3061f8e58d046650, 0xd819c59f0ce2cf8b, 0x144e89e93635e870, 0x3415e88279b21651, + 0xd6f7ab944b86c3fa, 0x45f1dd15d0f67bdc, 0xbf0d97c7f4fa24f4, 0x34a7de520a57fcd2, + 0x4ba86fda03e9e2bc, 0xa7995265a025b552, 0x698f6819d5f51cf7, 0xd07dbe9d8a156981, + 0x2683945373857fc1, 0x116f8a84f96167de, 0x8bc832bd85595ebf, 0xb206519d74fdfafa, + 0xde9519b2e9b5cc5f, 0x16fdd6f2da1d8163, 0x7ba32bd48ef56f11, 0x6f4e4d7ee8b29717, + 0xd31576dde7468aad, 0x023bb08848676045, 0xf6dcc083178160b7, 0x42035f426250e683, + 0x343732993cfed89f, 0x0640a870a22d3d58, 0x65cff80b53b4ae6a, 0x27996fa17ab05215, + 0xfd5db01401b21a04, 0x894508784bc1673c, 0x5bfcf43a2380e27d, 0x4cd6dcc2715583b7, + 0xa43b3763e7d4c902, 0x6da83e12ef0c1257, 0xfe80a602b0335aff, 0x293a7d8f4ff344de, + 0xb4ae7c2b8956bf5a, 0x6b45432d38254b4d, 0xd086acbdf15d9455, 0xa4d19e43f41ea87b, + 0xf01f13ba4bb87fbf, 0xca582cf301a299ff, 0x0ddad3d45298fa7d, 0x0646a130459c3999, + 0xc08e3af3747e2cee, 0xfc7db8aa9ed67295, 0x783b329e7bd79d5f, 0x732dbc607957af7b, + 0x8e446ac19fb26555, 0xff1dfa4d61dc89a5, 0xb6fbc46bd8d011d8, 0x185147ec5779f0d7, + 0x6eb2cf6149a5380f, 0xb0e773df803a1eae, 0xc07706c5519bfce5, 0xc35abcf54fa95f14, + 0x40a01d99a38608ea, 0x776dcd6f603c277f, 0x6ae12389b1d6d0bb, 0x8bd981448df92bb9, + 0x426a6a7ca21a2c16, 0x87efd5b71c1bad26, 0x71fb7fc4cd41de48, 0xdd9033c45619d463, + 0x40eaab322654cef7, 0xe077fffed6f3e3a2, 0x375a4dbef9384447, 0x2066b009d2c4a100, + 0xeca4a5794a068447, 0x2128f64bddf341a1, 0x738b4bb1be90bd61, 0x433772cf3813d52e, + 0x9540c88add8e4474, 0x0b6d5decd21d3519, 0x654ead966745642d, 0xe1bfb03c3b4bdb4c, + 0x0b977a9937515b1f, 0x0a4587509ef63870, 0xe89f0de1d9cfd44a, 0x23a91390272e7f68, + 0xd92defbc9096b8d8, 0x004db87174612539, 0xc88ecaabdd1a71f1, 0x050de38393073346, + 0x8af1426d7964e038, 0xf352c4fef8ad5c87, 0x6f26bc7408e26548, 0x0d41543fd9bf3084, + 0xfc4e07553a840fc6, 0x5ef117de86a555a9, 0x1f11c42dffb5ae1b, 0x4147648f07490fa5, + 0x09b35fd7671b21aa, 0x1453b14f7ccca481, 0x944f6fcce4c9b2ba, 0x5b08dd2e3583dc06, + 0xe0220df78dc9c22d, 0x1c200b9506cbf666, 0x8a0b7465eadb523b, 0xfbcb43a91a1e2d80, + 0xe697f44be3c36a58, 0x2f8a8e48fb7e350d, 0x7baba71b8920d55f, 0x10edc0216105bc96, + 0x52db07c79d7a7a63, 0x1916e8cef9452ac3, 0x5cbbbf21f867b6cc, 0xadd583365a690a4b, + 0x4e4ca2c8bffc2fdb, 0xf5fe3416d2eebcfe, 0x839af8b85e452476, 0x8496c0c54ad44e16, + 0x6c46f1ecad4482bf, 0xb794cad76ae18715, 0x67b762eec7c62985, 0x52dc9e68df5b3a53, + 0x0cc7e444b422a5f9, 0xadbfe90841c112b0, 0xfe37b136f0ca5c34, 0xcfe9e47948a8d73e, + 0xee90572b86a30d91, 0x549e72d8262830aa, 0x3361564b469f32c6, 0x1e6eba9e0d2648e2, + 0x5f8e2b2ac5fcb4eb, 0xe4224fa5f71f7cc6, 0x7357a9230c76757b, 0xcad70f74aaf6b702, + 0xeef28ced23894cc2, 0x753fdd3352aefd68, 0x1fed6ba90bbeb9d2, 0x05316f4ab4034b4b, + 0x3396df022b9f63d6, 0x82d7125a7cfd0935, 0x3519a71caf1f87f0, 0xd1dfb7a5cc3974be, + 0xbfae40ecbdbbcc2a, 0x152c11778e08dd54, 0x4a96566a6c848554, 0x3a84d621c340cdd7, + 0xfd47aa1887e2fb03, 0xa63cae94b2f1d099, 0xed61783f3e5b75e0, 0xefd44864106019be, + 0x145ff78b80b081aa, 0x34670e5fcea9230e, 0x876ef976328db371, 0x4221f3a5269942a6, + 0x95315cbd85c648f4, 0x3ca344dc7c3b1600, 0x38421ea39ff28780, 0x31dbeee967c0435c, + 0x27437c3e268402e7, 0xdd0cf8343312a654, 0x965ab9dad1d8aa29, 0xf871706dd3e23509, + 0xce23d06c7a25e699, 0x1b37d59382b27589, 0x3407f004723d6324, 0x56efb69cdb5deaa1, + 0xf46cdd2b9fd604e0, 0xcad3ca79fdac69bd, 0x7252802a574e63cb, 0xc281fb8acc6ec1d3, + ], + // seed = 2 + [ + 0xdd16cb672ba6979c, 0x3954eaa9ec41ae41, 0x52cb802771d2966d, 0xf57ed8eb0d0294f2, + 0x768be23c71da2219, 0x6131e22d95a84ad3, 0xd849e4e49bb15842, 0x18e8e5c4978cf00d, + 0x3af5e5867ce1f9bd, 0x06c75a9fffe83d63, 0xe8de75a00b58a065, 0x0a773251bc0d755a, + 0x629dc21e54548329, 0x2a168f5e5a883e70, 0x33547375f0996c86, 0xdfcb4c7680451322, + 0x55c1ecaaaa57e397, 0x4546c346c24f5a31, 0x6f8f0401dfabc86c, 0x7760d2d36ee340b4, + 0xf6448e48bdeb229d, 0xba70e1633b4dba65, 0x069cda561e273054, 0xa010b6a84aebf340, + 0x5c23b8229eee34b6, 0xea63c926d90153af, 0x7d7de27b3e43ec1b, 0xea119541eddc3491, + 0xf1259daeddfc724c, 0x2873ca9a67730647, 0xa1e7710dade32607, 0x758de030b61d43fd, + 0xd2c9bcbfa475edb4, 0x18ade47bb8a0aa29, 0xf7a74af0ff1aea88, 0x6f8873274a987162, + 0x6963e8d876f4d282, 0xd435d4fe448c6c5b, 0x93ec80ba404cafff, 0xcf90d24c509e41e7, + 0x5f0fc8a62923e36e, 0x9224878fe458f3a4, 0xd9a039edf1945bcd, 0x0877d1892c288441, + 0x75205491f4b4740b, 0x30f9d2d523a9085b, 0x4b7f4029fa097c99, 0x170bb013745709d4, + 0x7087af537f11ef2e, 0x28c62b88e08fc464, 0x84bbcb3e0bb56271, 0x485a4b099165c681, + 0x357c63357caa9292, 0x819eb7d1aee2d27e, 0xdaa759eb9c0f8c9d, 0x42cdc36729cc3db5, + 0x9489aa852eddbb06, 0x8161e4f85a84e6d4, 0xa964863fdad3eb29, 0xcc095ddbce1a6702, + 0x3ecfadbb8dc2ce58, 0x971316509b95a231, 0xc8f484d1dbc38427, 0xae9c510c463574c0, + 0xdf2b31179600c21a, 0x440de87bada4dfa3, 0xbd8d30f3f6fb7522, 0x84e6d7f678a0e2d0, + 0x0ec4d74323e15975, 0xf6947610dad6d9ab, 0x73a55a95d73fe3a5, 0x3e5f623024d37eda, + 0x8d99a728d95d9344, 0x8b82a7956c4acdc4, 0x7faeaea4385b27f6, 0x540625ff4aa2ff21, + 0x4aa43b3ebd92ce2b, 0x899646a6df2da807, 0x49225115780942d7, 0xe16606636af89525, + 0xb980bcf893888e33, 0xf9ed57695291b0d8, 0x5c6dd14464619afa, 0x50606d69b733d4f3, + 0x7fb1af465b990f97, 0x3fab2634c8bbd936, 0x556da6168838b902, 0x0f15975902a30e1f, + 0xb29d782ae9e1991f, 0xae00e26ff8f7e739, 0xd3da86458bb292d5, 0x4528ee0afb27e4ce, + 0x49882d5ba49fabad, 0x7e873b6a7cf875ee, 0x777edd535113c912, 0x94ed05e7ff149594, + 0x0b8f95fc4211df43, 0x9135c2b42426fef2, 0x411e6c2b47307073, 0x503207d1af0c8cf8, + 0xd76f8619059f9a79, 0x64d24617855dee45, 0xf7bc7a877923196a, 0xd6cc42ed6a65be79, + 0xe3912ff09d4fc574, 0x4192d03b2bc2460a, 0xa0dcc37dad98af85, 0xfc59049b2a5818a4, + 0x2128bae90a5b975f, 0xbe7067ca05ea3294, 0x5bab7e7753064c4f, 0x42cbf0949ef88443, + 0x564df4bbd017492c, 0xf2c2eb500cf80564, 0x5b92e67eb00e92af, 0x8c4103eef59c0341, + 0x83412122b8284998, 0x888daf2da0636b6d, 0x4d54b10303dd07d6, 0x201190e7c1e7b5ed, + 0x3797510bb53a5771, 0x03f7bc598b570b79, 0xdc1e15d67d94f73e, 0x721e8b499ebe02c1, + 0x71f954f606d13fa0, 0x0c7a2e408c168bf0, 0x07df2ef14f69c89d, 0xe295096f46b4baaf, + 0x7a2037916438737e, 0xd1e861aeaf8676ea, 0xb36ebdce368b8108, 0xb7e53b090ddb5d25, + 0x5a606607b390b1aa, 0x475e52994f4a2471, 0xbcc2038ba55b2078, 0x28b8a6b6c80df694, + 0xb5f0130ec972c9a2, 0x7a87cd2a93276b54, 0x4d0eec7ecf92d625, 0xac1a8ce16269a42e, + 0xa4ca0237ca9637b8, 0xd8dc8ff91202b6ff, 0x75b29846799d7678, 0x761b11a5edd9c757, + 0xf2581db294ef3307, 0xe3173c2b6a48e20f, 0xe46fd7d486d65b3c, 0x1352024303580d1f, + 0x2d665dae485c1d6d, 0x4e0905c825d74d3b, 0x14ff470c331c229e, 0xbdc656b8613d8805, + 0x36de38e396345721, 0xaae682c1aa8ff13b, 0x57eb28d7b85a1052, 0xf3145290231d443a, + 0xd0f68095e23cbe39, 0x67f99b3c2570b33d, 0x54575285f3017a83, 0x9b2f7bb03d836a79, + 0xa57b209d303367a9, 0x7ccb545dd0939c79, 0x1392b79a37f4716d, 0x6e81bb91a3c79bcd, + 0x2c2cd80307dddf81, 0xb949e119e2a16cbb, 0x69625382c4c7596f, 0xf19c6d97204fb95c, + 0x1b2ea42a24b6b05e, 0x8976f83cd43d20ac, 0x7149dd3de44c9872, 0xc79f1ae2d2623059, + 0xca17a4f143a414e1, 0x66d7a1a21b6f0185, 0xed2c6198fe73f113, 0x16a5f0295cbe06af, + 0x5f27162e38d98013, 0xf54d9f295bdc0f76, 0x9ba7d562073ef77b, 0xa4a24daaa2cfc571, + 0x49884cf486da43cd, 0x74c641c0e2148a24, 0xbff9dcbff504c482, 0xf8fc2d9403c837ab, + 0x6ccc44828af0bb1e, 0xbcf0d69b4c19dfdb, 0x8fe0d962d47abf8f, 0xa65f1d9d5514271d, + 0x26ff393e62ef6a03, 0xc7153500f283e8fc, 0xea5ed99cdd9d15cd, 0xfc16ac2ba8b48bb7, + 0xf49694b70041c67a, 0xbd35dd30f5d15f72, 0xcf10ad7385f83f98, 0x709e52e27339cdc2, + 0xe9505cb3ec893b71, 0x2ffa610e4a229af7, 0x12e1bc774d1f0e52, 0xe301a3bb7eacccc8, + 0x1fdd3b6dcd877ebf, 0x56a7e8bda59c05aa, 0x99acd421035d6ab4, 0xfd21e401cecd2808, + 0x9a89d23df8b8d46f, 0x4e26b1f1eb297b9c, 0x9df24d973e1eae07, 0xe6cdc74da62a6318, + 0xfc360d74df992db0, 0xf4eca0a739514c98, 0x481c515ba9bf5215, 0xce89cce80f5f3022, + 0xf487a10fc80e4777, 0x235b379a87e41832, 0x76f72e028371f194, 0xd044d4a201325a7d, + 0x47d8e855e0ffbdde, 0x268ae196fe7334b0, 0x123f2b26db46faa8, 0x11741175b86eb083, + 0x72ee185a423e6e31, 0x8da113dfe6f6df89, 0x286b72e338bbd548, 0xa922246204973592, + 0x7237b4f939a6b629, 0x31babda9bedf039a, 0xb2e8f18c6aeec258, 0x0f5f6ce6dd65a45e, + 0x8f9071a0f23e57d3, 0x71307115ba598423, 0xcbe70264c0e1768c, 0x1c23729f955681a8, + 0xfbc829099bc2fc24, 0x9619355cbc37d5d6, 0xea694d4e59b59a74, 0xb41cf8d3a7c4f638, + 0xae1e792df721cd0b, 0x7cd855d28aac11f6, 0xca11ba0efec11238, 0x7c433e554ce261d8, + 0xe3140366f042b6ba, 0x8a59d68642b3b18c, 0x094fcdd5d7bccac2, 0x9517d80356362c37, + 0x4a20a9949c6c74e8, 0xc25bcf1699d3b326, 0xa8893f1d1ed2f340, 0x9b58986e0e8a886e, + 0x29d78c647587ce41, 0x3b210181df471767, 0xd45e8e807627849d, 0x1ec56bc3f2b653e3, + 0x974ff23068558b00, 0xdb72bdac5d34262c, 0x23225143bb206b57, 0xd0a34cfe027cbb7e, + ], + // seed = 3 + [ + 0x39209fb3eb541043, 0xee0cd3754563088f, 0x36c05fc545bf8abe, 0x842cb6381a9d396b, + 0xd5059dcb443ce3bf, 0xe92545a8dfa7097e, 0xb9d47558d8049174, 0xc6389e426f4c2fc0, + 0xd8e0a6e4c0b850d3, 0x7730e54360bd0d0d, 0x6ecb4d4c50d050d5, 0x07a16584d4eb229f, + 0x13305d05f4a92267, 0xb278ddd75db4baec, 0x32381b774138608f, 0x61fe7a7163948057, + 0x460c58a9092efee6, 0x553bf895d9b5ff62, 0x899daf2dabfd0189, 0xf388ab9c1c4b6f70, + 0xd600fe47027ea4cd, 0x16d527ec2b5ef355, 0x5ac1f58ff6908c81, 0xa08d79ff8ee9ffe8, + 0xc1060a80b7a5e117, 0x14b2c23118c60bda, 0x8cc0defbb890df8f, 0xe29540fd94c6d28b, + 0xa604f003f82d5b71, 0xa67583d4eb066d18, 0xd62cbd796322b3fc, 0x070cfe244cdcccf3, + 0x73557c30b3af47e5, 0x2e544e31153a2163, 0x996eef7464d5bead, 0xbc71cb5ab0586cdc, + 0x0bfcb6c1b517ed69, 0x62b4f1fcc82e8ca0, 0x0edbc68f544965c5, 0x40fa39baa24af412, + 0xf39aeb2413dab165, 0x17e6013e7afee738, 0x8109bff1c8d42a9d, 0x3cd99863390989b5, + 0x02021a4cc9c336c8, 0xa06060778cb60aa4, 0xd96591db60bc1e06, 0xd2727175183f4022, + 0xcdc1f1c5bce3e7ce, 0xb393ccc447872a37, 0xdf6efe63257ead3a, 0x20729d0340dbceb6, + 0x9f3d2d26fc0ea0d7, 0xf392e0885189bd79, 0xdf2ee01eb212b8b6, 0x6e103a0c0f97e2c3, + 0x96c604a763bd841b, 0x9fc590c43bba0169, 0xf92dcd5ddc248c40, 0x113a8b54446941dc, + 0x5943eda146b46bb8, 0xbf657901a36a39a7, 0x5a4e0e7ea6568971, 0xb94c635bae9f9117, + 0x2626fb65b3a4ef81, 0xa59bfd5478ce97de, 0x79112ba9cc1a1c63, 0xf41f102f002cf39c, + 0x0a589bcbfb7ff1c8, 0xa1478c53540c4fa1, 0x60d55e72c86dfaca, 0x312e7b6840ea7a39, + 0x8aae72dcccfe1f75, 0xff2f51f55bf0247a, 0x3c2e4b109edb4a90, 0x5c6d73f6525c7637, + 0xe49acb04a199f61c, 0x27860642d966df7f, 0x541ce75fb1e21c30, 0xd9fcd6f90806c7cc, + 0xb87c27bc93a7969b, 0x92f77a1179b8f8dc, 0xb1f29379deb89ed4, 0x7e63ead35808efe7, + 0x13545183d7fa5420, 0x575f593e34cf029d, 0x27f1199fb07344ae, 0xe67f95f7dc741455, + 0x49b478b761ab850b, 0xd7bedf794adfc21e, 0xdc788dcd2dda40ae, 0x14673eb9f4d8ad35, + 0x0cced3c71ecf5eb1, 0xe62d4e6c84471180, 0xdfe1b9e2cb4ada7d, 0x70185a8fce980426, + 0x0ce2db5e8f9553d6, 0x1fedc57bb37b7264, 0xb9310a2e970b3760, 0x989ff8ab9805e87d, + 0x0b912d7eb712d9ee, 0x1fe272830379e67c, 0x16e6a73aff4738fb, 0xeed196d98ba43866, + 0x7088ca12d356cbe2, 0x23539aa43a71eee0, 0xed52f0311fa0f7ad, 0xa12b16233f302eea, + 0xc477786f0870ecb4, 0xd603674717a93920, 0x4abe0ae17fa62a4c, 0xa18f1ad79e4edc8d, + 0xc49fe6db967c6981, 0xcc154d7e3c1271e9, 0xdd075d640013c0c0, 0xc026cd797d10922a, + 0xead7339703f95572, 0x4342f6f11739eb4b, 0x9862f4657d15c197, 0x4f3cb1d4d392f9ff, + 0xe35bffa018b97d03, 0x600c755031939ad3, 0xb8c6557ffea83abf, 0x14c9e7f2f8a122ea, + 0x0a2eb9285ee95a7c, 0x8823fec19840c46f, 0x2c4c445c736ed1d0, 0x83181dff233449f1, + 0x15ed3fca3107bef5, 0x305e9adb688a4c71, 0x7dbef196f68a3e2e, 0x93e47ece3e249187, + 0x8353c5e890ead93c, 0xea8a7ae66abafdf7, 0xf956dbb6becf7f74, 0x9f37c494fbfdb6e4, + 0x11c6cbaa2485dd32, 0x206f336fcca11320, 0x9befe9a59135d8fe, 0x5f3ef8b8db92c7db, + 0xbb305e556ce0ce9a, 0xf26bdafb1305887f, 0xcbf28abe23f08c61, 0x0bc64173b914e00b, + 0x9168da52e983f54a, 0x6ea41d09c3574a3e, 0x78aa44d4a74459ae, 0x2931422878387bf5, + 0x018f64a3a92c2d9c, 0x9be43f6752e66b34, 0xae378890decd1152, 0x07325329a1cb7623, + 0x3b96f4ee3dd9c525, 0x2d6ebcdbe77d61a3, 0x10e32b0e975f510c, 0xffc007b9da959bf9, + 0x38bf66c6559e5d90, 0xbe22bdf0bf8899fe, 0x87807d7a991632a8, 0x149a0d702816766a, + 0x026f723db057e9ab, 0xeeecb83625ec6798, 0xcec2ed5984208148, 0xd985a78e97f03c84, + 0xf96c279e7927b116, 0x99d5027b3204f6e2, 0x13a84878c3d34c55, 0x5cf5ec96229e9676, + 0x0bc36b07e4f8e289, 0xbed33b80a069914d, 0x2fbfbdd1ff4b9396, 0xab352bb6982da90f, + 0x154d219e4fa3f62b, 0x4d087512bb6b9be7, 0xc582e31775ee400e, 0x7dadb002ae8c4a4e, + 0xaae2957375c1aee2, 0x5f36ca643356625b, 0xf87cf8eb76e07fb7, 0x46f432a755e02cc3, + 0x36087e07aba09642, 0xe5642c1e4ebb9939, 0xb9152d22338eefad, 0xf7ba44278a22cf7f, + 0xd3b8013502acd838, 0x7761511da6482659, 0xb0857621638e8e50, 0x552eddb4a8b1d5f5, + 0xc43d9861e812c3ea, 0xd765c2aada47910c, 0x21c935b68f552b19, 0x6256d5641a2b47dc, + 0xab711d8e6c94bc79, 0xa8d0b91a2a01ab81, 0x5e6d66141e8d632a, 0x7638285124d5d602, + 0x794876dbca3e471f, 0x951937d8682670ce, 0x0f99cb1f52ed466a, 0x8c7cd205543b804c, + 0x2fd24d74a9c33783, 0xe5dcb7b7762e5af1, 0x45e6749cca4af77c, 0x540ac7ee61f2259f, + 0x89c505c72802ce86, 0xeab83b9d2d8000d1, 0x9f01d5e76748d005, 0xc740aaef3035b6d0, + 0x49afcd31d582d054, 0xcba5dc4c1efb5ddc, 0xc0a4c07434350ca1, 0xfc8dfaddcc65ee80, + 0x157c9780f6e4b2d9, 0x9762a872e1797617, 0xc4afae2cf3c7e1bd, 0x71cde14591b595d4, + 0x8843c3e0e641f3b9, 0xd92ecd91dce28750, 0x1474e7a1742cb19f, 0xec198e22764fa06b, + 0x39394edb47330c7d, 0x00ba1d925242533d, 0xaed8702536c6fb30, 0x6d3618e531c2967a, + 0x77f7cedcd7cc0411, 0xbc1e2ab82be5b752, 0x07b0cf9223676977, 0x596c693b099edd53, + 0xbb7f570f5b9b2811, 0x96bfdad3c4a6840c, 0x668015e79b60c534, 0x3ad38d72123f1366, + 0x6b994d81d2fcbb09, 0x70885f022c5052d8, 0xc891ee79d9306a7b, 0x2c4df05c0ed02497, + 0x19ebc13816898be2, 0xea7c64df11c392a2, 0xb7663e88dd12e1bd, 0x79f768cb8e154c21, + 0x1fb21b12e945933b, 0xe6a9045643f6906e, 0x544c47acd7e15371, 0xb7709b14f727e3d1, + 0x326ee36a46942971, 0x477f1cf7b0e2d847, 0x88b8f6b82b3b0c24, 0x18bc357b80e3cd5c, + 0x3333de70e4d66e0b, 0x4fd4c5e148583cf6, 0xae1b62f3008c0af3, 0xc49f419b6ab29cf5, + 0x2c29fa65afc3fa28, 0x4b19d93734d03009, 0x7dd6c09e589276ad, 0x1cece97f30de48ad, + ], + // seed = 4 + [ + 0x58bdf4338602e4fb, 0x71a5620b02c926d5, 0x3811c960129c2d9f, 0x29c2fb11fccac567, + 0x0d6b1ea7780f1352, 0xcc4d3ddfae3f87b3, 0xfdd30257362a586b, 0xabc948fde69f25f1, + 0x51b3523469d30f7b, 0xe0f0322724405ace, 0xd3729266d896da1e, 0xb10c37e5147915bf, + 0x8b577039f9fa32a3, 0xe677c6a9cbfb44b3, 0x7317a756ebb51a03, 0xf8e988ef37359485, + 0x600fc1ef3f469ff3, 0xbf0b8f8520444e01, 0x3711168b08b63d73, 0x34146f2944a6cb36, + 0x717feb263862cdde, 0x7185f8347db00412, 0x900798d82127e693, 0x84089e976a473268, + 0x10f8308c0d293719, 0xf62a618d4e5719b8, 0x8bdbd257a1a9516f, 0xf49f666fd7a75110, + 0xbaf45e2db7864339, 0xe4efa1ea0c627697, 0x3e71d4c82a09fe10, 0x54a2a51cf12127bb, + 0xa0592c9f54ba14cd, 0x27dd627a101c7a42, 0x3d2ceb44b3d20d72, 0x7ee1f94a68ca8f5d, + 0x7e8cb8651b006c36, 0xbd9fa7ca3a475259, 0x856de173586a7b34, 0xcedb291b594cb1b5, + 0xa3d6e462fd21cddc, 0x74561d10af9118e4, 0x13a3d389fc2d4b36, 0xeea8594a4a054856, + 0xf56d7474d9ba4b13, 0x25ddce2f6490b2fd, 0x920653ff3a8d830b, 0xcd8c0c9cdac740d1, + 0x2c348a738db9c4a0, 0x2967ccbe8ea44c22, 0x47963f69adb049f8, 0xf9d01eb5b4cf7eb6, + 0x7a5c26eb63a86bd2, 0x62ad8b7a71fa0566, 0xb373213179f250ae, 0x589d4e9a88245a4d, + 0x433dafebe2d558a8, 0x521fbef2c8fe4399, 0x62a31f9ff9ccd46b, 0x51602203eba7c1a6, + 0x9afc8c451b06c99f, 0xb529085bdbaffcea, 0xac251825cc75892b, 0x94976a5bce23d58e, + 0xdd17925b6c71b515, 0x568fd07a57bce92e, 0xefac31200d8bd340, 0x716c3e466b540ef9, + 0x3d2c9e380063c69b, 0x14168f9a3662dd83, 0xd298c7504dbc412f, 0x74490a94f016719f, + 0x0e0da431e1ab80c8, 0xe321f63dc6b169ae, 0xf08671544febc95a, 0x39324450cc394b3b, + 0xea6e3d35f1aa3a70, 0x8ef8a886508ce486, 0xdc1a631ef0a17f06, 0xfda2b3fbcd79e87b, + 0xd75bcae936403b10, 0xf88b5bd9f035f875, 0xc43efec2e3792dd4, 0xe9fac21a9d47cd94, + 0xc2876f0c4b7d47c3, 0xaba156cf49f368b4, 0x5ccda2170fa58bf9, 0xadc92c879ed18df7, + 0x110c1b227354e6c8, 0x298ee7a603249200, 0xde92142ede0e8ee7, 0x88e4a4610644ba9e, + 0xbb62d277e7641d3a, 0xb9be1985b7bf8073, 0x29024e5426cdb0d1, 0xf6aefd01f3092ab8, + 0x2a07087b313133aa, 0x6d71f445d6dfc839, 0x1e2412ff12e5526b, 0xed5cdeba6617b9e1, + 0x20b1d0d5e5f8760e, 0x12ff15705c368260, 0x7bf4338b7c387203, 0x34ff25f00cd06185, + 0x1148c706c518cf28, 0x5c04f0623388f025, 0xcb9d649275d87d79, 0x9b5f0c24fabc42ec, + 0x1a7b5e7964e33858, 0x2a81bbd8efdc6793, 0x8d05431ffe42752e, 0x83915cd511002677, + 0x580ed4d791837b31, 0x5982e041d19ff306, 0xcad0d08fa5d864ca, 0x867bee6efe1afa63, + 0x26467b0320f23009, 0xd842414dfda4ec36, 0x047fcdcbc0a76725, 0xbddb340a3768aeca, + 0xef4ce6fa6e99ab45, 0x88c5b66c7762bf9b, 0x5679f1c51ffb225d, 0xdab79048317d77ee, + 0xf14e9b8a8ba03803, 0xe77f07f7731184c1, 0x4c2aab9a108c1ef5, 0xa137795718e6ad97, + 0x8d6c7cc73350b88b, 0x5c34e2ae74131a49, 0xd4828f579570a056, 0xb7862594da5336fc, + 0x6fd590a4a2bed7a5, 0x138d327de35e0ec1, 0xe8290eb33d585b0b, 0xcee01d52cdf88833, + 0x165c7c76484f160e, 0x7232653da72fc7f6, 0x66600f13445ca481, 0x6bbdf0a01f7b127d, + 0xd7b71d6a1992c73b, 0xcf259d37ae3fda4a, 0xf570c70d05895acf, 0x1e01e6a3e8f60155, + 0x2dacbb83c2bd3671, 0x9c291f5a5bca81af, 0xd976826c68b4ee90, 0x95112eec1f6310a2, + 0x11ebc7f623bc4c9a, 0x18471781b1122b30, 0x48f7c65414b00187, 0x6834b03efa2f5c30, + 0x0875ef5c2c56b164, 0x45248d4f2a60ba71, 0x5a7d466e7f7ba830, 0x2bebe6a5e42c4a1d, + 0xd871d8483db51d10, 0x6ee37decd2fd392f, 0x7d724392010cede3, 0x8e96ef11e1c9bcc8, + 0x804a61d86b89d178, 0xbb1b83ce956055ec, 0xcb44e107410ff64f, 0xc426bb09ee0ba955, + 0x057c08f42c3dd7f1, 0x40ea1ec148602bdf, 0xc24688deeb65d7f1, 0xd8bcc53c768ba4e4, + 0x16e0e3af65c1106c, 0xfc12f7e7d647218b, 0x70d6e1d3ee93cef4, 0x01d2a505c4541ef9, + 0x1ef79e16e764d5c3, 0x0363d14d13870b98, 0xb56ef64345d06b11, 0xe653d557ebb7c346, + 0x8304a8597c2b2706, 0x1536e1322ce7e7bb, 0x525aec08a65af822, 0x91f66d6e98d28e43, + 0xe65af12c0b5c0274, 0xdf6ae56b7d5ea4c2, 0x5cef621cedf3c81c, 0x41e8b1ffd4889944, + 0xb5c0f452c213c3e5, 0x77af86f3e67e499b, 0xe20e76ea5b010704, 0xbdc205ab0c889ec0, + 0xc76d93eb0469cd83, 0x17ac27f65cab0034, 0xd49ec4531fd62133, 0x07a873ea2f1b9984, + 0xbff270dfef0032ee, 0x1764dbe91592f255, 0xe40363126f79e859, 0xa06cad3ab46971f6, + 0x0be596e90dedd875, 0x3387cce5c1658461, 0x44246acf88a9585e, 0xe0ad82b92d5ecb2c, + 0x2177491c9a1600a6, 0x16e7c4aac0f02422, 0x75792eeeec15c4e1, 0x2309cd359d08ee30, + 0x7cd9831dd1b83b0a, 0x374914a7c4ee8cf0, 0x0dd17765c9ac2e54, 0xb7847470ba9a7688, + 0xfba4f4bbe2991173, 0x422b203fc3de040e, 0x63bfcaf2ecf2ab0e, 0x0c5559f3a192946e, + 0xfdf80675c1847695, 0xf5f570accab842c9, 0x65cc5a448767afea, 0x1efeb0a7ee234f2f, + 0x9b05f03d81e7b5d2, 0xe7c31317a8626cf4, 0x620f2a53081d0398, 0x1b6de96cdd9943ae, + 0x8c226a436777d303, 0xa08fbbd50fafb10d, 0x6a64c5ec20104883, 0x9c9c653502c0f671, + 0x678a02b2174f52a0, 0x68e008ba16bbad4b, 0xa317c16d2efb860f, 0xeab2075d17ed714c, + 0x565eeeddf0c4ea15, 0x8ec8e94d242a6c19, 0x139e8e27d9000fae, 0xc977a7ff1b33d2f5, + 0x1d0accca84420346, 0xc9e82602cd436e03, 0x6a2231da53d2ccd3, 0xb44b12d917826e2a, + 0x4f4567c6a74cf0b9, 0xd8e115a42fc6da8f, 0xb6bbe79d95742a74, 0x5686c647f1707dab, + 0xa70d58eb6c008fc5, 0xaaedc2dbe4418026, 0x6661e2267bdcfd3d, 0x4882a6eda7706f9e, + 0xf6c2d2c912dafdd0, 0x2f2298c142fd61f9, 0x31d75afeb17143a8, 0x1f9b96580a2a982f, + 0xa6cd3e5604a8ad49, 0x0dae2a80aad17419, 0xdb9a9d12868124ac, 0x66b6109f80877fac, + 0x9a81d9c703a94029, 0xbd3b381b1e03c647, 0xe88bc07b70f31083, 0x4e17878356a55822, + ], + // seed = 5 + [ + 0xb3c58c2483ad5ead, 0x6570847428cdcf6c, 0x2b38adbf813ac866, 0x8cb9945d37eb9ad3, + 0xf5b409ec3d1aed1c, 0xa35f4bffc9bb5a93, 0x5db89cde3c9e9340, 0xff1225231b2afb2b, + 0x157b0b212b9cc47d, 0xf03faf97a2b2e04d, 0x86fdab8544a20f87, 0xfcb8732744ae5c1c, + 0xd91744c0787986d5, 0x5f8db2a76d65ad05, 0xcff605cbed17a90d, 0xf80284980a3164e7, + 0x59cc24e713fccc7d, 0x268982cada117ce4, 0xcd020e63896e730e, 0xe760dc46e9fe9885, + 0x6aaece8ab49c6b5d, 0x7451194d597aae3e, 0x35d4385900332457, 0xa40fb563a096583d, + 0xa797b612f7f11b76, 0x2fed6eb68e6a2b9b, 0x2f06ee64aeffd943, 0x9dd0e49d9ca45330, + 0x97d48f08bd7f1d8f, 0x1cfa7fe3ebe4d8ee, 0x2a2ba076bd397d42, 0x68c4344f7472f333, + 0xce21ec31987d74b5, 0xb73dabdc91d84088, 0x801aadee592222fe, 0xaf41345398ebc3f5, + 0x8a8f653d7f15ee46, 0xce2d065ff2ba2965, 0x4e05da515da2adb7, 0xa6dbdb8aa25f0fd4, + 0xca9f9666bbd2d5a9, 0x6b917ce50bd46408, 0x1550cc564ba6c84d, 0xb3063ae043506504, + 0x84e5f96bb796653d, 0xe2364798096cf6e3, 0x3b0dfedf6d3a53d0, 0xb7e4c7c77bde8d93, + 0xe99545bac9ab418a, 0xa0e31f96889507bb, 0x883c74f80c346885, 0xf674ae0b039fd341, + 0x8bb6ce2d5e8d1c75, 0x0c48737966a7ed7c, 0x04fcdf897b34c61c, 0xe96ac181bacbd4d6, + 0x5a9c55a6106a9c01, 0x2520f020de4f45d3, 0x935730955e94d208, 0xce5ad4d7f3f67d3b, + 0xa4b6d107fe2d81ca, 0x4f0033f50ae7944e, 0x32c5d28dd8a645a7, 0x57ce018223ef1039, + 0x2cbab15a661ab68e, 0x6de08798c0b5bec2, 0xee197fb2c5c007c6, 0x31b630ac63e7bda2, + 0xab98785aefe9efe3, 0xa36006158a606bf7, 0x7b20376b9f4af635, 0xa40762fdc3c08680, + 0x943b5faffd0ebee2, 0x7f39f41d0b81f06e, 0x7c4b399b116a90f8, 0x24e1662ac92bc9f3, + 0xcf586fc4e8e6c7db, 0xe46e0d047eeb12d7, 0xe8021076e4ea9958, 0x11fc13492e3ca22a, + 0xd61eae01410397e3, 0x7e8c4a58036a8e9f, 0x068a6de267970745, 0x64faab129bef1a41, + 0xb4a6f720943dad01, 0x631491058d73a9d5, 0xdad4fe95eab3ec02, 0x0a8b141c5c3a44f6, + 0x9fc69d4c2b335b98, 0x94d5f84a07d6e4cd, 0x1b73965de143c608, 0x443932c2dda54bcc, + 0x7397818fb0b04cd2, 0xef4ab03a1202b277, 0xf3d2ee459c0c2b92, 0x182d4daf8b058a87, + 0x90e63035d7b51368, 0xba4cd8b9a95d45fd, 0x12a7392c76731090, 0x890d264ec5d082d2, + 0xeeaf5c363da4994e, 0xd6aad756902123fb, 0xb531ebebdb28f191, 0xe71ce659fc59babd, + 0x37c1b94f63f2dcb5, 0xe4e3abeb311f9b96, 0x4a31b72ccb8695d3, 0x52cae1f0629fdce4, + 0xe5b0475e2ed71369, 0x2724e8c3506414fb, 0xbab0367920672deb, 0x0161a781c305449f, + 0x37b70f40f5bb60be, 0xddd1094c50251a01, 0x3b28283afd17224e, 0x06dec0cfe889fc6b, + 0x47608ea95bb4902d, 0xad883ebc12c00e82, 0x9e8d7ae0f7a8df29, 0xa79443e9f7c013a1, + 0xcfa26f68b7c68b71, 0x33ae6cc19bda1f23, 0xd9741e22b407887f, 0xf2bff78066d46b1c, + 0x794123191c9d32d4, 0x56cb6b903764ec76, 0x98775d0ef91e1a5a, 0xae7b713bc15c1db9, + 0x3b4c1a7870ed7a0d, 0x46666965f305cc34, 0x0ea0c3b2e9c6b3cd, 0x4dc387039a143bff, + 0x5f38bb9229ef9477, 0xea5d39ba72af7850, 0x69a5ed0174ce2b6d, 0x06969a36bfe7594d, + 0x0adee8e4065ccaa3, 0x908a581d57113718, 0x64822d6c5a8190ed, 0x8c5068b56ace4e4c, + 0x88ba3b4fb4e30bef, 0xa6ec0b8bb5896cfe, 0x4e23fcc6b47996fd, 0xe18e75b0dd549c7a, + 0xcd90f17e106cf939, 0x1666fdfb2ef7c52f, 0x4fae325f206dd88c, 0xe7bc1160e25b062d, + 0x3cc999cb246db950, 0xc5930a7326cd5c37, 0xb008a48a211367bd, 0xc5559da145a88fd4, + 0x1e3ad46655fac69c, 0x7834266b4841bfd7, 0xa764450fbffc58cc, 0x54d8cf93a939c667, + 0x93c51f11b21b2d9d, 0x0964112082ed65cc, 0x4c2df21213e7fb03, 0xf0405bc877468615, + 0x17b4fc835d116ab4, 0xa6b112ae5f3cb4ef, 0x23cfc8a7fd38a46e, 0x8e0a360dc2774808, + 0x24ca9c8092105ad5, 0xafd3f75524f2e0d5, 0x4f39ed7dbaddc24c, 0xe5e362c7679a7875, + 0x00914a916b07b389, 0xdfe1119b7d5ab5da, 0xabd6ed9940e46161, 0x630ed2044171e22c, + 0xdecc244157dd1601, 0x777e6d5b4b4868d5, 0x9b3530bee67017d8, 0xd2faf08b291fdcb9, + 0x006e99455d6523de, 0xd559b5817f6955b5, 0xefcc1063b0088c61, 0xed73145ae0f00ae7, + 0xab2af402cf5b7421, 0x897767f537644926, 0x26c9c0473ca83695, 0x192e34e1881b2962, + 0xf7cf666ec3b3d020, 0x27f9b79c7404afb7, 0xe533e8bed3010767, 0xe5817838e11d05d3, + 0x65659c531bd36517, 0xd427c5e0a23836fd, 0xf3eab7ea58fa3528, 0x07683adae1289f35, + 0x201d6af7e896dd32, 0xd5da938b9a21ad88, 0x843fb73ad67bc316, 0x1782ec7d5feef21b, + 0x943f66f6ec772877, 0x7e9112e7b26da097, 0xeac8161f8663c2c7, 0xe8600db480a9ebf4, + 0x07807fc90f6eaf5f, 0xe0e4c9deb41abf83, 0xbdf533db271f9c15, 0xb398411b0497afe2, + 0xdebb45ef25448940, 0xe7a5decefcd376c4, 0xaf1ef3c728c83735, 0xb8b83a99355cb15a, + 0x6444a0344f1611e4, 0xe8bb7f5cf3c60179, 0x77ab5c5177e75ff7, 0xc38fd6fa849d585d, + 0x390d57d53029060a, 0xa66327eb7b8b593c, 0x6350a14f6fcd5ac9, 0x2c08125bcd7008b4, + 0x2d00c299a6a6bf8e, 0x6b0039c1f68d1445, 0x0035150c5d06f143, 0xa34d01628cc927e1, + 0xdf5b3164d7b2ede1, 0x8167db1d0583d72e, 0x4e13b341cd2ae8bc, 0xa693d9b1f416e306, + 0xc15ed7ca0bc67609, 0xdc344313c1c4f0af, 0x88b6887ccf772bb4, 0x6326d8f93ca0b20e, + 0x6964fad667dc2f11, 0xe9783dd38fc6d515, 0x359ed258fa022718, 0x27ac934d1f7fd60a, + 0xd68130437294dbcc, 0xaf5f869921f8f416, 0x2b8f149b4ab4bf9f, 0xc41caca607e421cb, + 0x7746976904238ef9, 0x604cb5529b1532f0, 0x1c94cd17c4c4e4ab, 0xe833274b734d6bbe, + 0xe9f1d3ef674539ce, 0x64f56ed68d193c6a, 0xe34192343d8ecfc1, 0xcb162f6c3aa71fe8, + 0x99eaf25f4c0f8fa4, 0x92f11e7361cb8d02, 0xb89170cddff37197, 0x4f86e68a51e071e3, + 0x31abf6afd911a75b, 0x6d20cf259c269333, 0x4150b9f88fcb6513, 0x705063989ebf7451, + 0x559231d927c84410, 0x1ca8ec4b098bc687, 0xebed22405c9180e0, 0xaa815b37d052af59, + ], + // seed = 6 + [ + 0x946ac62246e04460, 0x9cebee264fcbc1ae, 0x8af54943a415652b, 0x2b327ed3b17b8682, + 0x983fde47b3c3847e, 0x10a3013f99a2ad33, 0x6e230bb92d2721ef, 0x1cf8b8369e5c5c50, + 0x7f64017f2b7b3738, 0xd393248a62417fa1, 0x9ff01c0b20a372c5, 0xb0e44abce7e7c220, + 0xcebb9f88d48a815f, 0xdb7df6bd09033886, 0x7844fc82b6fa9091, 0x72d095449863b8ec, + 0xc13e678c89da2c7e, 0x6caf4d5ad231d12f, 0x2e0ab7b5fcf35c49, 0xf410720cb932a70f, + 0xd66ea581f16fce06, 0x175c9f002f57dc98, 0xccbcfd0d32988775, 0xfde4c407d3b0a232, + 0x5db2931ae7e97223, 0x6e07e2173085809f, 0x6e1d1ec0f9cad73c, 0xb2fc251a7f802619, + 0xbc1fc17f04f342de, 0x8de8f21ec658e078, 0x72c0f40cbee53fd6, 0x0678244411fc17a1, + 0x1d5837ca166b9bbd, 0xc8cada003c554345, 0x6a2fe2bfb2e58652, 0xfca9d797a6f7988b, + 0x6699e24ac737948b, 0x69623ffcb05789ba, 0x946429c529d95b75, 0x0d14df0b2a13970f, + 0x593d8592c440dfec, 0x2ee176f3d7e74b94, 0xae003f1da3be9e26, 0x0c7b02c4c0f6764a, + 0x3117e2fa1f632462, 0xf0f23265b6f1eaeb, 0x3111255d9b10c137, 0xc82745e509a00397, + 0xbd1d04037005fea7, 0xe104ab0dd22a9036, 0x51b27ce50851ac7a, 0xb2cb9fb21b471b15, + 0x29d298074c5a3e26, 0x6ebdf2058b737418, 0xc4a974041431b96f, 0x1ec5a30ccb6bdaac, + 0xe818beede9bf4425, 0x4b69b1bce67a5555, 0xf5c35f1eb0d62698, 0xf4509bbd8e99867c, + 0xb17206debd52e1bc, 0x35785668c770b3be, 0xe9343987ff5863bc, 0x2ee768499ac73114, + 0x5132bb3426eeaaf4, 0x471bce2c6833c5ff, 0xbb9a2d5428e6f6f9, 0xd5678943c595792d, + 0xab2a65e7f81e479c, 0xa82407bb23990b31, 0xdae321383984923c, 0x01823bb22648e6f1, + 0xda6e8df4214a8b04, 0x0e172bb88e03d94f, 0x552da6c22e362777, 0x7ce67329fb0e90cb, + 0x7b2d7f287ede7ebf, 0xd44f8222500651bd, 0x4acca1ef58fbb8ab, 0x428ecf058df9656b, + 0xd7e1ec6a8987c185, 0x365be6a54b253246, 0x168849be1e271ee8, 0x6a00f3c4151a8db2, + 0x37602727ca94b33d, 0xf6b50f18504fa9ce, 0x1c10817f6bc872de, 0x4bfe1fe42b0f3638, + 0x135fad4b8ef6143b, 0x1b25ad2bafc25f58, 0x41e37f85cf321f92, 0xfc73f75d9d5b9bea, + 0x9eb3694d1e9cb7e1, 0x601d51f08fa83b90, 0x234a2a9b88366f41, 0x63fe903e16f2c3bf, + 0x1cdbd34fa751c0b0, 0x0ce4fc6747c0558c, 0x51ed72afb8bb49aa, 0x20313ba13ca12c96, + 0x271fa38f9ebd54c1, 0x3696a5ac03a8edde, 0x05602be7df625702, 0x11f1ac73790f7a9f, + 0xa2836c099f0810bd, 0xe5ac2e47caa532fa, 0xd9c000a66d39f681, 0xd93d900e6f3d9d5f, + 0x792c81c65b7900f2, 0x5c5dce790ee20da1, 0x74ff1950edec1aee, 0x71fc85fa1e277d8f, + 0x0e77df17d6546cbc, 0x07debad44816c3b4, 0xbafa721581e92a70, 0x8ab6fbe2ed27bba8, + 0xe83243a20dea304a, 0xaa85a63a84c00a07, 0xde0e79917fc4153a, 0x21bb445e83537896, + 0xeedcac49fc0b433a, 0xffb2926a810ae57a, 0xf724be1f41d28702, 0x79cb95746039bb3b, + 0x5a54fe3742a00900, 0xda4768d64922c04f, 0x420396a84a339dae, 0xa171e26ee5e8724e, + 0x4c8da7c5d289c20a, 0x9ebd79a1a8e94742, 0x39235232b97e9782, 0xb75df0be9bba7d80, + 0x0c1d204dd87d48fc, 0x8f81f3e7177266e8, 0xe4a460b39e78d72b, 0x50b98fa151e65351, + 0xb7cb585c3ee1eddc, 0x11cdad9a76ee1dc4, 0xa38054a78595dc1c, 0x92f09e2ec4978edc, + 0xa8f0061b5efdabaa, 0x04bcc4abc224d230, 0xc58606738e692d46, 0xdd2b27b565952433, + 0x19e6ed1b740beec0, 0xceadd49b2ef9891f, 0x328178c28fe95cad, 0xe5ad4c43afe02848, + 0x03c0cb538cd967c0, 0xec4352526d19a630, 0x4c7e99389d39b031, 0xf65dd05362c2deb6, + 0xd1e70daf6879d28d, 0xbe9f57db6309b265, 0xa4b66f370b872bb7, 0xe26896fbc6ee1fd5, + 0xac705e661bfcf7c5, 0xab4d0d07d7f09940, 0x976417c06aeb6267, 0x8161c684a6bd468c, + 0xf77b6b9976dc4601, 0xc6489b779a39c12c, 0xb2aa58d5681cea1a, 0x043b1b40f8c3e04c, + 0x681fcbfadc845430, 0xab8896c921ba8def, 0x57aaf172606f37b2, 0xc3735048cd5eb8d7, + 0xa7078b96955631bd, 0xdd6b3543aa187f33, 0xc7103ea4a2a697fd, 0x8d7b95f6ff1f7407, + 0xe44f419e84709530, 0xf340caa9132cbb0a, 0x2ba407283143c66c, 0xe1be240ca636c844, + 0x90d32f2877ac08bc, 0x5d26e6294b2c8673, 0x4a6b2f5b27c87a44, 0x961fb9043f76d34f, + 0x0afee02d8d3c55d2, 0x6228e3f48c42e5dc, 0xc338e69ee6593675, 0x853f74b16efb7bdd, + 0xd062f40bdd22e687, 0x647164b9ab4c4190, 0xf94689f67d598369, 0x8e4b29d87a5012d7, + 0xaf02b8b925656fbd, 0x7a722a767179a630, 0xb5c8afe937a75ace, 0xfdb8e8d02d279372, + 0x887ef700cb25fae1, 0xcfe9bd912f72cabe, 0xb1d4dedc24f978de, 0x517522d38319cc2a, + 0x7dd87b2b36aab798, 0x579c4ff3046b5a04, 0xf5c5975c5028b7a7, 0x7094579d1000ec84, + 0xbc8d5b1ea70a5291, 0x161b2d783be8855c, 0xd26d0b0d6d18279f, 0x0be1945f02a78bd5, + 0xb822a5a9e045415b, 0x2fe9d68b1ccc3562, 0xb2e375960033d14f, 0x26aca04e49b4ff22, + 0x732a81c862112aea, 0x8bd901ed6e4260b8, 0xe839532c561ad5b0, 0x8fb6e4d517a79b12, + 0x0dd37f8c0be9b429, 0xc8ad87ad12f1b1b0, 0xc51f3aa62b90318b, 0x031a7e8b86c1cefc, + 0xa95547af2b70fc76, 0x9cb3615c5a98801e, 0xa387e3c3341d7032, 0xa087ea52a1debaef, + 0x16325ec9a2e6e835, 0x587944a484c585eb, 0xc8879033bde22ecc, 0xa39dbfce709c464a, + 0x7acc010f99208774, 0x98dd2973a096c5ad, 0x26458b51139f198c, 0x2f5d19575e8c4f02, + 0x726643f0d38af352, 0x44d879b6d73e6e94, 0xa68a03885c980abe, 0x06048acd161c40c0, + 0xa4dab8f89d405d28, 0x7120c880cb04be18, 0xa062ace22a1cf0cf, 0x3901a9daf29704f4, + 0xff08f3ed989db30a, 0x6d22b13e874c67e9, 0x80c6f35518d73f4d, 0xc23c2a521aac6f29, + 0x2e708fd83aaa42e0, 0x7fc3780f55f1b0fd, 0xabb3075c98cf87f2, 0xb4df3f40f7c61143, + 0x2a04418098a76d75, 0x0d9eeee9509b2d37, 0x6be8ae51f4b59cdc, 0xe746cc7c00e4a2ab, + 0x785bc6df9cac597c, 0x33cb6620ce8adc48, 0xc1ba30739bffcef7, 0x6d95771f18e503f7, + 0xf7be3ae2e62652ff, 0xc8d82ffd2a73c62b, 0x8725a3ba5b110973, 0x67ed6b9c724757ec, + ], + // seed = 7 + [ + 0xc0272d42c19ff3ae, 0x4694228b43ea043b, 0x5709a6ef8a462841, 0xc9210a1e538805c9, + 0x279b171196113ec2, 0x859b769fc2d9e815, 0x0d5d3125a2bf14d3, 0x22bca1cfefa878ba, + 0x481b6bf58037bd83, 0x4933ba8647728d22, 0xf08c7b6b56f6e1b6, 0x374e8af5a15407c7, + 0xa95c4dc3d2487a5c, 0x9b832808ff11e751, 0xf2048507e9da01d5, 0xa9c576189f544a4a, + 0xf6c2a45b2e9d2b41, 0x9b9874c9f10ecc2f, 0x37d9b5f51f8c149e, 0x93aead54c9de9467, + 0x59cf0b4af262da23, 0xe7e9929af18194b2, 0x9df2644e33eb0178, 0xde4122d6f0671938, + 0xf005786c07f4800b, 0xb1fc9d254b5d1039, 0x0bf1088631f6dd7b, 0x665623f0a4b8f0c7, + 0x60f0113a9187db7c, 0xfd7cceda4f0d23a6, 0x26c01e9d89955940, 0x33afa1dfc0f5a6a0, + 0xeb77daf215e9283c, 0xc7575214bf85edb4, 0xeb0d804bf297e616, 0x84bff4ffd564f747, + 0xc4ac33189246f620, 0x43ef61213ecc1005, 0xcbbb0dea6cd96acd, 0x8ed27abfa8cfcb05, + 0x543b61529cb996b6, 0xa5f987ca41ea5e59, 0x3c50e0ac5254cb7a, 0x4192b0446c06d1e6, + 0x3e86592e21b45388, 0xdb766f06fcc6e51e, 0x0448ee36efe632db, 0x663c9db689253e35, + 0x72e0bd4985331dd4, 0xff501b5bf7d94e74, 0xe911ce758e2113a8, 0xec3a8d03a75a6ba4, + 0xaf6b4b72f56edc83, 0xf284857936c0a391, 0x5ba6feff407d46f4, 0x9d689c26de9d6702, + 0x28c04a9083726b5d, 0x2ccf4a627a029730, 0x7b4719500d4f0c71, 0x76470a9a7da250a8, + 0xcc48409404a1c890, 0xccefbdc7ec9a8055, 0xe0db91bff3cc42d3, 0x0532436426141254, + 0xf2ee9325e6f0ff0b, 0x149c20a5fbb28d9d, 0xe71624cd8d2d14d4, 0x8f01d4dc8cc2dd77, + 0x29cf409b333015b7, 0xba8bebd211884dd1, 0xc3396635e8c8db1d, 0x8ed0f6208d0528b8, + 0x0d90b43fdd0ee334, 0xd73c9a3333a044c7, 0xa2595cd208dbdc38, 0xae93cb264f940c09, + 0x8e0538d8afb07a97, 0x19115ec881385ba2, 0xa886f9e6a8039c6a, 0xcd5d62147ce3ecac, + 0xaecdf9e0bb4969f7, 0x2ddd631c53dcad10, 0x73ad1c97b3412054, 0xb08915fa2722efc6, + 0x97966047e5067eb0, 0x337f1675ed91445c, 0xb3a833d150b96a0d, 0x5940a98fe35e5e2e, + 0xfd03cc354ed0d8ff, 0x4e65b98291a8644a, 0x14a259f2852a60b2, 0x7648e3478c1e8e5f, + 0xbc0fbef6d9a919b4, 0xbec4302081346cf1, 0x57d2ce7aa1c7c511, 0x234c209d8f4e1ac3, + 0x87cf80cc933ce443, 0x7c262c616931e94e, 0xc5e33b049cf9eddf, 0x1a80790ed03ae51b, + 0xf2e8b9494f7220cf, 0x124cb59c14fff3ff, 0xa8a06cbfdb86ce18, 0x9068ef1f80b37653, + 0x0c55417b8d90338f, 0xcd579a523f6bcd30, 0xa31bfe2476a8d2a9, 0x1f8d142208094223, + 0x332dc40a5203cfad, 0xf8792fe5b2d33b4c, 0x443bd9668bf9461e, 0xc9019db0ace1409e, + 0x781bea919a113e8b, 0xb0f11d866abfbeec, 0xcfe139a60db0c26a, 0x869ab8721e6aa39e, + 0xdb48a4977717837a, 0x588a5ff151065b18, 0xe4a251ea0028864d, 0x7f0e43ba408a77c3, + 0x65f66dd50a536135, 0x6f49e934d9331c3e, 0xb8d742e0f0fa6b09, 0xe4e9b272deca2348, + 0xaee132ff902f773c, 0x43f658f7c2a0c90a, 0x28cb4dbc76cc53ea, 0x7d92253aa99ac39b, + 0x4fea3d832370baab, 0xb29e36936e51d78e, 0xea10778712321064, 0xff4f21f8ef274be2, + 0x84eff18ddfa0933f, 0xd0ec6a9f86c758a0, 0xaf82e5973c431ae0, 0x352023c00c045425, + 0xad34d7bc4a2f8961, 0xbdb4a02a24d4dee0, 0x354a4846d97447cf, 0x331a8b944d5bc19f, + 0x5ce04f8e17909035, 0x6497581bad8f4aab, 0x07c503bba647111e, 0x85f412ba78e1f7ff, + 0x7f3b920fd20f4cff, 0x424e1a9a4ce34e2f, 0x3035e2d62e1b9f0a, 0xef63114bff7b729a, + 0xe86a05889ab6bb60, 0xee0830cf095585a1, 0x4a54f7fa47d9c94b, 0x17daeece9fcb556a, + 0xc506d3f391834c6f, 0xb3f24be362e1af64, 0xc435e4e23608efdd, 0xeeba9caaa4cc1768, + 0x5a71f306daddc22d, 0x18e5205f41eba1a0, 0x7b29b4d1f6610925, 0x065cb65a0258d9a9, + 0x3e5ac8faa9fd1f95, 0x3b362362c1ea0470, 0xce0e4f6434db7a2e, 0xf327341098de52f2, + 0xcfca3b9e2a1992c3, 0x7483bf9401233e41, 0xbafbac531c6f9281, 0x4b52dd71b2c106f8, + 0xdf73b66e50b5a1f7, 0x237aec0202a20283, 0x23dd5be23dffdf2b, 0xea9730731ee122ef, + 0x5cb3f846014fbcd3, 0xc3b21c8ffdce9201, 0x06a99a02f91a8760, 0x721a81fa8fd7b7a3, + 0x6aafcdddc53cbcd8, 0xd03b464005a93bcc, 0x8212edc1b1669dcb, 0x71f4c31364c31bc7, + 0xfeeec0eba8772307, 0x1948d00a13d88cf1, 0x19064fd6d943ada8, 0x4ec8d31722697bfd, + 0x596d9a953a516609, 0xc4cb4bff53507da2, 0x1d59f3c5be36e4ca, 0xe5b4fc5bf6044c9b, + 0x1bb74e052232f735, 0x04e8a0db611ddd5d, 0x8d04eaa009b421bf, 0xa7878ae0ac0e6d58, + 0x28c1030217cab2b3, 0x827943767e56a883, 0x28fce5fa02d22809, 0xb30c322fffc8c58e, + 0x1ca5a6a9f8066c5b, 0xb24db5f1462b2513, 0x02f653b89b7e5f6c, 0xe31f8fb5d5f78eee, + 0x266acc514ed93501, 0x936879d1c6fddcc4, 0xcd51be3636af1952, 0x3fdbb6fc332c78c8, + 0x9eb656379fa73094, 0x056146cc92fa0f96, 0xed6c4f1836c027c3, 0x021e0bb5d2113f2a, + 0x8983e42ec1c626b3, 0x73ea9bc6513ad9c9, 0x0c904903b24f4247, 0xacbac1e6243e2525, + 0x0b1069a0c230fb06, 0x77d709fca3fc1ce5, 0x87ad0f65020947e6, 0x555302641c53f4e6, + 0x65ea87871fa9aaee, 0x58aaf4ecc1067bb4, 0x1a66c48cc4c65b3f, 0xca96aca48b2ea969, + 0xa68eb70bad14de2b, 0x5ccdb3d7e00a6f6e, 0xe178fbfec73fe72f, 0x2b63d6a16b83e890, + 0x32fdb7a5330fbae0, 0x2ab5803c8d1bf32c, 0xda838388c1527c94, 0x16a50bdc4de24acb, + 0xe561301f134c074a, 0xd7ae63d2816b4db1, 0x036aabd4df0dd741, 0xc5e0db8783435b9d, + 0x9c4386cf0a07f3b2, 0x6a72ac1aa56a13a1, 0x299bbdb04bb20a23, 0x138c1018fda16b81, + 0x0e354f0b3bda49df, 0x9f4c295b23127437, 0xd133ceb2bd561341, 0xd8b4bfd5a526ac29, + 0xcdd0a70ddc1c7bbd, 0x81dce595bf572225, 0x1c6f925c05f6efd7, 0x8ae5097553856ea0, + 0x3aabeaeef248f60d, 0xd9005809d19a69e2, 0x2a3a1a314311cc27, 0x89bb2dc76b2b624a, + 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, + 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e, + ]]; diff --git a/parquet/src/column/chunker/mod.rs b/parquet/src/column/chunker/mod.rs new file mode 100644 index 000000000000..d5ccec101e46 --- /dev/null +++ b/parquet/src/column/chunker/mod.rs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Content-defined chunking (CDC) for Parquet data pages. +//! +//! CDC creates data page boundaries based on content rather than fixed sizes, +//! enabling efficient deduplication in content-addressable storage (CAS) systems. +//! See [`CdcOptions`](crate::file::properties::CdcOptions) for configuration. + +mod cdc; +mod cdc_generated; + +pub(crate) use cdc::ContentDefinedChunker; + +/// A chunk of data with level and value offsets for record-shredded nested data. +#[derive(Debug, Clone, Copy)] +pub(crate) struct Chunk { + /// The start offset of this chunk inside the given levels. + pub level_offset: usize, + /// The start offset of this chunk inside the given values array. + pub value_offset: usize, + /// The number of levels in this chunk. + pub levels_to_write: usize, +} diff --git a/parquet/src/column/mod.rs b/parquet/src/column/mod.rs index 1e534bdd6b77..e2db4fe69159 100644 --- a/parquet/src/column/mod.rs +++ b/parquet/src/column/mod.rs @@ -117,6 +117,7 @@ //! assert_eq!(rep_levels, vec![0, 1, 0, 1, 1]); //! ``` +pub(crate) mod chunker; pub mod page; #[cfg(feature = "encryption")] pub(crate) mod page_encryption; diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index c014397f132e..5a2f46628d68 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -100,6 +100,12 @@ impl ColumnWriter<'_> { downcast_writer!(self, typed, typed.get_estimated_total_bytes()) } + /// Flush the currently buffered values as a data page. + #[cfg(feature = "arrow")] + pub(crate) fn flush_current_page(&mut self) -> Result<()> { + downcast_writer!(self, typed, typed.flush_current_page()) + } + /// Close this [`ColumnWriter`], returning the metadata for the column chunk. pub fn close(self) -> Result { downcast_writer!(self, typed, typed.close()) @@ -596,6 +602,17 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { &self.descr } + /// Flush the currently buffered values as a data page. + /// + /// This is used by content-defined chunking to force a page boundary at + /// content-determined positions. + pub(crate) fn flush_current_page(&mut self) -> Result<()> { + if self.page_metrics.num_buffered_values > 0 { + self.add_data_page()?; + } + Ok(()) + } + /// Finalizes writes and closes the column writer. /// Returns total bytes written, total rows written and column chunk metadata. pub fn close(mut self) -> Result { diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index ae21de304404..02a07dd4049d 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -62,6 +62,48 @@ pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false; /// Default values for [`WriterProperties::coerce_types`] pub const DEFAULT_COERCE_TYPES: bool = false; +/// EXPERIMENTAL: Options for content-defined chunking (CDC). +/// +/// CDC creates data page boundaries based on content rather than fixed sizes, +/// enabling efficient deduplication in content-addressable storage (CAS) systems. +/// When enabled, unchanged data across file versions will produce identical byte +/// sequences, allowing storage-level deduplication. +/// +/// Each content-defined chunk is written as a separate parquet data page. These +/// options control the chunk size and the chunking process. Note that the chunk +/// size is calculated based on the logical value of the data, before any encoding +/// or compression is applied. +#[derive(Debug, Clone, Copy)] +pub struct CdcOptions { + /// Minimum chunk size in bytes, default is 256 KiB. + /// The rolling hash will not be evaluated until this many bytes have been + /// accumulated in the current chunk. All data fed through the hash function + /// counts towards the chunk size, including definition and repetition levels. + pub min_chunk_size: usize, + /// Maximum chunk size in bytes, default is 1024 KiB. + /// A chunk boundary is forced when the chunk size reaches this value, + /// regardless of hash state. The parquet writer's `data_page_size_limit` + /// still applies independently. + pub max_chunk_size: usize, + /// Normalization level to center the chunk size distribution around the + /// average size more aggressively, default is 0. + /// Increasing the normalization level increases the probability of finding + /// a chunk boundary, improving the deduplication ratio, but also increases + /// the number of small chunks. Use 1 or 2 for higher deduplication at the + /// expense of fragmentation. + pub norm_level: i32, +} + +impl Default for CdcOptions { + fn default() -> Self { + Self { + min_chunk_size: 256 * 1024, + max_chunk_size: 1024 * 1024, + norm_level: 0, + } + } +} + /// Parquet writer version. /// /// Basic constant, which is not part of the Thrift definition. @@ -168,6 +210,7 @@ pub struct WriterProperties { column_index_truncate_length: Option, statistics_truncate_length: Option, coerce_types: bool, + cdc_options: Option, #[cfg(feature = "encryption")] pub(crate) file_encryption_properties: Option>, } @@ -364,6 +407,13 @@ impl WriterProperties { self.coerce_types } + /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled. + /// + /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`] + pub fn cdc_options(&self) -> Option<&CdcOptions> { + self.cdc_options.as_ref() + } + /// Returns encoding for a data page, when dictionary encoding is enabled. /// /// This is not configurable. @@ -487,6 +537,7 @@ pub struct WriterPropertiesBuilder { column_index_truncate_length: Option, statistics_truncate_length: Option, coerce_types: bool, + cdc_options: Option, #[cfg(feature = "encryption")] file_encryption_properties: Option>, } @@ -510,6 +561,7 @@ impl Default for WriterPropertiesBuilder { column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH, coerce_types: DEFAULT_COERCE_TYPES, + cdc_options: None, #[cfg(feature = "encryption")] file_encryption_properties: None, } @@ -535,6 +587,7 @@ impl WriterPropertiesBuilder { column_index_truncate_length: self.column_index_truncate_length, statistics_truncate_length: self.statistics_truncate_length, coerce_types: self.coerce_types, + cdc_options: self.cdc_options, #[cfg(feature = "encryption")] file_encryption_properties: self.file_encryption_properties, } @@ -750,6 +803,33 @@ impl WriterPropertiesBuilder { self } + /// EXPERIMENTAL: Enables or disables content-defined chunking with default options. + /// + /// When enabled, data page boundaries are determined by a rolling hash of the + /// column values, so unchanged data produces identical byte sequences across + /// file versions. This enables efficient deduplication on content-addressable + /// storage systems. + /// + /// Only supported through the Arrow writer interface ([`ArrowWriter`]). + /// + /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter + pub fn set_content_defined_chunking(mut self, enabled: bool) -> Self { + self.cdc_options = if enabled { + Some(CdcOptions::default()) + } else { + None + }; + self + } + + /// EXPERIMENTAL: Sets content-defined chunking options, implicitly enabling CDC. + /// + /// See [`CdcOptions`] for details on each parameter. + pub fn set_cdc_options(mut self, options: CdcOptions) -> Self { + self.cdc_options = Some(options); + self + } + /// Sets FileEncryptionProperties (defaults to `None`) #[cfg(feature = "encryption")] pub fn with_file_encryption_properties( @@ -1033,6 +1113,7 @@ impl From for WriterPropertiesBuilder { column_index_truncate_length: props.column_index_truncate_length, statistics_truncate_length: props.statistics_truncate_length, coerce_types: props.coerce_types, + cdc_options: props.cdc_options, #[cfg(feature = "encryption")] file_encryption_properties: props.file_encryption_properties, } diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 98106a2c1059..b76f6d6670af 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -67,6 +67,28 @@ //! * [`ArrowColumnWriter`] for writing using multiple threads, //! * [`RowFilter`] to apply filters during decode //! +//! ### EXPERIMENTAL: Content-Defined Chunking +//! +//! [`ArrowWriter`] supports content-defined chunking (CDC), which creates data page +//! boundaries based on content rather than fixed sizes. CDC enables efficient +//! deduplication in content-addressable storage (CAS) systems: when the same data +//! appears in successive file versions, it will produce identical byte sequences that +//! CAS backends can deduplicate. +//! +//! Enable CDC via [`WriterProperties`]: +//! +//! ```no_run +//! # use parquet::file::properties::WriterProperties; +//! let props = WriterProperties::builder() +//! .set_content_defined_chunking(true) +//! .build(); +//! ``` +//! +//! See [`CdcOptions`] for chunk size and normalization parameters. +//! +//! [`WriterProperties`]: file::properties::WriterProperties +//! [`CdcOptions`]: file::properties::CdcOptions +//! //! [`ArrowWriter`]: arrow::arrow_writer::ArrowWriter //! [`ParquetRecordBatchReaderBuilder`]: arrow::arrow_reader::ParquetRecordBatchReaderBuilder //! [`ParquetPushDecoder`]: arrow::push_decoder::ParquetPushDecoder From 26364c53c2f7fe1b93a446a528f2a29a05254903 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 18 Feb 2026 18:34:59 +0100 Subject: [PATCH 02/21] feat(parquet): add `repeated_ancestor_def_level` to `ColumnDescriptor` and use it in content defined chunking --- parquet/src/arrow/arrow_writer/mod.rs | 60 +---------- parquet/src/column/chunker/cdc.rs | 34 ++++--- parquet/src/schema/types.rs | 137 +++++++++++++++++++++++++- 3 files changed, 155 insertions(+), 76 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 5d704ec2b8aa..55bf572d4646 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1148,22 +1148,10 @@ impl ArrowRowGroupWriterFactory { Some(opts) => opts, None => return Ok(None), }; - let schema_root = self.schema.root_schema(); self.schema .columns() .iter() - .map(|desc| { - let max_def_level = desc.max_def_level(); - let max_rep_level = desc.max_rep_level(); - let repeated_ancestor_def_level = - compute_repeated_ancestor_def_level(schema_root, desc.path()); - chunker::ContentDefinedChunker::new( - max_def_level, - max_rep_level, - repeated_ancestor_def_level, - opts, - ) - }) + .map(|desc| chunker::ContentDefinedChunker::new(desc, opts)) .collect::>>() .map(Some) } @@ -1744,52 +1732,6 @@ fn get_fsb_array_slice( values } -/// Compute the definition level at the nearest REPEATED ancestor by traversing -/// the Parquet schema tree from root to the given leaf column path. -fn compute_repeated_ancestor_def_level( - schema_root: &crate::schema::types::Type, - path: &crate::schema::types::ColumnPath, -) -> i16 { - use crate::basic::Repetition; - let parts = path.parts(); - if parts.is_empty() { - return 0; - } - - let mut current_type = schema_root; - let mut def_level: i16 = 0; - let mut repeated_ancestor_def_level: i16 = 0; - - for part in parts { - // Find the child with matching name - if !current_type.is_group() { - break; - } - let child = current_type.get_fields().iter().find(|f| f.name() == part); - let child = match child { - Some(c) => c, - None => break, - }; - - // Update def/rep levels based on this node's repetition - if child.get_basic_info().has_repetition() { - match child.get_basic_info().repetition() { - Repetition::OPTIONAL => { - def_level += 1; - } - Repetition::REPEATED => { - def_level += 1; - repeated_ancestor_def_level = def_level; - } - Repetition::REQUIRED => {} - } - } - current_type = child.as_ref(); - } - - repeated_ancestor_def_level -} - /// Compute CDC chunk boundaries by dispatching on the Arrow array's data type /// to feed value bytes into the rolling hash. fn get_cdc_chunks( diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index 7732ab39da9a..c7ef235cfc51 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -17,6 +17,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::properties::CdcOptions; +use crate::schema::types::ColumnDescriptor; use super::Chunk; use super::cdc_generated::{GEARHASH_TABLE, NUM_GEARHASH_TABLES}; @@ -68,21 +69,16 @@ pub(crate) struct ContentDefinedChunker { } impl ContentDefinedChunker { - pub fn new( - max_def_level: i16, - max_rep_level: i16, - repeated_ancestor_def_level: i16, - options: &CdcOptions, - ) -> Result { + pub fn new(desc: &ColumnDescriptor, options: &CdcOptions) -> Result { let rolling_hash_mask = Self::calculate_mask( options.min_chunk_size as i64, options.max_chunk_size as i64, options.norm_level, )?; Ok(Self { - max_def_level, - max_rep_level, - repeated_ancestor_def_level, + max_def_level: desc.max_def_level(), + max_rep_level: desc.max_rep_level(), + repeated_ancestor_def_level: desc.repeated_ancestor_def_level(), min_chunk_size: options.min_chunk_size as i64, max_chunk_size: options.max_chunk_size as i64, rolling_hash_mask, @@ -362,6 +358,16 @@ impl ContentDefinedChunker { #[cfg(test)] mod tests { use super::*; + use crate::basic::Type as PhysicalType; + use crate::schema::types::{ColumnPath, Type}; + use std::sync::Arc; + + fn make_desc(max_def_level: i16, max_rep_level: i16) -> ColumnDescriptor { + let tp = Type::primitive_type_builder("col", PhysicalType::INT32) + .build() + .unwrap(); + ColumnDescriptor::new(Arc::new(tp), max_def_level, max_rep_level, ColumnPath::new(vec![])) + } #[test] fn test_calculate_mask_defaults() { @@ -393,7 +399,7 @@ mod tests { max_chunk_size: 1024, norm_level: 0, }; - let mut chunker = ContentDefinedChunker::new(0, 0, 0, &options).unwrap(); + let mut chunker = ContentDefinedChunker::new(&make_desc(0, 0), &options).unwrap(); // Write a small amount of data — should produce exactly 1 chunk. let num_values = 4; @@ -411,7 +417,7 @@ mod tests { max_chunk_size: 1024, norm_level: 0, }; - let mut chunker = ContentDefinedChunker::new(0, 0, 0, &options).unwrap(); + let mut chunker = ContentDefinedChunker::new(&make_desc(0, 0), &options).unwrap(); // Write enough data to exceed max_chunk_size multiple times. // Each i32 = 4 bytes, max_chunk_size=1024, so ~256 values per chunk max. @@ -443,10 +449,10 @@ mod tests { let roll = |i: usize| (i as i64).to_le_bytes(); - let mut chunker1 = ContentDefinedChunker::new(0, 0, 0, &options).unwrap(); + let mut chunker1 = ContentDefinedChunker::new(&make_desc(0, 0), &options).unwrap(); let chunks1 = chunker1.get_chunks(None, None, 200, roll); - let mut chunker2 = ContentDefinedChunker::new(0, 0, 0, &options).unwrap(); + let mut chunker2 = ContentDefinedChunker::new(&make_desc(0, 0), &options).unwrap(); let chunks2 = chunker2.get_chunks(None, None, 200, roll); assert_eq!(chunks1.len(), chunks2.len()); @@ -464,7 +470,7 @@ mod tests { max_chunk_size: 64, norm_level: 0, }; - let mut chunker = ContentDefinedChunker::new(1, 0, 0, &options).unwrap(); + let mut chunker = ContentDefinedChunker::new(&make_desc(1, 0), &options).unwrap(); let num_levels = 20; // def_level=1 means non-null, def_level=0 means null diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 85f3ed48972c..436b61fbbf4a 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -853,6 +853,9 @@ pub struct ColumnDescriptor { /// The maximum repetition level for this column max_rep_level: i16, + /// The definition level at the nearest REPEATED ancestor, or 0 if none. + repeated_ancestor_def_level: i16, + /// The path of this column. For instance, "a.b.c.d". path: ColumnPath, } @@ -877,6 +880,7 @@ impl ColumnDescriptor { primitive_type, max_def_level, max_rep_level, + repeated_ancestor_def_level: 0, path, } } @@ -893,6 +897,12 @@ impl ColumnDescriptor { self.max_rep_level } + /// Returns the definition level at the nearest REPEATED ancestor, or 0 if none. + #[inline] + pub fn repeated_ancestor_def_level(&self) -> i16 { + self.repeated_ancestor_def_level + } + /// Returns [`ColumnPath`] for this column. pub fn path(&self) -> &ColumnPath { &self.path @@ -1069,7 +1079,7 @@ impl SchemaDescriptor { let mut path = Vec::with_capacity(INIT_SCHEMA_DEPTH); for (root_idx, f) in tp.get_fields().iter().enumerate() { path.clear(); - build_tree(f, root_idx, 0, 0, &mut leaves, &mut leaf_to_base, &mut path); + build_tree(f, root_idx, 0, 0, 0, &mut leaves, &mut leaf_to_base, &mut path); } Self { @@ -1196,6 +1206,7 @@ fn build_tree<'a>( root_idx: usize, mut max_rep_level: i16, mut max_def_level: i16, + mut repeated_ancestor_def_level: i16, leaves: &mut Vec, leaf_to_base: &mut Vec, path_so_far: &mut Vec<&'a str>, @@ -1210,6 +1221,7 @@ fn build_tree<'a>( Repetition::REPEATED => { max_def_level += 1; max_rep_level += 1; + repeated_ancestor_def_level = max_def_level; } _ => {} } @@ -1218,12 +1230,14 @@ fn build_tree<'a>( Type::PrimitiveType { .. } => { let mut path: Vec = vec![]; path.extend(path_so_far.iter().copied().map(String::from)); - leaves.push(Arc::new(ColumnDescriptor::new( + let mut desc = ColumnDescriptor::new( tp.clone(), max_def_level, max_rep_level, ColumnPath::new(path), - ))); + ); + desc.repeated_ancestor_def_level = repeated_ancestor_def_level; + leaves.push(Arc::new(desc)); leaf_to_base.push(root_idx); } Type::GroupType { fields, .. } => { @@ -1233,6 +1247,7 @@ fn build_tree<'a>( root_idx, max_rep_level, max_def_level, + repeated_ancestor_def_level, leaves, leaf_to_base, path_so_far, @@ -1941,6 +1956,122 @@ mod tests { assert_eq!(descr.column(3).max_rep_level(), 1); } + #[test] + fn test_schema_build_tree_repeated_ancestor_def_level() { + // Flat columns: no REPEATED ancestor → repeated_ancestor_def_level = 0 + let message_type = " + message m { + REQUIRED INT32 a; + OPTIONAL INT32 b; + OPTIONAL group s { + OPTIONAL INT32 x; + } + } + "; + let schema = parse_message_type(message_type).expect("should parse schema"); + let descr = SchemaDescriptor::new(Arc::new(schema)); + assert_eq!(descr.column(0).repeated_ancestor_def_level(), 0); // a + assert_eq!(descr.column(1).repeated_ancestor_def_level(), 0); // b + assert_eq!(descr.column(2).repeated_ancestor_def_level(), 0); // s.x + + // Standard list: OPTIONAL outer, REPEATED group, OPTIONAL element + // repeated_ancestor_def_level is the def_level at the REPEATED group (= 2) + let message_type = " + message m { + OPTIONAL group c (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + "; + let schema = parse_message_type(message_type).expect("should parse schema"); + let descr = SchemaDescriptor::new(Arc::new(schema)); + // c(optional)=1, list(repeated)=2, element(optional)=3 + assert_eq!(descr.column(0).max_def_level(), 3); + assert_eq!(descr.column(0).max_rep_level(), 1); + assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); + + // Required list: REQUIRED outer, REPEATED group, REQUIRED element + // No OPTIONAL nodes between REPEATED and leaf, so repeated_ancestor_def_level == max_def_level + let message_type = " + message m { + REQUIRED group c (LIST) { + REPEATED group list { + REQUIRED INT32 element; + } + } + } + "; + let schema = parse_message_type(message_type).expect("should parse schema"); + let descr = SchemaDescriptor::new(Arc::new(schema)); + // list(repeated)=1, element(required)=1 + assert_eq!(descr.column(0).max_def_level(), 1); + assert_eq!(descr.column(0).max_rep_level(), 1); + assert_eq!(descr.column(0).repeated_ancestor_def_level(), 1); + + // Nested lists: innermost REPEATED wins + let message_type = " + message m { + OPTIONAL group outer (LIST) { + REPEATED group list { + OPTIONAL group inner (LIST) { + REPEATED group list2 { + OPTIONAL INT32 element; + } + } + } + } + } + "; + let schema = parse_message_type(message_type).expect("should parse schema"); + let descr = SchemaDescriptor::new(Arc::new(schema)); + // outer(opt)=1, list(rep)=2, inner(opt)=3, list2(rep)=4, element(opt)=5 + assert_eq!(descr.column(0).max_def_level(), 5); + assert_eq!(descr.column(0).max_rep_level(), 2); + assert_eq!(descr.column(0).repeated_ancestor_def_level(), 4); + + // Struct inside list: all sibling leaves share the same repeated_ancestor_def_level + let message_type = " + message m { + OPTIONAL group bag (LIST) { + REPEATED group list { + REQUIRED group item { + OPTIONAL INT32 x; + REQUIRED INT32 y; + } + } + } + } + "; + let schema = parse_message_type(message_type).expect("should parse schema"); + let descr = SchemaDescriptor::new(Arc::new(schema)); + // bag(opt)=1, list(rep)=2, item(req)=2, x(opt)=3 + assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); // bag.list.item.x + // bag(opt)=1, list(rep)=2, item(req)=2, y(req)=2 + assert_eq!(descr.column(1).repeated_ancestor_def_level(), 2); // bag.list.item.y + + // Map type: key (required) and value (optional) under the same REPEATED group + let message_type = " + message m { + OPTIONAL group my_map (MAP) { + REPEATED group key_value { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL INT32 value; + } + } + } + "; + let schema = parse_message_type(message_type).expect("should parse schema"); + let descr = SchemaDescriptor::new(Arc::new(schema)); + // my_map(opt)=1, key_value(rep)=2, key(req)=2 + assert_eq!(descr.column(0).max_def_level(), 2); + assert_eq!(descr.column(0).repeated_ancestor_def_level(), 2); // key: max_def == repeated_ancestor + // my_map(opt)=1, key_value(rep)=2, value(opt)=3 + assert_eq!(descr.column(1).max_def_level(), 3); + assert_eq!(descr.column(1).repeated_ancestor_def_level(), 2); // value: max_def > repeated_ancestor + } + #[test] #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")] fn test_get_physical_type_panic() { From cf48df751160966f1e323462d66aefb45de33fc0 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 20 Feb 2026 16:28:44 +0100 Subject: [PATCH 03/21] chore: cargo format --- parquet/src/column/chunker/cdc.rs | 7 ++++++- parquet/src/schema/types.rs | 11 ++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index c7ef235cfc51..da94aef3bc92 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -366,7 +366,12 @@ mod tests { let tp = Type::primitive_type_builder("col", PhysicalType::INT32) .build() .unwrap(); - ColumnDescriptor::new(Arc::new(tp), max_def_level, max_rep_level, ColumnPath::new(vec![])) + ColumnDescriptor::new( + Arc::new(tp), + max_def_level, + max_rep_level, + ColumnPath::new(vec![]), + ) } #[test] diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 436b61fbbf4a..cb47fe18a1b5 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1079,7 +1079,16 @@ impl SchemaDescriptor { let mut path = Vec::with_capacity(INIT_SCHEMA_DEPTH); for (root_idx, f) in tp.get_fields().iter().enumerate() { path.clear(); - build_tree(f, root_idx, 0, 0, 0, &mut leaves, &mut leaf_to_base, &mut path); + build_tree( + f, + root_idx, + 0, + 0, + 0, + &mut leaves, + &mut leaf_to_base, + &mut path, + ); } Self { From b05da4d460ef5ec457afbe0a0ec9cb12dc5318d0 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 20 Feb 2026 16:52:23 +0100 Subject: [PATCH 04/21] chore: fix clippy errors --- parquet/src/arrow/arrow_writer/mod.rs | 25 ++++---------------- parquet/src/column/chunker/cdc.rs | 3 ++- parquet/src/schema/types.rs | 1 + parquet/tests/encryption/encryption_async.rs | 5 ++-- 4 files changed, 10 insertions(+), 24 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 55bf572d4646..b5d17e42c83a 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1119,11 +1119,9 @@ impl ArrowRowGroupWriterFactory { let mut writers = Vec::with_capacity(self.arrow_schema.fields.len()); let mut leaves = self.schema.columns().iter(); let column_factory = self.column_writer_factory(row_group_index); - let schema_root = self.schema.root_schema(); for field in &self.arrow_schema.fields { column_factory.get_arrow_column_writer( field.data_type(), - schema_root, &self.props, &mut leaves, &mut writers, @@ -1178,11 +1176,9 @@ pub fn get_column_writers( let mut writers = Vec::with_capacity(arrow.fields.len()); let mut leaves = parquet.columns().iter(); let column_factory = ArrowColumnWriterFactory::new(); - let schema_root = parquet.root_schema(); for field in &arrow.fields { column_factory.get_arrow_column_writer( field.data_type(), - schema_root, props, &mut leaves, &mut writers, @@ -1252,7 +1248,6 @@ impl ArrowColumnWriterFactory { fn get_arrow_column_writer( &self, data_type: &ArrowDataType, - schema_root: &crate::schema::types::Type, props: &WriterPropertiesPtr, leaves: &mut Iter<'_, ColumnDescPtr>, out: &mut Vec, @@ -1297,29 +1292,17 @@ impl ArrowColumnWriterFactory { | ArrowDataType::FixedSizeList(f, _) | ArrowDataType::ListView(f) | ArrowDataType::LargeListView(f) => { - self.get_arrow_column_writer(f.data_type(), schema_root, props, leaves, out)? + self.get_arrow_column_writer(f.data_type(), props, leaves, out)? } ArrowDataType::Struct(fields) => { for field in fields { - self.get_arrow_column_writer( - field.data_type(), - schema_root, - props, - leaves, - out, - )? + self.get_arrow_column_writer(field.data_type(), props, leaves, out)? } } ArrowDataType::Map(f, _) => match f.data_type() { ArrowDataType::Struct(f) => { - self.get_arrow_column_writer( - f[0].data_type(), - schema_root, - props, - leaves, - out, - )?; - self.get_arrow_column_writer(f[1].data_type(), schema_root, props, leaves, out)? + self.get_arrow_column_writer(f[0].data_type(), props, leaves, out)?; + self.get_arrow_column_writer(f[1].data_type(), props, leaves, out)? } _ => unreachable!("invalid map type"), }, diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index da94aef3bc92..cffa5d793901 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -119,7 +119,7 @@ impl ContentDefinedChunker { let effective_bits = mask_bits - norm_level; - if effective_bits < 1 || effective_bits > 63 { + if !(1..=63).contains(&effective_bits) { return Err(ParquetError::General(format!( "The number of bits in the CDC mask must be between 1 and 63, got {effective_bits}" ))); @@ -240,6 +240,7 @@ impl ContentDefinedChunker { // level_offset == value_offset for non-nested data. let def_levels = def_levels.expect("def_levels required when max_def_level > 0"); let mut prev_offset: usize = 0; + #[allow(clippy::needless_range_loop)] for offset in 0..num_levels { let def_level = def_levels[offset]; self.roll_level(def_level); diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index cb47fe18a1b5..8e8398151f35 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1210,6 +1210,7 @@ fn count_leaves(tp: &TypePtr, n_leaves: &mut usize) { } } +#[allow(clippy::too_many_arguments)] fn build_tree<'a>( tp: &'a TypePtr, root_idx: usize, diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index 48c844afb99e..e79279913966 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -548,7 +548,7 @@ fn spawn_rg_join_and_finalize_task( } fn spawn_parquet_parallel_serialization_task( - writer_factory: ArrowRowGroupWriterFactory, + mut writer_factory: ArrowRowGroupWriterFactory, mut data: Receiver, serialize_tx: Sender>, schema: Arc, @@ -778,7 +778,8 @@ async fn test_multi_threaded_encrypted_writing() { let temp_file = tempfile::tempfile().unwrap(); let mut writer = SerializedFileWriter::new(&temp_file, parquet_schema.root_schema_ptr(), props).unwrap(); - let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); + let mut row_group_writer_factory = + ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); let (serialize_tx, mut serialize_rx) = tokio::sync::mpsc::channel::>(1); From ea0e344bb26309e9d1245407ae1df696a71081c7 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 20 Feb 2026 17:10:10 +0100 Subject: [PATCH 05/21] refactor(parquet): maintain field for better encapsulation --- parquet/src/arrow/arrow_writer/levels.rs | 159 +++++++++++++++++++++-- parquet/src/arrow/arrow_writer/mod.rs | 14 +- parquet/src/column/chunker/cdc.rs | 42 +++--- parquet/src/column/chunker/mod.rs | 4 +- 4 files changed, 175 insertions(+), 44 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 1716c14d1aea..471fca4c58eb 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -40,6 +40,7 @@ //! //! \[1\] [parquet-format#nested-encoding](https://github.com/apache/parquet-format#nested-encoding) +use crate::column::chunker::Chunk; use crate::errors::{ParquetError, Result}; use arrow_array::cast::AsArray; use arrow_array::{Array, ArrayRef, OffsetSizeTrait}; @@ -803,26 +804,19 @@ impl ArrayLevels { } /// Create a sliced view of this `ArrayLevels` for a CDC chunk. - /// - /// - `level_offset`: start position within `def_levels`/`rep_levels` - /// - `levels_to_write`: number of levels in this chunk - /// - `value_offset`: start position within the values array - /// - `num_values`: number of values in this chunk - pub(crate) fn slice_for_chunk( - &self, - level_offset: usize, - levels_to_write: usize, - value_offset: usize, - num_values: usize, - ) -> Self { + pub(crate) fn slice_for_chunk(&self, chunk: &Chunk) -> Self { + let level_offset = chunk.level_offset; + let num_levels = chunk.num_levels; + let value_offset = chunk.value_offset; + let num_values = chunk.num_values; let def_levels = self .def_levels .as_ref() - .map(|levels| levels[level_offset..level_offset + levels_to_write].to_vec()); + .map(|levels| levels[level_offset..level_offset + num_levels].to_vec()); let rep_levels = self .rep_levels .as_ref() - .map(|levels| levels[level_offset..level_offset + levels_to_write].to_vec()); + .map(|levels| levels[level_offset..level_offset + num_levels].to_vec()); // Filter non_null_indices to [value_offset, value_offset + num_values) // and shift by -value_offset. @@ -852,6 +846,7 @@ impl ArrayLevels { #[cfg(test)] mod tests { use super::*; + use crate::column::chunker::Chunk; use arrow_array::builder::*; use arrow_array::types::Int32Type; @@ -2142,4 +2137,140 @@ mod tests { let v = Arc::new(array) as ArrayRef; LevelInfoBuilder::try_new(field, Default::default(), &v).unwrap() } + + #[test] + fn test_slice_for_chunk_flat() { + // Required field (no levels): array [1..=6], slice values 2..5 + let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6])); + let logical_nulls = array.logical_nulls(); + let levels = ArrayLevels { + def_levels: None, + rep_levels: None, + non_null_indices: vec![0, 1, 2, 3, 4, 5], + max_def_level: 0, + max_rep_level: 0, + array, + logical_nulls, + }; + let sliced = levels.slice_for_chunk(&Chunk { + level_offset: 0, + num_levels: 0, + value_offset: 2, + num_values: 3, + }); + assert!(sliced.def_levels.is_none()); + assert!(sliced.rep_levels.is_none()); + assert_eq!(sliced.non_null_indices, vec![0, 1, 2]); + assert_eq!(sliced.array.len(), 3); + + // Optional field (def levels only): [1, null, 3, null, 5, 6] + // Slice levels 1..4 (def=[0,1,0]), values 1..4 → non_null_indices [2]→[1] + let array: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + None, + Some(3), + None, + Some(5), + Some(6), + ])); + let logical_nulls = array.logical_nulls(); + let levels = ArrayLevels { + def_levels: Some(vec![1, 0, 1, 0, 1, 1]), + rep_levels: None, + non_null_indices: vec![0, 2, 4, 5], + max_def_level: 1, + max_rep_level: 0, + array, + logical_nulls, + }; + let sliced = levels.slice_for_chunk(&Chunk { + level_offset: 1, + num_levels: 3, + value_offset: 1, + num_values: 3, + }); + assert_eq!(sliced.def_levels, Some(vec![0, 1, 0])); + assert!(sliced.rep_levels.is_none()); + assert_eq!(sliced.non_null_indices, vec![1]); + assert_eq!(sliced.array.len(), 3); + } + + #[test] + fn test_slice_for_chunk_nested() { + // [[1,2],[3],[4,5]]: def=[2,2,2,2,2], rep=[0,1,0,0,1] + // Slice levels 2..5 (def=[2,2,2], rep=[0,0,1]), values 2..5 + let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let logical_nulls = array.logical_nulls(); + let levels = ArrayLevels { + def_levels: Some(vec![2, 2, 2, 2, 2]), + rep_levels: Some(vec![0, 1, 0, 0, 1]), + non_null_indices: vec![0, 1, 2, 3, 4], + max_def_level: 2, + max_rep_level: 1, + array, + logical_nulls, + }; + let sliced = levels.slice_for_chunk(&Chunk { + level_offset: 2, + num_levels: 3, + value_offset: 2, + num_values: 3, + }); + assert_eq!(sliced.def_levels, Some(vec![2, 2, 2])); + assert_eq!(sliced.rep_levels, Some(vec![0, 0, 1])); + // [0,1,2,3,4] filtered to [2,5) → [2,3,4] → shifted -2 → [0,1,2] + assert_eq!(sliced.non_null_indices, vec![0, 1, 2]); + assert_eq!(sliced.array.len(), 3); + } + + #[test] + fn test_slice_for_chunk_non_null_indices_boundary() { + // [1, null, 3]: non_null_indices=[0, 2]; test inclusive lower / exclusive upper bounds + let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])); + let logical_nulls = array.logical_nulls(); + let levels = ArrayLevels { + def_levels: Some(vec![1, 0, 1]), + rep_levels: None, + non_null_indices: vec![0, 2], + max_def_level: 1, + max_rep_level: 0, + array, + logical_nulls, + }; + assert_eq!( + levels + .slice_for_chunk(&Chunk { + level_offset: 0, + num_levels: 1, + value_offset: 0, + num_values: 1 + }) + .non_null_indices, + vec![0] + ); + // idx 2 in range [1,3), shifted -1 → 1 + assert_eq!( + levels + .slice_for_chunk(&Chunk { + level_offset: 1, + num_levels: 2, + value_offset: 1, + num_values: 2 + }) + .non_null_indices, + vec![1] + ); + // idx 2 excluded from [1,2) + assert_eq!( + levels + .slice_for_chunk(&Chunk { + level_offset: 1, + num_levels: 1, + value_offset: 1, + num_values: 1 + }) + .non_null_indices, + Vec::::new() + ); + } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index b5d17e42c83a..b3c748d88e59 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -933,19 +933,7 @@ impl ArrowColumnWriter { let num_chunks = chunks.len(); for (i, chunk) in chunks.iter().enumerate() { - // Compute the number of values in this chunk - let num_values = if i + 1 < num_chunks { - chunks[i + 1].value_offset - chunk.value_offset - } else { - leaf_array.len() - chunk.value_offset - }; - - let chunk_levels = levels.slice_for_chunk( - chunk.level_offset, - chunk.levels_to_write, - chunk.value_offset, - num_values, - ); + let chunk_levels = levels.slice_for_chunk(chunk); let chunk_col = ArrowLeafColumn(chunk_levels); self.write_without_cdc(&chunk_col)?; diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index cffa5d793901..41ca4c1f74a5 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -219,20 +219,24 @@ impl ContentDefinedChunker { for offset in 0..num_levels { self.roll_value_bytes(value_bytes(offset).as_ref()); if self.need_new_chunk() { + let levels_to_write = offset - prev_offset; chunks.push(Chunk { level_offset: prev_offset, value_offset: prev_offset, - levels_to_write: offset - prev_offset, + num_levels: levels_to_write, + num_values: levels_to_write, }); prev_offset = offset; } } // Last chunk if prev_offset < num_levels { + let levels_to_write = num_levels - prev_offset; chunks.push(Chunk { level_offset: prev_offset, value_offset: prev_offset, - levels_to_write: num_levels - prev_offset, + num_levels: levels_to_write, + num_values: levels_to_write, }); } } else if !has_rep_levels { @@ -248,20 +252,24 @@ impl ContentDefinedChunker { self.roll_value_bytes(value_bytes(offset).as_ref()); } if self.need_new_chunk() { + let levels_to_write = offset - prev_offset; chunks.push(Chunk { level_offset: prev_offset, value_offset: prev_offset, - levels_to_write: offset - prev_offset, + num_levels: levels_to_write, + num_values: levels_to_write, }); prev_offset = offset; } } // Last chunk if prev_offset < num_levels { + let levels_to_write = num_levels - prev_offset; chunks.push(Chunk { level_offset: prev_offset, value_offset: prev_offset, - levels_to_write: num_levels - prev_offset, + num_levels: levels_to_write, + num_values: levels_to_write, }); } } else { @@ -292,7 +300,8 @@ impl ContentDefinedChunker { chunks.push(Chunk { level_offset: prev_offset, value_offset: prev_value_offset, - levels_to_write, + num_levels: levels_to_write, + num_values: value_offset - prev_value_offset, }); prev_offset = offset; prev_value_offset = value_offset; @@ -310,7 +319,8 @@ impl ContentDefinedChunker { chunks.push(Chunk { level_offset: prev_offset, value_offset: prev_value_offset, - levels_to_write: num_levels - prev_offset, + num_levels: num_levels - prev_offset, + num_values: value_offset - prev_value_offset, }); } } @@ -329,27 +339,27 @@ impl ContentDefinedChunker { assert_eq!(first.level_offset, 0, "first chunk must start at level 0"); assert_eq!(first.value_offset, 0, "first chunk must start at value 0"); - let mut sum_levels = first.levels_to_write; + let mut sum_levels = first.num_levels; for i in 1..chunks.len() { let chunk = &chunks[i]; let prev = &chunks[i - 1]; - assert!(chunk.levels_to_write > 0, "chunk must have levels"); + assert!(chunk.num_levels > 0, "chunk must have levels"); assert!( chunk.value_offset >= prev.value_offset, "value offsets must be monotonically increasing" ); assert_eq!( chunk.level_offset, - prev.level_offset + prev.levels_to_write, + prev.level_offset + prev.num_levels, "chunks must be contiguous" ); - sum_levels += chunk.levels_to_write; + sum_levels += chunk.num_levels; } assert_eq!(sum_levels, num_levels, "chunks must cover all levels"); let last = chunks.last().unwrap(); assert_eq!( - last.level_offset + last.levels_to_write, + last.level_offset + last.num_levels, num_levels, "last chunk must end at num_levels" ); @@ -413,7 +423,7 @@ mod tests { assert_eq!(chunks.len(), 1); assert_eq!(chunks[0].level_offset, 0); assert_eq!(chunks[0].value_offset, 0); - assert_eq!(chunks[0].levels_to_write, 4); + assert_eq!(chunks[0].num_levels, 4); } #[test] @@ -438,9 +448,9 @@ mod tests { for (i, chunk) in chunks.iter().enumerate() { assert_eq!(chunk.level_offset, total_levels); if i < chunks.len() - 1 { - assert!(chunk.levels_to_write > 0); + assert!(chunk.num_levels > 0); } - total_levels += chunk.levels_to_write; + total_levels += chunk.num_levels; } assert_eq!(total_levels, num_values); } @@ -465,7 +475,7 @@ mod tests { for (a, b) in chunks1.iter().zip(chunks2.iter()) { assert_eq!(a.level_offset, b.level_offset); assert_eq!(a.value_offset, b.value_offset); - assert_eq!(a.levels_to_write, b.levels_to_write); + assert_eq!(a.num_levels, b.num_levels); } } @@ -489,7 +499,7 @@ mod tests { }); assert!(!chunks.is_empty()); - let total: usize = chunks.iter().map(|c| c.levels_to_write).sum(); + let total: usize = chunks.iter().map(|c| c.num_levels).sum(); assert_eq!(total, num_levels); } } diff --git a/parquet/src/column/chunker/mod.rs b/parquet/src/column/chunker/mod.rs index d5ccec101e46..70613500f556 100644 --- a/parquet/src/column/chunker/mod.rs +++ b/parquet/src/column/chunker/mod.rs @@ -34,5 +34,7 @@ pub(crate) struct Chunk { /// The start offset of this chunk inside the given values array. pub value_offset: usize, /// The number of levels in this chunk. - pub levels_to_write: usize, + pub num_levels: usize, + /// The number of values (Arrow array elements) in this chunk. + pub num_values: usize, } From 04711e21b39c61b3592fe7292117b7ad9f67e449 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sun, 22 Feb 2026 20:25:35 +0100 Subject: [PATCH 06/21] refactor(parquet): simplify the CDC implementation --- parquet/src/arrow/arrow_writer/mod.rs | 143 +++--------- parquet/src/column/chunker/cdc.rs | 320 ++++++++++++++++++++------ 2 files changed, 273 insertions(+), 190 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index b3c748d88e59..40feb891632b 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -17,7 +17,7 @@ //! Contains writer which writes arrow data into parquet data. -use crate::column::chunker; +use crate::column::ContentDefinedChunker; use bytes::Bytes; use std::io::{Read, Write}; @@ -860,7 +860,7 @@ impl ArrowColumnChunk { pub struct ArrowColumnWriter { writer: ArrowColumnWriterImpl, chunk: SharedColumnChunk, - pub(crate) chunker: Option, + chunker: Option, } impl std::fmt::Debug for ArrowColumnWriter { @@ -877,70 +877,43 @@ enum ArrowColumnWriterImpl { impl ArrowColumnWriter { /// Write an [`ArrowLeafColumn`] pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()> { - if self.chunker.is_some() { - self.write_with_cdc(col) + let levels = &col.0; + + if let Some(chunker) = self.chunker.as_mut() { + let chunks = + chunker.get_arrow_chunks(levels.def_levels(), levels.rep_levels(), levels.array())?; + + let num_chunks = chunks.len(); + for (i, chunk) in chunks.iter().enumerate() { + let chunk_levels = levels.slice_for_chunk(chunk); + self.write_internal(&chunk_levels)?; + + // Flush the page after each chunk except the last + if i + 1 < num_chunks { + self.flush_current_page()?; + } + } } else { - self.write_without_cdc(col) + self.write_internal(levels)?; } + Ok(()) } - fn write_without_cdc(&mut self, col: &ArrowLeafColumn) -> Result<()> { + fn write_internal(&mut self, levels: &ArrayLevels) -> Result<()> { match &mut self.writer { ArrowColumnWriterImpl::Column(c) => { - let leaf = col.0.array(); + let leaf = levels.array(); match leaf.as_any_dictionary_opt() { Some(dictionary) => { let materialized = arrow_select::take::take(dictionary.values(), dictionary.keys(), None)?; - write_leaf(c, &materialized, &col.0)? + write_leaf(c, &materialized, levels)?; } - None => write_leaf(c, leaf, &col.0)?, + None => write_leaf(c, leaf, levels)?, }; } ArrowColumnWriterImpl::ByteArray(c) => { - write_primitive(c, col.0.array().as_ref(), &col.0)?; - } - } - Ok(()) - } - - fn write_with_cdc(&mut self, col: &ArrowLeafColumn) -> Result<()> { - let levels = &col.0; - - // Dictionary-encoded arrays must be materialized before hashing because the - // CDC chunker must see the actual values, not dictionary indices. Two arrays - // with the same values but different dictionary orderings would otherwise - // produce different rolling hash states, breaking cross-file deduplication. - let leaf_array = match levels.array().as_any_dictionary_opt() { - Some(dictionary) => { - arrow_select::take::take(dictionary.values(), dictionary.keys(), None)? - } - None => levels.array().clone(), - }; - - let def_levels = levels.def_levels(); - let rep_levels = levels.rep_levels(); - let num_levels = def_levels - .map(|d| d.len()) - .or_else(|| rep_levels.map(|r| r.len())) - .unwrap_or(leaf_array.len()); - - // Compute CDC chunk boundaries - let chunks = { - let chunker = self.chunker.as_mut().unwrap(); - get_cdc_chunks(chunker, def_levels, rep_levels, num_levels, &leaf_array)? - }; - - let num_chunks = chunks.len(); - for (i, chunk) in chunks.iter().enumerate() { - let chunk_levels = levels.slice_for_chunk(chunk); - let chunk_col = ArrowLeafColumn(chunk_levels); - - self.write_without_cdc(&chunk_col)?; - - // Flush the page after each chunk except the last - if i + 1 < num_chunks { - self.flush_current_page()?; + write_primitive(c, levels.array().as_ref(), levels)?; } } Ok(()) @@ -1041,7 +1014,7 @@ impl ArrowRowGroupWriter { self, ) -> Result<( Vec, - Option>, + Option>, )> { let mut chunks = Vec::with_capacity(self.writers.len()); let mut chunkers = Vec::new(); @@ -1073,7 +1046,7 @@ pub struct ArrowRowGroupWriterFactory { file_encryptor: Option>, /// CDC chunkers persisted across row groups (one per leaf column). /// `None` when CDC is not enabled. - cdc_chunkers: Option>, + cdc_chunkers: Option>, } impl ArrowRowGroupWriterFactory { @@ -1129,7 +1102,7 @@ impl ArrowRowGroupWriterFactory { } /// Create CDC chunkers for all leaf columns, or `None` if CDC is not enabled. - fn create_cdc_chunkers(&self) -> Result>> { + fn create_cdc_chunkers(&self) -> Result>> { let opts = match self.props.cdc_options() { Some(opts) => opts, None => return Ok(None), @@ -1137,7 +1110,7 @@ impl ArrowRowGroupWriterFactory { self.schema .columns() .iter() - .map(|desc| chunker::ContentDefinedChunker::new(desc, opts)) + .map(|desc| ContentDefinedChunker::new(desc, opts)) .collect::>>() .map(Some) } @@ -1703,64 +1676,6 @@ fn get_fsb_array_slice( values } -/// Compute CDC chunk boundaries by dispatching on the Arrow array's data type -/// to feed value bytes into the rolling hash. -fn get_cdc_chunks( - chunker: &mut chunker::ContentDefinedChunker, - def_levels: Option<&[i16]>, - rep_levels: Option<&[i16]>, - num_levels: usize, - array: &dyn arrow_array::Array, -) -> Result> { - // Downcasts `array` to a concrete type, binds it to `$a`, then calls - // `get_chunks` with a closure that yields value bytes for index `$i`. - macro_rules! chunk { - ($a:ident = $downcast:expr, |$i:ident| $bytes:expr) => {{ - let $a = $downcast; - chunker.get_chunks(def_levels, rep_levels, num_levels, |$i| $bytes) - }}; - } - - let dtype = array.data_type(); - let chunks = match dtype { - ArrowDataType::Null => { - chunker.get_chunks(def_levels, rep_levels, num_levels, |_| -> &[u8] { &[] }) - } - ArrowDataType::Boolean => chunk!(a = array.as_boolean(), |i| [a.value(i) as u8]), - ArrowDataType::FixedSizeBinary(_) => { - chunk!(a = array.as_fixed_size_binary(), |i| a.value(i)) - } - ArrowDataType::Binary => chunk!(a = array.as_binary::(), |i| a.value(i)), - ArrowDataType::Utf8 => chunk!(a = array.as_string::(), |i| a.value(i).as_bytes()), - ArrowDataType::LargeBinary => chunk!(a = array.as_binary::(), |i| a.value(i)), - ArrowDataType::LargeUtf8 => chunk!(a = array.as_string::(), |i| a.value(i).as_bytes()), - ArrowDataType::BinaryView => chunk!(a = array.as_binary_view(), |i| a.value(i)), - ArrowDataType::Utf8View => chunk!(a = array.as_string_view(), |i| a.value(i).as_bytes()), - // All fixed-width primitive types (ints, floats, dates, times, timestamps, - // durations, intervals, decimals, float16). - // - // Values are read directly from the underlying buffer. `data.offset()` accounts - // for sliced arrays (non-zero logical start), so `base + i * byte_width` always - // resolves to the correct physical byte position for logical index `i`. - _ => { - let byte_width = dtype.primitive_width().ok_or_else(|| { - ParquetError::General(format!( - "content-defined chunking is not supported for data type {:?}", - dtype - )) - })?; - let data = array.to_data(); - let buffer = &data.buffers()[0]; - let base = data.offset() * byte_width; - chunker.get_chunks(def_levels, rep_levels, num_levels, |i| { - let start = base + i * byte_width; - &buffer[start..start + byte_width] - }) - } - }; - Ok(chunks) -} - #[cfg(test)] mod tests { use super::*; diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index 41ca4c1f74a5..b6e2eec6bc43 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -22,28 +22,66 @@ use crate::schema::types::ColumnDescriptor; use super::Chunk; use super::cdc_generated::{GEARHASH_TABLE, NUM_GEARHASH_TABLES}; -/// Content-defined chunker that uses a rolling gear hash to find chunk boundaries. +/// CDC (Content-Defined Chunking) divides data into variable-sized chunks based on +/// content rather than fixed-size boundaries. +/// +/// For example, given this sequence of values in a column: +/// +/// ```text +/// File1: [1,2,3, 4,5,6, 7,8,9] +/// chunk1 chunk2 chunk3 +/// ``` +/// +/// If a value is inserted between 3 and 4: +/// +/// ```text +/// File2: [1,2,3,0, 4,5,6, 7,8,9] +/// new-chunk chunk2 chunk3 +/// ``` +/// +/// The chunking process adjusts to maintain stable boundaries across data modifications. +/// Each chunk defines a new parquet data page which is contiguously written to the file. +/// Since each page is compressed independently, the files' contents look like: +/// +/// ```text +/// File1: [Page1][Page2][Page3]... +/// File2: [Page4][Page2][Page3]... +/// ``` +/// +/// When uploaded to a content-addressable storage (CAS) system, the CAS splits the byte +/// stream into content-defined blobs with unique identifiers. Identical blobs are stored +/// only once, so Page2 and Page3 are deduplicated across File1 and File2. +/// +/// ## Implementation +/// +/// Only the parquet writer needs to be aware of content-defined chunking; the reader is +/// unaffected. Each parquet column writer holds a `ContentDefinedChunker` instance +/// depending on the writer's properties. The chunker's state is maintained across the +/// entire column without being reset between pages and row groups. /// /// This implements a [FastCDC]-inspired algorithm using gear hashing. The input data is /// fed byte-by-byte into a rolling hash; when the hash matches a predefined mask, a new /// chunk boundary candidate is recorded. To reduce the exponential variance of chunk /// sizes inherent in a single gear hash, the algorithm requires **8 consecutive mask /// matches** — each against a different pre-computed gear hash table — before committing -/// to a boundary. This central-limit-theorem normalization makes the chunk size +/// to a boundary. This [central-limit-theorem normalization] makes the chunk size /// distribution approximately normal between `min_chunk_size` and `max_chunk_size`. /// -/// The chunker's state (rolling hash, run counter, accumulated size) persists across the -/// entire column (across pages and row groups), so boundaries are determined solely by -/// data content and are reproducible given the same input. +/// The chunker receives the record-shredded column data (def_levels, rep_levels, values) +/// and iterates over the (def_level, rep_level, value) triplets while adjusting the +/// column-global rolling hash. Whenever the rolling hash matches, the chunker creates a +/// new chunk. For nested data (lists, maps, structs) chunk boundaries are restricted to +/// top-level record boundaries (`rep_level == 0`) so that a nested row is never split +/// across chunks. /// -/// For nested data (lists, maps, structs) chunk boundaries are restricted to top-level -/// record boundaries (`rep_level == 0`) so that a nested row is never split across -/// chunks. +/// Note that boundaries are deterministically calculated exclusively based on the data +/// itself, so the same data always produces the same chunks given the same configuration. /// /// Ported from the C++ implementation in apache/arrow#45360 /// (`cpp/src/parquet/chunker_internal.cc`). /// /// [FastCDC]: https://www.usenix.org/conference/atc16/technical-sessions/presentation/xia +/// [central-limit-theorem normalization]: https://www.cidrdb.org/cidr2023/papers/p43-low.pdf #[derive(Debug)] pub(crate) struct ContentDefinedChunker { /// Maximum definition level for this column. @@ -53,7 +91,18 @@ pub(crate) struct ContentDefinedChunker { /// Definition level at the nearest REPEATED ancestor. repeated_ancestor_def_level: i16, + /// Minimum chunk size in bytes. + /// The rolling hash will not be updated until this size is reached for each chunk. + /// All data sent through the hash function counts towards the chunk size, including + /// definition and repetition levels if present. min_chunk_size: i64, + /// Maximum chunk size in bytes. + /// A new chunk is created whenever the chunk size exceeds this value. The chunk size + /// distribution approximates a normal distribution between `min_chunk_size` and + /// `max_chunk_size`. Note that the parquet writer has a related `data_pagesize` + /// property that controls the maximum size of a parquet data page after encoding. + /// While setting `data_pagesize` smaller than `max_chunk_size` doesn't affect + /// chunking effectiveness, it results in more small parquet data pages. max_chunk_size: i64, /// Mask for matching against the rolling hash. rolling_hash_mask: u64, @@ -136,7 +185,7 @@ impl ContentDefinedChunker { /// is the FastCDC optimization that prevents boundaries from appearing too early /// in a chunk. #[inline] - pub fn roll_value_bytes(&mut self, bytes: &[u8]) { + fn roll(&mut self, bytes: &[u8]) { self.chunk_size += bytes.len() as i64; if self.chunk_size < self.min_chunk_size { return; @@ -151,10 +200,30 @@ impl ContentDefinedChunker { } } + /// Feed exactly `N` bytes into the rolling hash (compile-time width). + /// + /// Like [`roll`](Self::roll), but the byte count is known at compile time, + /// allowing the compiler to unroll the inner loop. + #[inline(always)] + fn roll_fixed(&mut self, bytes: &[u8; N]) { + self.chunk_size += N as i64; + if self.chunk_size < self.min_chunk_size { + return; + } + for j in 0..N { + self.rolling_hash = self + .rolling_hash + .wrapping_shl(1) + .wrapping_add(GEARHASH_TABLE[self.nth_run][bytes[j] as usize]); + self.has_matched = + self.has_matched || ((self.rolling_hash & self.rolling_hash_mask) == 0); + } + } + /// Feed a definition or repetition level (i16) into the rolling hash. #[inline] fn roll_level(&mut self, level: i16) { - self.roll_value_bytes(&level.to_le_bytes()); + self.roll_fixed(&level.to_le_bytes()); } /// Check whether a new chunk boundary should be created. @@ -194,91 +263,77 @@ impl ContentDefinedChunker { /// Compute chunk boundaries for the given column data. /// - /// `value_bytes` returns the byte representation of the value at the given index. - /// The chunker feeds these bytes into the rolling hash to determine boundaries. - pub fn get_chunks( + /// The chunking state is maintained across the entire column without being + /// reset between pages and row groups. This enables the chunking process to + /// be continued between different write calls. + /// + /// We go over the (def_level, rep_level, value) triplets one by one while + /// adjusting the column-global rolling hash based on the triplet. Whenever + /// the rolling hash matches a predefined mask it sets `has_matched` to true. + /// + /// After each triplet [`need_new_chunk`](Self::need_new_chunk) is called to + /// evaluate if we need to create a new chunk. + fn calculate( &mut self, def_levels: Option<&[i16]>, rep_levels: Option<&[i16]>, num_levels: usize, - value_bytes: F, + mut roll_value: F, ) -> Vec where - F: Fn(usize) -> B, - B: AsRef<[u8]>, + F: FnMut(&mut Self, usize), { let has_def_levels = self.max_def_level > 0; let has_rep_levels = self.max_rep_level > 0; let mut chunks = Vec::new(); + let mut prev_offset: usize = 0; + let mut prev_value_offset: usize = 0; + // Total number of values seen; for non-nested data this equals num_levels. + let mut total_values: usize = num_levels; if !has_rep_levels && !has_def_levels { // Fastest path: non-nested, non-null data. - // level_offset == value_offset for this case. - let mut prev_offset: usize = 0; for offset in 0..num_levels { - self.roll_value_bytes(value_bytes(offset).as_ref()); + roll_value(self, offset); if self.need_new_chunk() { - let levels_to_write = offset - prev_offset; chunks.push(Chunk { level_offset: prev_offset, value_offset: prev_offset, - num_levels: levels_to_write, - num_values: levels_to_write, + num_levels: offset - prev_offset, + num_values: offset - prev_offset, }); prev_offset = offset; } } - // Last chunk - if prev_offset < num_levels { - let levels_to_write = num_levels - prev_offset; - chunks.push(Chunk { - level_offset: prev_offset, - value_offset: prev_offset, - num_levels: levels_to_write, - num_values: levels_to_write, - }); - } + // Set the previous value offset to add the last chunk. + prev_value_offset = prev_offset; } else if !has_rep_levels { - // Non-nested data with nulls (def levels only). - // level_offset == value_offset for non-nested data. + // Non-nested data with nulls. let def_levels = def_levels.expect("def_levels required when max_def_level > 0"); - let mut prev_offset: usize = 0; #[allow(clippy::needless_range_loop)] for offset in 0..num_levels { let def_level = def_levels[offset]; self.roll_level(def_level); if def_level == self.max_def_level { - self.roll_value_bytes(value_bytes(offset).as_ref()); + roll_value(self, offset); } if self.need_new_chunk() { - let levels_to_write = offset - prev_offset; chunks.push(Chunk { level_offset: prev_offset, value_offset: prev_offset, - num_levels: levels_to_write, - num_values: levels_to_write, + num_levels: offset - prev_offset, + num_values: offset - prev_offset, }); prev_offset = offset; } } - // Last chunk - if prev_offset < num_levels { - let levels_to_write = num_levels - prev_offset; - chunks.push(Chunk { - level_offset: prev_offset, - value_offset: prev_offset, - num_levels: levels_to_write, - num_values: levels_to_write, - }); - } + // Set the previous value offset to add the last chunk. + prev_value_offset = prev_offset; } else { - // Nested data (def + rep levels). - // value_offset tracks the leaf value index independently. + // Nested data with nulls. let def_levels = def_levels.expect("def_levels required for nested data"); let rep_levels = rep_levels.expect("rep_levels required for nested data"); - let mut prev_offset: usize = 0; - let mut prev_value_offset: usize = 0; let mut value_offset: usize = 0; for offset in 0..num_levels { @@ -288,13 +343,11 @@ impl ContentDefinedChunker { self.roll_level(def_level); self.roll_level(rep_level); if def_level == self.max_def_level { - self.roll_value_bytes(value_bytes(value_offset).as_ref()); + roll_value(self, value_offset); } - // Boundaries are only created at top-level record boundaries - // (rep_level == 0). Splitting inside a nested record would require - // writing a partial row, which is not valid in Parquet. if rep_level == 0 && self.need_new_chunk() { + // If we are at a record boundary and need a new chunk, create one. let levels_to_write = offset - prev_offset; if levels_to_write > 0 { chunks.push(Chunk { @@ -307,22 +360,22 @@ impl ContentDefinedChunker { prev_value_offset = value_offset; } } - // Count a value whenever the definition level reaches the nearest - // repeated ancestor. This tracks position in the Arrow array (which - // includes null inner elements), matching how Arrow encodes lists. if def_level >= self.repeated_ancestor_def_level { + // We only increment the value offset if we have a leaf value. value_offset += 1; } } - // Last chunk - if prev_offset < num_levels { - chunks.push(Chunk { - level_offset: prev_offset, - value_offset: prev_value_offset, - num_levels: num_levels - prev_offset, - num_values: value_offset - prev_value_offset, - }); - } + total_values = value_offset; + } + + // Add the last chunk if we have any levels left. + if prev_offset < num_levels { + chunks.push(Chunk { + level_offset: prev_offset, + value_offset: prev_value_offset, + num_levels: num_levels - prev_offset, + num_values: total_values - prev_value_offset, + }); } #[cfg(debug_assertions)] @@ -331,6 +384,92 @@ impl ContentDefinedChunker { chunks } + /// Compute CDC chunk boundaries by dispatching on the Arrow array's data type + /// to feed value bytes into the rolling hash. + #[cfg(feature = "arrow")] + pub(crate) fn get_arrow_chunks( + &mut self, + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + array: &dyn arrow_array::Array, + ) -> Result> { + use arrow_array::cast::AsArray; + use arrow_schema::DataType; + + let num_levels = match def_levels { + Some(def_levels) => def_levels.len(), + None => array.len(), + }; + + macro_rules! fixed_width { + ($N:literal) => {{ + let data = array.to_data(); + let raw = data.buffers()[0].as_slice(); + self.calculate(def_levels, rep_levels, num_levels, |c, i| { + c.roll_fixed::<$N>(raw[i * $N..(i + 1) * $N].try_into().unwrap()); + }) + }}; + } + + macro_rules! binary_like { + ($a:expr) => {{ + let a = $a; + self.calculate(def_levels, rep_levels, num_levels, |c, i| { + c.roll(a.value(i).as_ref()); + }) + }}; + } + + let dtype = array.data_type(); + let chunks = match dtype { + DataType::Null => self.calculate(def_levels, rep_levels, num_levels, |_, _| {}), + DataType::Boolean => { + let a = array.as_boolean(); + self.calculate(def_levels, rep_levels, num_levels, |c, i| { + c.roll_fixed(&[a.value(i) as u8]); + }) + } + DataType::Int8 | DataType::UInt8 => fixed_width!(1), + DataType::Int16 | DataType::UInt16 | DataType::Float16 => fixed_width!(2), + DataType::Int32 + | DataType::UInt32 + | DataType::Float32 + | DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(arrow_schema::IntervalUnit::YearMonth) + | DataType::Decimal32(_, _) => fixed_width!(4), + DataType::Int64 + | DataType::UInt64 + | DataType::Float64 + | DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) + | DataType::Interval(arrow_schema::IntervalUnit::DayTime) + | DataType::Decimal64(_, _) => fixed_width!(8), + DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) + | DataType::Decimal128(_, _) => fixed_width!(16), + DataType::Decimal256(_, _) => fixed_width!(32), + DataType::FixedSizeBinary(_) => binary_like!(array.as_fixed_size_binary()), + DataType::Binary => binary_like!(array.as_binary::()), + DataType::LargeBinary => binary_like!(array.as_binary::()), + DataType::Utf8 => binary_like!(array.as_string::()), + DataType::LargeUtf8 => binary_like!(array.as_string::()), + DataType::BinaryView => binary_like!(array.as_binary_view()), + DataType::Utf8View => binary_like!(array.as_string_view()), + DataType::Dictionary(_, _) => { + let dict = array.as_any_dictionary(); + self.get_arrow_chunks(def_levels, rep_levels, dict.keys())? + } + _ => { + return Err(ParquetError::General(format!( + "content-defined chunking is not supported for data type {dtype:?}", + ))); + } + }; + Ok(chunks) + } + #[cfg(debug_assertions)] fn validate_chunks(&self, chunks: &[Chunk], num_levels: usize) { assert!(!chunks.is_empty(), "chunks must be non-empty"); @@ -419,7 +558,9 @@ mod tests { // Write a small amount of data — should produce exactly 1 chunk. let num_values = 4; - let chunks = chunker.get_chunks(None, None, num_values, |i| (i as i32).to_le_bytes()); + let chunks = chunker.calculate(None, None, num_values, |c, i| { + c.roll_fixed::<4>(&(i as i32).to_le_bytes()); + }); assert_eq!(chunks.len(), 1); assert_eq!(chunks[0].level_offset, 0); assert_eq!(chunks[0].value_offset, 0); @@ -438,7 +579,9 @@ mod tests { // Write enough data to exceed max_chunk_size multiple times. // Each i32 = 4 bytes, max_chunk_size=1024, so ~256 values per chunk max. let num_values = 2000; - let chunks = chunker.get_chunks(None, None, num_values, |i| (i as i32).to_le_bytes()); + let chunks = chunker.calculate(None, None, num_values, |c, i| { + c.roll_fixed::<4>(&(i as i32).to_le_bytes()); + }); // Should have multiple chunks assert!(chunks.len() > 1); @@ -463,13 +606,15 @@ mod tests { norm_level: 0, }; - let roll = |i: usize| (i as i64).to_le_bytes(); + let roll = |c: &mut ContentDefinedChunker, i: usize| { + c.roll_fixed::<8>(&(i as i64).to_le_bytes()); + }; let mut chunker1 = ContentDefinedChunker::new(&make_desc(0, 0), &options).unwrap(); - let chunks1 = chunker1.get_chunks(None, None, 200, roll); + let chunks1 = chunker1.calculate(None, None, 200, roll); let mut chunker2 = ContentDefinedChunker::new(&make_desc(0, 0), &options).unwrap(); - let chunks2 = chunker2.get_chunks(None, None, 200, roll); + let chunks2 = chunker2.calculate(None, None, 200, roll); assert_eq!(chunks1.len(), chunks2.len()); for (a, b) in chunks1.iter().zip(chunks2.iter()) { @@ -494,8 +639,8 @@ mod tests { .map(|i| if i % 3 == 0 { 0 } else { 1 }) .collect(); - let chunks = chunker.get_chunks(Some(&def_levels), None, num_levels, |i| { - (i as i32).to_le_bytes() + let chunks = chunker.calculate(Some(&def_levels), None, num_levels, |c, i| { + c.roll_fixed::<4>(&(i as i32).to_le_bytes()); }); assert!(!chunks.is_empty()); @@ -862,6 +1007,29 @@ mod arrow_tests { assert_eq!(concat1, concat2); } + #[test] + fn test_cdc_roundtrip_dictionary() { + let values = StringArray::from_iter_values((0..10_000).map(|i| format!("val_{}", i % 100))); + let array: ArrayRef = Arc::new( + arrow_cast::cast::cast( + &values, + &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + ) + .unwrap(), + ); + let schema = Arc::new(Schema::new(vec![Field::new( + "col", + array.data_type().clone(), + false, + )])); + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); + + let data = write_batch_with_cdc(&batch); + let batches = read_batches(&data); + let result = concat_batches(&batches); + assert_eq!(batch.num_rows(), result.num_rows()); + } + #[test] fn test_cdc_roundtrip_list() { let mut builder = ListBuilder::new(arrow_array::builder::Int32Builder::new()); From c2b31ff2a07d404531a605ebec340051e8807862 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sun, 22 Feb 2026 21:15:42 +0100 Subject: [PATCH 07/21] refactor(parquet): hold the cdc chunkers in ArrowWriter --- parquet/src/arrow/arrow_writer/mod.rs | 155 +++++++++++++------------- 1 file changed, 77 insertions(+), 78 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 40feb891632b..2f6cdc1ce6d1 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -17,7 +17,7 @@ //! Contains writer which writes arrow data into parquet data. -use crate::column::ContentDefinedChunker; +use crate::column::chunker::ContentDefinedChunker; use bytes::Bytes; use std::io::{Read, Write}; @@ -194,6 +194,10 @@ pub struct ArrowWriter { /// The maximum size in bytes for a row group, or None for unlimited max_row_group_bytes: Option, + + /// CDC chunkers persisted across row groups (one per leaf column). + /// Moved into `ArrowRowGroupWriter` for each row group, then returned on close. + cdc_chunkers: Option>, } impl std::fmt::Debug for ArrowWriter { @@ -263,6 +267,19 @@ impl ArrowWriter { let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&file_writer, arrow_schema.clone()); + let cdc_chunkers = match props_ptr.cdc_options() { + Some(opts) => { + let chunkers = file_writer + .schema_descr() + .columns() + .iter() + .map(|desc| ContentDefinedChunker::new(desc, opts)) + .collect::>>()?; + Some(chunkers) + } + None => None, + }; + Ok(Self { writer: file_writer, in_progress: None, @@ -270,6 +287,7 @@ impl ArrowWriter { row_group_writer_factory, max_row_group_row_count, max_row_group_bytes, + cdc_chunkers, }) } @@ -338,9 +356,10 @@ impl ArrowWriter { let in_progress = match &mut self.in_progress { Some(in_progress) => in_progress, x => { - let rg = self - .row_group_writer_factory - .create_row_group_writer(self.writer.flushed_row_groups().len())?; + let rg = self.row_group_writer_factory.create_row_group_writer( + self.writer.flushed_row_groups().len(), + self.cdc_chunkers.take(), + )?; x.insert(rg) } }; @@ -426,7 +445,7 @@ impl ArrowWriter { }; let (chunks, chunkers) = in_progress.close()?; - self.row_group_writer_factory.cdc_chunkers = chunkers; + self.cdc_chunkers = chunkers; let mut row_group_writer = self.writer.next_row_group()?; for chunk in chunks { @@ -488,9 +507,10 @@ impl ArrowWriter { )] pub fn get_column_writers(&mut self) -> Result> { self.flush()?; - let in_progress = self - .row_group_writer_factory - .create_row_group_writer(self.writer.flushed_row_groups().len())?; + let in_progress = self.row_group_writer_factory.create_row_group_writer( + self.writer.flushed_row_groups().len(), + self.cdc_chunkers.take(), + )?; Ok(in_progress.writers) } @@ -860,7 +880,6 @@ impl ArrowColumnChunk { pub struct ArrowColumnWriter { writer: ArrowColumnWriterImpl, chunk: SharedColumnChunk, - chunker: Option, } impl std::fmt::Debug for ArrowColumnWriter { @@ -877,24 +896,28 @@ enum ArrowColumnWriterImpl { impl ArrowColumnWriter { /// Write an [`ArrowLeafColumn`] pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()> { - let levels = &col.0; + self.write_internal(&col.0) + } - if let Some(chunker) = self.chunker.as_mut() { - let chunks = - chunker.get_arrow_chunks(levels.def_levels(), levels.rep_levels(), levels.array())?; + /// Write with content-defined chunking, inserting page flushes at chunk boundaries. + fn write_with_chunker( + &mut self, + col: &ArrowLeafColumn, + chunker: &mut ContentDefinedChunker, + ) -> Result<()> { + let levels = &col.0; + let chunks = + chunker.get_arrow_chunks(levels.def_levels(), levels.rep_levels(), levels.array())?; - let num_chunks = chunks.len(); - for (i, chunk) in chunks.iter().enumerate() { - let chunk_levels = levels.slice_for_chunk(chunk); - self.write_internal(&chunk_levels)?; + let num_chunks = chunks.len(); + for (i, chunk) in chunks.iter().enumerate() { + let chunk_levels = levels.slice_for_chunk(chunk); + self.write_internal(&chunk_levels)?; - // Flush the page after each chunk except the last - if i + 1 < num_chunks { - self.flush_current_page()?; - } + // Flush the page after each chunk except the last + if i + 1 < num_chunks { + self.flush_current_page()?; } - } else { - self.write_internal(levels)?; } Ok(()) } @@ -907,7 +930,7 @@ impl ArrowColumnWriter { Some(dictionary) => { let materialized = arrow_select::take::take(dictionary.values(), dictionary.keys(), None)?; - write_leaf(c, &materialized, levels)?; + write_leaf(c, &materialized, levels)? } None => write_leaf(c, leaf, levels)?, }; @@ -980,23 +1003,34 @@ struct ArrowRowGroupWriter { writers: Vec, schema: SchemaRef, buffered_rows: usize, + chunkers: Option>, } impl ArrowRowGroupWriter { - fn new(writers: Vec, arrow: &SchemaRef) -> Self { + fn new( + writers: Vec, + arrow: &SchemaRef, + chunkers: Option>, + ) -> Self { Self { writers, schema: arrow.clone(), buffered_rows: 0, + chunkers, } } fn write(&mut self, batch: &RecordBatch) -> Result<()> { self.buffered_rows += batch.num_rows(); let mut writers = self.writers.iter_mut(); + let mut chunkers = self.chunkers.as_mut().map(|c| c.iter_mut()); for (field, column) in self.schema.fields().iter().zip(batch.columns()) { for leaf in compute_leaves(field.as_ref(), column)? { - writers.next().unwrap().write(&leaf)? + let writer = writers.next().unwrap(); + match chunkers.as_mut().and_then(|c| c.next()) { + Some(chunker) => writer.write_with_chunker(&leaf, chunker)?, + None => writer.write(&leaf)?, + } } } Ok(()) @@ -1010,26 +1044,13 @@ impl ArrowRowGroupWriter { .sum() } - fn close( - self, - ) -> Result<( - Vec, - Option>, - )> { - let mut chunks = Vec::with_capacity(self.writers.len()); - let mut chunkers = Vec::new(); - for mut writer in self.writers { - if let Some(chunker) = writer.chunker.take() { - chunkers.push(chunker); - } - chunks.push(writer.close()?); - } - let chunkers = if chunkers.is_empty() { - None - } else { - Some(chunkers) - }; - Ok((chunks, chunkers)) + fn close(self) -> Result<(Vec, Option>)> { + let chunks = self + .writers + .into_iter() + .map(|writer| writer.close()) + .collect::>>()?; + Ok((chunks, self.chunkers)) } } @@ -1044,9 +1065,6 @@ pub struct ArrowRowGroupWriterFactory { props: WriterPropertiesPtr, #[cfg(feature = "encryption")] file_encryptor: Option>, - /// CDC chunkers persisted across row groups (one per leaf column). - /// `None` when CDC is not enabled. - cdc_chunkers: Option>, } impl ArrowRowGroupWriterFactory { @@ -1063,13 +1081,20 @@ impl ArrowRowGroupWriterFactory { props, #[cfg(feature = "encryption")] file_encryptor: file_writer.file_encryptor(), - cdc_chunkers: None, } } - fn create_row_group_writer(&mut self, row_group_index: usize) -> Result { + fn create_row_group_writer( + &mut self, + row_group_index: usize, + chunkers: Option>, + ) -> Result { let writers = self.create_column_writers(row_group_index)?; - Ok(ArrowRowGroupWriter::new(writers, &self.arrow_schema)) + Ok(ArrowRowGroupWriter::new( + writers, + &self.arrow_schema, + chunkers, + )) } /// Create column writers for a new row group, with the given row group index @@ -1088,33 +1113,9 @@ impl ArrowRowGroupWriterFactory { &mut writers, )?; } - let chunkers = match self.cdc_chunkers.take() { - Some(chunkers) => chunkers, - None => match self.create_cdc_chunkers()? { - Some(chunkers) => chunkers, - None => return Ok(writers), - }, - }; - for (writer, chunker) in writers.iter_mut().zip(chunkers) { - writer.chunker = Some(chunker); - } Ok(writers) } - /// Create CDC chunkers for all leaf columns, or `None` if CDC is not enabled. - fn create_cdc_chunkers(&self) -> Result>> { - let opts = match self.props.cdc_options() { - Some(opts) => opts, - None => return Ok(None), - }; - self.schema - .columns() - .iter() - .map(|desc| ContentDefinedChunker::new(desc, opts)) - .collect::>>() - .map(Some) - } - #[cfg(feature = "encryption")] fn column_writer_factory(&self, row_group_idx: usize) -> ArrowColumnWriterFactory { ArrowColumnWriterFactory::new() @@ -1221,7 +1222,6 @@ impl ArrowColumnWriterFactory { Ok(ArrowColumnWriter { chunk, writer: ArrowColumnWriterImpl::Column(writer), - chunker: None, }) }; @@ -1233,7 +1233,6 @@ impl ArrowColumnWriterFactory { Ok(ArrowColumnWriter { chunk, writer: ArrowColumnWriterImpl::ByteArray(writer), - chunker: None, }) }; From ad4d2c60f138f7a9dbc7f4e415e5abe2536f6f8a Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 23 Feb 2026 15:25:06 +0100 Subject: [PATCH 08/21] chore(parquet): remove redundant flush_current_page() method --- parquet/src/arrow/arrow_writer/mod.rs | 14 +++++--------- parquet/src/column/mod.rs | 1 + parquet/src/column/writer/mod.rs | 22 +++++++--------------- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 2f6cdc1ce6d1..25da060d9efd 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -914,9 +914,12 @@ impl ArrowColumnWriter { let chunk_levels = levels.slice_for_chunk(chunk); self.write_internal(&chunk_levels)?; - // Flush the page after each chunk except the last + // Add a page break after each chunk except the last if i + 1 < num_chunks { - self.flush_current_page()?; + match &mut self.writer { + ArrowColumnWriterImpl::Column(c) => c.add_data_page()?, + ArrowColumnWriterImpl::ByteArray(c) => c.add_data_page()?, + } } } Ok(()) @@ -942,13 +945,6 @@ impl ArrowColumnWriter { Ok(()) } - fn flush_current_page(&mut self) -> Result<()> { - match &mut self.writer { - ArrowColumnWriterImpl::Column(c) => c.flush_current_page(), - ArrowColumnWriterImpl::ByteArray(c) => c.flush_current_page(), - } - } - /// Close this column returning the written [`ArrowColumnChunk`] pub fn close(self) -> Result { let close = match self.writer { diff --git a/parquet/src/column/mod.rs b/parquet/src/column/mod.rs index e2db4fe69159..115c8dd01b80 100644 --- a/parquet/src/column/mod.rs +++ b/parquet/src/column/mod.rs @@ -117,6 +117,7 @@ //! assert_eq!(rep_levels, vec![0, 1, 0, 1, 1]); //! ``` +#[cfg(feature = "arrow")] pub(crate) mod chunker; pub mod page; #[cfg(feature = "encryption")] diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 5a2f46628d68..4c3dbabc2132 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -100,10 +100,13 @@ impl ColumnWriter<'_> { downcast_writer!(self, typed, typed.get_estimated_total_bytes()) } - /// Flush the currently buffered values as a data page. + /// Finalize the currently buffered values as a data page. + /// + /// This is used by content-defined chunking to force a page boundary at + /// content-determined positions. #[cfg(feature = "arrow")] - pub(crate) fn flush_current_page(&mut self) -> Result<()> { - downcast_writer!(self, typed, typed.flush_current_page()) + pub(crate) fn add_data_page(&mut self) -> Result<()> { + downcast_writer!(self, typed, typed.add_data_page()) } /// Close this [`ColumnWriter`], returning the metadata for the column chunk. @@ -602,17 +605,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { &self.descr } - /// Flush the currently buffered values as a data page. - /// - /// This is used by content-defined chunking to force a page boundary at - /// content-determined positions. - pub(crate) fn flush_current_page(&mut self) -> Result<()> { - if self.page_metrics.num_buffered_values > 0 { - self.add_data_page()?; - } - Ok(()) - } - /// Finalizes writes and closes the column writer. /// Returns total bytes written, total rows written and column chunk metadata. pub fn close(mut self) -> Result { @@ -1018,7 +1010,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Adds data page. /// Data page is either buffered in case of dictionary encoding or written directly. - fn add_data_page(&mut self) -> Result<()> { + pub(crate) fn add_data_page(&mut self) -> Result<()> { // Extract encoded values let values_data = self.encoder.flush_data_page()?; From 25535756daf0d8a12c729d27234213be7503e5a6 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 23 Feb 2026 16:54:34 +0100 Subject: [PATCH 09/21] doc(parquet): remove content defined chunking example from dosctrings --- parquet/src/arrow/mod.rs | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 57b350a766c4..52152988166f 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -88,33 +88,6 @@ //! writer.close().unwrap(); //! ``` //! -//! ## EXPERIMENTAL: Content-Defined Chunking -//! -//! Enable content-defined chunking (CDC) via [`WriterProperties`] to improve -//! deduplication efficiency in content-addressable storage (CAS) systems such as -//! Hugging Face Hub. CDC creates data page boundaries based on content rather than -//! fixed sizes, so unchanged data across file versions produces identical byte -//! sequences that CAS backends can deduplicate at the page level. -//! -//! ```no_run -//! # use parquet::arrow::arrow_writer::ArrowWriter; -//! # use parquet::file::properties::WriterProperties; -//! # use std::fs::File; -//! # use arrow_array::RecordBatch; -//! # fn write(batch: &RecordBatch) { -//! let file = File::create("data.parquet").unwrap(); -//! let props = WriterProperties::builder() -//! .set_content_defined_chunking(true) -//! .build(); -//! let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props)).unwrap(); -//! writer.write(batch).unwrap(); -//! writer.close().unwrap(); -//! # } -//! ``` -//! -//! See [`CdcOptions`](crate::file::properties::CdcOptions) for chunk size and -//! normalization level configuration. -//! //! # Example: Reading Parquet file into Arrow `RecordBatch` //! //! ```rust From 5facebbc361725d34c5e2b6565bb48963a999f29 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 25 Feb 2026 09:19:39 +0100 Subject: [PATCH 10/21] chore(parquet): remove unnecessary mut row_group_writer_factory assignments --- parquet/benches/arrow_writer.rs | 2 +- parquet/tests/encryption/encryption_async.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index 140a8780088b..6bcdfc0769d1 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -348,7 +348,7 @@ fn write_batch_with_option( .with_coerce_types(props.coerce_types()) .convert(batch.schema_ref())?; let writer = SerializedFileWriter::new(&mut file, parquet_schema.root_schema_ptr(), props)?; - let mut row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); + let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); bench.iter(|| { let mut row_group = row_group_writer_factory.create_column_writers(0).unwrap(); diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index e79279913966..ef0d7056aa05 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -548,7 +548,7 @@ fn spawn_rg_join_and_finalize_task( } fn spawn_parquet_parallel_serialization_task( - mut writer_factory: ArrowRowGroupWriterFactory, + writer_factory: ArrowRowGroupWriterFactory, mut data: Receiver, serialize_tx: Sender>, schema: Arc, @@ -778,7 +778,7 @@ async fn test_multi_threaded_encrypted_writing() { let temp_file = tempfile::tempfile().unwrap(); let mut writer = SerializedFileWriter::new(&temp_file, parquet_schema.root_schema_ptr(), props).unwrap(); - let mut row_group_writer_factory = + let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); let (serialize_tx, mut serialize_rx) = From b25f206346687d19e8642b0a0051a23a30ca0158 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 25 Feb 2026 11:06:50 +0100 Subject: [PATCH 11/21] fix(parquet): incorporate primitive array offset when calculating cdc chunks --- parquet/src/arrow/arrow_writer/levels.rs | 10 +- parquet/src/column/chunker/cdc.rs | 134 +++++++++++++++++-- parquet/src/file/properties.rs | 62 ++++++--- parquet/tests/encryption/encryption_async.rs | 3 +- 4 files changed, 175 insertions(+), 34 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 471fca4c58eb..c5c1dcd5864f 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -819,12 +819,16 @@ impl ArrayLevels { .map(|levels| levels[level_offset..level_offset + num_levels].to_vec()); // Filter non_null_indices to [value_offset, value_offset + num_values) - // and shift by -value_offset. + // and shift by -value_offset. Use binary search since the slice is sorted. let value_end = value_offset + num_values; - let non_null_indices: Vec = self + let start = self .non_null_indices + .partition_point(|&idx| idx < value_offset); + let end = self + .non_null_indices + .partition_point(|&idx| idx < value_end); + let non_null_indices: Vec = self.non_null_indices[start..end] .iter() - .filter(|&&idx| idx >= value_offset && idx < value_end) .map(|&idx| idx - value_offset) .collect(); diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index b6e2eec6bc43..5696233625d8 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -379,7 +379,7 @@ impl ContentDefinedChunker { } #[cfg(debug_assertions)] - self.validate_chunks(&chunks, num_levels); + self.validate_chunks(&chunks, num_levels, total_values); chunks } @@ -404,9 +404,12 @@ impl ContentDefinedChunker { macro_rules! fixed_width { ($N:literal) => {{ let data = array.to_data(); - let raw = data.buffers()[0].as_slice(); + let buffer = data.buffers()[0].as_slice(); + let values = &buffer[data.offset() * $N..]; self.calculate(def_levels, rep_levels, num_levels, |c, i| { - c.roll_fixed::<$N>(raw[i * $N..(i + 1) * $N].try_into().unwrap()); + let offset = i * $N; + let slice = &values[offset..offset + $N]; + c.roll_fixed::<$N>(slice.try_into().unwrap()); }) }}; } @@ -471,7 +474,7 @@ impl ContentDefinedChunker { } #[cfg(debug_assertions)] - fn validate_chunks(&self, chunks: &[Chunk], num_levels: usize) { + fn validate_chunks(&self, chunks: &[Chunk], num_levels: usize, total_values: usize) { assert!(!chunks.is_empty(), "chunks must be non-empty"); let first = &chunks[0]; @@ -479,22 +482,26 @@ impl ContentDefinedChunker { assert_eq!(first.value_offset, 0, "first chunk must start at value 0"); let mut sum_levels = first.num_levels; + let mut sum_values = first.num_values; for i in 1..chunks.len() { let chunk = &chunks[i]; let prev = &chunks[i - 1]; assert!(chunk.num_levels > 0, "chunk must have levels"); - assert!( - chunk.value_offset >= prev.value_offset, - "value offsets must be monotonically increasing" - ); assert_eq!( chunk.level_offset, prev.level_offset + prev.num_levels, - "chunks must be contiguous" + "level offsets must be contiguous" + ); + assert_eq!( + chunk.value_offset, + prev.value_offset + prev.num_values, + "value offsets must be contiguous" ); sum_levels += chunk.num_levels; + sum_values += chunk.num_values; } assert_eq!(sum_levels, num_levels, "chunks must cover all levels"); + assert_eq!(sum_values, total_values, "chunks must cover all values"); let last = chunks.last().unwrap(); assert_eq!( @@ -1618,4 +1625,113 @@ mod arrow_tests { ); } } + + #[test] + fn test_cdc_array_offsets() { + // CDC boundaries are content-defined: once the gear hash converges (within + // a few dozen bytes), both the full and the sliced stream find boundaries + // at the same absolute content positions. Slicing at offset=10 therefore + // produces page lengths of the form: + // + // non-offsetted: [n, a, b, c, ...] + // offsetted: [n-10, a, b, c, ...] + // + // Only the first page is shorter by `offset`; every subsequent page, + // including the last, is identical. + let n = i32_part_length(); // large enough to span many CDC pages + let offset = 10usize; + let full = make_i32_batch(n, 0); + let sliced = full.slice(offset, n - offset); + + let full_data = + write_with_cdc_options(&[&full], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + let sliced_data = + write_with_cdc_options(&[&sliced], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + + // Roundtrip correctness. + let read = read_batches(&sliced_data); + assert_eq!(sliced, concat_batches(&read)); + + let full_pages = get_page_lengths(&full_data, 0); + let sliced_pages = get_page_lengths(&sliced_data, 0); + + assert_eq!(full_pages.len(), 1, "expected single row group"); + assert_eq!(sliced_pages.len(), 1, "expected single row group"); + + let fp = &full_pages[0]; + let sp = &sliced_pages[0]; + + assert!(fp.len() > 1, "expected multiple CDC pages, got {fp:?}"); + assert_eq!(fp.len(), sp.len(), "page count must match"); + + // First page is shorter by exactly `offset`. + assert_eq!( + fp[0] - sp[0], + offset as i64, + "sliced first page should be {offset} values shorter: full={fp:?} sliced={sp:?}" + ); + + // All remaining pages — including the last — are identical. + assert_eq!( + &fp[1..], + &sp[1..], + "pages after the first must be identical: full={fp:?} sliced={sp:?}" + ); + } + + #[test] + fn test_cdc_array_offsets_direct() { + // Call get_arrow_chunks directly on the low-level chunker, bypassing the + // Arrow writer pipeline. The same self-synchronisation property holds: + // + // non-offsetted chunks: [n, a, b, c, ...] + // offsetted chunks: [n-10, a, b, c, ...] + // + // Only the first chunk is shorter by `offset`; all subsequent chunks have + // identical num_values. + use crate::basic::Type as PhysicalType; + use crate::schema::types::{ColumnDescriptor, ColumnPath, Type}; + + let options = CdcOptions { + min_chunk_size: CDC_MIN_CHUNK_SIZE, + max_chunk_size: CDC_MAX_CHUNK_SIZE, + norm_level: 0, + }; + let desc = { + let tp = Type::primitive_type_builder("col", PhysicalType::INT32) + .build() + .unwrap(); + ColumnDescriptor::new(Arc::new(tp), 0, 0, ColumnPath::new(vec![])) + }; + + let n = i32_part_length(); // large enough for multiple CDC chunks + let offset = 10usize; + + // Non-offsetted: plain fresh array of n values. + let array = generate_i32_array(n, 0); + let mut chunker = super::ContentDefinedChunker::new(&desc, &options).unwrap(); + let chunks = chunker.get_arrow_chunks(None, None, &array).unwrap(); + + // Offsetted: same backing buffer sliced by `offset` elements. + let sliced = array.slice(offset, n - offset); + let mut chunker2 = super::ContentDefinedChunker::new(&desc, &options).unwrap(); + let chunks2 = chunker2.get_arrow_chunks(None, None, &sliced).unwrap(); + + let values: Vec = chunks.iter().map(|c| c.num_values).collect(); + let values2: Vec = chunks2.iter().map(|c| c.num_values).collect(); + + assert!(values.len() > 1, "expected multiple chunks, got {values:?}"); + assert_eq!(values.len(), values2.len(), "chunk count must match"); + + assert_eq!( + values[0] - values2[0], + offset, + "offsetted first chunk should be {offset} values shorter: {values:?} vs {values2:?}" + ); + assert_eq!( + &values[1..], + &values2[1..], + "all chunks after the first must be identical: {values:?} vs {values2:?}" + ); + } } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 02a07dd4049d..c3e2b9462fc0 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -64,33 +64,41 @@ pub const DEFAULT_COERCE_TYPES: bool = false; /// EXPERIMENTAL: Options for content-defined chunking (CDC). /// -/// CDC creates data page boundaries based on content rather than fixed sizes, -/// enabling efficient deduplication in content-addressable storage (CAS) systems. -/// When enabled, unchanged data across file versions will produce identical byte -/// sequences, allowing storage-level deduplication. +/// Content-defined chunking is an experimental feature that optimizes parquet +/// files for content addressable storage (CAS) systems by writing data pages +/// according to content-defined chunk boundaries. This allows for more +/// efficient deduplication of data across files, hence more efficient network +/// transfers and storage. /// -/// Each content-defined chunk is written as a separate parquet data page. These -/// options control the chunk size and the chunking process. Note that the chunk -/// size is calculated based on the logical value of the data, before any encoding -/// or compression is applied. +/// Each content-defined chunk is written as a separate parquet data page. The +/// following options control the chunks' size and the chunking process. Note +/// that the chunk size is calculated based on the logical value of the data, +/// before any encoding or compression is applied. #[derive(Debug, Clone, Copy)] pub struct CdcOptions { /// Minimum chunk size in bytes, default is 256 KiB. - /// The rolling hash will not be evaluated until this many bytes have been - /// accumulated in the current chunk. All data fed through the hash function - /// counts towards the chunk size, including definition and repetition levels. + /// The rolling hash will not be updated until this size is reached for each chunk. + /// Note that all data sent through the hash function is counted towards the chunk + /// size, including definition and repetition levels if present. pub min_chunk_size: usize, /// Maximum chunk size in bytes, default is 1024 KiB. - /// A chunk boundary is forced when the chunk size reaches this value, - /// regardless of hash state. The parquet writer's `data_page_size_limit` - /// still applies independently. + /// The chunker will create a new chunk whenever the chunk size exceeds this value. + /// Note that the parquet writer has a related `data_page_size_limit` property that + /// controls the maximum size of a parquet data page after encoding. While setting + /// `data_page_size_limit` to a smaller value than `max_chunk_size` doesn't affect + /// the chunking effectiveness, it results in more small parquet data pages. pub max_chunk_size: usize, - /// Normalization level to center the chunk size distribution around the - /// average size more aggressively, default is 0. - /// Increasing the normalization level increases the probability of finding - /// a chunk boundary, improving the deduplication ratio, but also increases - /// the number of small chunks. Use 1 or 2 for higher deduplication at the - /// expense of fragmentation. + /// Number of bit adjustment to the gearhash mask in order to center the chunk size + /// around the average size more aggressively, default is 0. + /// Increasing the normalization level increases the probability of finding a chunk, + /// improving the deduplication ratio, but also increasing the number of small chunks + /// resulting in many small parquet data pages. The default value provides a good + /// balance between deduplication ratio and fragmentation. + /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the + /// expense of fragmentation. Negative values can also be used to reduce the + /// probability of finding a chunk, resulting in larger chunks and fewer data pages. + /// Note that values outside [-3, 3] are not recommended, prefer using the default + /// value of 0 for most use cases. pub norm_level: i32, } @@ -825,7 +833,21 @@ impl WriterPropertiesBuilder { /// EXPERIMENTAL: Sets content-defined chunking options, implicitly enabling CDC. /// /// See [`CdcOptions`] for details on each parameter. + /// + /// # Panics + /// + /// Panics if `min_chunk_size == 0` or `max_chunk_size <= min_chunk_size`. pub fn set_cdc_options(mut self, options: CdcOptions) -> Self { + assert!( + options.min_chunk_size > 0, + "min_chunk_size must be positive" + ); + assert!( + options.max_chunk_size > options.min_chunk_size, + "max_chunk_size ({}) must be greater than min_chunk_size ({})", + options.max_chunk_size, + options.min_chunk_size + ); self.cdc_options = Some(options); self } diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index ef0d7056aa05..48c844afb99e 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -778,8 +778,7 @@ async fn test_multi_threaded_encrypted_writing() { let temp_file = tempfile::tempfile().unwrap(); let mut writer = SerializedFileWriter::new(&temp_file, parquet_schema.root_schema_ptr(), props).unwrap(); - let row_group_writer_factory = - ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); + let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); let (serialize_tx, mut serialize_rx) = tokio::sync::mpsc::channel::>(1); From a699aefae66f152116d2a6c34addd60cfa73836e Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 25 Feb 2026 11:42:23 +0100 Subject: [PATCH 12/21] chore(parquet): add benchmark for cdc chunking --- parquet/benches/arrow_writer.rs | 12 ++------- parquet/src/column/chunker/cdc.rs | 8 +++--- parquet/src/file/properties.rs | 44 ++++++++++++------------------- parquet/src/lib.rs | 4 +-- 4 files changed, 25 insertions(+), 43 deletions(-) diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index 6bcdfc0769d1..37163d2ef666 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -348,7 +348,7 @@ fn write_batch_with_option( .with_coerce_types(props.coerce_types()) .convert(batch.schema_ref())?; let writer = SerializedFileWriter::new(&mut file, parquet_schema.root_schema_ptr(), props)?; - let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); + let mut row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); bench.iter(|| { let mut row_group = row_group_writer_factory.create_column_writers(0).unwrap(); @@ -440,16 +440,8 @@ fn create_writer_props() -> Vec<(&'static str, WriterProperties)> { .build(); props.push(("zstd_parquet_2", prop)); - // CDC with small chunk sizes so that boundaries actually trigger within the - // benchmark batch size (~16 KiB for a 4096-row i32 batch). Dictionary encoding - // is disabled because CDC materializes dictionary arrays before hashing. let prop = WriterProperties::builder() - .set_cdc_options(CdcOptions { - min_chunk_size: 4 * 1024, - max_chunk_size: 16 * 1024, - norm_level: 0, - }) - .set_dictionary_enabled(false) + .set_content_defined_chunking(Some(CdcOptions::default())) .build(); props.push(("cdc", prop)); diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index 5696233625d8..cb4eba2c7043 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -721,7 +721,7 @@ mod arrow_tests { fn write_batch_with_cdc(batch: &RecordBatch) -> Vec { let props = WriterProperties::builder() - .set_content_defined_chunking(true) + .set_content_defined_chunking(Some(CdcOptions::default())) .build(); let mut buf = Vec::new(); let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), Some(props)).unwrap(); @@ -773,11 +773,11 @@ mod arrow_tests { let schema = batches[0].schema(); let mut builder = WriterProperties::builder() .set_dictionary_enabled(false) - .set_cdc_options(CdcOptions { + .set_content_defined_chunking(Some(CdcOptions { min_chunk_size, max_chunk_size, norm_level: 0, - }); + })); if let Some(max_rows) = max_row_group_rows { builder = builder.set_max_row_group_row_count(Some(max_rows)); } @@ -998,7 +998,7 @@ mod arrow_tests { let data_one_rg = write_batch_with_cdc(&batch_all); let props = WriterProperties::builder() - .set_content_defined_chunking(true) + .set_content_defined_chunking(Some(CdcOptions::default())) .set_max_row_group_row_count(Some(n as usize / 2)) .build(); let mut buf = Vec::new(); diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index c3e2b9462fc0..022105a82030 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -811,7 +811,7 @@ impl WriterPropertiesBuilder { self } - /// EXPERIMENTAL: Enables or disables content-defined chunking with default options. + /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`. /// /// When enabled, data page boundaries are determined by a rolling hash of the /// column values, so unchanged data produces identical byte sequences across @@ -820,35 +820,25 @@ impl WriterPropertiesBuilder { /// /// Only supported through the Arrow writer interface ([`ArrowWriter`]). /// - /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter - pub fn set_content_defined_chunking(mut self, enabled: bool) -> Self { - self.cdc_options = if enabled { - Some(CdcOptions::default()) - } else { - None - }; - self - } - - /// EXPERIMENTAL: Sets content-defined chunking options, implicitly enabling CDC. - /// - /// See [`CdcOptions`] for details on each parameter. - /// /// # Panics /// /// Panics if `min_chunk_size == 0` or `max_chunk_size <= min_chunk_size`. - pub fn set_cdc_options(mut self, options: CdcOptions) -> Self { - assert!( - options.min_chunk_size > 0, - "min_chunk_size must be positive" - ); - assert!( - options.max_chunk_size > options.min_chunk_size, - "max_chunk_size ({}) must be greater than min_chunk_size ({})", - options.max_chunk_size, - options.min_chunk_size - ); - self.cdc_options = Some(options); + /// + /// [`ArrowWriter`]: crate::arrow::arrow_writer::ArrowWriter + pub fn set_content_defined_chunking(mut self, options: Option) -> Self { + if let Some(ref options) = options { + assert!( + options.min_chunk_size > 0, + "min_chunk_size must be positive" + ); + assert!( + options.max_chunk_size > options.min_chunk_size, + "max_chunk_size ({}) must be greater than min_chunk_size ({})", + options.max_chunk_size, + options.min_chunk_size + ); + } + self.cdc_options = options; self } diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index b76f6d6670af..b8dec7a3728c 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -78,9 +78,9 @@ //! Enable CDC via [`WriterProperties`]: //! //! ```no_run -//! # use parquet::file::properties::WriterProperties; +//! # use parquet::file::properties::{WriterProperties, CdcOptions}; //! let props = WriterProperties::builder() -//! .set_content_defined_chunking(true) +//! .set_content_defined_chunking(Some(CdcOptions::default())) //! .build(); //! ``` //! From 94d2efc95057a0987000523940aca651687d9a23 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 25 Feb 2026 11:53:26 +0100 Subject: [PATCH 13/21] chore(parquet): fix clippy errors --- parquet/tests/encryption/encryption_async.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index 48c844afb99e..c8bed6b5d207 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -548,7 +548,7 @@ fn spawn_rg_join_and_finalize_task( } fn spawn_parquet_parallel_serialization_task( - writer_factory: ArrowRowGroupWriterFactory, + mut writer_factory: ArrowRowGroupWriterFactory, mut data: Receiver, serialize_tx: Sender>, schema: Arc, @@ -713,7 +713,7 @@ async fn test_concurrent_encrypted_writing_over_multiple_row_groups() { let writer = SerializedFileWriter::new(&temp_file, parquet_schema.root_schema_ptr(), props).unwrap(); - let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(schema)); + let mut row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(schema)); let max_row_groups = 1; let (serialize_tx, serialize_rx) = @@ -778,7 +778,7 @@ async fn test_multi_threaded_encrypted_writing() { let temp_file = tempfile::tempfile().unwrap(); let mut writer = SerializedFileWriter::new(&temp_file, parquet_schema.root_schema_ptr(), props).unwrap(); - let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); + let mut row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); let (serialize_tx, mut serialize_rx) = tokio::sync::mpsc::channel::>(1); From caef92e1bb146cf6c71633e7700ff4dcc7819e75 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 25 Feb 2026 12:01:16 +0100 Subject: [PATCH 14/21] refactor(parquet): do not store the chunker in the row group writer --- parquet/benches/arrow_writer.rs | 2 +- parquet/src/arrow/arrow_writer/mod.rs | 82 +++++++++----------- parquet/tests/encryption/encryption_async.rs | 4 +- 3 files changed, 41 insertions(+), 47 deletions(-) diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index 37163d2ef666..625dfc139898 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -348,7 +348,7 @@ fn write_batch_with_option( .with_coerce_types(props.coerce_types()) .convert(batch.schema_ref())?; let writer = SerializedFileWriter::new(&mut file, parquet_schema.root_schema_ptr(), props)?; - let mut row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); + let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); bench.iter(|| { let mut row_group = row_group_writer_factory.create_column_writers(0).unwrap(); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 25da060d9efd..0aca4f2f6edf 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -196,7 +196,6 @@ pub struct ArrowWriter { max_row_group_bytes: Option, /// CDC chunkers persisted across row groups (one per leaf column). - /// Moved into `ArrowRowGroupWriter` for each row group, then returned on close. cdc_chunkers: Option>, } @@ -356,10 +355,9 @@ impl ArrowWriter { let in_progress = match &mut self.in_progress { Some(in_progress) => in_progress, x => { - let rg = self.row_group_writer_factory.create_row_group_writer( - self.writer.flushed_row_groups().len(), - self.cdc_chunkers.take(), - )?; + let rg = self + .row_group_writer_factory + .create_row_group_writer(self.writer.flushed_row_groups().len())?; x.insert(rg) } }; @@ -406,7 +404,10 @@ impl ArrowWriter { } } - in_progress.write(batch)?; + match self.cdc_chunkers.as_mut() { + Some(chunkers) => in_progress.write_with_chunkers(batch, chunkers)?, + None => in_progress.write(batch)?, + } let should_flush = self .max_row_group_row_count @@ -444,8 +445,7 @@ impl ArrowWriter { None => return Ok(()), }; - let (chunks, chunkers) = in_progress.close()?; - self.cdc_chunkers = chunkers; + let chunks = in_progress.close()?; let mut row_group_writer = self.writer.next_row_group()?; for chunk in chunks { @@ -507,10 +507,9 @@ impl ArrowWriter { )] pub fn get_column_writers(&mut self) -> Result> { self.flush()?; - let in_progress = self.row_group_writer_factory.create_row_group_writer( - self.writer.flushed_row_groups().len(), - self.cdc_chunkers.take(), - )?; + let in_progress = self + .row_group_writer_factory + .create_row_group_writer(self.writer.flushed_row_groups().len())?; Ok(in_progress.writers) } @@ -999,34 +998,42 @@ struct ArrowRowGroupWriter { writers: Vec, schema: SchemaRef, buffered_rows: usize, - chunkers: Option>, } impl ArrowRowGroupWriter { - fn new( - writers: Vec, - arrow: &SchemaRef, - chunkers: Option>, - ) -> Self { + fn new(writers: Vec, arrow: &SchemaRef) -> Self { Self { writers, schema: arrow.clone(), buffered_rows: 0, - chunkers, } } fn write(&mut self, batch: &RecordBatch) -> Result<()> { self.buffered_rows += batch.num_rows(); let mut writers = self.writers.iter_mut(); - let mut chunkers = self.chunkers.as_mut().map(|c| c.iter_mut()); for (field, column) in self.schema.fields().iter().zip(batch.columns()) { for leaf in compute_leaves(field.as_ref(), column)? { - let writer = writers.next().unwrap(); - match chunkers.as_mut().and_then(|c| c.next()) { - Some(chunker) => writer.write_with_chunker(&leaf, chunker)?, - None => writer.write(&leaf)?, - } + writers.next().unwrap().write(&leaf)?; + } + } + Ok(()) + } + + fn write_with_chunkers( + &mut self, + batch: &RecordBatch, + chunkers: &mut [ContentDefinedChunker], + ) -> Result<()> { + self.buffered_rows += batch.num_rows(); + let mut writers = self.writers.iter_mut(); + let mut chunkers = chunkers.iter_mut(); + for (field, column) in self.schema.fields().iter().zip(batch.columns()) { + for leaf in compute_leaves(field.as_ref(), column)? { + writers + .next() + .unwrap() + .write_with_chunker(&leaf, chunkers.next().unwrap())?; } } Ok(()) @@ -1040,13 +1047,11 @@ impl ArrowRowGroupWriter { .sum() } - fn close(self) -> Result<(Vec, Option>)> { - let chunks = self - .writers + fn close(self) -> Result> { + self.writers .into_iter() .map(|writer| writer.close()) - .collect::>>()?; - Ok((chunks, self.chunkers)) + .collect() } } @@ -1080,24 +1085,13 @@ impl ArrowRowGroupWriterFactory { } } - fn create_row_group_writer( - &mut self, - row_group_index: usize, - chunkers: Option>, - ) -> Result { + fn create_row_group_writer(&self, row_group_index: usize) -> Result { let writers = self.create_column_writers(row_group_index)?; - Ok(ArrowRowGroupWriter::new( - writers, - &self.arrow_schema, - chunkers, - )) + Ok(ArrowRowGroupWriter::new(writers, &self.arrow_schema)) } /// Create column writers for a new row group, with the given row group index - pub fn create_column_writers( - &mut self, - row_group_index: usize, - ) -> Result> { + pub fn create_column_writers(&self, row_group_index: usize) -> Result> { let mut writers = Vec::with_capacity(self.arrow_schema.fields.len()); let mut leaves = self.schema.columns().iter(); let column_factory = self.column_writer_factory(row_group_index); diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index c8bed6b5d207..3a205de7e87f 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -713,7 +713,7 @@ async fn test_concurrent_encrypted_writing_over_multiple_row_groups() { let writer = SerializedFileWriter::new(&temp_file, parquet_schema.root_schema_ptr(), props).unwrap(); - let mut row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(schema)); + let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(schema)); let max_row_groups = 1; let (serialize_tx, serialize_rx) = @@ -778,7 +778,7 @@ async fn test_multi_threaded_encrypted_writing() { let temp_file = tempfile::tempfile().unwrap(); let mut writer = SerializedFileWriter::new(&temp_file, parquet_schema.root_schema_ptr(), props).unwrap(); - let mut row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); + let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); let (serialize_tx, mut serialize_rx) = tokio::sync::mpsc::channel::>(1); From 947bfdf59af0d6042ef4cfc887c66c85666d82dc Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 25 Feb 2026 12:06:13 +0100 Subject: [PATCH 15/21] chore(parquet): spell out cdc as content_defined_chunking in properties --- parquet/src/arrow/arrow_writer/mod.rs | 4 ++-- parquet/src/file/properties.rs | 16 ++++++++-------- parquet/tests/encryption/encryption_async.rs | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 0aca4f2f6edf..3d7136624118 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -266,7 +266,7 @@ impl ArrowWriter { let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&file_writer, arrow_schema.clone()); - let cdc_chunkers = match props_ptr.cdc_options() { + let cdc_chunkers = match props_ptr.content_defined_chunking() { Some(opts) => { let chunkers = file_writer .schema_descr() @@ -818,7 +818,7 @@ impl ArrowColumnChunk { /// .unwrap(); /// /// // Create a factory for building Arrow column writers -/// let mut row_group_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); +/// let row_group_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema)); /// // Create column writers for the 0th row group /// let col_writers = row_group_factory.create_column_writers(0).unwrap(); /// diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 022105a82030..06512f4c2c37 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -218,7 +218,7 @@ pub struct WriterProperties { column_index_truncate_length: Option, statistics_truncate_length: Option, coerce_types: bool, - cdc_options: Option, + content_defined_chunking: Option, #[cfg(feature = "encryption")] pub(crate) file_encryption_properties: Option>, } @@ -418,8 +418,8 @@ impl WriterProperties { /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled. /// /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`] - pub fn cdc_options(&self) -> Option<&CdcOptions> { - self.cdc_options.as_ref() + pub fn content_defined_chunking(&self) -> Option<&CdcOptions> { + self.content_defined_chunking.as_ref() } /// Returns encoding for a data page, when dictionary encoding is enabled. @@ -545,7 +545,7 @@ pub struct WriterPropertiesBuilder { column_index_truncate_length: Option, statistics_truncate_length: Option, coerce_types: bool, - cdc_options: Option, + content_defined_chunking: Option, #[cfg(feature = "encryption")] file_encryption_properties: Option>, } @@ -569,7 +569,7 @@ impl Default for WriterPropertiesBuilder { column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH, coerce_types: DEFAULT_COERCE_TYPES, - cdc_options: None, + content_defined_chunking: None, #[cfg(feature = "encryption")] file_encryption_properties: None, } @@ -595,7 +595,7 @@ impl WriterPropertiesBuilder { column_index_truncate_length: self.column_index_truncate_length, statistics_truncate_length: self.statistics_truncate_length, coerce_types: self.coerce_types, - cdc_options: self.cdc_options, + content_defined_chunking: self.content_defined_chunking, #[cfg(feature = "encryption")] file_encryption_properties: self.file_encryption_properties, } @@ -838,7 +838,7 @@ impl WriterPropertiesBuilder { options.min_chunk_size ); } - self.cdc_options = options; + self.content_defined_chunking = options; self } @@ -1125,7 +1125,7 @@ impl From for WriterPropertiesBuilder { column_index_truncate_length: props.column_index_truncate_length, statistics_truncate_length: props.statistics_truncate_length, coerce_types: props.coerce_types, - cdc_options: props.cdc_options, + content_defined_chunking: props.content_defined_chunking, #[cfg(feature = "encryption")] file_encryption_properties: props.file_encryption_properties, } diff --git a/parquet/tests/encryption/encryption_async.rs b/parquet/tests/encryption/encryption_async.rs index 3a205de7e87f..48c844afb99e 100644 --- a/parquet/tests/encryption/encryption_async.rs +++ b/parquet/tests/encryption/encryption_async.rs @@ -548,7 +548,7 @@ fn spawn_rg_join_and_finalize_task( } fn spawn_parquet_parallel_serialization_task( - mut writer_factory: ArrowRowGroupWriterFactory, + writer_factory: ArrowRowGroupWriterFactory, mut data: Receiver, serialize_tx: Sender>, schema: Arc, From 7622f22c92ab1a8b586f2f686a6bad67eca71cdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 14 Mar 2026 12:05:39 +0100 Subject: [PATCH 16/21] chore(parquet): apply suggestions from code review Co-authored-by: Ed Seidl --- parquet/src/arrow/arrow_writer/mod.rs | 12 ++++-------- parquet/src/file/properties.rs | 4 +++- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 3d7136624118..d3ee4183bc01 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -266,18 +266,14 @@ impl ArrowWriter { let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&file_writer, arrow_schema.clone()); - let cdc_chunkers = match props_ptr.content_defined_chunking() { - Some(opts) => { - let chunkers = file_writer + let cdc_chunkers = props_ptr.content_defined_chunking().map(|opts| { + file_writer .schema_descr() .columns() .iter() .map(|desc| ContentDefinedChunker::new(desc, opts)) - .collect::>>()?; - Some(chunkers) - } - None => None, - }; + .collect::>>() + }).transpose()?; Ok(Self { writer: file_writer, diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 06512f4c2c37..b2cfc55e41aa 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -83,10 +83,12 @@ pub struct CdcOptions { pub min_chunk_size: usize, /// Maximum chunk size in bytes, default is 1024 KiB. /// The chunker will create a new chunk whenever the chunk size exceeds this value. - /// Note that the parquet writer has a related `data_page_size_limit` property that + /// Note that the parquet writer has a related [`data_page_size_limit`] property that /// controls the maximum size of a parquet data page after encoding. While setting /// `data_page_size_limit` to a smaller value than `max_chunk_size` doesn't affect /// the chunking effectiveness, it results in more small parquet data pages. + /// + /// [`data_page_size_limit`]: WriterPropertiesBuilder::set_data_page_size_limit pub max_chunk_size: usize, /// Number of bit adjustment to the gearhash mask in order to center the chunk size /// around the average size more aggressively, default is 0. From 3f087aadf9426edfd5d6db349b14b310ce101214 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 14 Mar 2026 12:40:32 +0100 Subject: [PATCH 17/21] chore: address review comments --- parquet/src/arrow/arrow_writer/mod.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index d3ee4183bc01..99a173a21c96 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -350,12 +350,10 @@ impl ArrowWriter { let in_progress = match &mut self.in_progress { Some(in_progress) => in_progress, - x => { - let rg = self - .row_group_writer_factory - .create_row_group_writer(self.writer.flushed_row_groups().len())?; - x.insert(rg) - } + x => x.insert( + self.row_group_writer_factory + .create_row_group_writer(self.writer.flushed_row_groups().len())?, + ), }; if let Some(max_rows) = self.max_row_group_row_count { @@ -441,10 +439,8 @@ impl ArrowWriter { None => return Ok(()), }; - let chunks = in_progress.close()?; - let mut row_group_writer = self.writer.next_row_group()?; - for chunk in chunks { + for chunk in in_progress.close()? { chunk.append_to_row_group(&mut row_group_writer)?; } row_group_writer.close()?; From 255bec85921f88f57e556c6fc8eca80e7c5138c0 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 14 Mar 2026 12:52:40 +0100 Subject: [PATCH 18/21] chore: address review comments --- parquet/src/arrow/arrow_writer/levels.rs | 3 +++ parquet/src/arrow/arrow_writer/mod.rs | 9 ++++++--- parquet/src/schema/types.rs | 16 +++++++++++++--- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index c5c1dcd5864f..b0872de9b878 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -804,6 +804,9 @@ impl ArrayLevels { } /// Create a sliced view of this `ArrayLevels` for a CDC chunk. + /// + /// Note: `def_levels`, `rep_levels`, and `non_null_indices` are copied (not zero-copy), + /// while `array` is sliced without copying. pub(crate) fn slice_for_chunk(&self, chunk: &Chunk) -> Self { let level_offset = chunk.level_offset; let num_levels = chunk.num_levels; diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 99a173a21c96..2ef71d5745a2 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -266,14 +266,17 @@ impl ArrowWriter { let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&file_writer, arrow_schema.clone()); - let cdc_chunkers = props_ptr.content_defined_chunking().map(|opts| { - file_writer + let cdc_chunkers = props_ptr + .content_defined_chunking() + .map(|opts| { + file_writer .schema_descr() .columns() .iter() .map(|desc| ContentDefinedChunker::new(desc, opts)) .collect::>>() - }).transpose()?; + }) + .transpose()?; Ok(Self { writer: file_writer, diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 8e8398151f35..2925557e7b86 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -875,12 +875,22 @@ impl ColumnDescriptor { max_def_level: i16, max_rep_level: i16, path: ColumnPath, + ) -> Self { + Self::new_with_repeated_ancestor(primitive_type, max_def_level, max_rep_level, path, 0) + } + + pub(crate) fn new_with_repeated_ancestor( + primitive_type: TypePtr, + max_def_level: i16, + max_rep_level: i16, + path: ColumnPath, + repeated_ancestor_def_level: i16, ) -> Self { Self { primitive_type, max_def_level, max_rep_level, - repeated_ancestor_def_level: 0, + repeated_ancestor_def_level, path, } } @@ -1240,13 +1250,13 @@ fn build_tree<'a>( Type::PrimitiveType { .. } => { let mut path: Vec = vec![]; path.extend(path_so_far.iter().copied().map(String::from)); - let mut desc = ColumnDescriptor::new( + let desc = ColumnDescriptor::new_with_repeated_ancestor( tp.clone(), max_def_level, max_rep_level, ColumnPath::new(path), + repeated_ancestor_def_level, ); - desc.repeated_ancestor_def_level = repeated_ancestor_def_level; leaves.push(Arc::new(desc)); leaf_to_base.push(root_idx); } From 70940081b3fc6754d7913e561c42c84cdad2ee45 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 14 Mar 2026 13:06:51 +0100 Subject: [PATCH 19/21] chore: add constants for default cdc parameters --- parquet/src/file/properties.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index b2cfc55e41aa..ae15cc6b8263 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -61,6 +61,12 @@ pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option = Some(64); pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false; /// Default values for [`WriterProperties::coerce_types`] pub const DEFAULT_COERCE_TYPES: bool = false; +/// Default minimum chunk size for content-defined chunking: 256 KiB. +pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024; +/// Default maximum chunk size for content-defined chunking: 1024 KiB. +pub const DEFAULT_CDC_MAX_CHUNK_SIZE: usize = 1024 * 1024; +/// Default normalization level for content-defined chunking. +pub const DEFAULT_CDC_NORM_LEVEL: i32 = 0; /// EXPERIMENTAL: Options for content-defined chunking (CDC). /// @@ -107,9 +113,9 @@ pub struct CdcOptions { impl Default for CdcOptions { fn default() -> Self { Self { - min_chunk_size: 256 * 1024, - max_chunk_size: 1024 * 1024, - norm_level: 0, + min_chunk_size: DEFAULT_CDC_MIN_CHUNK_SIZE, + max_chunk_size: DEFAULT_CDC_MAX_CHUNK_SIZE, + norm_level: DEFAULT_CDC_NORM_LEVEL, } } } From 3b45dc806c7b571970d63578fd45db5f45cd22b6 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 16 Mar 2026 11:47:04 +0100 Subject: [PATCH 20/21] refactor(parquet): rename Chunk to CdcChunk to avoid confusion with column chunk --- parquet/src/arrow/arrow_writer/levels.rs | 36 ++++++++++++++++-------- parquet/src/column/chunker/cdc.rs | 16 +++++------ parquet/src/column/chunker/mod.rs | 2 +- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index b0872de9b878..d1da24872c49 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -40,7 +40,7 @@ //! //! \[1\] [parquet-format#nested-encoding](https://github.com/apache/parquet-format#nested-encoding) -use crate::column::chunker::Chunk; +use crate::column::chunker::CdcChunk; use crate::errors::{ParquetError, Result}; use arrow_array::cast::AsArray; use arrow_array::{Array, ArrayRef, OffsetSizeTrait}; @@ -807,7 +807,7 @@ impl ArrayLevels { /// /// Note: `def_levels`, `rep_levels`, and `non_null_indices` are copied (not zero-copy), /// while `array` is sliced without copying. - pub(crate) fn slice_for_chunk(&self, chunk: &Chunk) -> Self { + pub(crate) fn slice_for_chunk(&self, chunk: &CdcChunk) -> Self { let level_offset = chunk.level_offset; let num_levels = chunk.num_levels; let value_offset = chunk.value_offset; @@ -853,7 +853,7 @@ impl ArrayLevels { #[cfg(test)] mod tests { use super::*; - use crate::column::chunker::Chunk; + use crate::column::chunker::CdcChunk; use arrow_array::builder::*; use arrow_array::types::Int32Type; @@ -2147,7 +2147,11 @@ mod tests { #[test] fn test_slice_for_chunk_flat() { - // Required field (no levels): array [1..=6], slice values 2..5 + // Case 1: required field (max_def_level=0, no def/rep levels stored). + // Array has 6 values; all are non-null so non_null_indices covers every position. + // The chunk selects value_offset=2, num_values=3 → the sub-array [3, 4, 5]. + // Since there are no levels, num_levels=0 and level_offset are irrelevant. + // non_null_indices [0,1,2,3,4,5] filtered to [2,4) and shifted by -2 → [0,1,2]. let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6])); let logical_nulls = array.logical_nulls(); let levels = ArrayLevels { @@ -2159,7 +2163,7 @@ mod tests { array, logical_nulls, }; - let sliced = levels.slice_for_chunk(&Chunk { + let sliced = levels.slice_for_chunk(&CdcChunk { level_offset: 0, num_levels: 0, value_offset: 2, @@ -2170,8 +2174,16 @@ mod tests { assert_eq!(sliced.non_null_indices, vec![0, 1, 2]); assert_eq!(sliced.array.len(), 3); - // Optional field (def levels only): [1, null, 3, null, 5, 6] - // Slice levels 1..4 (def=[0,1,0]), values 1..4 → non_null_indices [2]→[1] + // Case 2: optional field (max_def_level=1, def levels present, no rep levels). + // Array: [Some(1), None, Some(3), None, Some(5), Some(6)] + // def_levels: [1, 0, 1, 0, 1, 1] (1=non-null, 0=null) + // non_null_indices: [0, 2, 4, 5] (array positions of the four non-null values) + // + // The chunk selects level_offset=1, num_levels=3, value_offset=1, num_values=3: + // - def_levels[1..4] = [0, 1, 0] → null, non-null, null + // - sub-array slice(1, 3) = [None, Some(3), None] + // - non_null_indices filtered to [value_offset=1, value_end=4): only index 2 qualifies, + // shifted by -1 → [1] (position of Some(3) within the sliced sub-array) let array: ArrayRef = Arc::new(Int32Array::from(vec![ Some(1), None, @@ -2190,7 +2202,7 @@ mod tests { array, logical_nulls, }; - let sliced = levels.slice_for_chunk(&Chunk { + let sliced = levels.slice_for_chunk(&CdcChunk { level_offset: 1, num_levels: 3, value_offset: 1, @@ -2217,7 +2229,7 @@ mod tests { array, logical_nulls, }; - let sliced = levels.slice_for_chunk(&Chunk { + let sliced = levels.slice_for_chunk(&CdcChunk { level_offset: 2, num_levels: 3, value_offset: 2, @@ -2246,7 +2258,7 @@ mod tests { }; assert_eq!( levels - .slice_for_chunk(&Chunk { + .slice_for_chunk(&CdcChunk { level_offset: 0, num_levels: 1, value_offset: 0, @@ -2258,7 +2270,7 @@ mod tests { // idx 2 in range [1,3), shifted -1 → 1 assert_eq!( levels - .slice_for_chunk(&Chunk { + .slice_for_chunk(&CdcChunk { level_offset: 1, num_levels: 2, value_offset: 1, @@ -2270,7 +2282,7 @@ mod tests { // idx 2 excluded from [1,2) assert_eq!( levels - .slice_for_chunk(&Chunk { + .slice_for_chunk(&CdcChunk { level_offset: 1, num_levels: 1, value_offset: 1, diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index cb4eba2c7043..a1fef9e31995 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -19,7 +19,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::properties::CdcOptions; use crate::schema::types::ColumnDescriptor; -use super::Chunk; +use super::CdcChunk; use super::cdc_generated::{GEARHASH_TABLE, NUM_GEARHASH_TABLES}; /// CDC (Content-Defined Chunking) divides data into variable-sized chunks based on @@ -279,7 +279,7 @@ impl ContentDefinedChunker { rep_levels: Option<&[i16]>, num_levels: usize, mut roll_value: F, - ) -> Vec + ) -> Vec where F: FnMut(&mut Self, usize), { @@ -297,7 +297,7 @@ impl ContentDefinedChunker { for offset in 0..num_levels { roll_value(self, offset); if self.need_new_chunk() { - chunks.push(Chunk { + chunks.push(CdcChunk { level_offset: prev_offset, value_offset: prev_offset, num_levels: offset - prev_offset, @@ -319,7 +319,7 @@ impl ContentDefinedChunker { roll_value(self, offset); } if self.need_new_chunk() { - chunks.push(Chunk { + chunks.push(CdcChunk { level_offset: prev_offset, value_offset: prev_offset, num_levels: offset - prev_offset, @@ -350,7 +350,7 @@ impl ContentDefinedChunker { // If we are at a record boundary and need a new chunk, create one. let levels_to_write = offset - prev_offset; if levels_to_write > 0 { - chunks.push(Chunk { + chunks.push(CdcChunk { level_offset: prev_offset, value_offset: prev_value_offset, num_levels: levels_to_write, @@ -370,7 +370,7 @@ impl ContentDefinedChunker { // Add the last chunk if we have any levels left. if prev_offset < num_levels { - chunks.push(Chunk { + chunks.push(CdcChunk { level_offset: prev_offset, value_offset: prev_value_offset, num_levels: num_levels - prev_offset, @@ -392,7 +392,7 @@ impl ContentDefinedChunker { def_levels: Option<&[i16]>, rep_levels: Option<&[i16]>, array: &dyn arrow_array::Array, - ) -> Result> { + ) -> Result> { use arrow_array::cast::AsArray; use arrow_schema::DataType; @@ -474,7 +474,7 @@ impl ContentDefinedChunker { } #[cfg(debug_assertions)] - fn validate_chunks(&self, chunks: &[Chunk], num_levels: usize, total_values: usize) { + fn validate_chunks(&self, chunks: &[CdcChunk], num_levels: usize, total_values: usize) { assert!(!chunks.is_empty(), "chunks must be non-empty"); let first = &chunks[0]; diff --git a/parquet/src/column/chunker/mod.rs b/parquet/src/column/chunker/mod.rs index 70613500f556..c4caf18af66b 100644 --- a/parquet/src/column/chunker/mod.rs +++ b/parquet/src/column/chunker/mod.rs @@ -28,7 +28,7 @@ pub(crate) use cdc::ContentDefinedChunker; /// A chunk of data with level and value offsets for record-shredded nested data. #[derive(Debug, Clone, Copy)] -pub(crate) struct Chunk { +pub(crate) struct CdcChunk { /// The start offset of this chunk inside the given levels. pub level_offset: usize, /// The start offset of this chunk inside the given values array. From 8dc0e5be1edbd68d5f73d7f324d9b4247f41f5a6 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 18 Mar 2026 22:08:24 +0100 Subject: [PATCH 21/21] test(parquet): closely port the CDC tests from the C++ implementation --- parquet/benches/arrow_writer.rs | 36 +- parquet/src/column/chunker/cdc.rs | 2101 +++++++++++++++++------------ parquet/src/lib.rs | 2 +- 3 files changed, 1269 insertions(+), 870 deletions(-) diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index 625dfc139898..2381941897c7 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -19,7 +19,7 @@ extern crate criterion; use criterion::{Bencher, Criterion, Throughput}; -use parquet::arrow::arrow_writer::{ArrowRowGroupWriterFactory, compute_leaves}; +use parquet::arrow::ArrowWriter; use parquet::basic::{Compression, ZstdLevel}; extern crate arrow; @@ -33,10 +33,8 @@ use arrow::datatypes::*; use arrow::util::bench_util::{create_f16_array, create_f32_array, create_f64_array}; use arrow::{record_batch::RecordBatch, util::data_gen::*}; use arrow_array::RecordBatchOptions; -use parquet::arrow::ArrowSchemaConverter; use parquet::errors::Result; use parquet::file::properties::{CdcOptions, WriterProperties, WriterVersion}; -use parquet::file::writer::SerializedFileWriter; fn create_primitive_bench_batch( size: usize, @@ -342,39 +340,21 @@ fn write_batch_with_option( batch: &RecordBatch, props: Option, ) -> Result<()> { - let mut file = Empty::default(); - let props = Arc::new(props.unwrap_or_default()); - let parquet_schema = ArrowSchemaConverter::new() - .with_coerce_types(props.coerce_types()) - .convert(batch.schema_ref())?; - let writer = SerializedFileWriter::new(&mut file, parquet_schema.root_schema_ptr(), props)?; - let row_group_writer_factory = ArrowRowGroupWriterFactory::new(&writer, batch.schema()); + let props = props.unwrap_or_default(); bench.iter(|| { - let mut row_group = row_group_writer_factory.create_column_writers(0).unwrap(); - - let mut writers = row_group.iter_mut(); - for (field, column) in batch - .schema() - .fields() - .iter() - .zip(black_box(batch).columns()) - { - for leaf in compute_leaves(field.as_ref(), column).unwrap() { - writers.next().unwrap().write(&leaf).unwrap() - } - } - - for writer in row_group.into_iter() { - black_box(writer.close()).unwrap(); - } + let mut file = Empty::default(); + let mut writer = + ArrowWriter::try_new(&mut file, batch.schema(), Some(props.clone())).unwrap(); + writer.write(black_box(batch)).unwrap(); + black_box(writer.close()).unwrap(); }); Ok(()) } fn create_batches() -> Vec<(&'static str, RecordBatch)> { - const BATCH_SIZE: usize = 4096; + const BATCH_SIZE: usize = 1024 * 1024; let mut batches = vec![]; diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index a1fef9e31995..f21f58780a6a 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -657,13 +657,14 @@ mod tests { } /// Integration tests that exercise CDC through the Arrow writer/reader roundtrip. +/// Ported from the C++ test suite in `chunker_internal_test.cc`. #[cfg(all(test, feature = "arrow"))] mod arrow_tests { use std::borrow::Borrow; use std::sync::Arc; - use arrow_array::builder::ListBuilder; - use arrow_array::{ArrayRef, Float64Array, Int32Array, RecordBatch, StringArray}; + use arrow_array::cast::AsArray; + use arrow_array::{Array, ArrayRef, BooleanArray, Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; use crate::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; @@ -671,12 +672,13 @@ mod arrow_tests { use crate::file::properties::{CdcOptions, WriterProperties}; use crate::file::reader::{FileReader, SerializedFileReader}; - // --- Constants --- + // --- Constants matching C++ TestCDCSingleRowGroup --- const CDC_MIN_CHUNK_SIZE: usize = 4 * 1024; const CDC_MAX_CHUNK_SIZE: usize = 16 * 1024; const CDC_PART_SIZE: usize = 128 * 1024; const CDC_EDIT_SIZE: usize = 128; + const CDC_ROW_GROUP_LENGTH: usize = 1024 * 1024; // --- Helpers --- @@ -691,88 +693,332 @@ mod arrow_tests { h } - fn generate_i32_array(length: usize, seed: u64) -> Int32Array { - (0..length) - .map(|i| test_hash(seed, i as u64) as i32) - .collect() - } + /// Generate a deterministic array for any supported data type, matching C++ `GenerateArray`. + fn generate_array(dtype: &DataType, nullable: bool, length: usize, seed: u64) -> ArrayRef { + macro_rules! gen_primitive { + ($array_type:ty, $cast:expr) => {{ + if nullable { + let arr: $array_type = (0..length) + .map(|i| { + let val = test_hash(seed, i as u64); + if val % 10 == 0 { + None + } else { + Some($cast(val)) + } + }) + .collect(); + Arc::new(arr) as ArrayRef + } else { + let arr: $array_type = (0..length) + .map(|i| Some($cast(test_hash(seed, i as u64)))) + .collect(); + Arc::new(arr) as ArrayRef + } + }}; + } - fn generate_nullable_i32_array(length: usize, seed: u64) -> Int32Array { - (0..length) - .map(|i| { - let val = test_hash(seed, i as u64); - if val % 10 == 0 { - None + match dtype { + DataType::Boolean => { + if nullable { + let arr: BooleanArray = (0..length) + .map(|i| { + let val = test_hash(seed, i as u64); + if val % 10 == 0 { + None + } else { + Some(val % 2 == 0) + } + }) + .collect(); + Arc::new(arr) } else { - Some(val as i32) + let arr: BooleanArray = (0..length) + .map(|i| Some(test_hash(seed, i as u64) % 2 == 0)) + .collect(); + Arc::new(arr) } - }) - .collect() + } + DataType::Int32 => gen_primitive!(Int32Array, |v: u64| v as i32), + DataType::Int64 => { + gen_primitive!(arrow_array::Int64Array, |v: u64| v as i64) + } + DataType::Float64 => { + gen_primitive!(arrow_array::Float64Array, |v: u64| (v % 100000) as f64 + / 1000.0) + } + DataType::Utf8 => { + let arr: arrow_array::StringArray = if nullable { + (0..length) + .map(|i| { + let val = test_hash(seed, i as u64); + if val % 10 == 0 { + None + } else { + Some(format!("str_{val}")) + } + }) + .collect() + } else { + (0..length) + .map(|i| Some(format!("str_{}", test_hash(seed, i as u64)))) + .collect() + }; + Arc::new(arr) + } + DataType::Binary => { + let arr: arrow_array::BinaryArray = if nullable { + (0..length) + .map(|i| { + let val = test_hash(seed, i as u64); + if val % 10 == 0 { + None + } else { + Some(format!("bin_{val}").into_bytes()) + } + }) + .collect() + } else { + (0..length) + .map(|i| Some(format!("bin_{}", test_hash(seed, i as u64)).into_bytes())) + .collect() + }; + Arc::new(arr) + } + DataType::FixedSizeBinary(size) => { + let size = *size; + let mut builder = arrow_array::builder::FixedSizeBinaryBuilder::new(size); + for i in 0..length { + let val = test_hash(seed, i as u64); + if nullable && val % 10 == 0 { + builder.append_null(); + } else { + let s = format!("bin_{val}"); + let bytes = s.as_bytes(); + let mut buf = vec![0u8; size as usize]; + let copy_len = bytes.len().min(size as usize); + buf[..copy_len].copy_from_slice(&bytes[..copy_len]); + builder.append_value(&buf).unwrap(); + } + } + Arc::new(builder.finish()) + } + DataType::Date32 => { + gen_primitive!(arrow_array::Date32Array, |v: u64| v as i32) + } + DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => { + gen_primitive!(arrow_array::TimestampNanosecondArray, |v: u64| v as i64) + } + _ => panic!("Unsupported test data type: {dtype:?}"), + } } - fn generate_string_array(length: usize, seed: u64) -> StringArray { - (0..length) - .map(|i| { - let val = test_hash(seed, i as u64); - Some(format!("str_{val}")) + /// Generate a RecordBatch with the given schema, matching C++ `GenerateTable`. + fn generate_table(schema: &Arc, length: usize, seed: u64) -> RecordBatch { + let arrays: Vec = schema + .fields() + .iter() + .enumerate() + .map(|(i, field)| { + generate_array( + field.data_type(), + field.is_nullable(), + length, + seed + i as u64 * 10, + ) }) - .collect() + .collect(); + RecordBatch::try_new(schema.clone(), arrays).unwrap() } - fn write_batch_with_cdc(batch: &RecordBatch) -> Vec { - let props = WriterProperties::builder() - .set_content_defined_chunking(Some(CdcOptions::default())) - .build(); - let mut buf = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), Some(props)).unwrap(); - writer.write(batch).unwrap(); - writer.close().unwrap(); - buf + /// Compute the CDC byte width for a data type, matching C++ `bytes_per_record`. + /// Returns 0 for variable-length types. + fn cdc_byte_width(dtype: &DataType) -> usize { + match dtype { + DataType::Boolean => 1, + DataType::Int8 | DataType::UInt8 => 1, + DataType::Int16 | DataType::UInt16 | DataType::Float16 => 2, + DataType::Int32 + | DataType::UInt32 + | DataType::Float32 + | DataType::Date32 + | DataType::Time32(_) => 4, + DataType::Int64 + | DataType::UInt64 + | DataType::Float64 + | DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) => 8, + DataType::Decimal128(_, _) => 16, + DataType::Decimal256(_, _) => 32, + DataType::FixedSizeBinary(n) => *n as usize, + _ => 0, // variable-length + } } - fn write_batch_without_cdc(batch: &RecordBatch) -> Vec { - let mut buf = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), None).unwrap(); - writer.write(batch).unwrap(); - writer.close().unwrap(); - buf + /// Compute bytes_per_record for determining part/edit lengths, matching C++. + fn bytes_per_record(dtype: &DataType, nullable: bool) -> usize { + let bw = cdc_byte_width(dtype); + if bw > 0 { + if nullable { bw + 2 } else { bw } + } else { + 16 // variable-length fallback, matching C++ + } } - fn read_batches(data: &[u8]) -> Vec { - let reader = ParquetRecordBatchReaderBuilder::try_new(bytes::Bytes::from(data.to_vec())) - .unwrap() - .build() - .unwrap(); - reader.collect::, _>>().unwrap() + /// Compute the CDC chunk size for an array slice, matching C++ `CalculateCdcSize`. + fn calculate_cdc_size(array: &dyn Array, nullable: bool) -> i64 { + let dtype = array.data_type(); + let bw = cdc_byte_width(dtype); + let result = if bw > 0 { + // Fixed-width: count only non-null values + let valid_count = array.len() - array.null_count(); + (valid_count * bw) as i64 + } else { + // Variable-length: sum of actual byte lengths + match dtype { + DataType::Utf8 => { + let a = array.as_string::(); + (0..a.len()) + .filter(|&i| a.is_valid(i)) + .map(|i| a.value(i).len() as i64) + .sum() + } + DataType::Binary => { + let a = array.as_binary::(); + (0..a.len()) + .filter(|&i| a.is_valid(i)) + .map(|i| a.value(i).len() as i64) + .sum() + } + DataType::LargeBinary => { + let a = array.as_binary::(); + (0..a.len()) + .filter(|&i| a.is_valid(i)) + .map(|i| a.value(i).len() as i64) + .sum() + } + _ => panic!("CDC size calculation not implemented for {dtype:?}"), + } + }; + + if nullable { + // Add 2 bytes per element for definition levels + result + array.len() as i64 * 2 + } else { + result + } + } + + /// Page-level metadata for a single column within a row group. + struct ColumnInfo { + page_lengths: Vec, + has_dictionary_page: bool, } - fn get_data_page_bytes(data: &[u8]) -> Vec> { + /// Extract per-row-group column info from Parquet data. + fn get_column_info(data: &[u8], column_index: usize) -> Vec { let reader = SerializedFileReader::new(bytes::Bytes::from(data.to_vec())).unwrap(); let metadata = reader.metadata(); - let mut pages = Vec::new(); + let mut result = Vec::new(); for rg in 0..metadata.num_row_groups() { let rg_reader = reader.get_row_group(rg).unwrap(); - for col in 0..metadata.row_group(rg).num_columns() { - let col_reader = rg_reader.get_column_page_reader(col).unwrap(); - for page in col_reader { - let page = page.unwrap(); - pages.push(page.buffer().to_vec()); + let col_reader = rg_reader.get_column_page_reader(column_index).unwrap(); + let mut info = ColumnInfo { + page_lengths: Vec::new(), + has_dictionary_page: false, + }; + for page in col_reader { + let page = page.unwrap(); + match page.page_type() { + crate::basic::PageType::DATA_PAGE | crate::basic::PageType::DATA_PAGE_V2 => { + info.page_lengths.push(page.num_values() as i64); + } + crate::basic::PageType::DICTIONARY_PAGE => { + info.has_dictionary_page = true; + } + _ => {} + } + } + result.push(info); + } + result + } + + /// Assert that CDC chunk sizes are within the expected range. + /// Equivalent to C++ `AssertContentDefinedChunkSizes`. + fn assert_cdc_chunk_sizes( + array: &ArrayRef, + info: &ColumnInfo, + nullable: bool, + min_chunk_size: usize, + max_chunk_size: usize, + expect_dictionary_page: bool, + ) { + // Boolean and FixedSizeBinary never produce dictionary pages (matching C++) + let expect_dict = match array.data_type() { + DataType::Boolean | DataType::FixedSizeBinary(_) => false, + _ => expect_dictionary_page, + }; + assert_eq!( + info.has_dictionary_page, + expect_dict, + "dictionary page mismatch for {:?}", + array.data_type() + ); + + let page_lengths = &info.page_lengths; + assert!( + page_lengths.len() > 1, + "CDC should produce multiple pages, got {page_lengths:?}" + ); + + let bw = cdc_byte_width(array.data_type()); + // Only do exact CDC size validation for fixed-width and base binary-like types + if bw > 0 + || matches!( + array.data_type(), + DataType::Utf8 | DataType::Binary | DataType::LargeBinary + ) + { + let mut offset = 0i64; + for (i, &page_len) in page_lengths.iter().enumerate() { + let slice = array.slice(offset as usize, page_len as usize); + let cdc_size = calculate_cdc_size(slice.as_ref(), nullable); + if i < page_lengths.len() - 1 { + assert!( + cdc_size >= min_chunk_size as i64, + "Page {i}: CDC size {cdc_size} < min {min_chunk_size}, pages={page_lengths:?}" + ); } + assert!( + cdc_size <= max_chunk_size as i64, + "Page {i}: CDC size {cdc_size} > max {max_chunk_size}, pages={page_lengths:?}" + ); + offset += page_len; } + assert_eq!( + offset, + array.len() as i64, + "page lengths must sum to array length" + ); } - pages } + /// Write batches with CDC options and validate roundtrip. + /// Matches C++ `WriteTableToBuffer`. fn write_with_cdc_options( batches: &[&RecordBatch], min_chunk_size: usize, max_chunk_size: usize, max_row_group_rows: Option, + enable_dictionary: bool, ) -> Vec { assert!(!batches.is_empty()); let schema = batches[0].schema(); let mut builder = WriterProperties::builder() - .set_dictionary_enabled(false) + .set_dictionary_enabled(enable_dictionary) .set_content_defined_chunking(Some(CdcOptions { min_chunk_size, max_chunk_size, @@ -783,37 +1029,43 @@ mod arrow_tests { } let props = builder.build(); let mut buf = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut buf, schema, Some(props)).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); for batch in batches { writer.write(batch).unwrap(); } writer.close().unwrap(); + + // Roundtrip validation (matching C++ WriteTableToBuffer) + let readback = read_batches(&buf); + let original_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + let readback_rows: usize = readback.iter().map(|b| b.num_rows()).sum(); + assert_eq!(original_rows, readback_rows, "Roundtrip row count mismatch"); + if original_rows > 0 { + let original = concat_batches(batches.iter().copied()); + let roundtrip = concat_batches(&readback); + assert_eq!(original, roundtrip, "Roundtrip validation failed"); + } + buf } - fn get_page_lengths(data: &[u8], column_index: usize) -> Vec> { - let reader = SerializedFileReader::new(bytes::Bytes::from(data.to_vec())).unwrap(); - let metadata = reader.metadata(); - let mut result = Vec::new(); - for rg in 0..metadata.num_row_groups() { - let rg_reader = reader.get_row_group(rg).unwrap(); - let col_reader = rg_reader.get_column_page_reader(column_index).unwrap(); - let mut lengths = Vec::new(); - for page in col_reader { - let page = page.unwrap(); - if matches!( - page.page_type(), - crate::basic::PageType::DATA_PAGE | crate::basic::PageType::DATA_PAGE_V2 - ) { - lengths.push(page.num_values() as i64); - } - } - result.push(lengths); - } - result + fn read_batches(data: &[u8]) -> Vec { + let reader = ParquetRecordBatchReaderBuilder::try_new(bytes::Bytes::from(data.to_vec())) + .unwrap() + .build() + .unwrap(); + reader.collect::, _>>().unwrap() + } + + fn concat_batches(batches: impl IntoIterator>) -> RecordBatch { + let batches: Vec<_> = batches.into_iter().collect(); + let schema = batches[0].borrow().schema(); + let batches = batches.iter().map(|b| b.borrow()); + arrow_select::concat::concat_batches(&schema, batches).unwrap() } /// LCS-based diff between two sequences of page lengths (ported from C++). + /// Includes the merge-adjacent-diffs post-processing from C++. fn find_differences(first: &[i64], second: &[i64]) -> Vec<(Vec, Vec)> { let n = first.len(); let m = second.len(); @@ -827,7 +1079,6 @@ mod arrow_tests { } } } - // Backtrack to find common elements let mut common = Vec::new(); let (mut i, mut j) = (n, m); while i > 0 && j > 0 { @@ -855,840 +1106,1009 @@ mod arrow_tests { if last_i < n || last_j < m { result.push((first[last_i..].to_vec(), second[last_j..].to_vec())); } - result - } - - fn make_i32_batch(length: usize, seed: u64) -> RecordBatch { - let col: ArrayRef = Arc::new(generate_i32_array(length, seed)); - RecordBatch::try_from_iter(vec![("col", col)]).unwrap() - } - fn concat_batches(batches: impl IntoIterator>) -> RecordBatch { - let batches: Vec<_> = batches.into_iter().collect(); - let schema = batches[0].borrow().schema(); - let batches = batches.iter().map(|b| b.borrow()); - arrow_select::concat::concat_batches(&schema, batches).unwrap() - } - - fn i32_part_length() -> usize { - CDC_PART_SIZE / 4 - } - - fn i32_edit_length() -> usize { - CDC_EDIT_SIZE / 4 - } - - // --- Roundtrip tests --- - - #[test] - fn test_cdc_roundtrip_i32() { - let array: ArrayRef = Arc::new(Int32Array::from_iter(0..10_000)); - let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); - - let data = write_batch_with_cdc(&batch); - let batches = read_batches(&data); - let result = concat_batches(&batches); - assert_eq!(batch, result); - } - - #[test] - fn test_cdc_roundtrip_string() { - let values = (0..5_000).map(|i| Some(format!("value_{i}"))); - let array: ArrayRef = Arc::new(StringArray::from_iter(values)); - let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); - - let data = write_batch_with_cdc(&batch); - let batches = read_batches(&data); - let result = concat_batches(&batches); - assert_eq!(batch, result); - } - - #[test] - fn test_cdc_roundtrip_large_binary() { - let mut builder = arrow_array::builder::LargeBinaryBuilder::new(); - for i in 0..5_000u32 { - builder.append_value(format!("value_{i}")); + // Merge adjacent diffs (matching C++ post-processing) + let mut merged: Vec<(Vec, Vec)> = Vec::new(); + for diff in result { + if let Some(prev) = merged.last_mut() { + if prev.0.is_empty() && diff.1.is_empty() { + prev.0 = diff.0; + continue; + } else if prev.1.is_empty() && diff.0.is_empty() { + prev.1 = diff.1; + continue; + } + } + merged.push(diff); } - let array: ArrayRef = Arc::new(builder.finish()); - let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); - - let data = write_batch_with_cdc(&batch); - let batches = read_batches(&data); - let result = concat_batches(&batches); - assert_eq!(batch, result); + merged } - #[test] - fn test_cdc_roundtrip_nullable() { - let values = (0..10_000).map(|i| if i % 7 == 0 { None } else { Some(i) }); - let array: ArrayRef = Arc::new(Int32Array::from_iter(values)); - let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); - - let data = write_batch_with_cdc(&batch); - let batches = read_batches(&data); - let result = concat_batches(&batches); - assert_eq!(batch, result); - } - - #[test] - fn test_cdc_deterministic() { - let values = 0..10_000; - let array: ArrayRef = Arc::new(Int32Array::from_iter(values)); - let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); - - let data1 = write_batch_with_cdc(&batch); - let data2 = write_batch_with_cdc(&batch); - assert_eq!(data1, data2, "CDC output must be deterministic"); - } - - #[test] - fn test_cdc_produces_multiple_pages() { - let values = 0..500_000; - let array: ArrayRef = Arc::new(Int32Array::from_iter(values)); - let batch = RecordBatch::try_from_iter(vec![("col", array)]).unwrap(); - - let cdc_data = write_batch_with_cdc(&batch); - let no_cdc_data = write_batch_without_cdc(&batch); - - let cdc_pages = get_data_page_bytes(&cdc_data); - let no_cdc_pages = get_data_page_bytes(&no_cdc_data); - - assert!( - cdc_pages.len() > 1, - "CDC should produce multiple pages, got {}", - cdc_pages.len() - ); - assert!( - cdc_pages.len() >= no_cdc_pages.len(), - "CDC pages {} should be >= non-CDC pages {}", - cdc_pages.len(), - no_cdc_pages.len() - ); - } - - #[test] - fn test_cdc_page_reuse_on_append() { - let n = 500_000; - let original_values = 0..n; - let appended_values = 0..n + 100; - let original: ArrayRef = Arc::new(Int32Array::from_iter(original_values)); - let appended: ArrayRef = Arc::new(Int32Array::from_iter(appended_values)); - - let batch1 = RecordBatch::try_from_iter(vec![("col", original)]).unwrap(); - let batch2 = RecordBatch::try_from_iter(vec![("col", appended)]).unwrap(); - - let pages1 = get_data_page_bytes(&write_batch_with_cdc(&batch1)); - let pages2 = get_data_page_bytes(&write_batch_with_cdc(&batch2)); - - let reused = pages1.iter().filter(|p| pages2.contains(p)).count(); - assert!( - reused > 0, - "At least some pages should be reused after append, pages1={}, pages2={}", - pages1.len(), - pages2.len() - ); - } - - #[test] - fn test_cdc_state_persists_across_row_groups() { - let n = 500_000i32; - let all_data: ArrayRef = Arc::new(Int32Array::from_iter(0..n)); - let batch_all = RecordBatch::try_from_iter(vec![("col", all_data)]).unwrap(); - let schema = batch_all.schema(); - let data_one_rg = write_batch_with_cdc(&batch_all); - - let props = WriterProperties::builder() - .set_content_defined_chunking(Some(CdcOptions::default())) - .set_max_row_group_row_count(Some(n as usize / 2)) - .build(); - let mut buf = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); - writer.write(&batch_all).unwrap(); - writer.close().unwrap(); - let data_two_rg = buf; - - let result1 = read_batches(&data_one_rg); - let result2 = read_batches(&data_two_rg); - let concat1 = concat_batches(&result1); - let concat2 = concat_batches(&result2); - assert_eq!(concat1, concat2); - } - - #[test] - fn test_cdc_roundtrip_dictionary() { - let values = StringArray::from_iter_values((0..10_000).map(|i| format!("val_{}", i % 100))); - let array: ArrayRef = Arc::new( - arrow_cast::cast::cast( - &values, - &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - ) - .unwrap(), - ); - let schema = Arc::new(Schema::new(vec![Field::new( - "col", - array.data_type().clone(), - false, - )])); - let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); - - let data = write_batch_with_cdc(&batch); - let batches = read_batches(&data); - let result = concat_batches(&batches); - assert_eq!(batch.num_rows(), result.num_rows()); - } - - #[test] - fn test_cdc_roundtrip_list() { - let mut builder = ListBuilder::new(arrow_array::builder::Int32Builder::new()); - for i in 0..5_000 { - for j in 0..(i % 5) { - builder.values().append_value(i * 10 + j); + /// Assert exact page length differences between original and modified files. + /// Matches C++ `AssertPageLengthDifferences` (full version). + fn assert_page_length_differences( + original: &ColumnInfo, + modified: &ColumnInfo, + exact_equal_diffs: usize, + exact_larger_diffs: usize, + exact_smaller_diffs: usize, + edit_length: i64, + ) { + let diffs = find_differences(&original.page_lengths, &modified.page_lengths); + let expected = exact_equal_diffs + exact_larger_diffs + exact_smaller_diffs; + + if diffs.len() != expected { + eprintln!("Original: {:?}", original.page_lengths); + eprintln!("Modified: {:?}", modified.page_lengths); + for d in &diffs { + eprintln!(" Diff: {:?} vs {:?}", d.0, d.1); } - builder.append(true); } - let list_array: ArrayRef = Arc::new(builder.finish()); - - let batch = RecordBatch::try_from_iter(vec![("col", list_array)]).unwrap(); - - let data = write_batch_with_cdc(&batch); - let batches = read_batches(&data); - let result = concat_batches(&batches); - assert_eq!(batch, result); - } - - #[test] - fn test_cdc_roundtrip_multiple_columns() { - let i32_array: ArrayRef = Arc::new(Int32Array::from_iter(0..10_000)); - let str_array: ArrayRef = Arc::new(StringArray::from_iter( - (0..10_000).map(|i| Some(format!("s{i}"))), - )); - let f64_array: ArrayRef = - Arc::new(Float64Array::from_iter((0..10_000).map(|i| i as f64 * 0.1))); - - let batch = RecordBatch::try_from_iter(vec![ - ("ints", i32_array), - ("strings", str_array), - ("floats", f64_array), - ]) - .unwrap(); - - let data = write_batch_with_cdc(&batch); - let batches = read_batches(&data); - let result = concat_batches(&batches); - assert_eq!(batch, result); - } - - // --- Page-level CDC tests ported from C++ chunker_internal_test.cc --- - - #[test] - fn test_cdc_find_differences() { - let diffs = find_differences(&[1, 2, 3, 4, 5], &[1, 7, 8, 4, 5]); - assert_eq!(diffs.len(), 1); - assert_eq!(diffs[0].0, vec![2, 3]); - assert_eq!(diffs[0].1, vec![7, 8]); - - let diffs = find_differences(&[1, 2, 3], &[1, 2, 3, 4, 5]); - assert_eq!(diffs.len(), 1); - assert!(diffs[0].0.is_empty()); - assert_eq!(diffs[0].1, vec![4, 5]); - - let diffs = find_differences(&[], &[]); - assert!(diffs.is_empty()); - } - - #[test] - fn test_cdc_delete_once() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - - let part1 = make_i32_batch(part_len, 0); - let edit = make_i32_batch(edit_len, 1); - let part2 = make_i32_batch(part_len, 100); - - let base = concat_batches([&part1, &edit, &part2]); - let modified = concat_batches([&part1, &part2]); - - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - // Verify roundtrip - let base_result = read_batches(&base_data); - let mod_result = read_batches(&mod_data); - assert_eq!(concat_batches(&base_result), base); - assert_eq!(concat_batches(&mod_result), modified); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - assert_eq!(base_pages.len(), 1); - assert_eq!(mod_pages.len(), 1); - - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert_eq!(diffs.len(), 1, "Expected 1 diff, got {diffs:?}"); - let base_sum: i64 = diffs[0].0.iter().sum(); - let mod_sum: i64 = diffs[0].1.iter().sum(); assert_eq!( - base_sum - mod_sum, - edit_len as i64, - "Diff should account for deleted rows" - ); - } - - #[test] - fn test_cdc_insert_once() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - - let part1 = make_i32_batch(part_len, 0); - let edit = make_i32_batch(edit_len, 1); - let part2 = make_i32_batch(part_len, 100); - - let base = concat_batches([&part1, &part2]); - let modified = concat_batches([&part1, &edit, &part2]); - - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - let mod_result = read_batches(&mod_data); - assert_eq!(concat_batches(&mod_result), modified); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - assert_eq!(base_pages.len(), 1); - assert_eq!(mod_pages.len(), 1); - - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert_eq!(diffs.len(), 1, "Expected 1 diff, got {diffs:?}"); - let base_sum: i64 = diffs[0].0.iter().sum(); - let mod_sum: i64 = diffs[0].1.iter().sum(); - assert_eq!( - mod_sum - base_sum, - edit_len as i64, - "Diff should account for inserted rows" + diffs.len(), + expected, + "Expected {expected} diffs, got {}", + diffs.len() ); - } - #[test] - fn test_cdc_update_once() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - - let part1 = make_i32_batch(part_len, 0); - let edit1 = make_i32_batch(edit_len, 1); - let edit2 = make_i32_batch(edit_len, 2); - let part2 = make_i32_batch(part_len, 100); - - let base = concat_batches([&part1, &edit1, &part2]); - let modified = concat_batches([&part1, &edit2, &part2]); - - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - assert_eq!(base_pages.len(), 1); - assert_eq!(mod_pages.len(), 1); - - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert!(diffs.len() <= 1, "Expected at most 1 diff, got {diffs:?}"); + let (mut eq, mut larger, mut smaller) = (0usize, 0usize, 0usize); for (left, right) in &diffs { let left_sum: i64 = left.iter().sum(); let right_sum: i64 = right.iter().sum(); - assert_eq!( - left_sum, right_sum, - "Update should not change total row count" - ); + if left_sum == right_sum { + eq += 1; + } else if left_sum < right_sum { + larger += 1; + assert_eq!( + left_sum + edit_length, + right_sum, + "Larger diff mismatch: {left_sum} + {edit_length} != {right_sum}" + ); + } else { + smaller += 1; + assert_eq!( + left_sum, + right_sum + edit_length, + "Smaller diff mismatch: {left_sum} != {right_sum} + {edit_length}" + ); + } } + + assert_eq!(eq, exact_equal_diffs, "equal diffs count"); + assert_eq!(larger, exact_larger_diffs, "larger diffs count"); + assert_eq!(smaller, exact_smaller_diffs, "smaller diffs count"); } - #[test] - fn test_cdc_update_twice() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - - let part1 = make_i32_batch(part_len, 0); - let edit1_old = make_i32_batch(edit_len, 1); - let edit1_new = make_i32_batch(edit_len, 2); - let part2 = make_i32_batch(part_len, 100); - let edit2_old = make_i32_batch(edit_len, 3); - let edit2_new = make_i32_batch(edit_len, 4); - let part3 = make_i32_batch(part_len, 200); - - let base = concat_batches([&part1, &edit1_old, &part2, &edit2_old, &part3]); - let modified = concat_batches([&part1, &edit1_new, &part2, &edit2_new, &part3]); - - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - - // A double update may produce 0, 1, or 2 diffs depending on whether the - // edits shift CDC boundaries. What must always hold is that the total row - // count within each diff region is unchanged (updates are row-count-neutral). - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert!(diffs.len() <= 2, "Expected at most 2 diffs, got {diffs:?}"); + /// Assert page length differences for update cases (simplified version). + /// Matches C++ `AssertPageLengthDifferences` (max_equal_diffs overload). + fn assert_page_length_differences_update( + original: &ColumnInfo, + modified: &ColumnInfo, + max_equal_diffs: usize, + ) { + let diffs = find_differences(&original.page_lengths, &modified.page_lengths); + assert!( + diffs.len() <= max_equal_diffs, + "Expected at most {max_equal_diffs} diffs, got {}", + diffs.len() + ); for (left, right) in &diffs { let left_sum: i64 = left.iter().sum(); let right_sum: i64 = right.iter().sum(); assert_eq!( left_sum, right_sum, - "Each update diff should not change total row count" + "Update diff should not change total row count" ); } } - /// Verifies that the `primitive_width` fallback in `get_cdc_chunks` (used for - /// f64 and other fixed-width non-integer types) produces correct CDC boundaries. - #[test] - fn test_cdc_f64_column() { - let part_len = CDC_PART_SIZE / 8; // 8 bytes per f64 - let edit_len = CDC_EDIT_SIZE / 8; - - let schema = Arc::new(Schema::new(vec![Field::new( - "col", - DataType::Float64, - false, - )])); - - let make_batch = |len: usize, seed: u64| { - let array: ArrayRef = Arc::new( - (0..len) - .map(|i| test_hash(seed, i as u64) as f64) - .collect::(), - ); - RecordBatch::try_new(schema.clone(), vec![array]).unwrap() - }; + // --- FindDifferences tests (ported from C++) --- - let part1 = make_batch(part_len, 0); - let edit = make_batch(edit_len, 1); - let part2 = make_batch(part_len, 100); - - let base = concat_batches([&part1, &part2]); - let modified = concat_batches([&part1, &edit, &part2]); - - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - let mod_result = read_batches(&mod_data); - assert_eq!(concat_batches(&mod_result), modified); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert_eq!( - diffs.len(), - 1, - "Expected 1 diff for f64 insert, got {diffs:?}" - ); - let mod_sum: i64 = diffs[0].1.iter().sum(); - let base_sum: i64 = diffs[0].0.iter().sum(); - assert_eq!(mod_sum - base_sum, edit_len as i64); + #[test] + fn test_find_differences_basic() { + let diffs = find_differences(&[1, 2, 3, 4, 5], &[1, 7, 8, 4, 5]); + assert_eq!(diffs.len(), 1); + assert_eq!(diffs[0].0, vec![2, 3]); + assert_eq!(diffs[0].1, vec![7, 8]); } #[test] - fn test_cdc_append() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - - let part1 = make_i32_batch(part_len, 0); - let part2 = make_i32_batch(part_len, 100); - let edit = make_i32_batch(edit_len, 1); - - let base = concat_batches([&part1, &part2]); - let modified = concat_batches([&part1, &part2, &edit]); - - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - assert_eq!(base_pages.len(), 1); - assert_eq!(mod_pages.len(), 1); - - let bp = &base_pages[0]; - let mp = &mod_pages[0]; - - assert!(mp.len() >= bp.len()); - for i in 0..bp.len() - 1 { - assert_eq!(bp[i], mp[i], "Page {i} should be identical"); - } - assert!( - mp[bp.len() - 1] >= bp[bp.len() - 1], - "Last original page should be same or larger in modified" - ); + fn test_find_differences_multiple() { + let diffs = find_differences(&[1, 2, 3, 4, 5, 6, 7], &[1, 8, 9, 4, 10, 6, 11]); + assert_eq!(diffs.len(), 3); + assert_eq!(diffs[0].0, vec![2, 3]); + assert_eq!(diffs[0].1, vec![8, 9]); + assert_eq!(diffs[1].0, vec![5]); + assert_eq!(diffs[1].1, vec![10]); + assert_eq!(diffs[2].0, vec![7]); + assert_eq!(diffs[2].1, vec![11]); } #[test] - fn test_cdc_prepend() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - - let part1 = make_i32_batch(part_len, 0); - let part2 = make_i32_batch(part_len, 100); - let edit = make_i32_batch(edit_len, 1); - - let base = concat_batches([&part1, &part2]); - let modified = concat_batches([&edit, &part1, &part2]); - - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - assert_eq!(base_pages.len(), 1); - assert_eq!(mod_pages.len(), 1); - - assert!(mod_pages[0].len() >= base_pages[0].len()); - - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert_eq!(diffs.len(), 1, "Expected 1 diff, got {diffs:?}"); - let base_sum: i64 = diffs[0].0.iter().sum(); - let mod_sum: i64 = diffs[0].1.iter().sum(); - assert_eq!( - mod_sum - base_sum, - edit_len as i64, - "Diff should account for prepended rows" - ); + fn test_find_differences_different_lengths() { + let diffs = find_differences(&[1, 2, 3], &[1, 2, 3, 4, 5]); + assert_eq!(diffs.len(), 1); + assert!(diffs[0].0.is_empty()); + assert_eq!(diffs[0].1, vec![4, 5]); } #[test] - fn test_cdc_empty_table() { - let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Int32, false)])); - let empty = RecordBatch::new_empty(schema.clone()); - let data = write_with_cdc_options(&[&empty], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - let pages = get_page_lengths(&data, 0); - assert!(pages.is_empty(), "Empty table should produce no row groups"); - - let result = read_batches(&data); - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total_rows, 0); + fn test_find_differences_empty() { + let diffs = find_differences(&[], &[]); + assert!(diffs.is_empty()); } #[test] - fn test_cdc_multiple_row_groups_insert() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - let rg_rows = part_len / 2; - - let part1 = make_i32_batch(part_len, 0); - let edit1 = make_i32_batch(edit_len, 1); - let edit2 = make_i32_batch(edit_len, 3); - let part2 = make_i32_batch(part_len, 100); - let part3 = make_i32_batch(part_len, 200); - - let base = concat_batches([&part1, &edit1, &part2, &part3]); - let modified = concat_batches([&part1, &edit1, &edit2, &part2, &part3]); - - let base_data = write_with_cdc_options( - &[&base], - CDC_MIN_CHUNK_SIZE, - CDC_MAX_CHUNK_SIZE, - Some(rg_rows), - ); - let mod_data = write_with_cdc_options( - &[&modified], - CDC_MIN_CHUNK_SIZE, - CDC_MAX_CHUNK_SIZE, - Some(rg_rows), - ); - - let base_result = read_batches(&base_data); - let mod_result = read_batches(&mod_data); - assert_eq!(concat_batches(&base_result), base); - assert_eq!(concat_batches(&mod_result), modified); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - - assert!(base_pages.len() > 1); - assert_eq!(base_pages.len(), mod_pages.len()); - - assert_eq!(base_pages[0], mod_pages[0]); - assert_eq!(base_pages[1], mod_pages[1]); + fn test_find_differences_changes_at_both_ends() { + let diffs = find_differences(&[1, 2, 3, 4, 5, 6, 7, 8, 9], &[0, 0, 2, 3, 4, 5, 7, 7, 8]); + assert_eq!(diffs.len(), 3); + assert_eq!(diffs[0].0, vec![1]); + assert_eq!(diffs[0].1, vec![0, 0]); + assert_eq!(diffs[1].0, vec![6]); + assert_eq!(diffs[1].1, vec![7]); + assert_eq!(diffs[2].0, vec![9]); + assert!(diffs[2].1.is_empty()); } #[test] - fn test_cdc_multiple_row_groups_append() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - let rg_rows = part_len / 2; - - let part1 = make_i32_batch(part_len, 0); - let edit1 = make_i32_batch(edit_len, 1); - let part2 = make_i32_batch(part_len, 100); - let part3 = make_i32_batch(part_len, 200); - let edit2 = make_i32_batch(edit_len, 3); - - let base = concat_batches([&part1, &edit1, &part2, &part3]); - let modified = concat_batches([&part1, &edit1, &part2, &part3, &edit2]); - - let base_data = write_with_cdc_options( - &[&base], - CDC_MIN_CHUNK_SIZE, - CDC_MAX_CHUNK_SIZE, - Some(rg_rows), - ); - let mod_data = write_with_cdc_options( - &[&modified], - CDC_MIN_CHUNK_SIZE, - CDC_MAX_CHUNK_SIZE, - Some(rg_rows), + fn test_find_differences_additional() { + let diffs = find_differences( + &[445, 312, 393, 401, 410, 138, 558, 457], + &[445, 312, 393, 393, 410, 138, 558, 457], ); - - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); - assert!(base_pages.len() > 1); - assert_eq!(base_pages.len(), mod_pages.len()); - - for i in 0..base_pages.len() - 1 { - assert_eq!( - base_pages[i], mod_pages[i], - "Row group {i} pages should be identical" - ); - } + assert_eq!(diffs.len(), 1); + assert_eq!(diffs[0].0, vec![401]); + assert_eq!(diffs[0].1, vec![393]); } - #[test] - fn test_cdc_nullable_column() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - - let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Int32, true)])); + // --- Parameterized single-row-group tests via macro --- - let make_batch = |len, seed| { - RecordBatch::try_new( - schema.clone(), - vec![Arc::new(generate_nullable_i32_array(len, seed)) as _], - ) - .unwrap() - }; + macro_rules! cdc_single_rg_tests { + ($mod_name:ident, $dtype:expr, $nullable:expr) => { + mod $mod_name { + use super::*; - let part1 = make_batch(part_len, 0); - let edit = make_batch(edit_len, 1); - let part2 = make_batch(part_len, 100); - - let base = concat_batches([&part1, &part2]); - let modified = concat_batches([&part1, &edit, &part2]); - - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - let mod_result = read_batches(&mod_data); - assert_eq!(concat_batches(&mod_result), modified); + fn config() -> (DataType, bool, usize, usize) { + let dtype: DataType = $dtype; + let nullable: bool = $nullable; + let bpr = bytes_per_record(&dtype, nullable); + let part_length = CDC_PART_SIZE / bpr; + let edit_length = CDC_EDIT_SIZE / bpr; + (dtype, nullable, part_length, edit_length) + } - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); + fn make_schema(dtype: &DataType, nullable: bool) -> Arc { + Arc::new(Schema::new(vec![Field::new("f0", dtype.clone(), nullable)])) + } - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert_eq!(diffs.len(), 1, "Expected 1 diff, got {diffs:?}"); - let mod_sum: i64 = diffs[0].1.iter().sum(); - let base_sum: i64 = diffs[0].0.iter().sum(); - assert_eq!(mod_sum - base_sum, edit_len as i64); - } + #[test] + fn delete_once() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let part1 = generate_table(&schema, part_length, 0); + let part2 = generate_table(&schema, edit_length, 1); + let part3 = generate_table(&schema, part_length, part_length as u64); + + let base = concat_batches([&part1, &part2, &part3]); + let modified = concat_batches([&part1, &part3]); + + for enable_dictionary in [false, true] { + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + + let base_info = get_column_info(&base_data, 0); + let mod_info = get_column_info(&mod_data, 0); + assert_eq!(base_info.len(), 1); + assert_eq!(mod_info.len(), 1); + + assert_cdc_chunk_sizes( + &base.column(0).clone(), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + assert_cdc_chunk_sizes( + &modified.column(0).clone(), + &mod_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + + assert_page_length_differences( + &base_info[0], + &mod_info[0], + 0, + 0, + 1, + edit_length as i64, + ); + } + } - #[test] - fn test_cdc_string_column() { - let part_len = CDC_PART_SIZE / 16; - let edit_len = CDC_EDIT_SIZE / 16; + #[test] + fn delete_twice() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let part1 = generate_table(&schema, part_length, 0); + let part2 = generate_table(&schema, edit_length, 1); + let part3 = generate_table(&schema, part_length, part_length as u64); + let part4 = generate_table(&schema, edit_length, 2); + let part5 = generate_table(&schema, part_length, 2 * part_length as u64); + + let base = concat_batches([&part1, &part2, &part3, &part4, &part5]); + let modified = concat_batches([&part1, &part3, &part5]); + + for enable_dictionary in [false, true] { + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + + let base_info = get_column_info(&base_data, 0); + let mod_info = get_column_info(&mod_data, 0); + assert_eq!(base_info.len(), 1); + assert_eq!(mod_info.len(), 1); + + assert_cdc_chunk_sizes( + &base.column(0).clone(), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + assert_cdc_chunk_sizes( + &modified.column(0).clone(), + &mod_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + + assert_page_length_differences( + &base_info[0], + &mod_info[0], + 0, + 0, + 2, + edit_length as i64, + ); + } + } - let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)])); + #[test] + fn insert_once() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let part1 = generate_table(&schema, part_length, 0); + let part2 = generate_table(&schema, edit_length, 1); + let part3 = generate_table(&schema, part_length, part_length as u64); + + let base = concat_batches([&part1, &part3]); + let modified = concat_batches([&part1, &part2, &part3]); + + for enable_dictionary in [false, true] { + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + + let base_info = get_column_info(&base_data, 0); + let mod_info = get_column_info(&mod_data, 0); + assert_eq!(base_info.len(), 1); + assert_eq!(mod_info.len(), 1); + + assert_cdc_chunk_sizes( + &base.column(0).clone(), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + assert_cdc_chunk_sizes( + &modified.column(0).clone(), + &mod_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + + assert_page_length_differences( + &base_info[0], + &mod_info[0], + 0, + 1, + 0, + edit_length as i64, + ); + } + } - let make_batch = |len, seed| { - RecordBatch::try_new( - schema.clone(), - vec![Arc::new(generate_string_array(len, seed)) as _], - ) - .unwrap() - }; + #[test] + fn insert_twice() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let part1 = generate_table(&schema, part_length, 0); + let part2 = generate_table(&schema, edit_length, 1); + let part3 = generate_table(&schema, part_length, part_length as u64); + let part4 = generate_table(&schema, edit_length, 2); + let part5 = generate_table(&schema, part_length, 2 * part_length as u64); + + let base = concat_batches([&part1, &part3, &part5]); + let modified = concat_batches([&part1, &part2, &part3, &part4, &part5]); + + for enable_dictionary in [false, true] { + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + + let base_info = get_column_info(&base_data, 0); + let mod_info = get_column_info(&mod_data, 0); + assert_eq!(base_info.len(), 1); + assert_eq!(mod_info.len(), 1); + + assert_cdc_chunk_sizes( + &base.column(0).clone(), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + assert_cdc_chunk_sizes( + &modified.column(0).clone(), + &mod_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + + assert_page_length_differences( + &base_info[0], + &mod_info[0], + 0, + 2, + 0, + edit_length as i64, + ); + } + } - let part1 = make_batch(part_len, 0); - let edit = make_batch(edit_len, 1); - let part2 = make_batch(part_len, 100); + #[test] + fn update_once() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let part1 = generate_table(&schema, part_length, 0); + let part2 = generate_table(&schema, edit_length, 1); + let part3 = generate_table(&schema, part_length, part_length as u64); + let part4 = generate_table(&schema, edit_length, 2); + + let base = concat_batches([&part1, &part2, &part3]); + let modified = concat_batches([&part1, &part4, &part3]); + + for enable_dictionary in [false, true] { + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + + let base_info = get_column_info(&base_data, 0); + let mod_info = get_column_info(&mod_data, 0); + assert_eq!(base_info.len(), 1); + assert_eq!(mod_info.len(), 1); + + assert_cdc_chunk_sizes( + &base.column(0).clone(), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + assert_cdc_chunk_sizes( + &modified.column(0).clone(), + &mod_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + + assert_page_length_differences_update(&base_info[0], &mod_info[0], 1); + } + } - let base = concat_batches([&part1, &part2]); - let modified = concat_batches([&part1, &edit, &part2]); + #[test] + fn update_twice() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let part1 = generate_table(&schema, part_length, 0); + let part2 = generate_table(&schema, edit_length, 1); + let part3 = generate_table(&schema, part_length, part_length as u64); + let part4 = generate_table(&schema, edit_length, 2); + let part5 = generate_table(&schema, part_length, 2 * part_length as u64); + let part6 = generate_table(&schema, edit_length, 3); + let part7 = generate_table(&schema, edit_length, 4); + + let base = concat_batches([&part1, &part2, &part3, &part4, &part5]); + let modified = concat_batches([&part1, &part6, &part3, &part7, &part5]); + + for enable_dictionary in [false, true] { + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + + let base_info = get_column_info(&base_data, 0); + let mod_info = get_column_info(&mod_data, 0); + assert_eq!(base_info.len(), 1); + assert_eq!(mod_info.len(), 1); + + assert_cdc_chunk_sizes( + &base.column(0).clone(), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + assert_cdc_chunk_sizes( + &modified.column(0).clone(), + &mod_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + + assert_page_length_differences_update(&base_info[0], &mod_info[0], 2); + } + } - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + #[test] + fn prepend() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let part1 = generate_table(&schema, part_length, 0); + let part2 = generate_table(&schema, edit_length, 1); + let part3 = generate_table(&schema, part_length, part_length as u64); + let part4 = generate_table(&schema, edit_length, 2); + + let base = concat_batches([&part1, &part2, &part3]); + let modified = concat_batches([&part4, &part1, &part2, &part3]); + + for enable_dictionary in [false, true] { + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + + let base_info = get_column_info(&base_data, 0); + let mod_info = get_column_info(&mod_data, 0); + assert_eq!(base_info.len(), 1); + assert_eq!(mod_info.len(), 1); + + assert_cdc_chunk_sizes( + &base.column(0).clone(), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + assert_cdc_chunk_sizes( + &modified.column(0).clone(), + &mod_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + + assert!( + mod_info[0].page_lengths.len() >= base_info[0].page_lengths.len(), + "Modified should have same or more pages" + ); + + assert_page_length_differences( + &base_info[0], + &mod_info[0], + 0, + 1, + 0, + edit_length as i64, + ); + } + } - let mod_result = read_batches(&mod_data); - assert_eq!(concat_batches(&mod_result), modified); + #[test] + fn append() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let part1 = generate_table(&schema, part_length, 0); + let part2 = generate_table(&schema, edit_length, 1); + let part3 = generate_table(&schema, part_length, part_length as u64); + let part4 = generate_table(&schema, edit_length, 2); + + let base = concat_batches([&part1, &part2, &part3]); + let modified = concat_batches([&part1, &part2, &part3, &part4]); + + for enable_dictionary in [false, true] { + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + + let base_info = get_column_info(&base_data, 0); + let mod_info = get_column_info(&mod_data, 0); + assert_eq!(base_info.len(), 1); + assert_eq!(mod_info.len(), 1); + + assert_cdc_chunk_sizes( + &base.column(0).clone(), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + assert_cdc_chunk_sizes( + &modified.column(0).clone(), + &mod_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + enable_dictionary, + ); + + let bp = &base_info[0].page_lengths; + let mp = &mod_info[0].page_lengths; + assert!(mp.len() >= bp.len()); + for i in 0..bp.len() - 1 { + assert_eq!(bp[i], mp[i], "Page {i} should be identical"); + } + assert!(mp[bp.len() - 1] >= bp[bp.len() - 1]); + } + } - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); + #[test] + fn empty_table() { + let (dtype, nullable, _, _) = config(); + let schema = make_schema(&dtype, nullable); + + let empty = RecordBatch::new_empty(schema); + for enable_dictionary in [false, true] { + let data = write_with_cdc_options( + &[&empty], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + enable_dictionary, + ); + let info = get_column_info(&data, 0); + // Empty table: either no row groups or one with no data pages + if !info.is_empty() { + assert!(info[0].page_lengths.is_empty()); + } + } + } - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert_eq!( - diffs.len(), - 1, - "Expected 1 diff for string insert, got {diffs:?}" - ); - let mod_sum: i64 = diffs[0].1.iter().sum(); - let base_sum: i64 = diffs[0].0.iter().sum(); - assert_eq!(mod_sum - base_sum, edit_len as i64); + #[test] + fn array_offsets() { + let (dtype, nullable, part_length, edit_length) = config(); + let schema = make_schema(&dtype, nullable); + + let table = concat_batches([ + &generate_table(&schema, part_length, 0), + &generate_table(&schema, edit_length, 1), + &generate_table(&schema, part_length, part_length as u64), + ]); + + for offset in [0usize, 512, 1024] { + if offset >= table.num_rows() { + continue; + } + let sliced = table.slice(offset, table.num_rows() - offset); + let data = write_with_cdc_options( + &[&sliced], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(CDC_ROW_GROUP_LENGTH), + true, + ); + let info = get_column_info(&data, 0); + assert_eq!(info.len(), 1); + + // Verify CDC actually produced content-defined chunks + assert_cdc_chunk_sizes( + &sliced.column(0).clone(), + &info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + true, + ); + } + } + } + }; } - #[test] - fn test_cdc_delete_twice() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); - - let part1 = make_i32_batch(part_len, 0); - let edit1 = make_i32_batch(edit_len, 1); - let part2 = make_i32_batch(part_len, 100); - let edit2 = make_i32_batch(edit_len, 2); - let part3 = make_i32_batch(part_len, 200); - - let base = concat_batches([&part1, &edit1, &part2, &edit2, &part3]); - let modified = concat_batches([&part1, &part2, &part3]); + // Instantiate for representative types matching C++ categories + cdc_single_rg_tests!(cdc_bool_non_null, DataType::Boolean, false); + cdc_single_rg_tests!(cdc_i32_non_null, DataType::Int32, false); + cdc_single_rg_tests!(cdc_i64_nullable, DataType::Int64, true); + cdc_single_rg_tests!(cdc_f64_nullable, DataType::Float64, true); + cdc_single_rg_tests!(cdc_utf8_non_null, DataType::Utf8, false); + cdc_single_rg_tests!(cdc_binary_nullable, DataType::Binary, true); + cdc_single_rg_tests!(cdc_fsb16_nullable, DataType::FixedSizeBinary(16), true); + cdc_single_rg_tests!(cdc_date32_non_null, DataType::Date32, false); + cdc_single_rg_tests!( + cdc_timestamp_nullable, + DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None), + true + ); + + // --- Multiple row group tests matching C++ TestCDCMultipleRowGroups --- + + mod cdc_multiple_row_groups { + use super::*; + + const PART_LENGTH: usize = 128 * 1024; + const EDIT_LENGTH: usize = 128; + const ROW_GROUP_LENGTH: usize = 64 * 1024; + + fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("int32", DataType::Int32, true), + Field::new("float64", DataType::Float64, true), + Field::new("bool", DataType::Boolean, false), + ])) + } - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + #[test] + fn insert_once() { + let s = schema(); + let part1 = generate_table(&s, PART_LENGTH, 0); + let part2 = generate_table(&s, PART_LENGTH, 2); + let part3 = generate_table(&s, PART_LENGTH, 4); + let edit1 = generate_table(&s, EDIT_LENGTH, 1); + let edit2 = generate_table(&s, EDIT_LENGTH, 3); + + let base = concat_batches([&part1, &edit1, &part2, &part3]); + let modified = concat_batches([&part1, &edit1, &edit2, &part2, &part3]); + assert_eq!(modified.num_rows(), base.num_rows() + EDIT_LENGTH); + + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(ROW_GROUP_LENGTH), + false, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(ROW_GROUP_LENGTH), + false, + ); - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); + for col in 0..s.fields().len() { + let base_info = get_column_info(&base_data, col); + let mod_info = get_column_info(&mod_data, col); + + assert_eq!(base_info.len(), 7, "expected 7 row groups for col {col}"); + assert_eq!(mod_info.len(), 7); + + // First two row groups should be identical + assert_eq!(base_info[0].page_lengths, mod_info[0].page_lengths); + assert_eq!(base_info[1].page_lengths, mod_info[1].page_lengths); + + // Middle row groups: 1 larger + 1 smaller diff + for i in 2..mod_info.len() - 1 { + assert_page_length_differences( + &base_info[i], + &mod_info[i], + 0, + 1, + 1, + EDIT_LENGTH as i64, + ); + } + // Last row group: just larger + assert_page_length_differences( + base_info.last().unwrap(), + mod_info.last().unwrap(), + 0, + 1, + 0, + EDIT_LENGTH as i64, + ); + } + } - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert_eq!( - diffs.len(), - 2, - "Expected 2 diffs for double delete, got {diffs:?}" - ); - for (left, right) in &diffs { - let left_sum: i64 = left.iter().sum(); - let right_sum: i64 = right.iter().sum(); - assert_eq!( - left_sum - right_sum, - edit_len as i64, - "Each diff should account for one deletion" + #[test] + fn delete_once() { + let s = schema(); + let part1 = generate_table(&s, PART_LENGTH, 0); + let part2 = generate_table(&s, PART_LENGTH, 2); + let part3 = generate_table(&s, PART_LENGTH, 4); + let edit1 = generate_table(&s, EDIT_LENGTH, 1); + let edit2 = generate_table(&s, EDIT_LENGTH, 3); + + let base = concat_batches([&part1, &edit1, &part2, &part3, &edit2]); + let modified = concat_batches([&part1, &part2, &part3, &edit2]); + + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(ROW_GROUP_LENGTH), + false, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(ROW_GROUP_LENGTH), + false, ); - } - } - #[test] - fn test_cdc_insert_twice() { - let part_len = i32_part_length(); - let edit_len = i32_edit_length(); + for col in 0..s.fields().len() { + let base_info = get_column_info(&base_data, col); + let mod_info = get_column_info(&mod_data, col); + + assert_eq!(base_info.len(), 7); + assert_eq!(mod_info.len(), 7); + + assert_eq!(base_info[0].page_lengths, mod_info[0].page_lengths); + assert_eq!(base_info[1].page_lengths, mod_info[1].page_lengths); + + for i in 2..mod_info.len() - 1 { + assert_page_length_differences( + &base_info[i], + &mod_info[i], + 0, + 1, + 1, + EDIT_LENGTH as i64, + ); + } + assert_page_length_differences( + base_info.last().unwrap(), + mod_info.last().unwrap(), + 0, + 0, + 1, + EDIT_LENGTH as i64, + ); + } + } - let part1 = make_i32_batch(part_len, 0); - let edit1 = make_i32_batch(edit_len, 1); - let part2 = make_i32_batch(part_len, 100); - let edit2 = make_i32_batch(edit_len, 2); - let part3 = make_i32_batch(part_len, 200); + #[test] + fn update_once() { + let s = schema(); + let part1 = generate_table(&s, PART_LENGTH, 0); + let part2 = generate_table(&s, PART_LENGTH, 2); + let part3 = generate_table(&s, PART_LENGTH, 4); + let edit1 = generate_table(&s, EDIT_LENGTH, 1); + let edit2 = generate_table(&s, EDIT_LENGTH, 3); + let edit3 = generate_table(&s, EDIT_LENGTH, 5); + + let base = concat_batches([&part1, &edit1, &part2, &part3, &edit2]); + let modified = concat_batches([&part1, &edit3, &part2, &part3, &edit2]); + + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(ROW_GROUP_LENGTH), + false, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(ROW_GROUP_LENGTH), + false, + ); - let base = concat_batches([&part1, &part2, &part3]); - let modified = concat_batches([&part1, &edit1, &part2, &edit2, &part3]); + for col in 0..s.fields().len() { + let nullable = s.field(col).is_nullable(); + let base_info = get_column_info(&base_data, col); + let mod_info = get_column_info(&mod_data, col); + + assert_eq!(base_info.len(), 7); + assert_eq!(mod_info.len(), 7); + + // Validate CDC chunk sizes on at least the first row group + assert_cdc_chunk_sizes( + &base.column(col).slice(0, ROW_GROUP_LENGTH), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + false, + ); + + assert_eq!(base_info[0].page_lengths, mod_info[0].page_lengths); + assert_eq!(base_info[1].page_lengths, mod_info[1].page_lengths); + + // Row group containing the edit + assert_page_length_differences_update(&base_info[2], &mod_info[2], 1); + + // Remaining row groups should be identical + for i in 3..mod_info.len() { + assert_eq!(base_info[i].page_lengths, mod_info[i].page_lengths); + } + } + } - let base_data = - write_with_cdc_options(&[&base], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let mod_data = - write_with_cdc_options(&[&modified], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); + #[test] + fn append() { + let s = schema(); + let part1 = generate_table(&s, PART_LENGTH, 0); + let part2 = generate_table(&s, PART_LENGTH, 2); + let part3 = generate_table(&s, PART_LENGTH, 4); + let edit1 = generate_table(&s, EDIT_LENGTH, 1); + let edit2 = generate_table(&s, EDIT_LENGTH, 3); + + let base = concat_batches([&part1, &edit1, &part2, &part3]); + let modified = concat_batches([&part1, &edit1, &part2, &part3, &edit2]); + + let base_data = write_with_cdc_options( + &[&base], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(ROW_GROUP_LENGTH), + false, + ); + let mod_data = write_with_cdc_options( + &[&modified], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + Some(ROW_GROUP_LENGTH), + false, + ); - let base_pages = get_page_lengths(&base_data, 0); - let mod_pages = get_page_lengths(&mod_data, 0); + for col in 0..s.fields().len() { + let nullable = s.field(col).is_nullable(); + let base_info = get_column_info(&base_data, col); + let mod_info = get_column_info(&mod_data, col); + + assert_eq!(base_info.len(), 7); + assert_eq!(mod_info.len(), 7); + + // Validate CDC chunk sizes on the first row group + assert_cdc_chunk_sizes( + &base.column(col).slice(0, ROW_GROUP_LENGTH), + &base_info[0], + nullable, + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + false, + ); + + // All row groups except last should be identical + for i in 0..base_info.len() - 1 { + assert_eq!(base_info[i].page_lengths, mod_info[i].page_lengths); + } - let diffs = find_differences(&base_pages[0], &mod_pages[0]); - assert_eq!( - diffs.len(), - 2, - "Expected 2 diffs for double insert, got {diffs:?}" - ); - for (left, right) in &diffs { - let left_sum: i64 = left.iter().sum(); - let right_sum: i64 = right.iter().sum(); - assert_eq!( - right_sum - left_sum, - edit_len as i64, - "Each diff should account for one insertion" - ); + // Last row group: pages should be identical except last + let bp = &base_info.last().unwrap().page_lengths; + let mp = &mod_info.last().unwrap().page_lengths; + assert!(mp.len() >= bp.len()); + for i in 0..bp.len() - 1 { + assert_eq!(bp[i], mp[i]); + } + } } } - #[test] - fn test_cdc_array_offsets() { - // CDC boundaries are content-defined: once the gear hash converges (within - // a few dozen bytes), both the full and the sliced stream find boundaries - // at the same absolute content positions. Slicing at offset=10 therefore - // produces page lengths of the form: - // - // non-offsetted: [n, a, b, c, ...] - // offsetted: [n-10, a, b, c, ...] - // - // Only the first page is shorter by `offset`; every subsequent page, - // including the last, is identical. - let n = i32_part_length(); // large enough to span many CDC pages - let offset = 10usize; - let full = make_i32_batch(n, 0); - let sliced = full.slice(offset, n - offset); - - let full_data = - write_with_cdc_options(&[&full], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - let sliced_data = - write_with_cdc_options(&[&sliced], CDC_MIN_CHUNK_SIZE, CDC_MAX_CHUNK_SIZE, None); - - // Roundtrip correctness. - let read = read_batches(&sliced_data); - assert_eq!(sliced, concat_batches(&read)); - - let full_pages = get_page_lengths(&full_data, 0); - let sliced_pages = get_page_lengths(&sliced_data, 0); - - assert_eq!(full_pages.len(), 1, "expected single row group"); - assert_eq!(sliced_pages.len(), 1, "expected single row group"); - - let fp = &full_pages[0]; - let sp = &sliced_pages[0]; - - assert!(fp.len() > 1, "expected multiple CDC pages, got {fp:?}"); - assert_eq!(fp.len(), sp.len(), "page count must match"); - - // First page is shorter by exactly `offset`. - assert_eq!( - fp[0] - sp[0], - offset as i64, - "sliced first page should be {offset} values shorter: full={fp:?} sliced={sp:?}" - ); - - // All remaining pages — including the last — are identical. - assert_eq!( - &fp[1..], - &sp[1..], - "pages after the first must be identical: full={fp:?} sliced={sp:?}" - ); - } + // --- Direct chunker test (kept from original) --- #[test] fn test_cdc_array_offsets_direct() { - // Call get_arrow_chunks directly on the low-level chunker, bypassing the - // Arrow writer pipeline. The same self-synchronisation property holds: - // - // non-offsetted chunks: [n, a, b, c, ...] - // offsetted chunks: [n-10, a, b, c, ...] - // - // Only the first chunk is shorter by `offset`; all subsequent chunks have - // identical num_values. use crate::basic::Type as PhysicalType; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type}; @@ -1704,15 +2124,14 @@ mod arrow_tests { ColumnDescriptor::new(Arc::new(tp), 0, 0, ColumnPath::new(vec![])) }; - let n = i32_part_length(); // large enough for multiple CDC chunks + let bpr = bytes_per_record(&DataType::Int32, false); + let n = CDC_PART_SIZE / bpr; let offset = 10usize; - // Non-offsetted: plain fresh array of n values. - let array = generate_i32_array(n, 0); + let array: Int32Array = (0..n).map(|i| test_hash(0, i as u64) as i32).collect(); let mut chunker = super::ContentDefinedChunker::new(&desc, &options).unwrap(); let chunks = chunker.get_arrow_chunks(None, None, &array).unwrap(); - // Offsetted: same backing buffer sliced by `offset` elements. let sliced = array.slice(offset, n - offset); let mut chunker2 = super::ContentDefinedChunker::new(&desc, &options).unwrap(); let chunks2 = chunker2.get_arrow_chunks(None, None, &sliced).unwrap(); @@ -1726,12 +2145,12 @@ mod arrow_tests { assert_eq!( values[0] - values2[0], offset, - "offsetted first chunk should be {offset} values shorter: {values:?} vs {values2:?}" + "offsetted first chunk should be {offset} values shorter" ); assert_eq!( &values[1..], &values2[1..], - "all chunks after the first must be identical: {values:?} vs {values2:?}" + "all chunks after the first must be identical" ); } } diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index b8dec7a3728c..916892fafeae 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -77,7 +77,7 @@ //! //! Enable CDC via [`WriterProperties`]: //! -//! ```no_run +//! ```rust //! # use parquet::file::properties::{WriterProperties, CdcOptions}; //! let props = WriterProperties::builder() //! .set_content_defined_chunking(Some(CdcOptions::default()))