Skip to content

Commit 85bcfa1

Browse files
committed
refactor: rename config CdcOptions to ParquetCdcOptions
Rename the DataFusion config struct `CdcOptions` to `ParquetCdcOptions` for explicitness and consistency with the other parquet sub-option structs (`ParquetColumnOptions`, `ParquetEncryptionOptions`), and to disambiguate from the unrelated "change data capture" meaning of CDC. Only the config type is renamed; parquet-rs's `CdcOptions` and the proto `parquet_options::CdcOptions` message keep their names (the latter already carries the parquet context via the module path).
1 parent 4246c25 commit 85bcfa1

6 files changed

Lines changed: 42 additions & 39 deletions

File tree

datafusion/common/src/config.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -713,11 +713,11 @@ config_namespace! {
713713
/// Options for content-defined chunking (CDC) when writing parquet files.
714714
/// Mirrors `parquet::file::properties::CdcOptions`.
715715
///
716-
/// Carried as a [`CdcOptions`] in [`ParquetOptions::content_defined_chunking`]
716+
/// Carried as a [`ParquetCdcOptions`] in [`ParquetOptions::content_defined_chunking`]
717717
/// with an explicit `enabled` flag, so it can be toggled with dotted config
718718
/// keys (`content_defined_chunking.enabled = true|false`) and the result is
719719
/// independent of the order in which the keys are set.
720-
pub struct CdcOptions {
720+
pub struct ParquetCdcOptions {
721721
/// (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing
722722
/// parquet files. When enabled, parallel writing is automatically disabled
723723
/// since the chunker state must persist across row groups.
@@ -737,20 +737,20 @@ config_namespace! {
737737
}
738738
}
739739

740-
impl CdcOptions {
740+
impl ParquetCdcOptions {
741741
/// Returns enabled CDC options with the default chunking parameters.
742742
///
743-
/// Shorthand for `CdcOptions { enabled: true, ..Default::default() }`; combine
744-
/// with struct-update syntax to override parameters, e.g.
745-
/// `CdcOptions { min_chunk_size: 4096, ..CdcOptions::enabled() }`.
743+
/// Shorthand for `ParquetCdcOptions { enabled: true, ..Default::default() }`;
744+
/// combine with struct-update syntax to override parameters, e.g.
745+
/// `ParquetCdcOptions { min_chunk_size: 4096, ..ParquetCdcOptions::enabled() }`.
746746
pub fn enabled() -> Self {
747747
Self {
748748
enabled: true,
749749
..Default::default()
750750
}
751751
}
752752

753-
/// Returns disabled CDC options (equivalent to [`CdcOptions::default`]).
753+
/// Returns disabled CDC options (equivalent to [`ParquetCdcOptions::default`]).
754754
pub fn disabled() -> Self {
755755
Self::default()
756756
}
@@ -959,7 +959,7 @@ config_namespace! {
959959
/// enabled, parallel writing is automatically disabled since the chunker state
960960
/// must persist across row groups. Mirrors
961961
/// `parquet::file::properties::WriterProperties::content_defined_chunking`.
962-
pub content_defined_chunking: CdcOptions, default = Default::default()
962+
pub content_defined_chunking: ParquetCdcOptions, default = Default::default()
963963
}
964964
}
965965

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use std::sync::Arc;
2121

2222
use crate::{
2323
_internal_datafusion_err, DataFusionError, Result,
24-
config::{CdcOptions, ParquetOptions, TableParquetOptions},
24+
config::{ParquetCdcOptions, ParquetOptions, TableParquetOptions},
2525
};
2626

2727
use arrow::datatypes::Schema;
@@ -166,13 +166,13 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
166166
}
167167
}
168168

169-
/// Convert DataFusion's [`CdcOptions`] into parquet-rs's `Option<CdcOptions>`.
169+
/// Convert DataFusion's [`ParquetCdcOptions`] into parquet-rs's `Option<CdcOptions>`.
170170
///
171171
/// parquet-rs has no `enabled` flag; CDC is on when the option is `Some`. So a
172-
/// disabled [`CdcOptions`] maps to `None`, and an enabled one to `Some` with the
173-
/// chunking parameters.
174-
impl From<&CdcOptions> for Option<parquet::file::properties::CdcOptions> {
175-
fn from(value: &CdcOptions) -> Self {
172+
/// disabled [`ParquetCdcOptions`] maps to `None`, and an enabled one to `Some`
173+
/// with the chunking parameters.
174+
impl From<&ParquetCdcOptions> for Option<parquet::file::properties::CdcOptions> {
175+
fn from(value: &ParquetCdcOptions) -> Self {
176176
value
177177
.enabled
178178
.then_some(parquet::file::properties::CdcOptions {
@@ -183,20 +183,21 @@ impl From<&CdcOptions> for Option<parquet::file::properties::CdcOptions> {
183183
}
184184
}
185185

186-
/// Convert parquet-rs's `Option<&CdcOptions>` back into DataFusion's [`CdcOptions`].
186+
/// Convert parquet-rs's `Option<&CdcOptions>` back into DataFusion's
187+
/// [`ParquetCdcOptions`].
187188
///
188189
/// The presence of parquet-rs options means CDC was enabled, so `Some` maps to
189190
/// `enabled: true`; `None` yields the disabled default.
190-
impl From<Option<&parquet::file::properties::CdcOptions>> for CdcOptions {
191+
impl From<Option<&parquet::file::properties::CdcOptions>> for ParquetCdcOptions {
191192
fn from(value: Option<&parquet::file::properties::CdcOptions>) -> Self {
192193
match value {
193-
Some(cdc) => CdcOptions {
194+
Some(cdc) => ParquetCdcOptions {
194195
enabled: true,
195196
min_chunk_size: cdc.min_chunk_size,
196197
max_chunk_size: cdc.max_chunk_size,
197198
norm_level: cdc.norm_level,
198199
},
199-
None => CdcOptions::default(),
200+
None => ParquetCdcOptions::default(),
200201
}
201202
}
202203
}
@@ -427,7 +428,7 @@ mod tests {
427428
#[cfg(feature = "parquet_encryption")]
428429
use crate::config::ConfigFileEncryptionProperties;
429430
use crate::config::{
430-
CdcOptions, ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions,
431+
ParquetCdcOptions, ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions,
431432
};
432433
use crate::parquet_config::DFParquetWriterVersion;
433434
use parquet::basic::Compression;
@@ -833,7 +834,7 @@ mod tests {
833834
#[test]
834835
fn test_cdc_enabled_with_custom_options() {
835836
let mut opts = TableParquetOptions::default();
836-
opts.global.content_defined_chunking = CdcOptions {
837+
opts.global.content_defined_chunking = ParquetCdcOptions {
837838
enabled: true,
838839
min_chunk_size: 128 * 1024,
839840
max_chunk_size: 512 * 1024,
@@ -861,7 +862,7 @@ mod tests {
861862
fn test_cdc_params_ignored_when_disabled() {
862863
// Parameters are customized but `enabled` is false, so CDC stays off.
863864
let mut opts = TableParquetOptions::default();
864-
opts.global.content_defined_chunking = CdcOptions {
865+
opts.global.content_defined_chunking = ParquetCdcOptions {
865866
enabled: false,
866867
min_chunk_size: 128 * 1024,
867868
max_chunk_size: 512 * 1024,
@@ -876,7 +877,7 @@ mod tests {
876877
#[test]
877878
fn test_cdc_round_trip_through_writer_props() {
878879
let mut opts = TableParquetOptions::default();
879-
opts.global.content_defined_chunking = CdcOptions {
880+
opts.global.content_defined_chunking = ParquetCdcOptions {
880881
enabled: true,
881882
min_chunk_size: 64 * 1024,
882883
max_chunk_size: 2 * 1024 * 1024,

datafusion/core/tests/parquet/content_defined_chunking.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow::array::{AsArray, Int32Array, StringArray};
2525
use arrow::datatypes::{DataType, Field, Int32Type, Int64Type, Schema};
2626
use arrow::record_batch::RecordBatch;
2727
use datafusion::prelude::{ParquetReadOptions, SessionContext};
28-
use datafusion_common::config::{CdcOptions, TableParquetOptions};
28+
use datafusion_common::config::{ParquetCdcOptions, TableParquetOptions};
2929
use parquet::arrow::ArrowWriter;
3030
use parquet::arrow::arrow_reader::ArrowReaderMetadata;
3131
use parquet::file::properties::WriterProperties;
@@ -97,7 +97,7 @@ async fn cdc_data_round_trip() {
9797
let batch = make_test_batch(5000);
9898

9999
let mut opts = TableParquetOptions::default();
100-
opts.global.content_defined_chunking = CdcOptions::enabled();
100+
opts.global.content_defined_chunking = ParquetCdcOptions::enabled();
101101
let props = writer_props(&mut opts, &batch.schema());
102102

103103
let tmp = write_parquet_file(&batch, props);
@@ -145,7 +145,7 @@ async fn cdc_affects_page_boundaries() {
145145

146146
// Write WITH CDC using small chunk sizes to maximize effect
147147
let mut cdc_opts = TableParquetOptions::default();
148-
cdc_opts.global.content_defined_chunking = CdcOptions {
148+
cdc_opts.global.content_defined_chunking = ParquetCdcOptions {
149149
enabled: true,
150150
min_chunk_size: 512,
151151
max_chunk_size: 2048,

datafusion/proto-common/src/from_proto/mod.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ use datafusion_common::{
3939
DataFusionError, JoinSide, ScalarValue, Statistics, TableReference,
4040
arrow_datafusion_err,
4141
config::{
42-
CdcOptions, CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
42+
CsvOptions, JsonOptions, ParquetCdcOptions, ParquetColumnOptions, ParquetOptions,
4343
TableParquetOptions,
4444
},
4545
file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions},
@@ -1130,14 +1130,14 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
11301130
max_predicate_cache_size: value.max_predicate_cache_size_opt.map(|opt| match opt {
11311131
protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => Some(v as usize),
11321132
}).unwrap_or(None),
1133-
content_defined_chunking: value.content_defined_chunking.map(CdcOptions::from).unwrap_or_default(),
1133+
content_defined_chunking: value.content_defined_chunking.map(ParquetCdcOptions::from).unwrap_or_default(),
11341134
})
11351135
}
11361136
}
11371137

1138-
impl From<protobuf::parquet_options::CdcOptions> for CdcOptions {
1138+
impl From<protobuf::parquet_options::CdcOptions> for ParquetCdcOptions {
11391139
fn from(value: protobuf::parquet_options::CdcOptions) -> Self {
1140-
CdcOptions {
1140+
ParquetCdcOptions {
11411141
enabled: value.enabled,
11421142
min_chunk_size: value.min_chunk_size as usize,
11431143
max_chunk_size: value.max_chunk_size as usize,
@@ -1330,7 +1330,9 @@ pub(crate) fn csv_writer_options_from_proto(
13301330

13311331
#[cfg(test)]
13321332
mod tests {
1333-
use datafusion_common::config::{CdcOptions, ParquetOptions, TableParquetOptions};
1333+
use datafusion_common::config::{
1334+
ParquetCdcOptions, ParquetOptions, TableParquetOptions,
1335+
};
13341336

13351337
fn parquet_options_proto_round_trip(opts: ParquetOptions) -> ParquetOptions {
13361338
let proto: crate::protobuf_common::ParquetOptions =
@@ -1390,7 +1392,7 @@ mod tests {
13901392
#[test]
13911393
fn test_parquet_options_cdc_enabled_round_trip() {
13921394
let opts = ParquetOptions {
1393-
content_defined_chunking: CdcOptions {
1395+
content_defined_chunking: ParquetCdcOptions {
13941396
enabled: true,
13951397
min_chunk_size: 128 * 1024,
13961398
max_chunk_size: 512 * 1024,
@@ -1409,10 +1411,10 @@ mod tests {
14091411
#[test]
14101412
fn test_parquet_options_cdc_negative_norm_level_round_trip() {
14111413
let opts = ParquetOptions {
1412-
content_defined_chunking: CdcOptions {
1414+
content_defined_chunking: ParquetCdcOptions {
14131415
enabled: true,
14141416
norm_level: -3,
1415-
..CdcOptions::default()
1417+
..ParquetCdcOptions::default()
14161418
},
14171419
..ParquetOptions::default()
14181420
};
@@ -1423,7 +1425,7 @@ mod tests {
14231425
#[test]
14241426
fn test_table_parquet_options_cdc_round_trip() {
14251427
let mut opts = TableParquetOptions::default();
1426-
opts.global.content_defined_chunking = CdcOptions {
1428+
opts.global.content_defined_chunking = ParquetCdcOptions {
14271429
enabled: true,
14281430
min_chunk_size: 64 * 1024,
14291431
max_chunk_size: 2 * 1024 * 1024,

datafusion/proto-common/src/to_proto/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ use datafusion_common::{
3636
Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
3737
DataFusionError, JoinSide, ScalarValue, Statistics,
3838
config::{
39-
CdcOptions, CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
39+
CsvOptions, JsonOptions, ParquetCdcOptions, ParquetColumnOptions, ParquetOptions,
4040
TableParquetOptions,
4141
},
4242
file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions},
@@ -943,8 +943,8 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
943943
}
944944
}
945945

946-
impl From<&CdcOptions> for protobuf::parquet_options::CdcOptions {
947-
fn from(value: &CdcOptions) -> Self {
946+
impl From<&ParquetCdcOptions> for protobuf::parquet_options::CdcOptions {
947+
fn from(value: &ParquetCdcOptions) -> Self {
948948
protobuf::parquet_options::CdcOptions {
949949
enabled: value.enabled,
950950
min_chunk_size: value.min_chunk_size as u64,

datafusion/proto/src/logical_plan/file_formats.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ mod parquet {
382382
parquet_options, parquet_options::CdcOptions as CdcOptionsProto,
383383
};
384384
use datafusion_common::config::{
385-
CdcOptions, ParquetColumnOptions, ParquetOptions, TableParquetOptions,
385+
ParquetCdcOptions, ParquetColumnOptions, ParquetOptions, TableParquetOptions,
386386
};
387387
use datafusion_datasource_parquet::file_format::ParquetFormatFactory;
388388

@@ -562,7 +562,7 @@ mod parquet {
562562
max_predicate_cache_size: proto.max_predicate_cache_size_opt.as_ref().map(|opt| match opt {
563563
parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size) => *size as usize,
564564
}),
565-
content_defined_chunking: proto.content_defined_chunking.map(|cdc| CdcOptions {
565+
content_defined_chunking: proto.content_defined_chunking.map(|cdc| ParquetCdcOptions {
566566
enabled: cdc.enabled,
567567
min_chunk_size: cdc.min_chunk_size as usize,
568568
max_chunk_size: cdc.max_chunk_size as usize,

0 commit comments

Comments
 (0)