@@ -69,6 +69,8 @@ pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
6969pub const DEFAULT_COERCE_TYPES : bool = false ;
7070/// Default value for [`WriterProperties::data_page_v2_compression_ratio_threshold`]
7171pub const DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD : f64 = 1.0 ;
72+ /// Default value for [`WriterProperties::write_path_in_schema`]
73+ pub const DEFAULT_WRITE_PATH_IN_SCHEMA : bool = true ;
7274/// Default minimum chunk size for content-defined chunking: 256 KiB.
7375pub const DEFAULT_CDC_MIN_CHUNK_SIZE : usize = 256 * 1024 ;
7476/// Default maximum chunk size for content-defined chunking: 1024 KiB.
@@ -252,6 +254,7 @@ pub struct WriterProperties {
252254 statistics_truncate_length : Option < usize > ,
253255 coerce_types : bool ,
254256 content_defined_chunking : Option < CdcOptions > ,
257+ write_path_in_schema : bool ,
255258 #[ cfg( feature = "encryption" ) ]
256259 pub ( crate ) file_encryption_properties : Option < Arc < FileEncryptionProperties > > ,
257260}
@@ -437,6 +440,14 @@ impl WriterProperties {
437440 self . coerce_types
438441 }
439442
443+ /// Returns `true` if the `path_in_schema` field of the `ColumnMetaData` Thrift struct
444+ /// should be written.
445+ ///
446+ /// For more details see [`WriterPropertiesBuilder::set_write_path_in_schema`]
447+ pub fn write_path_in_schema ( & self ) -> bool {
448+ self . write_path_in_schema
449+ }
450+
440451 /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled.
441452 ///
442453 /// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`]
@@ -592,6 +603,7 @@ pub struct WriterPropertiesBuilder {
592603 statistics_truncate_length : Option < usize > ,
593604 coerce_types : bool ,
594605 content_defined_chunking : Option < CdcOptions > ,
606+ write_path_in_schema : bool ,
595607 #[ cfg( feature = "encryption" ) ]
596608 file_encryption_properties : Option < Arc < FileEncryptionProperties > > ,
597609}
@@ -616,6 +628,7 @@ impl Default for WriterPropertiesBuilder {
616628 statistics_truncate_length : DEFAULT_STATISTICS_TRUNCATE_LENGTH ,
617629 coerce_types : DEFAULT_COERCE_TYPES ,
618630 content_defined_chunking : None ,
631+ write_path_in_schema : DEFAULT_WRITE_PATH_IN_SCHEMA ,
619632 #[ cfg( feature = "encryption" ) ]
620633 file_encryption_properties : None ,
621634 }
@@ -670,6 +683,7 @@ impl WriterPropertiesBuilder {
670683 statistics_truncate_length : self . statistics_truncate_length ,
671684 coerce_types : self . coerce_types ,
672685 content_defined_chunking : self . content_defined_chunking ,
686+ write_path_in_schema : self . write_path_in_schema ,
673687 #[ cfg( feature = "encryption" ) ]
674688 file_encryption_properties : self . file_encryption_properties ,
675689 }
@@ -885,6 +899,43 @@ impl WriterPropertiesBuilder {
885899 self
886900 }
887901
902+ /// EXPERIMENTAL: Should the writer emit the `path_in_schema` element of the
903+ /// `ColumnMetaData` Thrift struct. Defaults to `true` via [`DEFAULT_WRITE_PATH_IN_SCHEMA`].
904+ ///
905+ /// Because `path_in_schema` is a field on the `ColumnMetaData`, it is repeated
906+ /// `num_columns * num_rowgroups` times. Compounding this is any level of nesting or
907+ /// repetition in the schema. For instance, a top-level list column named `foo` will have
908+ /// a `path_in_schema` of `["foo", "list", "element"]`. A list-of-struct is even worse,
909+ /// because the necessary list wrapping is repeated for each element of the struct. A
910+ /// file with a deeply nested schema and many row groups can have a large percentage of the
911+ /// footer taken up by this field. For example, a file of 38 row groups with a schema containing
912+ /// several lists of structs containing lists had 36% of the footer taken up by `path_in_schema`.
913+ /// Removing this redundant information can greatly speed up footer parsing, which is particularly
914+ /// important in scenarios where one does not wish to read the entire file (e.g. point
915+ /// lookups).
916+ ///
917+ /// <div class="warning">
918+ ///
919+ /// **WARNING:**
920+ /// Setting this to `false` will break compatibility with Parquet readers that
921+ /// still expect this field to be present. Virtually all Parquet readers (parquet-java,
922+ /// Spark, arrow-cpp, pyarrow, pandas to name a few), with the exception
923+ /// of the one in this crate, expect this field to be present, and will terminate execution
924+ /// if it is not. This will continue to be the case unless/until the Parquet format
925+ /// specification is explicitly changed to allow this field to be missing. As a consquence,
926+ /// users should only set this to `false` if they have verified that any reader(s) they plan
927+ /// to use can tolerate the absence of this field.
928+ ///
929+ /// For more context, see [GH-563].
930+ ///
931+ /// </div>
932+ ///
933+ /// [GH-563]: https://github.com/apache/parquet-format/issues/563
934+ pub fn set_write_path_in_schema ( mut self , write_path_in_schema : bool ) -> Self {
935+ self . write_path_in_schema = write_path_in_schema;
936+ self
937+ }
938+
888939 /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`.
889940 ///
890941 /// When enabled, data page boundaries are determined by a rolling hash of the
@@ -1253,6 +1304,7 @@ impl From<WriterProperties> for WriterPropertiesBuilder {
12531304 statistics_truncate_length : props. statistics_truncate_length ,
12541305 coerce_types : props. coerce_types ,
12551306 content_defined_chunking : props. content_defined_chunking ,
1307+ write_path_in_schema : props. write_path_in_schema ,
12561308 #[ cfg( feature = "encryption" ) ]
12571309 file_encryption_properties : props. file_encryption_properties ,
12581310 }
0 commit comments