@@ -54,17 +54,27 @@ use crate::schema::types::{
5454 Type as SchemaType ,
5555} ;
5656
57- /// [`Index`] for each row group of each column.
57+ /// Page level statistics for each column chunk of each row group.
58+ ///
59+ /// This structure is an in-memory representation of multiple [`ColumnIndex`]
60+ /// structures in a parquet file footer, as described in the Parquet [PageIndex
61+ /// documentation]. Each [`Index`] holds statistics about all the pages in a
62+ /// particular column chunk.
5863///
5964/// `column_index[row_group_number][column_number]` holds the
6065/// [`Index`] corresponding to column `column_number` of row group
6166/// `row_group_number`.
6267///
6368/// For example `column_index[2][3]` holds the [`Index`] for the forth
6469/// column in the third row group of the parquet file.
70+ ///
71+ /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
6572pub type ParquetColumnIndex = Vec < Vec < Index > > ;
6673
67- /// [`PageLocation`] for each data page of each row group of each column.
74+ /// [`PageLocation`] for each data page of each row group of each column
75+ ///
76+ /// This structure is the parsed representation of the [`OffsetIndex`] from the
77+ /// Parquet file footer, as described in the Parquet [PageIndex documentation].
6878///
6979/// `offset_index[row_group_number][column_number][page_number]` holds
7080/// the [`PageLocation`] corresponding to page `page_number` of column
@@ -73,6 +83,8 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
7383/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
7484/// the fifth page of the forth column in the third row group of the
7585/// parquet file.
86+ ///
87+ /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
7688pub type ParquetOffsetIndex = Vec < Vec < Vec < PageLocation > > > ;
7789
7890/// Parsed metadata for a single Parquet file
@@ -946,14 +958,22 @@ impl ColumnChunkMetaDataBuilder {
946958 }
947959}
948960
949- /// Builder for column index
961+ /// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
962+ ///
963+ /// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
950964pub struct ColumnIndexBuilder {
951965 null_pages : Vec < bool > ,
952966 min_values : Vec < Vec < u8 > > ,
953967 max_values : Vec < Vec < u8 > > ,
954968 null_counts : Vec < i64 > ,
955969 boundary_order : BoundaryOrder ,
956- // If one page can't get build index, need to ignore all index in this column
970+ /// Is the information in the builder valid?
971+ ///
972+ /// Set to `false` if any entry in the page doesn't have statistics for
973+ /// some reason, so statistics for that page won't be written to the file.
974+ /// This might happen if the page is entirely null, or
975+ /// is a floating point column without any non-nan values
976+ /// e.g. <https://github.com/apache/parquet-format/pull/196>
957977 valid : bool ,
958978}
959979
@@ -975,6 +995,7 @@ impl ColumnIndexBuilder {
975995 }
976996 }
977997
998+ /// Append statistics for the next page
978999 pub fn append (
9791000 & mut self ,
9801001 null_page : bool ,
@@ -992,15 +1013,19 @@ impl ColumnIndexBuilder {
9921013 self . boundary_order = boundary_order;
9931014 }
9941015
1016+ /// Mark this column index as invalid
9951017 pub fn to_invalid ( & mut self ) {
9961018 self . valid = false ;
9971019 }
9981020
1021+ /// Is the information in the builder valid?
9991022 pub fn valid ( & self ) -> bool {
10001023 self . valid
10011024 }
10021025
10031026 /// Build and get the thrift metadata of column index
1027+ ///
1028+ /// Note: callers should check [`Self::valid`] before calling this method
10041029 pub fn build_to_thrift ( self ) -> ColumnIndex {
10051030 ColumnIndex :: new (
10061031 self . null_pages ,
@@ -1012,7 +1037,9 @@ impl ColumnIndexBuilder {
10121037 }
10131038}
10141039
1015- /// Builder for offset index
1040+ /// Builder for offset index, part of the Parquet [PageIndex].
1041+ ///
1042+ /// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
10161043pub struct OffsetIndexBuilder {
10171044 offset_array : Vec < i64 > ,
10181045 compressed_page_size_array : Vec < i32 > ,
0 commit comments