@@ -35,6 +35,7 @@ use datafusion_pruning::PruningPredicate;
3535
3636use log:: { debug, trace} ;
3737use parquet:: arrow:: arrow_reader:: statistics:: StatisticsConverter ;
38+ use parquet:: arrow:: parquet_column;
3839use parquet:: file:: metadata:: { ParquetColumnIndex , ParquetOffsetIndex } ;
3940use parquet:: file:: page_index:: offset_index:: PageLocation ;
4041use parquet:: schema:: types:: SchemaDescriptor ;
@@ -193,21 +194,24 @@ impl PagePruningAccessPlanFilter {
193194 let mut total_pages_skip = 0 ;
194195 // track the total number of pages that should not be skipped
195196 let mut total_pages_select = 0 ;
196- // track rows that were already proven fully matched at row group
197- // level and therefore did not need page-index predicate evaluation
198- let mut total_rows_fully_matched = 0 ;
197+ // track pages for which page-index pruning was skipped because the
198+ // containing row group was already proven fully matched by statistics
199+ let mut total_pages_skipped_by_fully_matched = 0 ;
199200
200201 // for each row group specified in the access plan
201202 let row_group_indexes = access_plan. row_group_indexes ( ) ;
202203 for row_group_index in row_group_indexes {
203204 // Skip page pruning for fully matched row groups: all rows are
204205 // known to satisfy the predicate, so page-level pruning is wasted work.
205206 if access_plan. is_fully_matched ( row_group_index) {
206- // Page metrics count evaluated page-index pruning work; this
207- // branch only records rows already proven fully matched.
208- let row_count = groups[ row_group_index] . num_rows ( ) as usize ;
209- total_select += row_count;
210- total_rows_fully_matched += row_count;
207+ let page_count = fully_matched_page_count (
208+ row_group_index,
209+ page_index_predicates,
210+ arrow_schema,
211+ parquet_schema,
212+ parquet_metadata,
213+ ) ;
214+ total_pages_skipped_by_fully_matched += page_count;
211215
212216 continue ;
213217 }
@@ -218,10 +222,13 @@ impl PagePruningAccessPlanFilter {
218222 let mut matched_pages_in_group: Option < HashSet < usize > > = None ;
219223
220224 for predicate in page_index_predicates {
221- let column = predicate
222- . required_columns ( )
223- . single_column ( )
224- . expect ( "Page pruning requires single column predicates" ) ;
225+ let Some ( column) = predicate. required_columns ( ) . single_column ( ) else {
226+ debug ! (
227+ "Ignoring multi-column page pruning predicate: {:?}" ,
228+ predicate. predicate_expr( )
229+ ) ;
230+ continue ;
231+ } ;
225232
226233 let converter = StatisticsConverter :: try_new (
227234 column. name ( ) ,
@@ -318,15 +325,15 @@ impl PagePruningAccessPlanFilter {
318325 file_metrics
319326 . page_index_rows_pruned
320327 . add_matched ( total_select) ;
321- file_metrics
322- . page_index_rows_pruned
323- . add_fully_matched ( total_rows_fully_matched) ;
324328 file_metrics
325329 . page_index_pages_pruned
326330 . add_pruned ( total_pages_skip) ;
327331 file_metrics
328332 . page_index_pages_pruned
329333 . add_matched ( total_pages_select) ;
334+ file_metrics. add_page_index_pages_skipped_by_fully_matched (
335+ total_pages_skipped_by_fully_matched,
336+ ) ;
330337 access_plan
331338 }
332339
@@ -346,6 +353,45 @@ fn update_selection(
346353 }
347354}
348355
356+ /// Returns the number of pages for which page-index pruning is skipped because
357+ /// the containing row group is fully matched by row-group statistics.
358+ fn fully_matched_page_count (
359+ row_group_index : usize ,
360+ page_index_predicates : & [ PruningPredicate ] ,
361+ arrow_schema : & Schema ,
362+ parquet_schema : & SchemaDescriptor ,
363+ parquet_metadata : & ParquetMetaData ,
364+ ) -> usize {
365+ let Some ( offset_index) = parquet_metadata. offset_index ( ) else {
366+ return 0 ;
367+ } ;
368+
369+ let Some ( row_group_offsets) = offset_index. get ( row_group_index) else {
370+ return 0 ;
371+ } ;
372+
373+ for predicate in page_index_predicates {
374+ let Some ( column) = predicate. required_columns ( ) . single_column ( ) else {
375+ continue ;
376+ } ;
377+
378+ let Some ( ( parquet_column_index, _) ) =
379+ parquet_column ( parquet_schema, arrow_schema, column. name ( ) )
380+ else {
381+ continue ;
382+ } ;
383+
384+ let Some ( offset_index_metadata) = row_group_offsets. get ( parquet_column_index)
385+ else {
386+ continue ;
387+ } ;
388+
389+ return offset_index_metadata. page_locations ( ) . len ( ) ;
390+ }
391+
392+ 0
393+ }
394+
349395/// Returns a [`RowSelection`] for the rows in this row group to scan, in addition to a vec of
350396/// booleans that state if each page was matched (true) or not (false).
351397///
0 commit comments