|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +//! Decoder-projection construction for the parquet scan. |
| 19 | +//! |
| 20 | +//! Owns the projection-mask + projector + schema-replacement triple the |
| 21 | +//! opener installs on every parquet decoder run, behind a single |
| 22 | +//! [`DecoderProjection::build`] entry point. Keeping it here lets the |
| 23 | +//! opener orchestrate scans with one call instead of an inline block of |
| 24 | +//! `build_projection_read_plan` / `reassign_expr_columns` / `make_projector`, |
| 25 | +//! and gives a clean seam for the in-scan post-scan filter that follows in |
| 26 | +//! a later change (`PostScanFilter`, when conjuncts the parquet `RowFilter` |
| 27 | +//! cannot evaluate fall through to a decoded-batch predicate). |
| 28 | +
|
| 29 | +use arrow::datatypes::SchemaRef; |
| 30 | + |
| 31 | +use datafusion_common::Result; |
| 32 | +use datafusion_physical_expr::projection::{ProjectionExprs, Projector}; |
| 33 | +use datafusion_physical_expr::utils::reassign_expr_columns; |
| 34 | + |
| 35 | +use parquet::arrow::ProjectionMask; |
| 36 | +use parquet::schema::types::SchemaDescriptor; |
| 37 | + |
| 38 | +use crate::row_filter::build_projection_read_plan; |
| 39 | + |
| 40 | +/// The parquet decoder projection: the [`ProjectionMask`] installed on every |
| 41 | +/// decoder run in the scan, the [`Projector`] that maps decoder output |
| 42 | +/// batches to the user-visible output, and the `replace_schema` flag that |
| 43 | +/// tells [`PushDecoderStreamState`](crate::push_decoder::PushDecoderStreamState) |
| 44 | +/// whether the projector's output schema must be rebuilt with the requested |
| 45 | +/// `output_schema` (e.g. for metadata / nullability mismatches). |
| 46 | +/// |
| 47 | +/// Built once per file by the opener via [`Self::build`]. |
| 48 | +pub(crate) struct DecoderProjection { |
| 49 | + /// Projection mask passed to the parquet decoder. |
| 50 | + pub(crate) projection_mask: ProjectionMask, |
| 51 | + /// Maps decoder output (stream) batches to the user-visible output. |
| 52 | + pub(crate) projector: Projector, |
| 53 | + /// `true` when the projector's output schema differs from `output_schema` |
| 54 | + /// in metadata / nullability and the caller must rebuild the batch with |
| 55 | + /// `output_schema` before yielding it. |
| 56 | + pub(crate) replace_schema: bool, |
| 57 | +} |
| 58 | + |
| 59 | +impl DecoderProjection { |
| 60 | + /// Build the decoder projection state for a file. |
| 61 | + /// |
| 62 | + /// `projection` references columns in `physical_file_schema` (i.e. already |
| 63 | + /// adapted by the per-file expr adapter); `parquet_schema` is the |
| 64 | + /// corresponding parquet `SchemaDescriptor`. `output_schema` is what |
| 65 | + /// consumers of the scan stream expect. |
| 66 | + pub(crate) fn build( |
| 67 | + projection: &ProjectionExprs, |
| 68 | + physical_file_schema: &SchemaRef, |
| 69 | + parquet_schema: &SchemaDescriptor, |
| 70 | + output_schema: &SchemaRef, |
| 71 | + ) -> Result<Self> { |
| 72 | + let read_plan = build_projection_read_plan( |
| 73 | + projection.expr_iter(), |
| 74 | + physical_file_schema, |
| 75 | + parquet_schema, |
| 76 | + ); |
| 77 | + |
| 78 | + let stream_schema = read_plan.projected_schema; |
| 79 | + |
| 80 | + // Rebase the projection onto the decoder's stream schema (column |
| 81 | + // indices change because the decoder yields only the masked columns). |
| 82 | + let rebased_projection = projection |
| 83 | + .clone() |
| 84 | + .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; |
| 85 | + let projector = rebased_projection.make_projector(&stream_schema)?; |
| 86 | + |
| 87 | + // Compare against the projector's *output* schema rather than the |
| 88 | + // stream schema, so future widening of the mask (for post-scan filter |
| 89 | + // columns) does not flip this flag. |
| 90 | + let replace_schema = projector.output_schema() != output_schema; |
| 91 | + |
| 92 | + Ok(Self { |
| 93 | + projection_mask: read_plan.projection_mask, |
| 94 | + projector, |
| 95 | + replace_schema, |
| 96 | + }) |
| 97 | + } |
| 98 | +} |
0 commit comments