@@ -90,7 +90,9 @@ class FileScanner : public Scanner {
9090 : Scanner(state, profile),
9191 _params (params),
9292 _col_name_to_slot_id(colname_to_slot_id),
93- _real_tuple_desc(tuple_desc) {};
93+ _real_tuple_desc(tuple_desc) {
94+ _configure_file_scan_handlers ();
95+ };
9496
9597 Status read_lines_from_range (const TFileRangeDesc& range, const std::list<int64_t >& row_ids,
9698 Block* result_block, const ExternalFileMappingInfo& external_info,
@@ -107,6 +109,9 @@ class FileScanner : public Scanner {
107109
108110 Status _get_next_reader ();
109111
112+ // Build a ReaderInitContext with shared fields from FileScanner members.
113+ void _fill_base_init_context (ReaderInitContext* ctx);
114+
110115 // TODO: cast input block columns type to string.
111116 Status _cast_src_block (Block* block) { return Status::OK (); }
112117
@@ -128,10 +133,10 @@ class FileScanner : public Scanner {
128133 std::vector<SlotDescriptor*> _file_slot_descs;
129134 // col names from _file_slot_descs
130135 std::vector<std::string> _file_col_names;
136+ // Unified column descriptors for init_reader (includes file, partition, missing, synthesized cols)
137+ std::vector<ColumnDescriptor> _column_descs;
131138
132- // Partition source slot descriptors
133- std::vector<SlotDescriptor*> _partition_slot_descs;
134- // Partition slot id to index in _partition_slot_descs
139+ // Partition slot id to partition key index (for matching columns_from_path)
135140 std::unordered_map<SlotId, int > _partition_slot_index_map;
136141 // created from param.expr_of_dest_slot
137142 // For query, it saves default value expr of all dest columns, or nullptr for NULL.
@@ -152,8 +157,6 @@ class FileScanner : public Scanner {
152157 // Get from GenericReader, save the existing columns in file to their type.
153158 std::unordered_map<std::string, DataTypePtr> _slot_lower_name_to_col_type;
154159 // Get from GenericReader, save columns that required by scan but not exist in file.
155- // These columns will be filled by default value or null.
156- std::unordered_set<std::string> _missing_cols;
157160
158161 // The col lowercase name of source file to type of source file.
159162 std::map<std::string, DataTypePtr> _source_file_col_name_types;
@@ -192,7 +195,6 @@ class FileScanner : public Scanner {
192195 std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>
193196 _partition_col_descs;
194197 std::unordered_map<std::string, bool > _partition_value_is_null;
195- std::unordered_map<std::string, VExprContextSPtr> _missing_col_descs;
196198
197199 // idx of skip_bitmap_col in _input_tuple_desc
198200 int32_t _skip_bitmap_col_idx {-1 };
@@ -230,34 +232,41 @@ class FileScanner : public Scanner {
230232 // otherwise, point to _output_tuple_desc
231233 const TupleDescriptor* _real_tuple_desc = nullptr ;
232234
233- std::pair<std::shared_ptr<RowIdColumnIteratorV2>, int > _row_id_column_iterator_pair = {nullptr ,
234- -1 };
235- bool _need_iceberg_rowid_column = false ;
236- int _iceberg_rowid_column_pos = -1 ;
237- // for iceberg row lineage
238- RowLineageColumns _row_lineage_columns;
239235 int64_t _last_bytes_read_from_local = 0 ;
240236 int64_t _last_bytes_read_from_remote = 0 ;
241237
238+ Status (FileScanner::*_init_src_block_handler)(Block* block) = nullptr ;
239+ Status (FileScanner::*_process_src_block_after_read_handler)(Block* block) = nullptr ;
240+ bool (FileScanner::*_should_push_down_predicates_handler)(
241+ TFileFormatType::type format_type) const = nullptr ;
242+ bool (FileScanner::*_should_enable_condition_cache_handler)() const = nullptr ;
243+
242244 // Condition cache for external tables
243245 uint64_t _condition_cache_digest = 0 ;
244246 segment_v2::ConditionCache::ExternalCacheKey _condition_cache_key;
245247 std::shared_ptr<std::vector<bool >> _condition_cache;
246248 std::shared_ptr<ConditionCacheContext> _condition_cache_ctx;
247249 int64_t _condition_cache_hit_count = 0 ;
248250
251+ void _configure_file_scan_handlers ();
252+
249253 Status _init_expr_ctxes ();
250254 Status _init_src_block (Block* block);
251- Status _check_output_block_types ();
252- Status _cast_to_input_block (Block* block);
255+ Status _init_src_block_for_load (Block* block);
256+ Status _init_src_block_for_query (Block* block);
257+ Status _process_src_block_after_read (Block* block);
258+ Status _process_src_block_after_read_for_load (Block* block);
259+ Status _process_src_block_after_read_for_query (Block* block);
253260 Status _fill_columns_from_path (size_t rows);
254261 Status _fill_missing_columns (size_t rows);
262+ Status _check_output_block_types ();
263+ Status _cast_to_input_block (Block* block);
255264 Status _pre_filter_src_block ();
256265 Status _convert_to_output_block (Block* block);
257266 Status _truncate_char_or_varchar_columns (Block* block);
258267 void _truncate_char_or_varchar_column (Block* block, int idx, int len);
259268 Status _generate_partition_columns ();
260- Status _generate_missing_columns ();
269+
261270 bool _check_partition_prune_expr (const VExprSPtr& expr);
262271 void _init_runtime_filter_partition_prune_ctxs ();
263272 void _init_runtime_filter_partition_prune_block ();
@@ -267,11 +276,11 @@ class FileScanner : public Scanner {
267276 void _get_slot_ids (VExpr* expr, std::vector<int >* slot_ids);
268277 Status _generate_truncate_columns (bool need_to_get_parsed_schema);
269278 Status _set_fill_or_truncate_columns (bool need_to_get_parsed_schema);
270- Status _init_orc_reader (std::unique_ptr<OrcReader>&& orc_reader ,
271- FileMetaCache* file_meta_cache_ptr );
272- Status _init_parquet_reader (std::unique_ptr<ParquetReader>&& parquet_reader ,
273- FileMetaCache* file_meta_cache_ptr );
274- Status _create_row_id_column_iterator ();
279+ Status _init_orc_reader (FileMetaCache* file_meta_cache_ptr ,
280+ std::unique_ptr<OrcReader> orc_reader = nullptr );
281+ Status _init_parquet_reader (FileMetaCache* file_meta_cache_ptr ,
282+ std::unique_ptr<ParquetReader> parquet_reader = nullptr );
283+ std::shared_ptr<segment_v2::RowIdColumnIteratorV2> _create_row_id_column_iterator ();
275284
276285 TFileFormatType::type _get_current_format_type () {
277286 // for compatibility, if format_type is not set in range, use the format type of params
@@ -291,6 +300,11 @@ class FileScanner : public Scanner {
291300 }
292301
293302 bool _should_enable_condition_cache ();
303+ bool _should_enable_condition_cache_for_load () const ;
304+ bool _should_enable_condition_cache_for_query () const ;
305+ bool _should_push_down_predicates (TFileFormatType::type format_type) const ;
306+ bool _should_push_down_predicates_for_load (TFileFormatType::type format_type) const ;
307+ bool _should_push_down_predicates_for_query (TFileFormatType::type format_type) const ;
294308 void _init_reader_condition_cache ();
295309 void _finalize_reader_condition_cache ();
296310
0 commit comments