2626from pypaimon .manifest .schema .data_file_meta import DataFileMeta
2727from pypaimon .read .interval_partition import IntervalPartition , SortedRun
2828from pypaimon .read .partition_info import PartitionInfo
29- from pypaimon .read .reader .concat_batch_reader import ConcatBatchReader , ShardBatchReader
29+ from pypaimon .read .reader .concat_batch_reader import ConcatBatchReader , ShardBatchReader , MergeAllBatchReader
3030from pypaimon .read .reader .concat_record_reader import ConcatRecordReader
3131from pypaimon .read .reader .data_file_batch_reader import DataFileBatchReader
3232from pypaimon .read .reader .data_evolution_merge_reader import DataEvolutionMergeReader
@@ -73,21 +73,21 @@ def __init__(self, table, predicate: Optional[Predicate], push_down_predicate,
7373 def create_reader (self ) -> RecordReader :
7474 """Create a record reader for the given split."""
7575
76- def file_reader_supplier (self , file_path : str , for_merge_read : bool ):
76+ def file_reader_supplier (self , file_path : str , for_merge_read : bool , read_fields : List [ str ] ):
7777 _ , extension = os .path .splitext (file_path )
7878 file_format = extension [1 :]
7979
8080 format_reader : RecordBatchReader
8181 if file_format == CoreOptions .FILE_FORMAT_AVRO :
82- format_reader = FormatAvroReader (self .table .file_io , file_path , self . _get_final_read_data_fields () ,
82+ format_reader = FormatAvroReader (self .table .file_io , file_path , read_fields ,
8383 self .read_fields , self .push_down_predicate )
8484 elif file_format == CoreOptions .FILE_FORMAT_BLOB :
8585 blob_as_descriptor = self .table .options .get (CoreOptions .FILE_BLOB_AS_DESCRIPTOR , False )
86- format_reader = FormatBlobReader (self .table .file_io , file_path , self . _get_final_read_data_fields () ,
86+ format_reader = FormatBlobReader (self .table .file_io , file_path , read_fields ,
8787 self .read_fields , self .push_down_predicate , blob_as_descriptor )
8888 elif file_format == CoreOptions .FILE_FORMAT_PARQUET or file_format == CoreOptions .FILE_FORMAT_ORC :
8989 format_reader = FormatPyArrowReader (self .table .file_io , file_format , file_path ,
90- self . _get_final_read_data_fields () , self .push_down_predicate )
90+ read_fields , self .push_down_predicate )
9191 else :
9292 raise ValueError (f"Unexpected file format: { file_format } " )
9393
@@ -253,7 +253,12 @@ class RawFileSplitRead(SplitRead):
253253 def create_reader (self ) -> RecordReader :
254254 data_readers = []
255255 for file_path in self .split .file_paths :
256- supplier = partial (self .file_reader_supplier , file_path = file_path , for_merge_read = False )
256+ supplier = partial (
257+ self .file_reader_supplier ,
258+ file_path = file_path ,
259+ for_merge_read = False ,
260+ read_fields = self ._get_final_read_data_fields (),
261+ )
257262 data_readers .append (supplier )
258263
259264 if not data_readers :
@@ -274,7 +279,12 @@ def _get_all_data_fields(self):
274279
275280class MergeFileSplitRead (SplitRead ):
276281 def kv_reader_supplier (self , file_path ):
277- reader_supplier = partial (self .file_reader_supplier , file_path = file_path , for_merge_read = True )
282+ reader_supplier = partial (
283+ self .file_reader_supplier ,
284+ file_path = file_path ,
285+ for_merge_read = True ,
286+ read_fields = self ._get_final_read_data_fields ()
287+ )
278288 return KeyValueWrapReader (reader_supplier (), len (self .trimmed_primary_key ), self .value_arity )
279289
280290 def section_reader_supplier (self , section : List [SortedRun ]):
@@ -317,7 +327,7 @@ def create_reader(self) -> RecordReader:
317327 if len (need_merge_files ) == 1 or not self .read_fields :
318328 # No need to merge fields, just create a single file reader
319329 suppliers .append (
320- lambda f = need_merge_files [0 ]: self ._create_file_reader (f )
330+ lambda f = need_merge_files [0 ]: self ._create_file_reader (f , self . _get_final_read_data_fields () )
321331 )
322332 else :
323333 suppliers .append (
@@ -424,26 +434,30 @@ def _create_union_reader(self, need_merge_files: List[DataFileMeta]) -> RecordRe
424434 self .read_fields = read_fields # create reader based on read_fields
425435 # Create reader for this bunch
426436 if len (bunch .files ()) == 1 :
427- file_record_readers [i ] = self ._create_file_reader (bunch .files ()[0 ])
437+ file_record_readers [i ] = self ._create_file_reader (
438+ bunch .files ()[0 ], [field .name for field in read_fields ]
439+ )
428440 else :
429441 # Create concatenated reader for multiple files
430442 suppliers = [
431- lambda f = file : self ._create_file_reader (f ) for file in bunch .files ()
443+ lambda f = file : self ._create_file_reader (
444+ f , [field .name for field in read_fields ]
445+ ) for file in bunch .files ()
432446 ]
433- file_record_readers [i ] = ConcatRecordReader (suppliers )
447+ file_record_readers [i ] = MergeAllBatchReader (suppliers )
434448 self .read_fields = table_fields
435449
436450 # Validate that all required fields are found
437451 for i , field in enumerate (all_read_fields ):
438452 if row_offsets [i ] == - 1 :
439- if not field .type .is_nullable () :
453+ if not field .type .nullable :
440454 raise ValueError (f"Field { field } is not null but can't find any file contains it." )
441455
442456 return DataEvolutionMergeReader (row_offsets , field_offsets , file_record_readers )
443457
444- def _create_file_reader (self , file : DataFileMeta ) -> RecordReader :
458+ def _create_file_reader (self , file : DataFileMeta , read_fields : [ str ] ) -> RecordReader :
445459 """Create a file reader for a single file."""
446- return self .file_reader_supplier (file_path = file .file_path , for_merge_read = False )
460+ return self .file_reader_supplier (file_path = file .file_path , for_merge_read = False , read_fields = read_fields )
447461
448462 def _split_field_bunches (self , need_merge_files : List [DataFileMeta ]) -> List [FieldBunch ]:
449463 """Split files into field bunches."""
0 commit comments