@@ -2163,24 +2163,40 @@ def _min_sequence_number(manifests: list[ManifestFile]) -> int:
21632163 return INITIAL_SEQUENCE_NUMBER
21642164
21652165
2166- def _to_arrow_via_file_scan_tasks (scan : BaseScan , tasks : Iterable [FileScanTask ]) -> pa .Table :
2166+ def _to_arrow_via_file_scan_tasks (
2167+ scan : BaseScan , tasks : Iterable [FileScanTask ], dictionary_columns : tuple [str , ...] = ()
2168+ ) -> pa .Table :
21672169 """Materialize a scan into an Arrow table given its planned ``FileScanTask``s."""
21682170 from pyiceberg .io .pyarrow import ArrowScan
21692171
2170- return ArrowScan (scan .table_metadata , scan .io , scan .projection (), scan .row_filter , scan .case_sensitive , scan .limit ).to_table (
2171- tasks
2172- )
2172+ return ArrowScan (
2173+ scan .table_metadata ,
2174+ scan .io ,
2175+ scan .projection (),
2176+ scan .row_filter ,
2177+ scan .case_sensitive ,
2178+ scan .limit ,
2179+ dictionary_columns = dictionary_columns ,
2180+ ).to_table (tasks )
21732181
21742182
2175- def _to_arrow_batch_reader_via_file_scan_tasks (scan : BaseScan , tasks : Iterable [FileScanTask ]) -> pa .RecordBatchReader :
2183+ def _to_arrow_batch_reader_via_file_scan_tasks (
2184+ scan : BaseScan , tasks : Iterable [FileScanTask ], dictionary_columns : tuple [str , ...] = ()
2185+ ) -> pa .RecordBatchReader :
21762186 """Stream a scan into an Arrow ``RecordBatchReader`` given its planned ``FileScanTask``s."""
21772187 import pyarrow as pa
21782188
21792189 from pyiceberg .io .pyarrow import ArrowScan , schema_to_pyarrow
21802190
21812191 target_schema = schema_to_pyarrow (scan .projection ())
21822192 batches = ArrowScan (
2183- scan .table_metadata , scan .io , scan .projection (), scan .row_filter , scan .case_sensitive , scan .limit
2193+ scan .table_metadata ,
2194+ scan .io ,
2195+ scan .projection (),
2196+ scan .row_filter ,
2197+ scan .case_sensitive ,
2198+ scan .limit ,
2199+ dictionary_columns = dictionary_columns ,
21842200 ).to_record_batches (tasks )
21852201
21862202 return pa .RecordBatchReader .from_batches (target_schema , batches ).cast (target_schema )
@@ -2259,28 +2275,44 @@ def plan_files(self) -> Iterable[FileScanTask]:
22592275 return self ._plan_files_server_side ()
22602276 return self ._plan_files_local ()
22612277
2262- def to_arrow (self ) -> pa .Table :
2278+ def to_arrow (self , dictionary_columns : tuple [ str , ...] = () ) -> pa .Table :
22632279 """Read an Arrow table eagerly from this DataScan.
22642280
22652281 All rows will be loaded into memory at once.
22662282
2283+ Args:
2284+ dictionary_columns:
2285+ A tuple of column names that PyArrow should read as
2286+ dictionary-encoded (``pa.DictionaryArray``). Dictionary
2287+ encoding can substantially reduce memory usage for columns
2288+ with low-cardinality repeated string values.
2289+ Only applies to Parquet files; silently ignored for ORC.
2290+
22672291 Returns:
22682292 pa.Table: Materialized Arrow Table from the Iceberg table's DataScan
22692293 """
2270- return _to_arrow_via_file_scan_tasks (self , self .plan_files ())
2294+ return _to_arrow_via_file_scan_tasks (self , self .plan_files (), dictionary_columns = dictionary_columns )
22712295
2272- def to_arrow_batch_reader (self ) -> pa .RecordBatchReader :
2296+ def to_arrow_batch_reader (self , dictionary_columns : tuple [ str , ...] = () ) -> pa .RecordBatchReader :
22732297 """Return an Arrow RecordBatchReader from this DataScan.
22742298
22752299 For large results, using a RecordBatchReader requires less memory than
22762300 loading an Arrow Table for the same DataScan, because a RecordBatch
22772301 is read one at a time.
22782302
2303+ Args:
2304+ dictionary_columns:
2305+ A tuple of column names that PyArrow should read as
2306+ dictionary-encoded (``pa.DictionaryArray``). Dictionary
2307+ encoding can substantially reduce memory usage for columns
2308+ with low-cardinality repeated string values.
2309+ Only applies to Parquet files; silently ignored for ORC.
2310+
22792311 Returns:
22802312 pa.RecordBatchReader: Arrow RecordBatchReader from the Iceberg table's DataScan
22812313 which can be used to read a stream of record batches one by one.
22822314 """
2283- return _to_arrow_batch_reader_via_file_scan_tasks (self , self .plan_files ())
2315+ return _to_arrow_batch_reader_via_file_scan_tasks (self , self .plan_files (), dictionary_columns = dictionary_columns )
22842316
22852317 def count (self ) -> int :
22862318 from pyiceberg .io .pyarrow import ArrowScan
@@ -2637,13 +2669,11 @@ def _build_residual_evaluator(self, spec_id: int) -> Callable[[DataFile], Residu
26372669 # The lambda created here is run in multiple threads.
26382670 # So we avoid creating _EvaluatorExpression methods bound to a single
26392671 # shared instance across multiple threads.
2640- return lambda datafile : (
2641- residual_evaluator_of (
2642- spec = spec ,
2643- expr = self .row_filter ,
2644- case_sensitive = self .case_sensitive ,
2645- schema = self .table_metadata .schema (),
2646- )
2672+ return lambda datafile : residual_evaluator_of (
2673+ spec = spec ,
2674+ expr = self .row_filter ,
2675+ case_sensitive = self .case_sensitive ,
2676+ schema = self .table_metadata .schema (),
26472677 )
26482678
26492679 @staticmethod
0 commit comments