@@ -5103,3 +5103,73 @@ def test_partition_column_projection_with_schema_evolution(catalog: InMemoryCata
51035103 result_sorted = result .sort_by ("name" )
51045104 assert result_sorted ["name" ].to_pylist () == ["Alice" , "Bob" , "Charlie" , "David" ]
51055105 assert result_sorted ["new_column" ].to_pylist () == [None , None , "new1" , "new2" ]
5106+
5107+
5108+ def test_dictionary_columns_produces_dict_encoded_output (tmpdir : str ) -> None :
5109+ """dictionary_columns passed to ArrowScan must yield dictionary-encoded arrays.
5110+
5111+ Verifies that:
5112+ 1. The requested column is returned as a pa.DictionaryArray.
5113+ 2. Values are identical to a plain (non-dict) scan.
5114+ 3. A column NOT in dictionary_columns is still returned as a plain array.
5115+ """
5116+ from pyiceberg .expressions import AlwaysTrue
5117+ from pyiceberg .io .pyarrow import ArrowScan , PyArrowFileIO
5118+ from pyiceberg .partitioning import PartitionSpec
5119+ from pyiceberg .table import FileScanTask
5120+ from pyiceberg .table .metadata import TableMetadataV2
5121+
5122+ arrow_schema = pa .schema (
5123+ [
5124+ pa .field ("id" , pa .int32 (), nullable = True , metadata = {PYARROW_PARQUET_FIELD_ID_KEY : "1" }),
5125+ pa .field ("label" , pa .string (), nullable = True , metadata = {PYARROW_PARQUET_FIELD_ID_KEY : "2" }),
5126+ ]
5127+ )
5128+ arrow_table = pa .table (
5129+ [pa .array ([1 , 2 , 3 , 4 ], type = pa .int32 ()), pa .array (["a" , "b" , "a" , "b" ], type = pa .string ())],
5130+ schema = arrow_schema ,
5131+ )
5132+ data_file = _write_table_to_data_file (f"{ tmpdir } /test_dict_cols.parquet" , arrow_schema , arrow_table )
5133+ data_file .spec_id = 0
5134+
5135+ iceberg_schema = Schema (
5136+ NestedField (1 , "id" , IntegerType (), required = False ),
5137+ NestedField (2 , "label" , StringType (), required = False ),
5138+ )
5139+ table_metadata = TableMetadataV2 (
5140+ location = f"file://{ tmpdir } " ,
5141+ last_column_id = 2 ,
5142+ format_version = 2 ,
5143+ schemas = [iceberg_schema ],
5144+ partition_specs = [PartitionSpec ()],
5145+ )
5146+ io = PyArrowFileIO ()
5147+ task = FileScanTask (data_file )
5148+
5149+ scan_plain = ArrowScan (
5150+ table_metadata = table_metadata ,
5151+ io = io ,
5152+ projected_schema = iceberg_schema ,
5153+ row_filter = AlwaysTrue (),
5154+ )
5155+ scan_dict = ArrowScan (
5156+ table_metadata = table_metadata ,
5157+ io = io ,
5158+ projected_schema = iceberg_schema ,
5159+ row_filter = AlwaysTrue (),
5160+ dictionary_columns = ("label" ,),
5161+ )
5162+
5163+ result_plain = scan_plain .to_table ([task ])
5164+ result_dict = scan_dict .to_table ([task ])
5165+
5166+ # id column is not in dictionary_columns — both scans should return int32
5167+ assert result_plain .schema .field ("id" ).type == pa .int32 ()
5168+ assert result_dict .schema .field ("id" ).type == pa .int32 ()
5169+
5170+ # label column: plain scan → string, dict scan → dictionary<values=string, indices=int32>
5171+ assert result_plain .schema .field ("label" ).type == pa .string ()
5172+ assert pa .types .is_dictionary (result_dict .schema .field ("label" ).type )
5173+
5174+ # Values must be identical
5175+ assert result_plain .column ("label" ).to_pylist () == result_dict .column ("label" ).to_pylist ()
0 commit comments