@@ -4884,3 +4884,212 @@ def test_partition_column_projection_with_schema_evolution(catalog: InMemoryCata
48844884 result_sorted = result .sort_by ("name" )
48854885 assert result_sorted ["name" ].to_pylist () == ["Alice" , "Bob" , "Charlie" , "David" ]
48864886 assert result_sorted ["new_column" ].to_pylist () == [None , None , "new1" , "new2" ]
4887+
4888+
4889+ def test_task_to_record_batches_with_batch_size (tmpdir : str ) -> None :
4890+ schema = Schema (NestedField (1 , "id" , IntegerType (), required = False ))
4891+ pyarrow_schema = schema_to_pyarrow (schema , metadata = {ICEBERG_SCHEMA : bytes (schema .model_dump_json (), UTF8 )})
4892+
4893+ # Create a parquet file with 1000 rows
4894+ table = pa .Table .from_arrays ([pa .array (list (range (1000 )))], schema = pyarrow_schema )
4895+ data_file = _write_table_to_data_file (f"{ tmpdir } /batch_size_test.parquet" , pyarrow_schema , table )
4896+ data_file .spec_id = 0
4897+
4898+ task = FileScanTask (data_file = data_file )
4899+
4900+ batches = list (
4901+ _task_to_record_batches (
4902+ PyArrowFileIO (),
4903+ task ,
4904+ bound_row_filter = AlwaysTrue (),
4905+ projected_schema = schema ,
4906+ table_schema = schema ,
4907+ projected_field_ids = {1 },
4908+ positional_deletes = None ,
4909+ case_sensitive = True ,
4910+ batch_size = 100 ,
4911+ )
4912+ )
4913+
4914+ total_rows = sum (len (b ) for b in batches )
4915+ assert total_rows == 1000
4916+ for batch in batches :
4917+ assert len (batch ) <= 100
4918+
4919+
4920+ def test_to_record_batches_streaming_basic (tmpdir : str ) -> None :
4921+ schema = Schema (NestedField (1 , "id" , IntegerType (), required = False ))
4922+ pyarrow_schema = schema_to_pyarrow (schema , metadata = {ICEBERG_SCHEMA : bytes (schema .model_dump_json (), UTF8 )})
4923+
4924+ table = pa .Table .from_arrays ([pa .array (list (range (100 )))], schema = pyarrow_schema )
4925+ data_file = _write_table_to_data_file (f"{ tmpdir } /streaming_basic.parquet" , pyarrow_schema , table )
4926+ data_file .spec_id = 0
4927+
4928+ task = FileScanTask (data_file = data_file )
4929+
4930+ scan = ArrowScan (
4931+ table_metadata = TableMetadataV2 (
4932+ location = "file://a/b/" ,
4933+ last_column_id = 1 ,
4934+ format_version = 2 ,
4935+ schemas = [schema ],
4936+ partition_specs = [PartitionSpec ()],
4937+ ),
4938+ io = PyArrowFileIO (),
4939+ projected_schema = schema ,
4940+ row_filter = AlwaysTrue (),
4941+ case_sensitive = True ,
4942+ )
4943+
4944+ result = scan .to_record_batches_streaming ([task ])
4945+ # Should be a generator/iterator, not a list
4946+ import types
4947+
4948+ assert isinstance (result , types .GeneratorType )
4949+
4950+ batches = list (result )
4951+ total_rows = sum (len (b ) for b in batches )
4952+ assert total_rows == 100
4953+
4954+
4955+ def test_to_record_batches_streaming_with_batch_size (tmpdir : str ) -> None :
4956+ schema = Schema (NestedField (1 , "id" , IntegerType (), required = False ))
4957+ pyarrow_schema = schema_to_pyarrow (schema , metadata = {ICEBERG_SCHEMA : bytes (schema .model_dump_json (), UTF8 )})
4958+
4959+ table = pa .Table .from_arrays ([pa .array (list (range (500 )))], schema = pyarrow_schema )
4960+ data_file = _write_table_to_data_file (f"{ tmpdir } /streaming_batch_size.parquet" , pyarrow_schema , table )
4961+ data_file .spec_id = 0
4962+
4963+ task = FileScanTask (data_file = data_file )
4964+
4965+ scan = ArrowScan (
4966+ table_metadata = TableMetadataV2 (
4967+ location = "file://a/b/" ,
4968+ last_column_id = 1 ,
4969+ format_version = 2 ,
4970+ schemas = [schema ],
4971+ partition_specs = [PartitionSpec ()],
4972+ ),
4973+ io = PyArrowFileIO (),
4974+ projected_schema = schema ,
4975+ row_filter = AlwaysTrue (),
4976+ case_sensitive = True ,
4977+ )
4978+
4979+ batches = list (scan .to_record_batches_streaming ([task ], batch_size = 50 ))
4980+
4981+ total_rows = sum (len (b ) for b in batches )
4982+ assert total_rows == 500
4983+ for batch in batches :
4984+ assert len (batch ) <= 50
4985+
4986+
4987+ def test_to_record_batches_streaming_with_limit (tmpdir : str ) -> None :
4988+ schema = Schema (NestedField (1 , "id" , IntegerType (), required = False ))
4989+ pyarrow_schema = schema_to_pyarrow (schema , metadata = {ICEBERG_SCHEMA : bytes (schema .model_dump_json (), UTF8 )})
4990+
4991+ table = pa .Table .from_arrays ([pa .array (list (range (500 )))], schema = pyarrow_schema )
4992+ data_file = _write_table_to_data_file (f"{ tmpdir } /streaming_limit.parquet" , pyarrow_schema , table )
4993+ data_file .spec_id = 0
4994+
4995+ task = FileScanTask (data_file = data_file )
4996+
4997+ scan = ArrowScan (
4998+ table_metadata = TableMetadataV2 (
4999+ location = "file://a/b/" ,
5000+ last_column_id = 1 ,
5001+ format_version = 2 ,
5002+ schemas = [schema ],
5003+ partition_specs = [PartitionSpec ()],
5004+ ),
5005+ io = PyArrowFileIO (),
5006+ projected_schema = schema ,
5007+ row_filter = AlwaysTrue (),
5008+ case_sensitive = True ,
5009+ limit = 100 ,
5010+ )
5011+
5012+ batches = list (scan .to_record_batches_streaming ([task ]))
5013+
5014+ total_rows = sum (len (b ) for b in batches )
5015+ assert total_rows == 100
5016+
5017+
5018+ def test_to_record_batches_streaming_with_deletes (
5019+ deletes_file : str , request : pytest .FixtureRequest , table_schema_simple : Schema
5020+ ) -> None :
5021+ file_format = FileFormat .PARQUET if deletes_file .endswith (".parquet" ) else FileFormat .ORC
5022+
5023+ if file_format == FileFormat .PARQUET :
5024+ example_task = request .getfixturevalue ("example_task" )
5025+ else :
5026+ example_task = request .getfixturevalue ("example_task_orc" )
5027+
5028+ example_task_with_delete = FileScanTask (
5029+ data_file = example_task .file ,
5030+ delete_files = {
5031+ DataFile .from_args (
5032+ content = DataFileContent .POSITION_DELETES ,
5033+ file_path = deletes_file ,
5034+ file_format = file_format ,
5035+ )
5036+ },
5037+ )
5038+
5039+ metadata_location = "file://a/b/c.json"
5040+ scan = ArrowScan (
5041+ table_metadata = TableMetadataV2 (
5042+ location = metadata_location ,
5043+ last_column_id = 1 ,
5044+ format_version = 2 ,
5045+ current_schema_id = 1 ,
5046+ schemas = [table_schema_simple ],
5047+ partition_specs = [PartitionSpec ()],
5048+ ),
5049+ io = load_file_io (),
5050+ projected_schema = table_schema_simple ,
5051+ row_filter = AlwaysTrue (),
5052+ )
5053+
5054+ # Compare streaming path to table path
5055+ streaming_batches = list (scan .to_record_batches_streaming ([example_task_with_delete ]))
5056+ streaming_table = pa .concat_tables (
5057+ [pa .Table .from_batches ([b ]) for b in streaming_batches ], promote_options = "permissive"
5058+ )
5059+ eager_table = scan .to_table (tasks = [example_task_with_delete ])
5060+
5061+ assert streaming_table .num_rows == eager_table .num_rows
5062+ assert streaming_table .column_names == eager_table .column_names
5063+
5064+
5065+ def test_to_record_batches_streaming_multiple_files (tmpdir : str ) -> None :
5066+ schema = Schema (NestedField (1 , "id" , IntegerType (), required = False ))
5067+ pyarrow_schema = schema_to_pyarrow (schema , metadata = {ICEBERG_SCHEMA : bytes (schema .model_dump_json (), UTF8 )})
5068+
5069+ tasks = []
5070+ total_expected = 0
5071+ for i in range (3 ):
5072+ num_rows = (i + 1 ) * 100 # 100, 200, 300
5073+ total_expected += num_rows
5074+ table = pa .Table .from_arrays ([pa .array (list (range (num_rows )))], schema = pyarrow_schema )
5075+ data_file = _write_table_to_data_file (f"{ tmpdir } /multi_{ i } .parquet" , pyarrow_schema , table )
5076+ data_file .spec_id = 0
5077+ tasks .append (FileScanTask (data_file = data_file ))
5078+
5079+ scan = ArrowScan (
5080+ table_metadata = TableMetadataV2 (
5081+ location = "file://a/b/" ,
5082+ last_column_id = 1 ,
5083+ format_version = 2 ,
5084+ schemas = [schema ],
5085+ partition_specs = [PartitionSpec ()],
5086+ ),
5087+ io = PyArrowFileIO (),
5088+ projected_schema = schema ,
5089+ row_filter = AlwaysTrue (),
5090+ case_sensitive = True ,
5091+ )
5092+
5093+ batches = list (scan .to_record_batches_streaming (tasks ))
5094+ total_rows = sum (len (b ) for b in batches )
5095+ assert total_rows == total_expected # 600 rows total
0 commit comments