feat: add streaming flag to ArrowScan.to_record_batches

sumedhsakdeo · claude · sumedhsakdeo · commit b72b7ba8934f · 2026-02-14T13:29:14.000-08:00
When streaming=True, batches are yielded as they are produced by PyArrow without materializing entire files into memory. Files are still processed sequentially, preserving file ordering. The inner method handles the global limit correctly when called with all tasks, avoiding double-counting. This addresses the OOM issue in apache#3036 for single-file-at-a-time streaming. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -362,6 +362,13 @@ for buf in tbl.scan().to_arrow_batch_reader(batch_size=1000):
     print(f"Buffer contains {len(buf)} rows")
 ```
 
+By default, each file's batches are materialized in memory before being yielded. For large files that may exceed available memory, use `streaming=True` to yield batches as they are produced without materializing entire files:
+
+```python
+for buf in tbl.scan().to_arrow_batch_reader(streaming=True, batch_size=1000):
+    print(f"Buffer contains {len(buf)} rows")
+```
+
 To avoid any type inconsistencies during writing, you can convert the Iceberg table schema to Arrow:
 
 ```python
@@ -1635,6 +1642,15 @@ table.scan(
 ).to_arrow_batch_reader(batch_size=1000)
 ```
 
+Use `streaming=True` to avoid materializing entire files in memory. This yields batches as they are produced by PyArrow, one file at a time:
+
+```python
+table.scan(
+    row_filter=GreaterThanOrEqual("trip_distance", 10.0),
+    selected_fields=("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"),
+).to_arrow_batch_reader(streaming=True)
+```
+
 ### Pandas
 
 <!-- prettier-ignore-start -->
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -1761,7 +1761,9 @@ def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table:
 
         return result
 
-    def to_record_batches(self, tasks: Iterable[FileScanTask], batch_size: int | None = None) -> Iterator[pa.RecordBatch]:
+    def to_record_batches(
+        self, tasks: Iterable[FileScanTask], batch_size: int | None = None, streaming: bool = False
+    ) -> Iterator[pa.RecordBatch]:
         """Scan the Iceberg table and return an Iterator[pa.RecordBatch].
 
         Returns an Iterator of pa.RecordBatch with data from the Iceberg table
@@ -1770,6 +1772,9 @@ def to_record_batches(self, tasks: Iterable[FileScanTask], batch_size: int | Non
 
         Args:
             tasks: FileScanTasks representing the data files and delete files to read from.
+            batch_size: The number of rows per batch. If None, PyArrow's default is used.
+            streaming: If True, yield batches as they are produced without materializing
+                entire files into memory. Files are still processed sequentially.
 
         Returns:
             An Iterator of PyArrow RecordBatches.
@@ -1781,31 +1786,38 @@ def to_record_batches(self, tasks: Iterable[FileScanTask], batch_size: int | Non
         """
         deletes_per_file = _read_all_delete_files(self._io, tasks)
 
-        total_row_count = 0
-        executor = ExecutorFactory.get_or_create()
-
-        def batches_for_task(task: FileScanTask) -> list[pa.RecordBatch]:
-            # Materialize the iterator here to ensure execution happens within the executor.
-            # Otherwise, the iterator would be lazily consumed later (in the main thread),
-            # defeating the purpose of using executor.map.
-            return list(self._record_batches_from_scan_tasks_and_deletes([task], deletes_per_file, batch_size))
-
-        limit_reached = False
-        for batches in executor.map(batches_for_task, tasks):
-            for batch in batches:
-                current_batch_size = len(batch)
-                if self._limit is not None and total_row_count + current_batch_size >= self._limit:
-                    yield batch.slice(0, self._limit - total_row_count)
+        if streaming:
+            # Streaming path: process all tasks sequentially, yielding batches as produced.
+            # _record_batches_from_scan_tasks_and_deletes handles the limit internally
+            # when called with all tasks, so no outer limit check is needed.
+            yield from self._record_batches_from_scan_tasks_and_deletes(tasks, deletes_per_file, batch_size)
+        else:
+            # Non-streaming path: existing behavior with executor.map + list()
+            total_row_count = 0
+            executor = ExecutorFactory.get_or_create()
+
+            def batches_for_task(task: FileScanTask) -> list[pa.RecordBatch]:
+                # Materialize the iterator here to ensure execution happens within the executor.
+                # Otherwise, the iterator would be lazily consumed later (in the main thread),
+                # defeating the purpose of using executor.map.
+                return list(self._record_batches_from_scan_tasks_and_deletes([task], deletes_per_file, batch_size))
+
+            limit_reached = False
+            for batches in executor.map(batches_for_task, tasks):
+                for batch in batches:
+                    current_batch_size = len(batch)
+                    if self._limit is not None and total_row_count + current_batch_size >= self._limit:
+                        yield batch.slice(0, self._limit - total_row_count)
+
+                        limit_reached = True
+                        break
+                    else:
+                        yield batch
+                        total_row_count += current_batch_size
 
-                    limit_reached = True
+                if limit_reached:
+                    # This break will also cancel all running tasks in the executor
                     break
-                else:
-                    yield batch
-                    total_row_count += current_batch_size
-
-            if limit_reached:
-                # This break will also cancel all running tasks in the executor
-                break
 
     def _record_batches_from_scan_tasks_and_deletes(
         self, tasks: Iterable[FileScanTask], deletes_per_file: dict[str, list[ChunkedArray]], batch_size: int | None = None
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -2157,7 +2157,7 @@ def to_arrow(self) -> pa.Table:
             self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
         ).to_table(self.plan_files())
 
-    def to_arrow_batch_reader(self, batch_size: int | None = None) -> pa.RecordBatchReader:
+    def to_arrow_batch_reader(self, batch_size: int | None = None, streaming: bool = False) -> pa.RecordBatchReader:
         """Return an Arrow RecordBatchReader from this DataScan.
 
         For large results, using a RecordBatchReader requires less memory than
@@ -2166,6 +2166,8 @@ def to_arrow_batch_reader(self, batch_size: int | None = None) -> pa.RecordBatch
 
         Args:
             batch_size: The number of rows per batch. If None, PyArrow's default is used.
+            streaming: If True, yield batches as they are produced without materializing
+                entire files into memory. Files are still processed sequentially.
 
         Returns:
             pa.RecordBatchReader: Arrow RecordBatchReader from the Iceberg table's DataScan
@@ -2178,7 +2180,7 @@ def to_arrow_batch_reader(self, batch_size: int | None = None) -> pa.RecordBatch
         target_schema = schema_to_pyarrow(self.projection())
         batches = ArrowScan(
             self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
-        ).to_record_batches(self.plan_files(), batch_size=batch_size)
+        ).to_record_batches(self.plan_files(), batch_size=batch_size, streaming=streaming)
 
         return pa.RecordBatchReader.from_batches(
             target_schema,
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
@@ -3106,6 +3106,86 @@ def test_task_to_record_batches_default_batch_size(tmpdir: str) -> None:
     assert len(batches[0]) == num_rows
 
 
+def _create_scan_and_tasks(
+    tmpdir: str, num_files: int = 1, rows_per_file: int = 100, limit: int | None = None
+) -> tuple[ArrowScan, list[FileScanTask]]:
+    """Helper to create an ArrowScan and FileScanTasks for testing."""
+    table_schema = Schema(NestedField(1, "col", LongType(), required=True))
+    pa_schema = pa.schema([pa.field("col", pa.int64(), nullable=False, metadata={PYARROW_PARQUET_FIELD_ID_KEY: "1"})])
+    tasks = []
+    for i in range(num_files):
+        start = i * rows_per_file
+        arrow_table = pa.table({"col": pa.array(range(start, start + rows_per_file))}, schema=pa_schema)
+        data_file = _write_table_to_data_file(f"{tmpdir}/file_{i}.parquet", pa_schema, arrow_table)
+        data_file.spec_id = 0
+        tasks.append(FileScanTask(data_file))
+
+    scan = ArrowScan(
+        table_metadata=TableMetadataV2(
+            location="file://a/b/",
+            last_column_id=1,
+            format_version=2,
+            schemas=[table_schema],
+            partition_specs=[PartitionSpec()],
+        ),
+        io=PyArrowFileIO(),
+        projected_schema=table_schema,
+        row_filter=AlwaysTrue(),
+        case_sensitive=True,
+        limit=limit,
+    )
+    return scan, tasks
+
+
+def test_streaming_false_produces_same_results(tmpdir: str) -> None:
+    """Test that streaming=False produces the same results as the default behavior."""
+    scan, tasks = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100)
+
+    batches_default = list(scan.to_record_batches(tasks, streaming=False))
+    # Re-create tasks since iterators are consumed
+    _, tasks2 = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100)
+    batches_streaming = list(scan.to_record_batches(tasks2, streaming=False))
+
+    total_default = sum(len(b) for b in batches_default)
+    total_streaming = sum(len(b) for b in batches_streaming)
+    assert total_default == 300
+    assert total_streaming == 300
+
+
+def test_streaming_true_yields_all_batches(tmpdir: str) -> None:
+    """Test that streaming=True yields all batches correctly."""
+    scan, tasks = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100)
+
+    batches = list(scan.to_record_batches(tasks, streaming=True))
+
+    total_rows = sum(len(b) for b in batches)
+    assert total_rows == 300
+    # Verify all values are present
+    all_values = sorted([v for b in batches for v in b.column("col").to_pylist()])
+    assert all_values == list(range(300))
+
+
+def test_streaming_true_with_limit(tmpdir: str) -> None:
+    """Test that streaming=True respects the row limit."""
+    scan, tasks = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100, limit=150)
+
+    batches = list(scan.to_record_batches(tasks, streaming=True))
+
+    total_rows = sum(len(b) for b in batches)
+    assert total_rows == 150
+
+
+def test_streaming_file_ordering_preserved(tmpdir: str) -> None:
+    """Test that file ordering is preserved in both streaming modes."""
+    scan, tasks = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100)
+
+    batches = list(scan.to_record_batches(tasks, streaming=True))
+    all_values = [v for b in batches for v in b.column("col").to_pylist()]
+
+    # Values should be in file order: 0-99 from file 0, 100-199 from file 1, 200-299 from file 2
+    assert all_values == list(range(300))
+
+
 def test_parse_location_defaults() -> None:
     """Test that parse_location uses defaults."""