feat: add streaming flag to ArrowScan.to_record_batches

sumedhsakdeo · claude · sumedhsakdeo · commit 444549f227d0 · 2026-02-14T16:35:32.000-08:00
When streaming=True, batches are yielded as they are produced by PyArrow without materializing entire files into memory. Files are still processed sequentially, preserving file ordering. The inner method handles the global limit correctly when called with all tasks, avoiding double-counting. This addresses the OOM issue in #3036 for single-file-at-a-time streaming. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -362,6 +362,13 @@ for buf in tbl.scan().to_arrow_batch_reader(batch_size=1000):
     print(f"Buffer contains {len(buf)} rows")
 ```
 
+By default, each file's batches are materialized in memory before being yielded. For large files that may exceed available memory, use `streaming=True` to yield batches as they are produced without materializing entire files:
+
+```python
+for buf in tbl.scan().to_arrow_batch_reader(streaming=True, batch_size=1000):
+    print(f"Buffer contains {len(buf)} rows")
+```
+
 To avoid any type inconsistencies during writing, you can convert the Iceberg table schema to Arrow:
 
 ```python
@@ -1635,6 +1642,15 @@ table.scan(
 ).to_arrow_batch_reader(batch_size=1000)
 ```
 
+Use `streaming=True` to avoid materializing entire files in memory. This yields batches as they are produced by PyArrow, one file at a time:
+
+```python
+table.scan(
+    row_filter=GreaterThanOrEqual("trip_distance", 10.0),
+    selected_fields=("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"),
+).to_arrow_batch_reader(streaming=True)
+```
+
 ### Pandas
 
 <!-- prettier-ignore-start -->
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -1761,7 +1761,9 @@ def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table:
 
         return result
 
-    def to_record_batches(self, tasks: Iterable[FileScanTask], batch_size: int | None = None) -> Iterator[pa.RecordBatch]:
+    def to_record_batches(
+        self, tasks: Iterable[FileScanTask], batch_size: int | None = None, streaming: bool = False
+    ) -> Iterator[pa.RecordBatch]:
         """Scan the Iceberg table and return an Iterator[pa.RecordBatch].
 
         Returns an Iterator of pa.RecordBatch with data from the Iceberg table
@@ -1770,6 +1772,9 @@ def to_record_batches(self, tasks: Iterable[FileScanTask], batch_size: int | Non
 
         Args:
             tasks: FileScanTasks representing the data files and delete files to read from.
+            batch_size: The number of rows per batch. If None, PyArrow's default is used.
+            streaming: If True, yield batches as they are produced without materializing
+                entire files into memory. Files are still processed sequentially.
 
         Returns:
             An Iterator of PyArrow RecordBatches.
@@ -1781,6 +1786,14 @@ def to_record_batches(self, tasks: Iterable[FileScanTask], batch_size: int | Non
         """
         deletes_per_file = _read_all_delete_files(self._io, tasks)
 
+        if streaming:
+            # Streaming path: process all tasks sequentially, yielding batches as produced.
+            # _record_batches_from_scan_tasks_and_deletes handles the limit internally
+            # when called with all tasks, so no outer limit check is needed.
+            yield from self._record_batches_from_scan_tasks_and_deletes(tasks, deletes_per_file, batch_size)
+            return
+
+        # Non-streaming path: existing behavior with executor.map + list()
         total_row_count = 0
         executor = ExecutorFactory.get_or_create()
 
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -2157,7 +2157,7 @@ def to_arrow(self) -> pa.Table:
             self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
         ).to_table(self.plan_files())
 
-    def to_arrow_batch_reader(self, batch_size: int | None = None) -> pa.RecordBatchReader:
+    def to_arrow_batch_reader(self, batch_size: int | None = None, streaming: bool = False) -> pa.RecordBatchReader:
         """Return an Arrow RecordBatchReader from this DataScan.
 
         For large results, using a RecordBatchReader requires less memory than
@@ -2166,6 +2166,8 @@ def to_arrow_batch_reader(self, batch_size: int | None = None) -> pa.RecordBatch
 
         Args:
             batch_size: The number of rows per batch. If None, PyArrow's default is used.
+            streaming: If True, yield batches as they are produced without materializing
+                entire files into memory. Files are still processed sequentially.
 
         Returns:
             pa.RecordBatchReader: Arrow RecordBatchReader from the Iceberg table's DataScan
@@ -2178,7 +2180,7 @@ def to_arrow_batch_reader(self, batch_size: int | None = None) -> pa.RecordBatch
         target_schema = schema_to_pyarrow(self.projection())
         batches = ArrowScan(
             self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
-        ).to_record_batches(self.plan_files(), batch_size=batch_size)
+        ).to_record_batches(self.plan_files(), batch_size=batch_size, streaming=streaming)
 
         return pa.RecordBatchReader.from_batches(
             target_schema,
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
@@ -3106,6 +3106,174 @@ def test_task_to_record_batches_default_batch_size(tmpdir: str) -> None:
     assert len(batches[0]) == num_rows
 
 
+def _create_scan_and_tasks(
+    tmpdir: str,
+    num_files: int = 1,
+    rows_per_file: int = 100,
+    limit: int | None = None,
+    delete_rows_per_file: list[list[int]] | None = None,
+) -> tuple[ArrowScan, list[FileScanTask]]:
+    """Helper to create an ArrowScan and FileScanTasks for testing.
+
+    Args:
+        delete_rows_per_file: If provided, a list of lists of row positions to delete
+            per file. Length must match num_files. Each inner list contains 0-based
+            row positions within that file to mark as positionally deleted.
+    """
+    table_schema = Schema(NestedField(1, "col", LongType(), required=True))
+    pa_schema = pa.schema([pa.field("col", pa.int64(), nullable=False, metadata={PYARROW_PARQUET_FIELD_ID_KEY: "1"})])
+    tasks = []
+    for i in range(num_files):
+        start = i * rows_per_file
+        arrow_table = pa.table({"col": pa.array(range(start, start + rows_per_file))}, schema=pa_schema)
+        data_file = _write_table_to_data_file(f"{tmpdir}/file_{i}.parquet", pa_schema, arrow_table)
+        data_file.spec_id = 0
+
+        delete_files = set()
+        if delete_rows_per_file and delete_rows_per_file[i]:
+            delete_table = pa.table({
+                "file_path": [data_file.file_path] * len(delete_rows_per_file[i]),
+                "pos": delete_rows_per_file[i],
+            })
+            delete_path = f"{tmpdir}/deletes_{i}.parquet"
+            pq.write_table(delete_table, delete_path)
+            delete_files.add(
+                DataFile.from_args(
+                    content=DataFileContent.POSITION_DELETES,
+                    file_path=delete_path,
+                    file_format=FileFormat.PARQUET,
+                    partition={},
+                    record_count=len(delete_rows_per_file[i]),
+                    file_size_in_bytes=22,
+                )
+            )
+
+        tasks.append(FileScanTask(data_file=data_file, delete_files=delete_files))
+
+    scan = ArrowScan(
+        table_metadata=TableMetadataV2(
+            location="file://a/b/",
+            last_column_id=1,
+            format_version=2,
+            schemas=[table_schema],
+            partition_specs=[PartitionSpec()],
+        ),
+        io=PyArrowFileIO(),
+        projected_schema=table_schema,
+        row_filter=AlwaysTrue(),
+        case_sensitive=True,
+        limit=limit,
+    )
+    return scan, tasks
+
+
+def test_streaming_false_produces_same_results(tmpdir: str) -> None:
+    """Test that streaming=False produces the same results as the default behavior."""
+    scan, tasks = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100)
+
+    batches_default = list(scan.to_record_batches(tasks, streaming=False))
+    # Re-create tasks since iterators are consumed
+    _, tasks2 = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100)
+    batches_streaming = list(scan.to_record_batches(tasks2, streaming=False))
+
+    total_default = sum(len(b) for b in batches_default)
+    total_streaming = sum(len(b) for b in batches_streaming)
+    assert total_default == 300
+    assert total_streaming == 300
+
+
+def test_streaming_true_yields_all_batches(tmpdir: str) -> None:
+    """Test that streaming=True yields all batches correctly."""
+    scan, tasks = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100)
+
+    batches = list(scan.to_record_batches(tasks, streaming=True))
+
+    total_rows = sum(len(b) for b in batches)
+    assert total_rows == 300
+    # Verify all values are present
+    all_values = sorted([v for b in batches for v in b.column("col").to_pylist()])
+    assert all_values == list(range(300))
+
+
+def test_streaming_true_with_limit(tmpdir: str) -> None:
+    """Test that streaming=True respects the row limit."""
+    scan, tasks = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100, limit=150)
+
+    batches = list(scan.to_record_batches(tasks, streaming=True))
+
+    total_rows = sum(len(b) for b in batches)
+    assert total_rows == 150
+
+
+def test_streaming_file_ordering_preserved(tmpdir: str) -> None:
+    """Test that file ordering is preserved in both streaming modes."""
+    scan, tasks = _create_scan_and_tasks(tmpdir, num_files=3, rows_per_file=100)
+
+    batches = list(scan.to_record_batches(tasks, streaming=True))
+    all_values = [v for b in batches for v in b.column("col").to_pylist()]
+
+    # Values should be in file order: 0-99 from file 0, 100-199 from file 1, 200-299 from file 2
+    assert all_values == list(range(300))
+
+
+def test_streaming_with_positional_deletes(tmpdir: str) -> None:
+    """Test that streaming=True correctly applies positional deletes."""
+    # 3 files, 10 rows each; delete rows 0,5 from file 0, row 3 from file 1, nothing from file 2
+    scan, tasks = _create_scan_and_tasks(
+        tmpdir,
+        num_files=3,
+        rows_per_file=10,
+        delete_rows_per_file=[[0, 5], [3], []],
+    )
+
+    batches = list(scan.to_record_batches(tasks, streaming=True))
+
+    total_rows = sum(len(b) for b in batches)
+    assert total_rows == 27  # 30 - 3 deletes
+    all_values = sorted([v for b in batches for v in b.column("col").to_pylist()])
+    # File 0: 0-9, delete rows 0,5 → values 1,2,3,4,6,7,8,9
+    # File 1: 10-19, delete row 3 → values 10,11,12,14,15,16,17,18,19
+    # File 2: 20-29, no deletes → values 20-29
+    expected = [1, 2, 3, 4, 6, 7, 8, 9] + [10, 11, 12, 14, 15, 16, 17, 18, 19] + list(range(20, 30))
+    assert all_values == sorted(expected)
+
+
+def test_streaming_with_positional_deletes_and_limit(tmpdir: str) -> None:
+    """Test that streaming=True with positional deletes respects the row limit."""
+    # 3 files, 10 rows each; delete row 0 from each file
+    scan, tasks = _create_scan_and_tasks(
+        tmpdir,
+        num_files=3,
+        rows_per_file=10,
+        limit=15,
+        delete_rows_per_file=[[0], [0], [0]],
+    )
+
+    batches = list(scan.to_record_batches(tasks, streaming=True))
+
+    total_rows = sum(len(b) for b in batches)
+    assert total_rows == 15
+
+
+def test_default_mode_with_positional_deletes(tmpdir: str) -> None:
+    """Test that the default (non-streaming) mode correctly applies positional deletes."""
+    # 3 files, 10 rows each; delete rows from each file
+    scan, tasks = _create_scan_and_tasks(
+        tmpdir,
+        num_files=3,
+        rows_per_file=10,
+        delete_rows_per_file=[[0, 5], [3], []],
+    )
+
+    batches = list(scan.to_record_batches(tasks, streaming=False))
+
+    total_rows = sum(len(b) for b in batches)
+    assert total_rows == 27  # 30 - 3 deletes
+    all_values = sorted([v for b in batches for v in b.column("col").to_pylist()])
+    expected = [1, 2, 3, 4, 6, 7, 8, 9] + [10, 11, 12, 14, 15, 16, 17, 18, 19] + list(range(20, 30))
+    assert all_values == sorted(expected)
+
+
 def test_parse_location_defaults() -> None:
     """Test that parse_location uses defaults."""