feat: add to_record_batch_stream function and update DataFrame iteration methods

kosiew · kosiew · commit 7ee5924daa2c · 2025-09-06T21:38:28.000+08:00
diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
@@ -53,7 +53,7 @@
 )
 from .io import read_avro, read_csv, read_json, read_parquet
 from .plan import ExecutionPlan, LogicalPlan
-from .record_batch import RecordBatch, RecordBatchStream
+from .record_batch import RecordBatch, RecordBatchStream, to_record_batch_stream
 from .user_defined import (
     Accumulator,
     AggregateUDF,
@@ -107,6 +107,7 @@
     "read_json",
     "read_parquet",
     "substrait",
+    "to_record_batch_stream",
     "udaf",
     "udf",
     "udtf",
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -25,6 +25,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    AsyncIterator,
     Iterable,
     Iterator,
     Literal,
@@ -43,7 +44,11 @@
 from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
 from datafusion.expr import Expr, SortExpr, sort_or_default
 from datafusion.plan import ExecutionPlan, LogicalPlan
-from datafusion.record_batch import RecordBatchStream
+from datafusion.record_batch import (
+    RecordBatch,
+    RecordBatchStream,
+    to_record_batch_stream,
+)
 
 if TYPE_CHECKING:
     import pathlib
@@ -1123,15 +1128,20 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         return self.df.__arrow_c_stream__(requested_schema)
 
     def __iter__(self) -> Iterator[RecordBatch]:
-        """Yield DataFusion record batches without materializing results.
+        """Yield record batches from the DataFrame without materializing results.
+
+        This implementation delegates to :func:`to_record_batch_stream`, which
+        executes the DataFrame and returns a :class:`RecordBatchStream`.
+        """
+        return to_record_batch_stream(self).__iter__()
+
+    def __aiter__(self) -> AsyncIterator[RecordBatch]:
+        """Asynchronously yield record batches from the DataFrame.
 
-        Batches are produced lazily using DataFusion's partitioned streaming
-        APIs so ``collect`` is never invoked. Each returned batch exposes the
-        Arrow C data interface and can be consumed by downstream libraries that
-        support ``__arrow_c_array__``.
+        This delegates to :func:`to_record_batch_stream` to obtain a
+        :class:`RecordBatchStream` and returns its asynchronous iterator.
         """
-        for stream in self.execute_stream_partitioned():
-            yield from stream
+        return to_record_batch_stream(self).__aiter__()
 
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.
diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py
@@ -25,11 +25,13 @@
 
 from typing import TYPE_CHECKING
 
+import datafusion._internal as df_internal
+
 if TYPE_CHECKING:
     import pyarrow as pa
     import typing_extensions
 
-    import datafusion._internal as df_internal
+    from datafusion.dataframe import DataFrame
 
 
 class RecordBatch:
@@ -79,3 +81,15 @@ def __aiter__(self) -> typing_extensions.Self:
     def __iter__(self) -> typing_extensions.Self:
         """Iterator function."""
         return self
+
+
+def to_record_batch_stream(df: DataFrame) -> RecordBatchStream:
+    """Convert a DataFrame into a RecordBatchStream.
+
+    Args:
+        df: DataFrame to convert.
+
+    Returns:
+        A RecordBatchStream representing the DataFrame.
+    """
+    return df.execute_stream()
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1314,7 +1314,7 @@ def test_execution_plan(aggregate_df):
 @pytest.mark.asyncio
 async def test_async_iteration_of_df(aggregate_df):
     rows_returned = 0
-    async for batch in aggregate_df.execute_stream():
+    async for batch in aggregate_df:
         assert batch is not None
         rows_returned += len(batch.to_pyarrow()[0])
 
diff --git a/python/tests/test_dataframe_iter.py b/python/tests/test_dataframe_iter.py
@@ -15,10 +15,26 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import pytest
 import datafusion
 
 
+def test_iter_dataframe(ctx):
+    df = ctx.from_pydict({"a": [1, 2]})
+    batches = [batch.to_pyarrow() for batch in df]
+    assert len(batches) == 1
+    assert batches[0].column(0).to_pylist() == [1, 2]
+
+
 def test_iter_returns_record_batch(ctx):
     df = ctx.from_pydict({"a": [1, 2]})
     batch = next(iter(df))
     assert isinstance(batch, datafusion.RecordBatch)
+
+
+@pytest.mark.asyncio
+async def test_async_iter_dataframe(ctx):
+    df = ctx.from_pydict({"a": [1, 2]})
+    batches = [batch async for batch in df]
+    assert len(batches) == 1
+    assert batches[0].to_pyarrow().column(0).to_pylist() == [1, 2]