feat: add to_stream method for lazy DataFrame processing and update iteration behavior

kosiew · kosiew · commit f2e6335ae256 · 2025-09-06T12:02:01.000+08:00
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -168,8 +168,18 @@ out-of-memory errors.
     for batch in reader:
         ...  # process each batch as it is produced
 
-DataFrames are also iterable, yielding :class:`pyarrow.RecordBatch` objects
-lazily so you can loop over results directly:
+DataFrames expose :py:meth:`~datafusion.DataFrame.to_stream`, which returns a
+``RecordBatchStream`` for lazily processing results without materializing them
+all at once:
+
+.. code-block:: python
+
+    stream = df.to_stream()
+    for batch in stream:
+        ...  # process each batch as it is produced
+
+DataFrames themselves are also iterable and delegate to ``to_stream()`` under
+the hood:
 
 .. code-block:: python
 
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -25,6 +25,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    AsyncIterator,
     Iterable,
     Iterator,
     Literal,
@@ -291,7 +292,9 @@ class DataFrame:
     """Two dimensional table representation of data.
 
     DataFrame objects are iterable; iterating over a DataFrame yields
-    :class:`datafusion.record_batch.RecordBatch` instances lazily.
+    :class:`pyarrow.RecordBatch` instances lazily. Use
+    :py:meth:`to_stream` to obtain a :class:`~datafusion.record_batch.RecordBatchStream`
+    for explicit iteration over the results.
 
     See :ref:`user_guide_concepts` in the online documentation for more information.
     """
@@ -1022,6 +1025,14 @@ def to_arrow_table(self) -> pa.Table:
         """
         return self.df.to_arrow_table()
 
+    def to_stream(self) -> RecordBatchStream:
+        """Execute this :py:class:`DataFrame` and return a record batch stream.
+
+        This is a convenience wrapper around :py:meth:`execute_stream` and can be
+        used to iterate over results without materializing them.
+        """
+        return self.execute_stream()
+
     def execute_stream(self) -> RecordBatchStream:
         """Executes this DataFrame and returns a stream over a single partition.
 
@@ -1121,14 +1132,25 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
-    def __iter__(self) -> Iterator[RecordBatch]:
-        """Yield record batches from the DataFrame without materializing results.
+    def __iter__(self) -> Iterator[pa.RecordBatch]:
+        """Yield record batches from this DataFrame lazily.
 
-        This executes the DataFrame using DataFusion's partitioned streaming
-        APIs and yields :class:`datafusion.record_batch.RecordBatch` objects.
+        This delegates to :py:meth:`to_stream` and converts each batch to a
+        :class:`pyarrow.RecordBatch` without eagerly materializing the entire
+        result set.
         """
-        for stream in self.execute_stream_partitioned():
-            yield from stream
+        for batch in self.to_stream():
+            yield batch.to_pyarrow()
+
+    def __aiter__(self) -> AsyncIterator[pa.RecordBatch]:
+        """Asynchronously yield record batches from this DataFrame lazily."""
+        stream = self.to_stream()
+
+        async def iterator() -> AsyncIterator[pa.RecordBatch]:
+            async for batch in stream:
+                yield batch.to_pyarrow()
+
+        return iterator()
 
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.
diff --git a/python/tests/test_dataframe_iter_stream.py b/python/tests/test_dataframe_iter_stream.py
@@ -15,41 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pyarrow as pa
-
-
-def test_iter_releases_reader(monkeypatch, ctx):
-    batches = [
-        pa.RecordBatch.from_pydict({"a": [1]}),
-        pa.RecordBatch.from_pydict({"a": [2]}),
-    ]
-
-    class DummyReader:
-        def __init__(self, batches):
-            self._iter = iter(batches)
-            self.closed = False
-
-        def __iter__(self):
-            return self
-
-        def __next__(self):
-            return next(self._iter)
-
-        def close(self):
-            self.closed = True
-
-    dummy_reader = DummyReader(batches)
-
-    class FakeRecordBatchReader:
-        @staticmethod
-        def _import_from_c_capsule(*_args, **_kwargs):
-            return dummy_reader
-
-    monkeypatch.setattr(pa, "RecordBatchReader", FakeRecordBatchReader)
 
+def test_to_stream(ctx):
     df = ctx.from_pydict({"a": [1, 2]})
+    stream = df.to_stream()
+    batches = [rb.to_pyarrow() for rb in stream]
+    assert len(batches) == 1
+    assert batches[0].to_pydict() == {"a": [1, 2]}
 
-    for _ in df:
-        break
 
-    assert dummy_reader.closed
+def test_dataframe_iter(ctx):
+    df = ctx.from_pydict({"a": [1, 2]})
+    batches = list(df)
+    assert len(batches) == 1
+    assert batches[0].to_pydict() == {"a": [1, 2]}