fix: update DataFrame iteration to yield DataFusion RecordBatch objects

kosiew · kosiew · commit b54edc4de63b · 2025-09-06T21:27:07.000+08:00
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -168,12 +168,14 @@ out-of-memory errors.
     for batch in reader:
         ...  # process each batch as it is produced
 
-DataFrames are also iterable, yielding :class:`pyarrow.RecordBatch` objects
-lazily so you can loop over results directly:
+DataFrames are also iterable, yielding :class:`datafusion.RecordBatch` objects
+that implement the Arrow C data interface. These batches can be consumed by
+libraries like PyArrow without copying:
 
 .. code-block:: python
 
     for batch in df:
+        pa_batch = batch.to_pyarrow()  # optional conversion
         ...  # process each batch as it is produced
 
 See :doc:`../io/arrow` for additional details on the Arrow interface.
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -54,6 +54,7 @@
     import pyarrow as pa
 
     from datafusion._internal import expr as expr_internal
+    from datafusion.record_batch import RecordBatch
 
 from enum import Enum
 
@@ -1121,22 +1122,16 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
-    def __iter__(self) -> Iterator[pa.RecordBatch]:
-        """Yield record batches from the DataFrame without materializing results.
+    def __iter__(self) -> Iterator[RecordBatch]:
+        """Yield DataFusion record batches without materializing results.
 
-        This implementation streams record batches via the Arrow C Stream
-        interface, allowing callers such as :func:`pyarrow.Table.from_batches` to
-        consume results lazily. The DataFrame is executed using DataFusion's
-        partitioned streaming APIs so ``collect`` is never invoked and batch
-        order across partitions is preserved.
+        Batches are produced lazily using DataFusion's partitioned streaming
+        APIs so ``collect`` is never invoked. Each returned batch exposes the
+        Arrow C data interface and can be consumed by downstream libraries that
+        support ``__arrow_c_array__``.
         """
-        from contextlib import closing
-
-        import pyarrow as pa
-
-        reader = pa.RecordBatchReader._import_from_c_capsule(self.__arrow_c_stream__())
-        with closing(reader):
-            yield from reader
+        for stream in self.execute_stream_partitioned():
+            yield from stream
 
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.
diff --git a/python/tests/test_dataframe_iter_stream.py b/python/tests/test_dataframe_iter_stream.py
@@ -15,41 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pyarrow as pa
+import datafusion
 
 
-def test_iter_releases_reader(monkeypatch, ctx):
-    batches = [
-        pa.RecordBatch.from_pydict({"a": [1]}),
-        pa.RecordBatch.from_pydict({"a": [2]}),
-    ]
-
-    class DummyReader:
-        def __init__(self, batches):
-            self._iter = iter(batches)
-            self.closed = False
-
-        def __iter__(self):
-            return self
-
-        def __next__(self):
-            return next(self._iter)
-
-        def close(self):
-            self.closed = True
-
-    dummy_reader = DummyReader(batches)
-
-    class FakeRecordBatchReader:
-        @staticmethod
-        def _import_from_c_capsule(*_args, **_kwargs):
-            return dummy_reader
-
-    monkeypatch.setattr(pa, "RecordBatchReader", FakeRecordBatchReader)
-
+def test_iter_returns_record_batch(ctx):
     df = ctx.from_pydict({"a": [1, 2]})
-
-    for _ in df:
-        break
-
-    assert dummy_reader.closed
+    batch = next(iter(df))
+    assert isinstance(batch, datafusion.RecordBatch)