fix: update DataFrame iterator to yield pyarrow.RecordBatch objects directly

kosiew · kosiew · commit b5ccc4f2b343 · 2025-09-07T14:03:29.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1028,22 +1028,6 @@ def to_arrow_table(self) -> pa.Table:
         """
         return self.df.to_arrow_table()
 
-    def __iter__(self) -> Iterator[pa.RecordBatch]:
-        """Iterate over :py:class:`pyarrow.RecordBatch` objects.
-
-        This executes the DataFrame and yields each partition as a native
-        :py:class:`pyarrow.RecordBatch`.
-
-        Yields:
-            pyarrow.RecordBatch: the next batch in the result stream.
-        """
-        for batch in self.execute_stream():
-            # ``execute_stream`` yields batches that may be ``RecordBatch``
-            # wrappers or ``pyarrow.RecordBatch`` objects directly. Convert
-            # to native PyArrow batches when necessary to provide a consistent
-            # iterator interface.
-            yield batch.to_pyarrow() if hasattr(batch, "to_pyarrow") else batch
-
     def execute_stream(self) -> RecordBatchStream:
         """Executes this DataFrame and returns a stream over a single partition.
 
@@ -1143,11 +1127,12 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
-    def __iter__(self) -> Iterator[RecordBatch]:
-        """Yield record batches from the DataFrame without materializing results.
+    def __iter__(self) -> Iterator[pa.RecordBatch]:
+        """Iterate over :class:`pyarrow.RecordBatch` objects.
 
-        This implementation delegates to :func:`to_record_batch_stream`, which
-        executes the DataFrame and returns a :class:`RecordBatchStream`.
+        Results are streamed without materializing the full DataFrame. This
+        implementation delegates to :func:`to_record_batch_stream`, which executes
+        the :class:`DataFrame` and returns a :class:`RecordBatchStream`.
         """
         return to_record_batch_stream(self).__iter__()