Skip to content

Commit b5ccc4f

Browse files
committed
fix: update DataFrame iterator to yield pyarrow.RecordBatch objects directly
1 parent ce2f9f5 commit b5ccc4f

File tree

1 file changed

+5
-20
lines changed

1 file changed

+5
-20
lines changed

python/datafusion/dataframe.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,22 +1028,6 @@ def to_arrow_table(self) -> pa.Table:
10281028
"""
10291029
return self.df.to_arrow_table()
10301030

1031-
def __iter__(self) -> Iterator[pa.RecordBatch]:
1032-
"""Iterate over :py:class:`pyarrow.RecordBatch` objects.
1033-
1034-
This executes the DataFrame and yields each partition as a native
1035-
:py:class:`pyarrow.RecordBatch`.
1036-
1037-
Yields:
1038-
pyarrow.RecordBatch: the next batch in the result stream.
1039-
"""
1040-
for batch in self.execute_stream():
1041-
# ``execute_stream`` yields batches that may be ``RecordBatch``
1042-
# wrappers or ``pyarrow.RecordBatch`` objects directly. Convert
1043-
# to native PyArrow batches when necessary to provide a consistent
1044-
# iterator interface.
1045-
yield batch.to_pyarrow() if hasattr(batch, "to_pyarrow") else batch
1046-
10471031
def execute_stream(self) -> RecordBatchStream:
10481032
"""Executes this DataFrame and returns a stream over a single partition.
10491033
@@ -1143,11 +1127,12 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
11431127
# preserving the original partition order.
11441128
return self.df.__arrow_c_stream__(requested_schema)
11451129

1146-
def __iter__(self) -> Iterator[RecordBatch]:
1147-
"""Yield record batches from the DataFrame without materializing results.
1130+
def __iter__(self) -> Iterator[pa.RecordBatch]:
1131+
"""Iterate over :class:`pyarrow.RecordBatch` objects.
11481132
1149-
This implementation delegates to :func:`to_record_batch_stream`, which
1150-
executes the DataFrame and returns a :class:`RecordBatchStream`.
1133+
Results are streamed without materializing the full DataFrame. This
1134+
implementation delegates to :func:`to_record_batch_stream`, which executes
1135+
the :class:`DataFrame` and returns a :class:`RecordBatchStream`.
11511136
"""
11521137
return to_record_batch_stream(self).__iter__()
11531138

0 commit comments

Comments
 (0)