|
25 | 25 | from typing import ( |
26 | 26 | TYPE_CHECKING, |
27 | 27 | Any, |
| 28 | + AsyncIterator, |
28 | 29 | Iterable, |
29 | 30 | Iterator, |
30 | 31 | Literal, |
@@ -291,7 +292,9 @@ class DataFrame: |
291 | 292 | """Two dimensional table representation of data. |
292 | 293 |
|
293 | 294 | DataFrame objects are iterable; iterating over a DataFrame yields |
294 | | - :class:`datafusion.record_batch.RecordBatch` instances lazily. |
| 295 | + :class:`pyarrow.RecordBatch` instances lazily. Use |
| 296 | + :py:meth:`to_stream` to obtain a :class:`~datafusion.record_batch.RecordBatchStream` |
| 297 | + for explicit iteration over the results. |
295 | 298 |
|
296 | 299 | See :ref:`user_guide_concepts` in the online documentation for more information. |
297 | 300 | """ |
@@ -1022,6 +1025,14 @@ def to_arrow_table(self) -> pa.Table: |
1022 | 1025 | """ |
1023 | 1026 | return self.df.to_arrow_table() |
1024 | 1027 |
|
| 1028 | + def to_stream(self) -> RecordBatchStream: |
| 1029 | + """Execute this :py:class:`DataFrame` and return a record batch stream. |
| 1030 | +
|
| 1031 | + This is a convenience wrapper around :py:meth:`execute_stream` and can be |
| 1032 | + used to iterate over results without materializing them. |
| 1033 | + """ |
| 1034 | + return self.execute_stream() |
| 1035 | + |
1025 | 1036 | def execute_stream(self) -> RecordBatchStream: |
1026 | 1037 | """Executes this DataFrame and returns a stream over a single partition. |
1027 | 1038 |
|
@@ -1121,14 +1132,25 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: |
1121 | 1132 | # preserving the original partition order. |
1122 | 1133 | return self.df.__arrow_c_stream__(requested_schema) |
1123 | 1134 |
|
1124 | | - def __iter__(self) -> Iterator[RecordBatch]: |
1125 | | - """Yield record batches from the DataFrame without materializing results. |
| 1135 | + def __iter__(self) -> Iterator[pa.RecordBatch]: |
| 1136 | + """Yield record batches from this DataFrame lazily. |
1126 | 1137 |
|
1127 | | - This executes the DataFrame using DataFusion's partitioned streaming |
1128 | | - APIs and yields :class:`datafusion.record_batch.RecordBatch` objects. |
| 1138 | + This delegates to :py:meth:`to_stream` and converts each batch to a |
| 1139 | + :class:`pyarrow.RecordBatch` without eagerly materializing the entire |
| 1140 | + result set. |
1129 | 1141 | """ |
1130 | | - for stream in self.execute_stream_partitioned(): |
1131 | | - yield from stream |
| 1142 | + for batch in self.to_stream(): |
| 1143 | + yield batch.to_pyarrow() |
| 1144 | + |
| 1145 | + def __aiter__(self) -> AsyncIterator[pa.RecordBatch]: |
| 1146 | + """Asynchronously yield record batches from this DataFrame lazily.""" |
| 1147 | + stream = self.to_stream() |
| 1148 | + |
| 1149 | + async def iterator() -> AsyncIterator[pa.RecordBatch]: |
| 1150 | + async for batch in stream: |
| 1151 | + yield batch.to_pyarrow() |
| 1152 | + |
| 1153 | + return iterator() |
1132 | 1154 |
|
1133 | 1155 | def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame: |
1134 | 1156 | """Apply a function to the current DataFrame which returns another DataFrame. |
|
0 commit comments