Skip to content

Commit 9a4709d

Browse files
committed
feat(tests): add test for __arrow_c_stream__ with large dataset handling
1 parent 77f632a commit 9a4709d

1 file changed

Lines changed: 26 additions & 0 deletions

File tree

python/tests/test_io.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17+
import resource
1718
from pathlib import Path
1819

1920
import pyarrow as pa
@@ -92,3 +93,28 @@ def test_read_avro():
9293
path = Path.cwd() / "testing/data/avro/alltypes_plain.avro"
9394
avro_df = read_avro(path=path)
9495
assert avro_df is not None
96+
97+
98+
def test_arrow_c_stream_large_dataset(ctx):
99+
"""DataFrame.__arrow_c_stream__ yields batches incrementally.
100+
101+
This test constructs a DataFrame that would be far larger than available
102+
memory if materialized. The ``__arrow_c_stream__`` method should expose a
103+
stream of record batches without collecting the full dataset, so reading a
104+
handful of batches should not exhaust process memory.
105+
"""
106+
# Create a very large DataFrame using range; this would be terabytes if collected
107+
df = ctx.range(0, 1 << 40)
108+
109+
reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
110+
111+
# Track maximum RSS before consuming batches
112+
start_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
113+
114+
for _ in range(5):
115+
batch = reader.read_next_batch()
116+
assert batch is not None
117+
assert len(batch) > 0
118+
current_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
119+
# Ensure memory usage hasn't grown substantially (>50MB)
120+
assert current_max_rss - start_max_rss < 50 * 1024

0 commit comments

Comments
 (0)