feat(tests): add test for __arrow_c_stream__ with large dataset handling

kosiew · kosiew · commit 9a4709d7336d · 2025-09-01T10:49:52.000+08:00
diff --git a/python/tests/test_io.py b/python/tests/test_io.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import resource
 from pathlib import Path
 
 import pyarrow as pa
@@ -92,3 +93,28 @@ def test_read_avro():
     path = Path.cwd() / "testing/data/avro/alltypes_plain.avro"
     avro_df = read_avro(path=path)
     assert avro_df is not None
+
+
+def test_arrow_c_stream_large_dataset(ctx):
+    """DataFrame.__arrow_c_stream__ yields batches incrementally.
+
+    This test constructs a DataFrame that would be far larger than available
+    memory if materialized. The ``__arrow_c_stream__`` method should expose a
+    stream of record batches without collecting the full dataset, so reading a
+    handful of batches should not exhaust process memory.
+    """
+    # Create a very large DataFrame using range; this would be terabytes if collected
+    df = ctx.range(0, 1 << 40)
+
+    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+
+    # Track maximum RSS before consuming batches
+    start_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+
+    for _ in range(5):
+        batch = reader.read_next_batch()
+        assert batch is not None
+        assert len(batch) > 0
+        current_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        # Ensure memory usage hasn't grown substantially (>50MB)
+        assert current_max_rss - start_max_rss < 50 * 1024