Fix batch_size resolving in to_parquet and improve to_parquet documentation

HaukurPall · HaukurPall · commit 504bd6e2f1ac · 2026-03-06T11:09:45.000Z
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -4360,8 +4360,20 @@ def to_parquet(
                 Either a path to a file (e.g. `file.parquet`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.parquet`),
                 or a BinaryIO, where the dataset will be saved to in the specified format.
             batch_size (`int`, *optional*):
-                Size of the batch to load in memory and write at once.
-                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
+                Size of the Arrow table batches loaded into memory and written at once.
+                Each batch is written as a single Parquet row group (unless overridden
+                by `row_group_size` in `parquet_writer_kwargs`).
+                Defaults to a value inferred from the dataset features, falling back
+                to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
+
+                <Tip warning={true}>
+
+                If the dataset has no features set, the entire dataset is materialized
+                into memory as a single Arrow table before writing, regardless of
+                `batch_size`. Call [`IterableDataset.cast`] or set features beforehand
+                to enable streaming writes.
+
+                </Tip>
             storage_options (`dict`, *optional*):
                 Key/value pairs to be passed on to the file-system backend, if any.
 
@@ -4386,21 +4398,21 @@ def to_parquet(
         ```
 
         """
-        from .io.parquet import ParquetDatasetWriter
+        from .arrow_writer import get_arrow_writer_batch_size_from_features
 
+        batch_size = (
+            batch_size or get_arrow_writer_batch_size_from_features(self.features) or config.DEFAULT_MAX_BATCH_SIZE
+        )
         dataset: Union[Dataset, IterableDataset, None] = None
         if self.features is None:
             # Without features we can't construct a schema upfront — fall back to materializing
-            from .arrow_writer import get_arrow_writer_batch_size_from_features
-
-            batch_size = (
-                batch_size or get_arrow_writer_batch_size_from_features(self.features) or config.DEFAULT_MAX_BATCH_SIZE
-            )
             table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=batch_size)))
             dataset = Dataset(table, fingerprint="unset")
         else:
             dataset = self
 
+        from .io.parquet import ParquetDatasetWriter
+
         return ParquetDatasetWriter(
             dataset, path_or_buf, batch_size=batch_size, storage_options=storage_options, **parquet_writer_kwargs
         ).write()