@@ -4360,8 +4360,20 @@ def to_parquet(
43604360 Either a path to a file (e.g. `file.parquet`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.parquet`),
43614361 or a BinaryIO, where the dataset will be saved to in the specified format.
43624362 batch_size (`int`, *optional*):
4363- Size of the batch to load in memory and write at once.
4364- Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
4363+ Size of the Arrow table batches loaded into memory and written at once.
4364+ Each batch is written as a single Parquet row group (unless overridden
4365+ by `row_group_size` in `parquet_writer_kwargs`).
4366+ Defaults to a value inferred from the dataset features, falling back
4367+ to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
4368+
4369+ <Tip warning={true}>
4370+
4371+ If the dataset has no features set, the entire dataset is materialized
4372+ into memory as a single Arrow table before writing, regardless of
4373+ `batch_size`. Call [`IterableDataset.cast`] or set features beforehand
4374+ to enable streaming writes.
4375+
4376+ </Tip>
43654377 storage_options (`dict`, *optional*):
43664378 Key/value pairs to be passed on to the file-system backend, if any.
43674379
@@ -4386,21 +4398,21 @@ def to_parquet(
43864398 ```
43874399
43884400 """
4389- from .io . parquet import ParquetDatasetWriter
4401+ from .arrow_writer import get_arrow_writer_batch_size_from_features
43904402
4403+ batch_size = (
4404+ batch_size or get_arrow_writer_batch_size_from_features (self .features ) or config .DEFAULT_MAX_BATCH_SIZE
4405+ )
43914406 dataset : Union [Dataset , IterableDataset , None ] = None
43924407 if self .features is None :
43934408 # Without features we can't construct a schema upfront — fall back to materializing
4394- from .arrow_writer import get_arrow_writer_batch_size_from_features
4395-
4396- batch_size = (
4397- batch_size or get_arrow_writer_batch_size_from_features (self .features ) or config .DEFAULT_MAX_BATCH_SIZE
4398- )
43994409 table = pa .concat_tables (list (self .with_format ("arrow" ).iter (batch_size = batch_size )))
44004410 dataset = Dataset (table , fingerprint = "unset" )
44014411 else :
44024412 dataset = self
44034413
4414+ from .io .parquet import ParquetDatasetWriter
4415+
44044416 return ParquetDatasetWriter (
44054417 dataset , path_or_buf , batch_size = batch_size , storage_options = storage_options , ** parquet_writer_kwargs
44064418 ).write ()
0 commit comments