diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index cceecf35d63..4683551cf61 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -106,6 +106,10 @@ def _from_exported_parquet_files_and_dataset_infos( exported_parquet_files: list[dict[str, Any]], dataset_infos: DatasetInfosDict, ) -> "MetadataConfigs": + # groupby only merges consecutive keys; exported rows are not guaranteed sorted. + sorted_exported_parquet_files = sorted( + exported_parquet_files, key=lambda row: (row["config"], row["split"]) + ) metadata_configs = { config_name: { "data_files": [ @@ -116,11 +120,15 @@ def _from_exported_parquet_files_and_dataset_infos( for parquet_file in parquet_files_for_split ], } - for split_name, parquet_files_for_split in groupby(parquet_files_for_config, itemgetter("split")) + for split_name, parquet_files_for_split in groupby( + parquet_files_for_config, itemgetter("split") + ) ], "version": str(dataset_infos.get(config_name, DatasetInfo()).version or "0.0.0"), } - for config_name, parquet_files_for_config in groupby(exported_parquet_files, itemgetter("config")) + for config_name, parquet_files_for_config in groupby( + sorted_exported_parquet_files, itemgetter("config") + ) } if dataset_infos: # Preserve order of configs and splits diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index cf9111fa6d9..a0834eff5b4 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -276,6 +276,34 @@ def test_metadata_configs_incorrect_yaml(): _ = MetadataConfigs.from_dataset_card_data(dataset_card_data) +def test_non_consecutive_config_rows_are_merged_in_metadata_configs_from_exported_parquet_files(): + exported_parquet_files = [ + { + "config": "default", + "split": "train", + "url": "https://huggingface.co/datasets/org/name/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet", + }, + { + "config": "other", + "split": "train", + "url": "https://huggingface.co/datasets/org/name/resolve/refs%2Fconvert%2Fparquet/other/train/0000.parquet", + }, + { + "config": "default", + "split": "train", + "url": "https://huggingface.co/datasets/org/name/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet", + }, + ] + metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos( + "abc123", exported_parquet_files, {} + ) + default_paths = metadata_configs["default"]["data_files"][0]["path"] + assert default_paths == [ + "https://huggingface.co/datasets/org/name/resolve/abc123/default/train/0000.parquet", + "https://huggingface.co/datasets/org/name/resolve/abc123/default/train/0001.parquet", + ] + + def test_split_order_in_metadata_configs_from_exported_parquet_files_and_dataset_infos(): exported_parquet_files = [ {