diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index f8fb2209db..e3a42bb966 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -26,6 +26,7 @@ from datasets.builder import DatasetBuilder from datasets.data_files import EmptyDatasetError as _EmptyDatasetError from datasets.download import StreamingDownloadManager +from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder as AudioFolderBuilder from datasets.packaged_modules.imagefolder.imagefolder import ImageFolder as ImageFolderBuilder from datasets.packaged_modules.parquet.parquet import Parquet as ParquetBuilder from datasets.packaged_modules.videofolder.videofolder import VideoFolder as VideoFolderBuilder @@ -218,6 +219,15 @@ def is_video_builder(builder: DatasetBuilder) -> bool: return isinstance(builder, VideoFolderBuilder) or "Video(" in str(builder.info.features) +def is_folder_based_builder(builder: DatasetBuilder) -> bool: + """Check if the builder is a folder-based builder (audiofolder/imagefolder). + + These builders may incorrectly infer a 'label' column from directory structure + when directories are used for splits (train/test) rather than class labels. + """ + return isinstance(builder, (AudioFolderBuilder, ImageFolderBuilder)) + + def _is_too_big_from_hub( dataset_info: DatasetInfo, max_dataset_size_bytes: int, @@ -1292,6 +1302,19 @@ def compute_config_parquet_and_info_response( token=hf_token, download_config=download_config, ) + # For folder-based builders (audiofolder/imagefolder), reload with drop_labels=True + # to prevent spurious 'label' column when directories are used for splits (train/test) + # rather than class labels. See: https://github.com/huggingface/dataset-viewer/issues/3014 + if is_folder_based_builder(builder): + logging.info(f"{dataset=} {config=} is a folder-based dataset, reloading with drop_labels=True") + builder = retry_load_dataset_builder( + path=dataset, + name=config, + revision=source_revision, + token=hf_token, + download_config=download_config, + drop_labels=True, + ) except _EmptyDatasetError as err: raise EmptyDatasetError(f"{dataset=} is empty.", cause=err) from err except ValueError as err: diff --git a/services/worker/tests/job_runners/config/test_parquet_and_info.py b/services/worker/tests/job_runners/config/test_parquet_and_info.py index eed1eb84a3..fd6d7372af 100644 --- a/services/worker/tests/job_runners/config/test_parquet_and_info.py +++ b/services/worker/tests/job_runners/config/test_parquet_and_info.py @@ -42,6 +42,7 @@ get_delete_operations, get_urlpaths_in_gen_kwargs, get_writer_batch_size_from_info, + is_folder_based_builder, limit_parquet_writes, list_generated_parquet_files, parse_repo_filename, @@ -199,6 +200,31 @@ def test__is_too_big_from_datasets( ) +def test_is_folder_based_builder() -> None: + """Test is_folder_based_builder returns True for audiofolder/imagefolder builders.""" + from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder as AudioFolderBuilder + from datasets.packaged_modules.imagefolder.imagefolder import ImageFolder as ImageFolderBuilder + from datasets.packaged_modules.csv.csv import Csv as CsvBuilder + from unittest.mock import MagicMock + + # Create mock builders + audio_builder = MagicMock(spec=AudioFolderBuilder) + audio_builder.__class__ = AudioFolderBuilder + + image_builder = MagicMock(spec=ImageFolderBuilder) + image_builder.__class__ = ImageFolderBuilder + + csv_builder = MagicMock(spec=CsvBuilder) + csv_builder.__class__ = CsvBuilder + + # Test folder-based builders return True + assert is_folder_based_builder(audio_builder) is True + assert is_folder_based_builder(image_builder) is True + + # Test non-folder-based builder returns False + assert is_folder_based_builder(csv_builder) is False + + def test_supported_if_big_parquet( app_config: AppConfig, get_job_runner: GetJobRunner,