Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions services/worker/src/worker/job_runners/config/parquet_and_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from datasets.builder import DatasetBuilder
from datasets.data_files import EmptyDatasetError as _EmptyDatasetError
from datasets.download import StreamingDownloadManager
from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder as AudioFolderBuilder
from datasets.packaged_modules.imagefolder.imagefolder import ImageFolder as ImageFolderBuilder
from datasets.packaged_modules.parquet.parquet import Parquet as ParquetBuilder
from datasets.packaged_modules.videofolder.videofolder import VideoFolder as VideoFolderBuilder
Expand Down Expand Up @@ -218,6 +219,15 @@ def is_video_builder(builder: DatasetBuilder) -> bool:
return isinstance(builder, VideoFolderBuilder) or "Video(" in str(builder.info.features)


def is_folder_based_builder(builder: DatasetBuilder) -> bool:
"""Check if the builder is a folder-based builder (audiofolder/imagefolder).

These builders may incorrectly infer a 'label' column from directory structure
when directories are used for splits (train/test) rather than class labels.
"""
return isinstance(builder, (AudioFolderBuilder, ImageFolderBuilder))


def _is_too_big_from_hub(
dataset_info: DatasetInfo,
max_dataset_size_bytes: int,
Expand Down Expand Up @@ -1292,6 +1302,19 @@ def compute_config_parquet_and_info_response(
token=hf_token,
download_config=download_config,
)
# For folder-based builders (audiofolder/imagefolder), reload with drop_labels=True
# to prevent spurious 'label' column when directories are used for splits (train/test)
# rather than class labels. See: https://github.com/huggingface/dataset-viewer/issues/3014
if is_folder_based_builder(builder):
logging.info(f"{dataset=} {config=} is a folder-based dataset, reloading with drop_labels=True")
builder = retry_load_dataset_builder(
path=dataset,
name=config,
revision=source_revision,
token=hf_token,
download_config=download_config,
drop_labels=True,
)
except _EmptyDatasetError as err:
raise EmptyDatasetError(f"{dataset=} is empty.", cause=err) from err
except ValueError as err:
Expand Down
26 changes: 26 additions & 0 deletions services/worker/tests/job_runners/config/test_parquet_and_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
get_delete_operations,
get_urlpaths_in_gen_kwargs,
get_writer_batch_size_from_info,
is_folder_based_builder,
limit_parquet_writes,
list_generated_parquet_files,
parse_repo_filename,
Expand Down Expand Up @@ -199,6 +200,31 @@ def test__is_too_big_from_datasets(
)


def test_is_folder_based_builder() -> None:
"""Test is_folder_based_builder returns True for audiofolder/imagefolder builders."""
from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder as AudioFolderBuilder
from datasets.packaged_modules.imagefolder.imagefolder import ImageFolder as ImageFolderBuilder
from datasets.packaged_modules.csv.csv import Csv as CsvBuilder
from unittest.mock import MagicMock

# Create mock builders
audio_builder = MagicMock(spec=AudioFolderBuilder)
audio_builder.__class__ = AudioFolderBuilder

image_builder = MagicMock(spec=ImageFolderBuilder)
image_builder.__class__ = ImageFolderBuilder

csv_builder = MagicMock(spec=CsvBuilder)
csv_builder.__class__ = CsvBuilder

# Test folder-based builders return True
assert is_folder_based_builder(audio_builder) is True
assert is_folder_based_builder(image_builder) is True

# Test non-folder-based builder returns False
assert is_folder_based_builder(csv_builder) is False


def test_supported_if_big_parquet(
app_config: AppConfig,
get_job_runner: GetJobRunner,
Expand Down
Loading