From 04163a9ef48d85fd2d1189c6a9df9c29c7f5743f Mon Sep 17 00:00:00 2001 From: pdTetteh Date: Sat, 4 Apr 2026 12:42:00 +0000 Subject: [PATCH 1/3] Handle large_string storage in Audio.cast_storage --- src/datasets/features/audio.py | 4 +++- tests/features/test_audio.py | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index a398b110da7..8ae70c6d2e0 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -249,7 +249,9 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.Str `pa.StructArray`: Array in the Audio arrow storage type, that is `pa.struct({"bytes": pa.binary(), "path": pa.string()})` """ - if pa.types.is_string(storage.type): + if pa.types.is_string(storage.type) or pa.types.is_large_string(storage.type): + if pa.types.is_large_string(storage.type): + storage = array_cast(storage, pa.string()) bytes_array = pa.array([None] * len(storage), type=pa.binary()) storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null()) elif pa.types.is_large_binary(storage.type): diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index a6dbca799fe..1972ea816e1 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -501,6 +501,41 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) assert samples.sample_rate == 16000 assert samples.data.shape == (2, 40124) +def test_cast_column_audio_from_csv_large_string(tmp_path): + import wave + import struct + import math + + from datasets import Audio, load_dataset + + audio_path = tmp_path / "example.wav" + csv_path = tmp_path / "audio.csv" + + sr = 16000 + duration = 0.25 + freq = 440.0 + samples = int(sr * duration) + + with wave.open(str(audio_path), "w") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sr) + frames = bytearray() + for i in range(samples): + x = int(16000 * math.sin(2 * math.pi * freq * i / sr)) + frames.extend(struct.pack(" Date: Tue, 7 Apr 2026 15:40:56 +0000 Subject: [PATCH 2/3] Support large_string paths for audio, video, and pdf features --- src/datasets/features/pdf.py | 10 ++++++++++ src/datasets/features/video.py | 10 ++++++++++ tests/features/test_audio.py | 4 ++-- tests/features/test_pdf.py | 17 ++++++++++++++++- tests/features/test_video.py | 15 +++++++++++++++ 5 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py index 756530554d4..b863e8770bf 100644 --- a/src/datasets/features/pdf.py +++ b/src/datasets/features/pdf.py @@ -186,6 +186,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr The Arrow types that can be converted to the Pdf pyarrow storage type are: - `pa.string()` - it must contain the "path" data + - `pa.large_string()` - it must contain the "path" data (will be cast to string if possible) - `pa.binary()` - it must contain the image bytes - `pa.struct({"bytes": pa.binary()})` - `pa.struct({"path": pa.string()})` @@ -200,6 +201,15 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr `pa.StructArray`: Array in the Pdf arrow storage type, that is `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. """ + if pa.types.is_large_string(storage.type): + try: + storage = storage.cast(pa.string()) + except pa.ArrowInvalid as e: + raise ValueError( + f"Failed to cast large_string to string for Pdf feature. " + f"This can happen if string values exceed 2GB. " + f"Original error: {e}" + ) from e if pa.types.is_string(storage.type): bytes_array = pa.array([None] * len(storage), type=pa.binary()) storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null()) diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index cf1c19551ca..2681a547578 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -241,6 +241,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr The Arrow types that can be converted to the Video pyarrow storage type are: - `pa.string()` - it must contain the "path" data + - `pa.large_string()` - it must contain the "path" data (will be cast to string if possible) - `pa.binary()` - it must contain the video bytes - `pa.struct({"bytes": pa.binary()})` - `pa.struct({"path": pa.string()})` @@ -255,6 +256,15 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr `pa.StructArray`: Array in the Video arrow storage type, that is `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. """ + if pa.types.is_large_string(storage.type): + try: + storage = storage.cast(pa.string()) + except pa.ArrowInvalid as e: + raise ValueError( + f"Failed to cast large_string to string for Video feature. " + f"This can happen if string values exceed 2GB. " + f"Original error: {e}" + ) from e if pa.types.is_string(storage.type): bytes_array = pa.array([None] * len(storage), type=pa.binary()) storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null()) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 1972ea816e1..708e8ecd3f7 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -502,9 +502,9 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) assert samples.data.shape == (2, 40124) def test_cast_column_audio_from_csv_large_string(tmp_path): - import wave - import struct import math + import struct + import wave from datasets import Audio, load_dataset diff --git a/tests/features/test_pdf.py b/tests/features/test_pdf.py index fe0b521c96c..7c319726b73 100644 --- a/tests/features/test_pdf.py +++ b/tests/features/test_pdf.py @@ -2,7 +2,7 @@ import pytest -from datasets import Dataset, Features, Pdf +from datasets import Dataset, Features, Pdf, load_dataset from ..utils import require_pdfplumber @@ -60,3 +60,18 @@ def test_dataset_with_pdf_feature(shared_datadir): item = dset[0] assert item.keys() == {"pdf"} assert isinstance(item["pdf"], pdfplumber.pdf.PDF) + +def test_cast_column_pdf_from_csv_large_string(shared_datadir, tmp_path): + pdf_path = str(shared_datadir / "test_pdf.pdf") + csv_path = tmp_path / "pdf.csv" + + csv_path.write_text(f"pdf\n{pdf_path}\n", encoding="utf-8") + + dset = load_dataset("csv", data_files=str(csv_path), split="train") + assert str(dset.features["pdf"]) == "Value('large_string')" + + dset = dset.cast_column("pdf", Pdf(decode=False)) + + assert isinstance(dset.features["pdf"], Pdf) + item = dset[0]["pdf"] + assert item["path"] == pdf_path diff --git a/tests/features/test_video.py b/tests/features/test_video.py index 131b01be6d2..29de50e9e8b 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -71,6 +71,21 @@ def test_dataset_with_video_feature(shared_datadir): assert item["video"].get_frame_at(0).data.shape == (3, 50, 66) assert isinstance(item["video"].get_frame_at(0).data, torch.Tensor) +def test_cast_column_video_from_csv_large_string(shared_datadir, tmp_path): + video_path = str(shared_datadir / "test_video_66x50.mov") + csv_path = tmp_path / "video.csv" + + csv_path.write_text(f"video\n{video_path}\n", encoding="utf-8") + + dset = load_dataset("csv", data_files=str(csv_path), split="train") + assert str(dset.features["video"]) == "Value('large_string')" + + dset = dset.cast_column("video", Video(decode=False)) + + assert isinstance(dset.features["video"], Video) + item = dset[0]["video"] + assert item["path"] == video_path + @require_torchcodec def test_dataset_with_video_map_and_formatted(shared_datadir): From 01deb00b7f0b9188d3a267601b4412d26eee88f6 Mon Sep 17 00:00:00 2001 From: pdTetteh Date: Fri, 10 Apr 2026 21:27:48 +0000 Subject: [PATCH 3/3] Use existing audio fixture and document large_string support for Audio --- src/datasets/features/audio.py | 1 + tests/features/test_audio.py | 27 +++------------------------ 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 8ae70c6d2e0..b22dc5aaf32 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -236,6 +236,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.Str The Arrow types that can be converted to the Audio pyarrow storage type are: - `pa.string()` - it must contain the "path" data + - `pa.large_string()` - it must contain the "path" data (will be cast to string if possible) - `pa.binary()` - it must contain the audio bytes - `pa.struct({"bytes": pa.binary()})` - `pa.struct({"path": pa.string()})` diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 708e8ecd3f7..7caa510c7ab 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -501,32 +501,11 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir) assert samples.sample_rate == 16000 assert samples.data.shape == (2, 40124) -def test_cast_column_audio_from_csv_large_string(tmp_path): - import math - import struct - import wave - +def test_cast_column_audio_from_csv_large_string(audio_file, tmp_path): from datasets import Audio, load_dataset - audio_path = tmp_path / "example.wav" csv_path = tmp_path / "audio.csv" - - sr = 16000 - duration = 0.25 - freq = 440.0 - samples = int(sr * duration) - - with wave.open(str(audio_path), "w") as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(sr) - frames = bytearray() - for i in range(samples): - x = int(16000 * math.sin(2 * math.pi * freq * i / sr)) - frames.extend(struct.pack("