Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/datasets/features/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.Str
The Arrow types that can be converted to the Audio pyarrow storage type are:

- `pa.string()` - it must contain the "path" data
- `pa.large_string()` - it must contain the "path" data (will be cast to string if possible)
- `pa.binary()` - it must contain the audio bytes
- `pa.struct({"bytes": pa.binary()})`
- `pa.struct({"path": pa.string()})`
Expand All @@ -249,7 +250,9 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.Str
`pa.StructArray`: Array in the Audio arrow storage type, that is
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`
"""
if pa.types.is_string(storage.type):
if pa.types.is_string(storage.type) or pa.types.is_large_string(storage.type):
if pa.types.is_large_string(storage.type):
storage = array_cast(storage, pa.string())
Comment on lines +253 to +255
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can use the same code as in Image.cast_storage

Suggested change
if pa.types.is_string(storage.type) or pa.types.is_large_string(storage.type):
if pa.types.is_large_string(storage.type):
storage = array_cast(storage, pa.string())
if pa.types.is_large_string(storage.type):
try:
storage = storage.cast(pa.string())
except pa.ArrowInvalid as e:
raise ValueError(
f"Failed to cast large_string to string for Image feature. "
f"This can happen if string values exceed 2GB. "
f"Original error: {e}"
) from e
if pa.types.is_string(storage.type):

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright that's helpful

bytes_array = pa.array([None] * len(storage), type=pa.binary())
storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null())
elif pa.types.is_large_binary(storage.type):
Expand Down
10 changes: 10 additions & 0 deletions src/datasets/features/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr
The Arrow types that can be converted to the Pdf pyarrow storage type are:

- `pa.string()` - it must contain the "path" data
- `pa.large_string()` - it must contain the "path" data (will be cast to string if possible)
- `pa.binary()` - it must contain the image bytes
- `pa.struct({"bytes": pa.binary()})`
- `pa.struct({"path": pa.string()})`
Expand All @@ -200,6 +201,15 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr
`pa.StructArray`: Array in the Pdf arrow storage type, that is
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
"""
if pa.types.is_large_string(storage.type):
try:
storage = storage.cast(pa.string())
except pa.ArrowInvalid as e:
raise ValueError(
f"Failed to cast large_string to string for Pdf feature. "
f"This can happen if string values exceed 2GB. "
f"Original error: {e}"
) from e
if pa.types.is_string(storage.type):
bytes_array = pa.array([None] * len(storage), type=pa.binary())
storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null())
Expand Down
10 changes: 10 additions & 0 deletions src/datasets/features/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr
The Arrow types that can be converted to the Video pyarrow storage type are:

- `pa.string()` - it must contain the "path" data
- `pa.large_string()` - it must contain the "path" data (will be cast to string if possible)
- `pa.binary()` - it must contain the video bytes
- `pa.struct({"bytes": pa.binary()})`
- `pa.struct({"path": pa.string()})`
Expand All @@ -255,6 +256,15 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr
`pa.StructArray`: Array in the Video arrow storage type, that is
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
"""
if pa.types.is_large_string(storage.type):
try:
storage = storage.cast(pa.string())
except pa.ArrowInvalid as e:
raise ValueError(
f"Failed to cast large_string to string for Video feature. "
f"This can happen if string values exceed 2GB. "
f"Original error: {e}"
) from e
if pa.types.is_string(storage.type):
bytes_array = pa.array([None] * len(storage), type=pa.binary())
storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null())
Expand Down
14 changes: 14 additions & 0 deletions tests/features/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,20 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir)
assert samples.sample_rate == 16000
assert samples.data.shape == (2, 40124)

def test_cast_column_audio_from_csv_large_string(audio_file, tmp_path):
from datasets import Audio, load_dataset

csv_path = tmp_path / "audio.csv"
csv_path.write_text(f"audio\n{audio_file}\n", encoding="utf-8")

dset = load_dataset("csv", data_files=str(csv_path), split="train")
assert str(dset.features["audio"]) == "Value('large_string')"

dset = dset.cast_column("audio", Audio(decode=False))

assert isinstance(dset.features["audio"], Audio)
item = dset[0]["audio"]
assert item["path"] == audio_file

@require_torchcodec
@pytest.mark.parametrize(
Expand Down
17 changes: 16 additions & 1 deletion tests/features/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from datasets import Dataset, Features, Pdf
from datasets import Dataset, Features, Pdf, load_dataset

from ..utils import require_pdfplumber

Expand Down Expand Up @@ -60,3 +60,18 @@ def test_dataset_with_pdf_feature(shared_datadir):
item = dset[0]
assert item.keys() == {"pdf"}
assert isinstance(item["pdf"], pdfplumber.pdf.PDF)

def test_cast_column_pdf_from_csv_large_string(shared_datadir, tmp_path):
pdf_path = str(shared_datadir / "test_pdf.pdf")
csv_path = tmp_path / "pdf.csv"

csv_path.write_text(f"pdf\n{pdf_path}\n", encoding="utf-8")

dset = load_dataset("csv", data_files=str(csv_path), split="train")
assert str(dset.features["pdf"]) == "Value('large_string')"

dset = dset.cast_column("pdf", Pdf(decode=False))

assert isinstance(dset.features["pdf"], Pdf)
item = dset[0]["pdf"]
assert item["path"] == pdf_path
15 changes: 15 additions & 0 deletions tests/features/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,21 @@ def test_dataset_with_video_feature(shared_datadir):
assert item["video"].get_frame_at(0).data.shape == (3, 50, 66)
assert isinstance(item["video"].get_frame_at(0).data, torch.Tensor)

def test_cast_column_video_from_csv_large_string(shared_datadir, tmp_path):
video_path = str(shared_datadir / "test_video_66x50.mov")
csv_path = tmp_path / "video.csv"

csv_path.write_text(f"video\n{video_path}\n", encoding="utf-8")

dset = load_dataset("csv", data_files=str(csv_path), split="train")
assert str(dset.features["video"]) == "Value('large_string')"

dset = dset.cast_column("video", Video(decode=False))

assert isinstance(dset.features["video"], Video)
item = dset[0]["video"]
assert item["path"] == video_path


@require_torchcodec
def test_dataset_with_video_map_and_formatted(shared_datadir):
Expand Down