Skip to content

Commit c95920b

Browse files
authored
Don't copy image/audio/video/pdf from repo file, and link to file directly in src (#3330)
* read from hf:// paths when needed * fix source url
1 parent 7c6499b commit c95920b

15 files changed

Lines changed: 131 additions & 18 deletions

File tree

libs/libapi/src/libapi/response.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ async def create_response(
1717
config: str,
1818
split: str,
1919
storage_client: StorageClient,
20+
hf_endpoint: str,
21+
hf_token: Optional[str],
2022
pa_table: pa.Table,
2123
offset: int,
2224
features: Features,
@@ -39,6 +41,8 @@ async def create_response(
3941
config=config,
4042
split=split,
4143
storage_client=storage_client,
44+
hf_endpoint=hf_endpoint,
45+
hf_token=hf_token,
4246
offset=offset,
4347
features=features,
4448
row_idx_column=ROW_IDX_COLUMN if use_row_idx_column else None,

libs/libapi/src/libapi/rows_utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ def _transform_row(
2525
split: str,
2626
features: Features,
2727
storage_client: StorageClient,
28+
hf_endpoint: str,
29+
hf_token: Optional[str],
2830
offset: int,
2931
row_idx_column: Optional[str],
3032
) -> Row:
@@ -42,6 +44,8 @@ def _transform_row(
4244
featureName=featureName,
4345
fieldType=fieldType,
4446
storage_client=storage_client,
47+
hf_endpoint=hf_endpoint,
48+
hf_token=hf_token,
4549
)
4650
except Exception as err:
4751
suggestion_messages: dict[type[Exception], str] = {
@@ -65,6 +69,8 @@ async def transform_rows(
6569
rows: list[Row],
6670
features: Features,
6771
storage_client: StorageClient,
72+
hf_endpoint: str,
73+
hf_token: Optional[str],
6874
offset: int,
6975
row_idx_column: Optional[str],
7076
) -> list[Row]:
@@ -76,6 +82,8 @@ async def transform_rows(
7682
split=split,
7783
features=features,
7884
storage_client=storage_client,
85+
hf_endpoint=hf_endpoint,
86+
hf_token=hf_token,
7987
offset=offset,
8088
row_idx_column=row_idx_column,
8189
)

libs/libapi/src/libapi/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,8 @@ async def to_rows_list(
205205
offset: int,
206206
features: Features,
207207
storage_client: StorageClient,
208+
hf_endpoint: str,
209+
hf_token: Optional[str],
208210
row_idx_column: Optional[str] = None,
209211
truncated_columns: Optional[list[str]] = None,
210212
) -> list[RowItem]:
@@ -217,6 +219,8 @@ async def to_rows_list(
217219
rows=pa_table.to_pylist(),
218220
features=features,
219221
storage_client=storage_client,
222+
hf_endpoint=hf_endpoint,
223+
hf_token=hf_token,
220224
offset=offset,
221225
row_idx_column=row_idx_column,
222226
)

libs/libapi/tests/test_response.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ async def test_create_response(storage_client: StorageClient) -> None:
3939
config="default",
4040
split="train",
4141
storage_client=storage_client,
42+
hf_endpoint="not-needed",
43+
hf_token=None,
4244
pa_table=ds.data,
4345
offset=0,
4446
features=ds.features,
@@ -63,6 +65,8 @@ async def test_create_response_with_row_idx_column(storage_client: StorageClient
6365
config="default",
6466
split="train",
6567
storage_client=storage_client,
68+
hf_endpoint="not-needed",
69+
hf_token=None,
6670
pa_table=ds.data,
6771
offset=0,
6872
features=ds.features,
@@ -91,6 +95,8 @@ async def test_create_response_with_image(image_path: str, storage_client: Stora
9195
config=config,
9296
split=split,
9397
storage_client=storage_client,
98+
hf_endpoint="not-needed",
99+
hf_token=None,
94100
pa_table=ds_image.data,
95101
offset=0,
96102
features=ds_image.features,
@@ -131,6 +137,8 @@ async def test_create_response_with_document(document_path: str, storage_client:
131137
config=config,
132138
split=split,
133139
storage_client=storage_client,
140+
hf_endpoint="not-needed",
141+
hf_token=None,
134142
pa_table=ds_document.data,
135143
offset=0,
136144
features=ds_document.features,

libs/libcommon/src/libcommon/viewer_utils/asset.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,19 @@
88

99
import fitz
1010
from pdfplumber.pdf import PDF
11-
from PIL import Image, ImageOps
11+
from PIL import Image
1212
from pydub import AudioSegment # type:ignore
1313

1414
if TYPE_CHECKING:
1515
from libcommon.storage_client import StorageClient
1616

1717

18-
SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE = {".wav": "audio/wav", ".mp3": "audio/mpeg", ".opus": "audio/ogg"}
18+
SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE = {
19+
".wav": "audio/wav",
20+
".mp3": "audio/mpeg",
21+
".opus": "audio/ogg",
22+
".flac": "audio/x-flac",
23+
}
1924
SUPPORTED_AUDIO_EXTENSIONS = SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE.keys()
2025
DATASET_GIT_REVISION_PLACEHOLDER = "{dataset_git_revision}"
2126
LOCK = threading.Lock()
@@ -71,7 +76,6 @@ def create_image_file(
7176
)
7277
path = replace_dataset_git_revision_placeholder(object_path, revision=revision)
7378
if storage_client.overwrite or not storage_client.exists(path):
74-
image = ImageOps.exif_transpose(image) # type: ignore[assignment]
7579
buffer = BytesIO()
7680
image.save(fp=buffer, format=format)
7781
buffer.seek(0)

libs/libcommon/src/libcommon/viewer_utils/features.py

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,17 @@
3030
Value,
3131
Video,
3232
)
33+
from huggingface_hub import HfFileSystem
3334
from PIL import Image as PILImage
3435

3536
from libcommon.dtos import FeatureItem
3637
from libcommon.storage_client import StorageClient
3738
from libcommon.viewer_utils.asset import (
39+
SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE,
3840
SUPPORTED_AUDIO_EXTENSIONS,
41+
AudioSource,
42+
ImageSource,
43+
VideoSource,
3944
create_audio_file,
4045
create_image_file,
4146
create_pdf_file,
@@ -89,6 +94,8 @@ def image(
8994
value: Any,
9095
featureName: str,
9196
storage_client: StorageClient,
97+
hf_endpoint: str,
98+
hf_token: Optional[str],
9299
json_path: Optional[list[Union[str, int]]] = None,
93100
) -> Any:
94101
if value is None:
@@ -97,13 +104,15 @@ def image(
97104
value = PILImage.open(BytesIO(value["bytes"]))
98105
elif isinstance(value, bytes):
99106
value = PILImage.open(BytesIO(value))
100-
elif (
101-
isinstance(value, dict)
102-
and "path" in value
103-
and isinstance(value["path"], str)
104-
and os.path.exists(value["path"])
105-
):
106-
value = PILImage.open(value["path"])
107+
elif isinstance(value, dict) and "path" in value and isinstance(value["path"], str):
108+
if os.path.exists(value["path"]):
109+
value = PILImage.open(value["path"])
110+
elif value["path"].startswith(f"hf://datasets/{dataset}@"):
111+
with HfFileSystem(endpoint=hf_endpoint, token=hf_token).open(value["path"], "rb") as f:
112+
src = value["path"].replace("hf://", hf_endpoint + "/", 1).replace("@", "/resolve/", 1)
113+
image = PILImage.open(f)
114+
return ImageSource(src=src, height=image.height, width=image.width)
115+
107116
if not isinstance(value, PILImage.Image):
108117
raise TypeError(
109118
"Image cell must be a PIL image or an encoded dict of an image, "
@@ -141,6 +150,7 @@ def audio(
141150
value: Any,
142151
featureName: str,
143152
storage_client: StorageClient,
153+
hf_endpoint: str,
144154
json_path: Optional[list[Union[str, int]]] = None,
145155
) -> Any:
146156
from datasets.features._torchcodec import AudioDecoder
@@ -161,7 +171,14 @@ def audio(
161171
"Audio cell must be an encoded dict of an audio sample or a torchcodec AudioDecoder, "
162172
f"but got {str(value)[:300]}{'...' if len(str(value)) > 300 else ''}"
163173
)
174+
164175
audio_file_extension = get_audio_file_extension(value)
176+
if "path" in value and isinstance(value["path"], str) and value.get("bytes") is None:
177+
if audio_file_extension in SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE:
178+
if value["path"].startswith(f"hf://datasets/{dataset}@"):
179+
src = value["path"].replace("hf://", hf_endpoint + "/", 1).replace("@", "/resolve/", 1)
180+
return AudioSource(src=src, type=SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE[audio_file_extension])
181+
165182
audio_file_bytes = get_audio_file_bytes(value)
166183
if not audio_file_extension:
167184
audio_file_extension = infer_audio_file_extension(audio_file_bytes)
@@ -262,6 +279,7 @@ def video(
262279
value: Any,
263280
featureName: str,
264281
storage_client: StorageClient,
282+
hf_endpoint: str,
265283
json_path: Optional[list[Union[str, int]]] = None,
266284
) -> Any:
267285
if datasets.config.TORCHCODEC_AVAILABLE:
@@ -292,6 +310,11 @@ def video(
292310
f"but got {str(value)[:300]}{'...' if len(str(value)) > 300 else ''}"
293311
)
294312

313+
if "path" in value and isinstance(value["path"], str) and value.get("bytes") is None:
314+
if value["path"].startswith(f"hf://datasets/{dataset}@"):
315+
src = value["path"].replace("hf://", hf_endpoint + "/", 1).replace("@", "/resolve/", 1)
316+
return VideoSource(src=src)
317+
295318
video_file_extension = get_video_file_extension(value)
296319
video_file_bytes = get_video_file_bytes(value)
297320
if not video_file_extension:
@@ -346,6 +369,8 @@ def pdf(
346369
value: Any,
347370
featureName: str,
348371
storage_client: StorageClient,
372+
hf_endpoint: str,
373+
hf_token: Optional[str],
349374
json_path: Optional[list[Union[str, int]]] = None,
350375
) -> Any:
351376
if value is None:
@@ -354,13 +379,12 @@ def pdf(
354379
value = pdfplumber.open(BytesIO(value["bytes"]))
355380
elif isinstance(value, bytes):
356381
value = pdfplumber.open(BytesIO(value))
357-
elif (
358-
isinstance(value, dict)
359-
and "path" in value
360-
and isinstance(value["path"], str)
361-
and os.path.exists(value["path"])
362-
):
363-
value = pdfplumber.open(value["path"])
382+
elif isinstance(value, dict) and "path" in value and isinstance(value["path"], str):
383+
if os.path.exists(value["path"]):
384+
value = pdfplumber.open(value["path"])
385+
elif value["path"].startswith(f"hf://datasets/{dataset}@"):
386+
f = HfFileSystem(endpoint=hf_endpoint, token=hf_token).open(value["path"], "rb")
387+
value = pdfplumber.open(f)
364388

365389
if not isinstance(value, pdfplumber.pdf.PDF):
366390
raise TypeError(
@@ -392,6 +416,8 @@ def get_cell_value(
392416
featureName: str,
393417
fieldType: Any,
394418
storage_client: StorageClient,
419+
hf_endpoint: str,
420+
hf_token: Optional[str],
395421
json_path: Optional[list[Union[str, int]]] = None,
396422
) -> Any:
397423
# always allow None values in the cells
@@ -407,6 +433,8 @@ def get_cell_value(
407433
value=cell,
408434
featureName=featureName,
409435
storage_client=storage_client,
436+
hf_endpoint=hf_endpoint,
437+
hf_token=hf_token,
410438
json_path=json_path,
411439
)
412440
elif isinstance(fieldType, Audio):
@@ -419,6 +447,7 @@ def get_cell_value(
419447
value=cell,
420448
featureName=featureName,
421449
storage_client=storage_client,
450+
hf_endpoint=hf_endpoint,
422451
json_path=json_path,
423452
)
424453
elif isinstance(fieldType, Video):
@@ -431,6 +460,7 @@ def get_cell_value(
431460
value=cell,
432461
featureName=featureName,
433462
storage_client=storage_client,
463+
hf_endpoint=hf_endpoint,
434464
json_path=json_path,
435465
)
436466
elif isinstance(fieldType, Pdf):
@@ -443,6 +473,8 @@ def get_cell_value(
443473
value=cell,
444474
featureName=featureName,
445475
storage_client=storage_client,
476+
hf_endpoint=hf_endpoint,
477+
hf_token=hf_token,
446478
json_path=json_path,
447479
)
448480
elif isinstance(fieldType, Json):
@@ -467,6 +499,8 @@ def get_cell_value(
467499
featureName=featureName,
468500
fieldType=subFieldType,
469501
storage_client=storage_client,
502+
hf_endpoint=hf_endpoint,
503+
hf_token=hf_token,
470504
json_path=json_path + [idx] if json_path else [idx],
471505
)
472506
for (idx, subCell) in enumerate(cell)
@@ -486,6 +520,8 @@ def get_cell_value(
486520
featureName=featureName,
487521
fieldType=subFieldType,
488522
storage_client=storage_client,
523+
hf_endpoint=hf_endpoint,
524+
hf_token=hf_token,
489525
json_path=json_path + [idx] if json_path else [idx],
490526
)
491527
for (idx, subCell) in enumerate(cell)
@@ -506,6 +542,8 @@ def get_cell_value(
506542
featureName=featureName,
507543
fieldType=fieldType.feature,
508544
storage_client=storage_client,
545+
hf_endpoint=hf_endpoint,
546+
hf_token=hf_token,
509547
json_path=json_path + [idx] if json_path else [idx],
510548
)
511549
for (idx, subCell) in enumerate(cell)
@@ -525,6 +563,8 @@ def get_cell_value(
525563
featureName=featureName,
526564
fieldType=fieldType[key],
527565
storage_client=storage_client,
566+
hf_endpoint=hf_endpoint,
567+
hf_token=hf_token,
528568
json_path=json_path + [key] if json_path else [key],
529569
)
530570
for (key, subCell) in cell.items()

libs/libcommon/src/libcommon/viewer_utils/rows.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Copyright 2022 The HuggingFace Authors.
33

44

5-
from typing import Protocol
5+
from typing import Optional, Protocol
66

77
import PIL
88
from datasets import Audio, Features, Image, Pdf, Value, Video
@@ -29,6 +29,8 @@ def transform_rows(
2929
rows: list[Row],
3030
features: Features,
3131
storage_client: StorageClient,
32+
hf_endpoint: str,
33+
hf_token: Optional[str],
3234
) -> list[Row]:
3335
transformed_rows: list[Row] = []
3436
for row_idx, row in enumerate(rows):
@@ -45,6 +47,8 @@ def transform_rows(
4547
featureName=featureName,
4648
fieldType=fieldType,
4749
storage_client=storage_client,
50+
hf_endpoint=hf_endpoint,
51+
hf_token=hf_token,
4852
)
4953
for (featureName, fieldType) in features.items()
5054
}
@@ -71,6 +75,8 @@ def create_first_rows_response(
7175
config: str,
7276
split: str,
7377
storage_client: StorageClient,
78+
hf_endpoint: str,
79+
hf_token: Optional[str],
7480
features: Features,
7581
get_rows_content: GetRowsContent,
7682
min_cell_bytes: int,
@@ -161,6 +167,8 @@ def create_first_rows_response(
161167
rows=rows_content.rows,
162168
features=features,
163169
storage_client=storage_client,
170+
hf_endpoint=hf_endpoint,
171+
hf_token=hf_token,
164172
)
165173

166174
# truncate the rows to fit within the restrictions, and prepare them as RowItems

0 commit comments

Comments
 (0)