3030 Value ,
3131 Video ,
3232)
33+ from huggingface_hub import HfFileSystem
3334from PIL import Image as PILImage
3435
3536from libcommon .dtos import FeatureItem
3637from libcommon .storage_client import StorageClient
3738from libcommon .viewer_utils .asset import (
39+ SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE ,
3840 SUPPORTED_AUDIO_EXTENSIONS ,
41+ AudioSource ,
42+ ImageSource ,
43+ VideoSource ,
3944 create_audio_file ,
4045 create_image_file ,
4146 create_pdf_file ,
@@ -89,6 +94,8 @@ def image(
8994 value : Any ,
9095 featureName : str ,
9196 storage_client : StorageClient ,
97+ hf_endpoint : str ,
98+ hf_token : Optional [str ],
9299 json_path : Optional [list [Union [str , int ]]] = None ,
93100) -> Any :
94101 if value is None :
@@ -97,13 +104,15 @@ def image(
97104 value = PILImage .open (BytesIO (value ["bytes" ]))
98105 elif isinstance (value , bytes ):
99106 value = PILImage .open (BytesIO (value ))
100- elif (
101- isinstance (value , dict )
102- and "path" in value
103- and isinstance (value ["path" ], str )
104- and os .path .exists (value ["path" ])
105- ):
106- value = PILImage .open (value ["path" ])
107+ elif isinstance (value , dict ) and "path" in value and isinstance (value ["path" ], str ):
108+ if os .path .exists (value ["path" ]):
109+ value = PILImage .open (value ["path" ])
110+ elif value ["path" ].startswith (f"hf://datasets/{ dataset } @" ):
111+ with HfFileSystem (endpoint = hf_endpoint , token = hf_token ).open (value ["path" ], "rb" ) as f :
112+ src = value ["path" ].replace ("hf://" , hf_endpoint + "/" , 1 ).replace ("@" , "/resolve/" , 1 )
113+ image = PILImage .open (f )
114+ return ImageSource (src = src , height = image .height , width = image .width )
115+
107116 if not isinstance (value , PILImage .Image ):
108117 raise TypeError (
109118 "Image cell must be a PIL image or an encoded dict of an image, "
@@ -141,6 +150,7 @@ def audio(
141150 value : Any ,
142151 featureName : str ,
143152 storage_client : StorageClient ,
153+ hf_endpoint : str ,
144154 json_path : Optional [list [Union [str , int ]]] = None ,
145155) -> Any :
146156 from datasets .features ._torchcodec import AudioDecoder
@@ -161,7 +171,14 @@ def audio(
161171 "Audio cell must be an encoded dict of an audio sample or a torchcodec AudioDecoder, "
162172 f"but got { str (value )[:300 ]} { '...' if len (str (value )) > 300 else '' } "
163173 )
174+
164175 audio_file_extension = get_audio_file_extension (value )
176+ if "path" in value and isinstance (value ["path" ], str ) and value .get ("bytes" ) is None :
177+ if audio_file_extension in SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE :
178+ if value ["path" ].startswith (f"hf://datasets/{ dataset } @" ):
179+ src = value ["path" ].replace ("hf://" , hf_endpoint + "/" , 1 ).replace ("@" , "/resolve/" , 1 )
180+ return AudioSource (src = src , type = SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE [audio_file_extension ])
181+
165182 audio_file_bytes = get_audio_file_bytes (value )
166183 if not audio_file_extension :
167184 audio_file_extension = infer_audio_file_extension (audio_file_bytes )
@@ -262,6 +279,7 @@ def video(
262279 value : Any ,
263280 featureName : str ,
264281 storage_client : StorageClient ,
282+ hf_endpoint : str ,
265283 json_path : Optional [list [Union [str , int ]]] = None ,
266284) -> Any :
267285 if datasets .config .TORCHCODEC_AVAILABLE :
@@ -292,6 +310,11 @@ def video(
292310 f"but got { str (value )[:300 ]} { '...' if len (str (value )) > 300 else '' } "
293311 )
294312
313+ if "path" in value and isinstance (value ["path" ], str ) and value .get ("bytes" ) is None :
314+ if value ["path" ].startswith (f"hf://datasets/{ dataset } @" ):
315+ src = value ["path" ].replace ("hf://" , hf_endpoint + "/" , 1 ).replace ("@" , "/resolve/" , 1 )
316+ return VideoSource (src = src )
317+
295318 video_file_extension = get_video_file_extension (value )
296319 video_file_bytes = get_video_file_bytes (value )
297320 if not video_file_extension :
@@ -346,6 +369,8 @@ def pdf(
346369 value : Any ,
347370 featureName : str ,
348371 storage_client : StorageClient ,
372+ hf_endpoint : str ,
373+ hf_token : Optional [str ],
349374 json_path : Optional [list [Union [str , int ]]] = None ,
350375) -> Any :
351376 if value is None :
@@ -354,13 +379,12 @@ def pdf(
354379 value = pdfplumber .open (BytesIO (value ["bytes" ]))
355380 elif isinstance (value , bytes ):
356381 value = pdfplumber .open (BytesIO (value ))
357- elif (
358- isinstance (value , dict )
359- and "path" in value
360- and isinstance (value ["path" ], str )
361- and os .path .exists (value ["path" ])
362- ):
363- value = pdfplumber .open (value ["path" ])
382+ elif isinstance (value , dict ) and "path" in value and isinstance (value ["path" ], str ):
383+ if os .path .exists (value ["path" ]):
384+ value = pdfplumber .open (value ["path" ])
385+ elif value ["path" ].startswith (f"hf://datasets/{ dataset } @" ):
386+ f = HfFileSystem (endpoint = hf_endpoint , token = hf_token ).open (value ["path" ], "rb" )
387+ value = pdfplumber .open (f )
364388
365389 if not isinstance (value , pdfplumber .pdf .PDF ):
366390 raise TypeError (
@@ -392,6 +416,8 @@ def get_cell_value(
392416 featureName : str ,
393417 fieldType : Any ,
394418 storage_client : StorageClient ,
419+ hf_endpoint : str ,
420+ hf_token : Optional [str ],
395421 json_path : Optional [list [Union [str , int ]]] = None ,
396422) -> Any :
397423 # always allow None values in the cells
@@ -407,6 +433,8 @@ def get_cell_value(
407433 value = cell ,
408434 featureName = featureName ,
409435 storage_client = storage_client ,
436+ hf_endpoint = hf_endpoint ,
437+ hf_token = hf_token ,
410438 json_path = json_path ,
411439 )
412440 elif isinstance (fieldType , Audio ):
@@ -419,6 +447,7 @@ def get_cell_value(
419447 value = cell ,
420448 featureName = featureName ,
421449 storage_client = storage_client ,
450+ hf_endpoint = hf_endpoint ,
422451 json_path = json_path ,
423452 )
424453 elif isinstance (fieldType , Video ):
@@ -431,6 +460,7 @@ def get_cell_value(
431460 value = cell ,
432461 featureName = featureName ,
433462 storage_client = storage_client ,
463+ hf_endpoint = hf_endpoint ,
434464 json_path = json_path ,
435465 )
436466 elif isinstance (fieldType , Pdf ):
@@ -443,6 +473,8 @@ def get_cell_value(
443473 value = cell ,
444474 featureName = featureName ,
445475 storage_client = storage_client ,
476+ hf_endpoint = hf_endpoint ,
477+ hf_token = hf_token ,
446478 json_path = json_path ,
447479 )
448480 elif isinstance (fieldType , Json ):
@@ -467,6 +499,8 @@ def get_cell_value(
467499 featureName = featureName ,
468500 fieldType = subFieldType ,
469501 storage_client = storage_client ,
502+ hf_endpoint = hf_endpoint ,
503+ hf_token = hf_token ,
470504 json_path = json_path + [idx ] if json_path else [idx ],
471505 )
472506 for (idx , subCell ) in enumerate (cell )
@@ -486,6 +520,8 @@ def get_cell_value(
486520 featureName = featureName ,
487521 fieldType = subFieldType ,
488522 storage_client = storage_client ,
523+ hf_endpoint = hf_endpoint ,
524+ hf_token = hf_token ,
489525 json_path = json_path + [idx ] if json_path else [idx ],
490526 )
491527 for (idx , subCell ) in enumerate (cell )
@@ -506,6 +542,8 @@ def get_cell_value(
506542 featureName = featureName ,
507543 fieldType = fieldType .feature ,
508544 storage_client = storage_client ,
545+ hf_endpoint = hf_endpoint ,
546+ hf_token = hf_token ,
509547 json_path = json_path + [idx ] if json_path else [idx ],
510548 )
511549 for (idx , subCell ) in enumerate (cell )
@@ -525,6 +563,8 @@ def get_cell_value(
525563 featureName = featureName ,
526564 fieldType = fieldType [key ],
527565 storage_client = storage_client ,
566+ hf_endpoint = hf_endpoint ,
567+ hf_token = hf_token ,
528568 json_path = json_path + [key ] if json_path else [key ],
529569 )
530570 for (key , subCell ) in cell .items ()
0 commit comments