3838 from zarr .core .common import JSON , ZarrFormat
3939 from zarr .core .dtype .wrapper import TBaseDType
4040
41- _NUMPY_SUPPORTS_VLEN_STRING = hasattr (np .dtypes , "StringDType" )
42-
4341
4442@runtime_checkable
4543class SupportsStr (Protocol ):
@@ -451,36 +449,40 @@ class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8
451449 """
452450
453451
454- # VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy.
455- # If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length
456- # string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object
457- # dtype as the native dtype.
458- class UTF8Base [DType : TBaseDType ](ZDType [DType , str ], HasObjectCodec ):
452+ @dataclass (frozen = True , kw_only = True )
453+ class VariableLengthUTF8 (ZDType [np .dtypes .StringDType , str ], HasObjectCodec ): # type: ignore[type-var]
459454 """
460- A base class for variable-length UTF-8 string data types.
455+ A Zarr data type for arrays containing variable-length UTF-8 strings.
456+
457+ Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances
458+ of ``str``.
461459
462- Not intended for direct use, but as a base for concrete implementations.
463460
464461 Attributes
465462 ----------
466- object_codec_id : ClassVar[Literal["vlen-utf8"]]
463+ dtype_cls : Type[np.dtypes.StringDType]
464+ The NumPy dtype class for this data type.
465+ _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
466+ The name of this data type in Zarr V3.
467+ object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
467468 The object codec ID for this data type.
468469
469470 References
470471 ----------
471- This data type does not have a Zarr V3 specification.
472+ https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/string
472473
473- The Zarr V2 data type specification can be found [here](https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding).
474474 """
475475
476+ dtype_cls = np .dtypes .StringDType # type: ignore[assignment]
476477 _zarr_v3_name : ClassVar [Literal ["string" ]] = "string"
477478 object_codec_id : ClassVar [Literal ["vlen-utf8" ]] = "vlen-utf8"
478479
479480 @classmethod
480481 def from_native_dtype (cls , dtype : TBaseDType ) -> Self :
481482 """
482483 Create an instance of this data type from a compatible NumPy data type.
483-
484+ We reject NumPy StringDType instances that have the `na_object` field set,
485+ because this is not representable by the Zarr `string` data type.
484486
485487 Parameters
486488 ----------
@@ -496,13 +498,33 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self:
496498 ------
497499 DataTypeValidationError
498500 If the input is not compatible with this data type.
501+ ValueError
502+ If the input is `numpy.dtypes.StringDType` and has `na_object` set.
499503 """
500504 if cls ._check_native_dtype (dtype ):
505+ if hasattr (dtype , "na_object" ):
506+ msg = (
507+ f"Zarr data type resolution from { dtype } failed. "
508+ "Attempted to resolve a zarr data type from a `numpy.dtypes.StringDType` "
509+ "with `na_object` set, which is not supported."
510+ )
511+ raise ValueError (msg )
501512 return cls ()
502513 raise DataTypeValidationError (
503514 f"Invalid data type: { dtype } . Expected an instance of { cls .dtype_cls } "
504515 )
505516
517+ def to_native_dtype (self ) -> np .dtypes .StringDType :
518+ """
519+ Create a NumPy string dtype from this VariableLengthUTF8 ZDType.
520+
521+ Returns
522+ -------
523+ np.dtypes.StringDType
524+ The NumPy string dtype.
525+ """
526+ return self .dtype_cls ()
527+
506528 @classmethod
507529 def _check_json_v2 (
508530 cls ,
@@ -719,109 +741,3 @@ def cast_scalar(self, data: object) -> str:
719741 f"data type { self } ."
720742 )
721743 raise TypeError (msg ) # pragma: no cover
722-
723-
724- if _NUMPY_SUPPORTS_VLEN_STRING :
725-
726- @dataclass (frozen = True , kw_only = True )
727- class VariableLengthUTF8 (UTF8Base [np .dtypes .StringDType ]): # type: ignore[type-var]
728- """
729- A Zarr data type for arrays containing variable-length UTF-8 strings.
730-
731- Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances
732- of ``str``.
733-
734-
735- Attributes
736- ----------
737- dtype_cls : Type[np.dtypes.StringDType]
738- The NumPy dtype class for this data type.
739- _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
740- The name of this data type in Zarr V3.
741- object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
742- The object codec ID for this data type.
743- """
744-
745- dtype_cls = np .dtypes .StringDType # type: ignore[assignment]
746-
747- @classmethod
748- def from_native_dtype (cls , dtype : TBaseDType ) -> Self :
749- """
750- Create an instance of this data type from a compatible NumPy data type.
751- We reject NumPy StringDType instances that have the `na_object` field set,
752- because this is not representable by the Zarr `string` data type.
753-
754- Parameters
755- ----------
756- dtype : TBaseDType
757- The native data type.
758-
759- Returns
760- -------
761- Self
762- An instance of this data type.
763-
764- Raises
765- ------
766- DataTypeValidationError
767- If the input is not compatible with this data type.
768- ValueError
769- If the input is `numpy.dtypes.StringDType` and has `na_object` set.
770- """
771- if cls ._check_native_dtype (dtype ):
772- if hasattr (dtype , "na_object" ):
773- msg = (
774- f"Zarr data type resolution from { dtype } failed. "
775- "Attempted to resolve a zarr data type from a `numpy.dtypes.StringDType` "
776- "with `na_object` set, which is not supported."
777- )
778- raise ValueError (msg )
779- return cls ()
780- raise DataTypeValidationError (
781- f"Invalid data type: { dtype } . Expected an instance of { cls .dtype_cls } "
782- )
783-
784- def to_native_dtype (self ) -> np .dtypes .StringDType :
785- """
786- Create a NumPy string dtype from this VariableLengthUTF8 ZDType.
787-
788- Returns
789- -------
790- np.dtypes.StringDType
791- The NumPy string dtype.
792- """
793- return self .dtype_cls ()
794-
795- else :
796- # Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead.
797- @dataclass (frozen = True , kw_only = True )
798- class VariableLengthUTF8 (UTF8Base [np .dtypes .ObjectDType ]): # type: ignore[no-redef]
799- """
800- A Zarr data type for arrays containing variable-length UTF-8 strings.
801-
802- Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances
803- of ``str``.
804-
805-
806- Attributes
807- ----------
808- dtype_cls : Type[np.dtypes.ObjectDType]
809- The NumPy dtype class for this data type.
810- _zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
811- The name of this data type in Zarr V3.
812- object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
813- The object codec ID for this data type.
814- """
815-
816- dtype_cls = np .dtypes .ObjectDType
817-
818- def to_native_dtype (self ) -> np .dtypes .ObjectDType :
819- """
820- Create a NumPy object dtype from this VariableLengthUTF8 ZDType.
821-
822- Returns
823- -------
824- np.dtypes.ObjectDType
825- The NumPy object dtype.
826- """
827- return self .dtype_cls ()
0 commit comments