Skip to content

Commit 1907ad6

Browse files
d-v-bmaxrjones
andauthored
prune old string dtype class (#3973)
* chore: remove old stringdtype specialized for numpy < 2 * chore: narrow test fn signature * docs: changelog * chore: rename test class * Update changes/3973.removal.md Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> --------- Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com>
1 parent 093a153 commit 1907ad6

6 files changed

Lines changed: 84 additions & 214 deletions

File tree

changes/3973.removal.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Removed the NumPy 1.x implementation of the `VariableLengthUTF8` data type because NumPy 1.x is no longer supported under [SPEC0](https://scientific-python.org/specs/spec-0000/).

src/zarr/core/dtype/npy/string.py

Lines changed: 35 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@
3838
from zarr.core.common import JSON, ZarrFormat
3939
from zarr.core.dtype.wrapper import TBaseDType
4040

41-
_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType")
42-
4341

4442
@runtime_checkable
4543
class SupportsStr(Protocol):
@@ -451,36 +449,40 @@ class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8
451449
"""
452450

453451

454-
# VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy.
455-
# If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length
456-
# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object
457-
# dtype as the native dtype.
458-
class UTF8Base[DType: TBaseDType](ZDType[DType, str], HasObjectCodec):
452+
@dataclass(frozen=True, kw_only=True)
453+
class VariableLengthUTF8(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var]
459454
"""
460-
A base class for variable-length UTF-8 string data types.
455+
A Zarr data type for arrays containing variable-length UTF-8 strings.
456+
457+
Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances
458+
of ``str``.
461459
462-
Not intended for direct use, but as a base for concrete implementations.
463460
464461
Attributes
465462
----------
466-
object_codec_id : ClassVar[Literal["vlen-utf8"]]
463+
dtype_cls : Type[np.dtypes.StringDType]
464+
The NumPy dtype class for this data type.
465+
_zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
466+
The name of this data type in Zarr V3.
467+
object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
467468
The object codec ID for this data type.
468469
469470
References
470471
----------
471-
This data type does not have a Zarr V3 specification.
472+
https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/string
472473
473-
The Zarr V2 data type specification can be found [here](https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding).
474474
"""
475475

476+
dtype_cls = np.dtypes.StringDType # type: ignore[assignment]
476477
_zarr_v3_name: ClassVar[Literal["string"]] = "string"
477478
object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
478479

479480
@classmethod
480481
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
481482
"""
482483
Create an instance of this data type from a compatible NumPy data type.
483-
484+
We reject NumPy StringDType instances that have the `na_object` field set,
485+
because this is not representable by the Zarr `string` data type.
484486
485487
Parameters
486488
----------
@@ -496,13 +498,33 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self:
496498
------
497499
DataTypeValidationError
498500
If the input is not compatible with this data type.
501+
ValueError
502+
If the input is `numpy.dtypes.StringDType` and has `na_object` set.
499503
"""
500504
if cls._check_native_dtype(dtype):
505+
if hasattr(dtype, "na_object"):
506+
msg = (
507+
f"Zarr data type resolution from {dtype} failed. "
508+
"Attempted to resolve a zarr data type from a `numpy.dtypes.StringDType` "
509+
"with `na_object` set, which is not supported."
510+
)
511+
raise ValueError(msg)
501512
return cls()
502513
raise DataTypeValidationError(
503514
f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}"
504515
)
505516

517+
def to_native_dtype(self) -> np.dtypes.StringDType:
518+
"""
519+
Create a NumPy string dtype from this VariableLengthUTF8 ZDType.
520+
521+
Returns
522+
-------
523+
np.dtypes.StringDType
524+
The NumPy string dtype.
525+
"""
526+
return self.dtype_cls()
527+
506528
@classmethod
507529
def _check_json_v2(
508530
cls,
@@ -719,109 +741,3 @@ def cast_scalar(self, data: object) -> str:
719741
f"data type {self}."
720742
)
721743
raise TypeError(msg) # pragma: no cover
722-
723-
724-
if _NUMPY_SUPPORTS_VLEN_STRING:
725-
726-
@dataclass(frozen=True, kw_only=True)
727-
class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var]
728-
"""
729-
A Zarr data type for arrays containing variable-length UTF-8 strings.
730-
731-
Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances
732-
of ``str``.
733-
734-
735-
Attributes
736-
----------
737-
dtype_cls : Type[np.dtypes.StringDType]
738-
The NumPy dtype class for this data type.
739-
_zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
740-
The name of this data type in Zarr V3.
741-
object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
742-
The object codec ID for this data type.
743-
"""
744-
745-
dtype_cls = np.dtypes.StringDType # type: ignore[assignment]
746-
747-
@classmethod
748-
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
749-
"""
750-
Create an instance of this data type from a compatible NumPy data type.
751-
We reject NumPy StringDType instances that have the `na_object` field set,
752-
because this is not representable by the Zarr `string` data type.
753-
754-
Parameters
755-
----------
756-
dtype : TBaseDType
757-
The native data type.
758-
759-
Returns
760-
-------
761-
Self
762-
An instance of this data type.
763-
764-
Raises
765-
------
766-
DataTypeValidationError
767-
If the input is not compatible with this data type.
768-
ValueError
769-
If the input is `numpy.dtypes.StringDType` and has `na_object` set.
770-
"""
771-
if cls._check_native_dtype(dtype):
772-
if hasattr(dtype, "na_object"):
773-
msg = (
774-
f"Zarr data type resolution from {dtype} failed. "
775-
"Attempted to resolve a zarr data type from a `numpy.dtypes.StringDType` "
776-
"with `na_object` set, which is not supported."
777-
)
778-
raise ValueError(msg)
779-
return cls()
780-
raise DataTypeValidationError(
781-
f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}"
782-
)
783-
784-
def to_native_dtype(self) -> np.dtypes.StringDType:
785-
"""
786-
Create a NumPy string dtype from this VariableLengthUTF8 ZDType.
787-
788-
Returns
789-
-------
790-
np.dtypes.StringDType
791-
The NumPy string dtype.
792-
"""
793-
return self.dtype_cls()
794-
795-
else:
796-
# Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead.
797-
@dataclass(frozen=True, kw_only=True)
798-
class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef]
799-
"""
800-
A Zarr data type for arrays containing variable-length UTF-8 strings.
801-
802-
Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances
803-
of ``str``.
804-
805-
806-
Attributes
807-
----------
808-
dtype_cls : Type[np.dtypes.ObjectDType]
809-
The NumPy dtype class for this data type.
810-
_zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
811-
The name of this data type in Zarr V3.
812-
object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
813-
The object codec ID for this data type.
814-
"""
815-
816-
dtype_cls = np.dtypes.ObjectDType
817-
818-
def to_native_dtype(self) -> np.dtypes.ObjectDType:
819-
"""
820-
Create a NumPy object dtype from this VariableLengthUTF8 ZDType.
821-
822-
Returns
823-
-------
824-
np.dtypes.ObjectDType
825-
The NumPy object dtype.
826-
"""
827-
return self.dtype_cls()

tests/test_array.py

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@
6969
)
7070
from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr
7171
from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str
72-
from zarr.core.dtype.npy.string import UTF8Base
7372
from zarr.core.group import AsyncGroup
7473
from zarr.core.indexing import BasicIndexer, _iter_grid, _iter_regions
7574
from zarr.core.metadata.v2 import ArrayV2Metadata
@@ -1981,23 +1980,14 @@ def test_array_repr(store: Store) -> None:
19811980
assert str(arr) == f"<Array {store} shape={shape} dtype={dtype}>"
19821981

19831982

1984-
class UnknownObjectDtype(UTF8Base[np.dtypes.ObjectDType]):
1985-
object_codec_id = "unknown" # type: ignore[assignment]
1986-
1987-
def to_native_dtype(self) -> np.dtypes.ObjectDType:
1988-
"""
1989-
Create a NumPy object dtype from this VariableLengthUTF8 ZDType.
1983+
class UnknownObjectCodecDtype(VariableLengthUTF8):
1984+
"""A data type that requires an object codec with an unknown id, used for error-path tests."""
19901985

1991-
Returns
1992-
-------
1993-
np.dtypes.ObjectDType
1994-
The NumPy object dtype.
1995-
"""
1996-
return np.dtype("o") # type: ignore[return-value]
1986+
object_codec_id = "unknown" # type: ignore[assignment]
19971987

19981988

19991989
@pytest.mark.parametrize(
2000-
"dtype", [VariableLengthUTF8(), VariableLengthBytes(), UnknownObjectDtype()]
1990+
"dtype", [VariableLengthUTF8(), VariableLengthBytes(), UnknownObjectCodecDtype()]
20011991
)
20021992
def test_chunk_encoding_no_object_codec_errors(dtype: ZDType[Any, Any]) -> None:
20031993
"""
@@ -2024,7 +2014,7 @@ def test_unknown_object_codec_default_serializer_v3() -> None:
20242014
Test that we get a valueerrror when trying to create the default serializer for a data type
20252015
that requires an unknown object codec
20262016
"""
2027-
dtype = UnknownObjectDtype()
2017+
dtype = UnknownObjectCodecDtype()
20282018
msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}."
20292019
with pytest.raises(ValueError, match=re.escape(msg)):
20302020
default_serializer_v3(dtype)
@@ -2035,7 +2025,7 @@ def test_unknown_object_codec_default_filters_v2() -> None:
20352025
Test that we get a valueerrror when trying to create the default serializer for a data type
20362026
that requires an unknown object codec
20372027
"""
2038-
dtype = UnknownObjectDtype()
2028+
dtype = UnknownObjectCodecDtype()
20392029
msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}."
20402030
with pytest.raises(ValueError, match=re.escape(msg)):
20412031
default_filters_v2(dtype)

tests/test_codecs/test_vlen.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,19 @@
1010
from zarr.codecs import ZstdCodec
1111
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
1212
from zarr.core.dtype import get_data_type_from_native_dtype
13-
from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING
1413
from zarr.core.metadata.v3 import ArrayV3Metadata
1514
from zarr.storage import StorePath
1615

17-
numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"]
18-
expected_array_string_dtype: np.dtype[Any]
19-
if _NUMPY_SUPPORTS_VLEN_STRING:
20-
numpy_str_dtypes.append(np.dtypes.StringDType)
21-
expected_array_string_dtype = np.dtypes.StringDType()
22-
else:
23-
expected_array_string_dtype = np.dtype("O")
16+
numpy_str_dtypes: list[type | str | None] = [
17+
None,
18+
str,
19+
"str",
20+
np.dtypes.StrDType,
21+
"S",
22+
"U",
23+
np.dtypes.StringDType,
24+
]
25+
expected_array_string_dtype: np.dtype[Any] = np.dtypes.StringDType()
2426

2527

2628
@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning")

0 commit comments

Comments
 (0)