Skip to content

Commit 7b68dab

Browse files
committed
wip
1 parent 824cef3 commit 7b68dab

15 files changed

Lines changed: 298 additions & 576 deletions

File tree

src/zarr/abc/bikeshed.py

Lines changed: 0 additions & 29 deletions
This file was deleted.

src/zarr/abc/codec.py

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,21 @@
11
from __future__ import annotations
22

33
from abc import abstractmethod
4-
from typing import TYPE_CHECKING, Generic, TypeVar
4+
from collections.abc import Mapping
5+
from typing import (
6+
TYPE_CHECKING,
7+
Generic,
8+
Literal,
9+
TypedDict,
10+
TypeVar,
11+
overload,
12+
)
13+
14+
from typing_extensions import ReadOnly
515

616
from zarr.abc.metadata import Metadata
717
from zarr.core.buffer import Buffer, NDBuffer
8-
from zarr.core.common import ChunkCoords, concurrent_map
18+
from zarr.core.common import ChunkCoords, NamedConfig, ZarrFormat, concurrent_map
919
from zarr.core.config import config
1020

1121
if TYPE_CHECKING:
@@ -33,6 +43,15 @@
3343
CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer)
3444
CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer)
3545

46+
TName = TypeVar("TName", bound=str, covariant=True)
47+
48+
49+
class CodecConfig_V2(TypedDict, Generic[TName]):
50+
id: ReadOnly[TName]
51+
52+
53+
CodecConfig_V3 = NamedConfig[str, Mapping[str, object]]
54+
3655

3756
class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]):
3857
"""Generic base class for codecs.
@@ -156,6 +175,34 @@ async def encode(
156175
"""
157176
return await _batching_helper(self._encode_single, chunks_and_specs)
158177

178+
@overload
179+
def to_json(self, zarr_format: Literal[2]) -> CodecConfig_V2[str]: ...
180+
@overload
181+
def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ...
182+
183+
def to_json(
184+
self, zarr_format: ZarrFormat
185+
) -> CodecConfig_V2[str] | NamedConfig[str, Mapping[str, object]]:
186+
raise NotImplementedError
187+
188+
@classmethod
189+
def _from_json_v2(cls, data: Mapping[str, object]) -> Self:
190+
raise NotImplementedError
191+
192+
@classmethod
193+
def _from_json_v3(cls, data: Mapping[str, object]) -> Self:
194+
raise NotImplementedError
195+
196+
@classmethod
197+
def from_json(cls, data: Mapping[str, object], zarr_format: ZarrFormat) -> Self:
198+
if zarr_format == 2:
199+
return cls._from_json_v2(data)
200+
elif zarr_format == 3:
201+
return cls._from_json_v3(data)
202+
raise ValueError(
203+
f"Unsupported Zarr format {zarr_format}. Expected 2 or 3."
204+
) # pragma: no cover
205+
159206

160207
class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]):
161208
"""Base class for array-to-array codecs."""

src/zarr/codecs/bytes.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44
from dataclasses import dataclass, replace
55
from enum import Enum
6-
from typing import TYPE_CHECKING, cast
6+
from typing import TYPE_CHECKING
77

88
import numpy as np
99

@@ -17,7 +17,6 @@
1717
from typing import Self
1818

1919
from zarr.core.array_spec import ArraySpec
20-
from zarr.core.dtype.common import Endianness
2120

2221

2322
class Endian(Enum):

src/zarr/codecs/gzip.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
from __future__ import annotations
22

33
import asyncio
4+
from collections.abc import Mapping
45
from dataclasses import dataclass
5-
from typing import TYPE_CHECKING
6+
from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload
67

78
from numcodecs.gzip import GZip
89

9-
from zarr.abc.codec import BytesBytesCodec
10+
from zarr.abc.codec import BytesBytesCodec, CodecConfig_V2
1011
from zarr.core.buffer.cpu import as_numpy_array_wrapper
11-
from zarr.core.common import JSON, parse_named_configuration
12+
from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration
1213
from zarr.registry import register_codec
1314

1415
if TYPE_CHECKING:
@@ -28,6 +29,16 @@ def parse_gzip_level(data: JSON) -> int:
2829
return data
2930

3031

32+
class GZipSettings(TypedDict):
33+
level: int
34+
35+
36+
class GZipConfig_V2(CodecConfig_V2[Literal["gzip"]], GZipSettings): ...
37+
38+
39+
GZipConfig_V3 = NamedConfig[Literal["gzip"], GZipSettings]
40+
41+
3142
@dataclass(frozen=True)
3243
class GzipCodec(BytesBytesCodec):
3344
is_fixed_size = False
@@ -47,6 +58,50 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
4758
def to_dict(self) -> dict[str, JSON]:
4859
return {"name": "gzip", "configuration": {"level": self.level}}
4960

61+
@overload
62+
def to_json(self, zarr_format: Literal[2]) -> GZipConfig_V2: ...
63+
@overload
64+
def to_json(self, zarr_format: Literal[3]) -> GZipConfig_V3: ...
65+
66+
def to_json(self, zarr_format: ZarrFormat) -> GZipConfig_V2 | GZipConfig_V3:
67+
if zarr_format == 2:
68+
return {"id": "gzip", "level": self.level}
69+
elif zarr_format == 3:
70+
return {"name": "gzip", "configuration": {"level": self.level}}
71+
raise ValueError(
72+
f"Unsupported Zarr format {zarr_format}. Expected 2 or 3."
73+
) # pragma: no cover
74+
75+
@classmethod
76+
def _check_json_v2(cls, data: Mapping[str, object]) -> TypeGuard[GZipConfig_V2]:
77+
return (
78+
set(data.keys()) == {"id", "level"}
79+
and data["id"] == "gzip"
80+
and isinstance(data["level"], int)
81+
)
82+
83+
@classmethod
84+
def _check_json_v3(cls, data: Mapping[str, object]) -> TypeGuard[GZipConfig_V3]:
85+
return (
86+
set(data.keys()) == {"name", "configuration"}
87+
and data["name"] == "gzip"
88+
and isinstance(data["configuration"], dict)
89+
and "level" in data["configuration"]
90+
and isinstance(data["configuration"]["level"], int)
91+
)
92+
93+
@classmethod
94+
def _from_json_v2(cls, data: Mapping[str, object]) -> Self:
95+
if cls._check_json_v2(data):
96+
return cls(level=data["level"])
97+
raise ValueError(f"Invalid GZip JSON data for Zarr format 2: {data!r}")
98+
99+
@classmethod
100+
def _from_json_v3(cls, data: Mapping[str, object]) -> Self:
101+
if cls._check_json_v3(data):
102+
return cls(level=data["configuration"]["level"])
103+
raise ValueError(f"Invalid GZip JSON data for Zarr format 3: {data!r}")
104+
50105
async def _decode_single(
51106
self,
52107
chunk_bytes: Buffer,

src/zarr/codecs/numcodec.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""
2+
Utilities for interfacing with the numcodecs library.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
import asyncio
8+
from collections.abc import Mapping
9+
from dataclasses import dataclass
10+
from typing import TYPE_CHECKING, Literal, Self, overload
11+
12+
import numpy as np
13+
from typing_extensions import Protocol, runtime_checkable
14+
15+
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, CodecConfig_V2
16+
from zarr.core.array_spec import ArraySpec
17+
from zarr.core.buffer.core import Buffer, BufferPrototype, NDArrayLike, NDBuffer
18+
from zarr.core.buffer.cpu import as_numpy_array_wrapper
19+
20+
if TYPE_CHECKING:
21+
from zarr.core.array_spec import ArraySpec
22+
from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat
23+
24+
BufferOrNDArray = Buffer | np.ndarray[tuple[int, ...], np.dtype[np.generic]] | NDArrayLike
25+
26+
27+
def resolve_numcodec(config: CodecConfig_V2[str]) -> Numcodec:
28+
import numcodecs
29+
30+
return numcodecs.get_codec(config) # type: ignore[no-any-return]
31+
32+
33+
@runtime_checkable
34+
class Numcodec(Protocol):
35+
"""
36+
A protocol that models the ``numcodecs.abc.Codec`` interface.
37+
"""
38+
39+
codec_id: str
40+
41+
def encode(self, buf: BufferOrNDArray) -> BufferOrNDArray: ...
42+
43+
def decode(
44+
self, buf: BufferOrNDArray, out: BufferOrNDArray | None = None
45+
) -> BufferOrNDArray: ...
46+
47+
def get_config(self) -> CodecConfig_V2[str]: ...
48+
49+
@classmethod
50+
def from_config(cls, config: CodecConfig_V2[str]) -> Self: ...
51+
52+
53+
@dataclass(frozen=True, kw_only=True)
54+
class NumcodecsAdapter:
55+
_codec: Numcodec
56+
57+
@overload
58+
def to_json(self, zarr_format: Literal[2]) -> CodecConfig_V2[str]: ...
59+
@overload
60+
def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ...
61+
62+
def to_json(
63+
self, zarr_format: ZarrFormat
64+
) -> CodecConfig_V2[str] | NamedConfig[str, BaseConfig]:
65+
if zarr_format == 2:
66+
return self._codec.get_config()
67+
elif zarr_format == 3:
68+
config = self._codec.get_config()
69+
config_no_id = {k: v for k, v in config.items() if k != "id"}
70+
return {"name": config["id"], "configuration": config_no_id}
71+
raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover
72+
73+
@classmethod
74+
def _from_json_v2(cls, data: Mapping[str, object]) -> Self:
75+
return cls(_codec=resolve_numcodec(data)) # type: ignore[arg-type]
76+
77+
@classmethod
78+
def _from_json_v3(cls, data: Mapping[str, object]) -> Self:
79+
raise NotImplementedError(
80+
"This class does not support creating instances from JSON data for Zarr format 3."
81+
)
82+
83+
def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int:
84+
raise NotImplementedError
85+
86+
87+
class NumcodecsBytesBytesCodec(NumcodecsAdapter, BytesBytesCodec):
88+
async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
89+
return await asyncio.to_thread(
90+
as_numpy_array_wrapper,
91+
self._codec.decode,
92+
chunk_data,
93+
chunk_spec.prototype,
94+
)
95+
96+
def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer:
97+
encoded = self._codec.encode(chunk_bytes.as_array_like())
98+
if isinstance(encoded, np.ndarray): # Required for checksum codecs
99+
return prototype.buffer.from_bytes(encoded.tobytes())
100+
return prototype.buffer.from_bytes(encoded)
101+
102+
async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
103+
return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype)
104+
105+
106+
@dataclass(kw_only=True, frozen=True)
107+
class NumcodecsArrayCodec(NumcodecsAdapter, ArrayArrayCodec):
108+
async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
109+
chunk_ndarray = chunk_data.as_ndarray_like()
110+
out = await asyncio.to_thread(self._codec.decode, chunk_ndarray)
111+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr]
112+
113+
async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
114+
chunk_ndarray = chunk_data.as_ndarray_like()
115+
out = await asyncio.to_thread(self._codec.encode, chunk_ndarray)
116+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type]
117+
118+
119+
@dataclass(kw_only=True, frozen=True)
120+
class NumcodecsArrayBytesCodec(NumcodecsAdapter, ArrayBytesCodec):
121+
async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
122+
chunk_bytes = chunk_data.to_bytes()
123+
out = await asyncio.to_thread(self._codec.decode, chunk_bytes)
124+
return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))
125+
126+
async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
127+
chunk_ndarray = chunk_data.as_ndarray_like()
128+
out = await asyncio.to_thread(self._codec.encode, chunk_ndarray)
129+
return chunk_spec.prototype.buffer.from_bytes(out)

src/zarr/core/array.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from zarr.abc.store import Store, set_or_delete
3131
from zarr.codecs._v2 import V2Codec
3232
from zarr.codecs.bytes import BytesCodec
33+
from zarr.codecs.numcodec import Numcodec
3334
from zarr.core._info import ArrayInfo
3435
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config
3536
from zarr.core.attributes import Attributes
@@ -4717,11 +4718,11 @@ def _parse_chunk_encoding_v3(
47174718
elif compressors == "auto":
47184719
out_bytes_bytes = default_bytes_bytes
47194720
else:
4720-
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
4721-
if isinstance(compressors, dict | Codec):
4721+
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON] | Numcodec]
4722+
if isinstance(compressors, dict | Codec | Numcodec):
47224723
maybe_bytes_bytes = (compressors,)
47234724
else:
4724-
maybe_bytes_bytes = cast("Iterable[Codec | dict[str, JSON]]", compressors)
4725+
maybe_bytes_bytes = compressors # type: ignore[assignment]
47254726

47264727
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
47274728

src/zarr/core/common.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
overload,
2020
)
2121

22+
from typing_extensions import ReadOnly
23+
2224
from zarr.core.config import config as zarr_config
2325

2426
if TYPE_CHECKING:
@@ -43,13 +45,15 @@
4345
ANY_ACCESS_MODE: Final = "r", "r+", "a", "w", "w-"
4446
DimensionNames = Iterable[str | None] | None
4547

46-
TName = TypeVar("TName", bound=str)
47-
TConfig = TypeVar("TConfig", bound=Mapping[str, object])
48+
BaseConfig = Mapping[str, object]
49+
50+
TName_co = TypeVar("TName_co", bound=str, covariant=True)
51+
TConfig_co = TypeVar("TConfig_co", bound=BaseConfig, covariant=True)
4852

4953

50-
class NamedConfig(TypedDict, Generic[TName, TConfig]):
51-
name: TName
52-
configuration: TConfig
54+
class NamedConfig(TypedDict, Generic[TName_co, TConfig_co]):
55+
name: ReadOnly[TName_co]
56+
configuration: ReadOnly[TConfig_co]
5357

5458

5559
def product(tup: ChunkCoords) -> int:

0 commit comments

Comments
 (0)