diff --git a/changes/3892.feature.md b/changes/3892.feature.md new file mode 100644 index 0000000000..a602602209 --- /dev/null +++ b/changes/3892.feature.md @@ -0,0 +1 @@ +Add `codec_class_map` and `codec_pipeline_class` fields to the runtime array configuration. This allows explicitly declaring the codec classes and codec pipeline class to use when reading an array, as well as dynamically swapping out the codec classes or the codec pipeline class on an existing `zarr.Array`. \ No newline at end of file diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index c776176665..2664cf9dea 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1242,6 +1242,7 @@ async def open_array( zarr_format: ZarrFormat | None = None, path: PathLike = "", storage_options: dict[str, Any] | None = None, + config: ArrayConfigLike | None = None, **kwargs: Any, # TODO: type kwargs as valid args to save ) -> AnyAsyncArray: """Open an array using file-mode-like semantics. @@ -1261,6 +1262,8 @@ async def open_array( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. + config : ArrayConfigLike + Declaration of the runtime configuration for the array. **kwargs Any keyword arguments to pass to [`create`][zarr.api.asynchronous.create]. @@ -1279,7 +1282,7 @@ async def open_array( _warn_write_empty_chunks_kwarg() try: - return await AsyncArray.open(store_path, zarr_format=zarr_format) + return await AsyncArray.open(store_path, zarr_format=zarr_format, config=config) except FileNotFoundError as err: if not store_path.read_only and mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index a865f97646..8f404d5eb6 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -1369,6 +1369,7 @@ def open_array( zarr_format: ZarrFormat | None = None, path: PathLike = "", storage_options: dict[str, Any] | None = None, + config: ArrayConfigLike | None = None, **kwargs: Any, ) -> AnyArray: """Open an array using file-mode-like semantics. @@ -1388,6 +1389,8 @@ def open_array( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. + config : ArrayConfigLike + Declaration of the runtime configuration for the array. **kwargs Any keyword arguments to pass to [`create`][zarr.api.asynchronous.create]. @@ -1405,6 +1408,7 @@ def open_array( zarr_format=zarr_format, path=path, storage_options=storage_options, + config=config, **kwargs, ) ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 609e32f87d..03bbd0b89e 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -26,7 +26,7 @@ ) from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig, parse_codec_class_map from zarr.core.buffer import ( Buffer, BufferPrototype, @@ -319,10 +319,13 @@ def __init__( codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),), index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()), index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end, + codec_class_map: Mapping[str, type[Codec]] | None = None, ) -> None: + if codec_class_map is None: + codec_class_map = parse_codec_class_map(None) chunk_shape_parsed = parse_shapelike(chunk_shape) - codecs_parsed = parse_codecs(codecs) - index_codecs_parsed = parse_codecs(index_codecs) + codecs_parsed = parse_codecs(codecs, codec_class_map=codec_class_map) + index_codecs_parsed = parse_codecs(index_codecs, codec_class_map=codec_class_map) index_location_parsed = parse_index_location(index_location) object.__setattr__(self, "chunk_shape", chunk_shape_parsed) @@ -345,9 +348,16 @@ def __getstate__(self) -> dict[str, Any]: def __setstate__(self, state: dict[str, Any]) -> None: config = state["configuration"] + codec_class_map = parse_codec_class_map(None) object.__setattr__(self, "chunk_shape", parse_shapelike(config["chunk_shape"])) - object.__setattr__(self, "codecs", parse_codecs(config["codecs"])) - object.__setattr__(self, "index_codecs", parse_codecs(config["index_codecs"])) + object.__setattr__( + self, "codecs", parse_codecs(config["codecs"], codec_class_map=codec_class_map) + ) + object.__setattr__( + self, + "index_codecs", + parse_codecs(config["index_codecs"], codec_class_map=codec_class_map), + ) object.__setattr__(self, "index_location", parse_index_location(config["index_location"])) # Use instance-local lru_cache to avoid memory leaks @@ -737,7 +747,7 @@ def _get_index_chunk_spec(self, chunks_per_shard: tuple[int, ...]) -> ArraySpec: shape=chunks_per_shard + (2,), dtype=UInt64(endianness="little"), fill_value=MAX_UINT_64, - config=ArrayConfig( + config=ArraySpecConfig( order="C", write_empty_chunks=False ), # Note: this is hard-coded for simplicity -- it is not surfaced into user code, prototype=default_buffer_prototype(), diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 4736805b9d..a01b21eb5f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -28,7 +28,13 @@ from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config +from zarr.core.array_spec import ( + ArrayConfig, + ArrayConfigLike, + ArraySpec, + ArraySpecConfig, + parse_array_config, +) from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -122,6 +128,7 @@ ChunkGridMetadata, RectilinearChunkGridMetadata, RegularChunkGridMetadata, + parse_codecs, parse_node_type_array, resolve_chunks, ) @@ -197,13 +204,19 @@ def _chunk_sizes_from_shape( return tuple(result) -def parse_array_metadata(data: Any) -> ArrayMetadata: - if isinstance(data, ArrayMetadata): +def parse_array_metadata(data: object, codec_class_map: Mapping[str, type[Codec]]) -> ArrayMetadata: + if isinstance(data, ArrayV3Metadata): + new_codecs = parse_codecs( + [c.to_dict() for c in data.codecs], codec_class_map=codec_class_map + ) + return replace(data, codecs=new_codecs) + elif isinstance(data, ArrayV2Metadata): + # V2 arrays get their codecs from numcodecs, for now. the codec class map is not used. return data elif isinstance(data, dict): zarr_format = data.get("zarr_format") if zarr_format == 3: - meta_out = ArrayV3Metadata.from_dict(data) + meta_out = ArrayV3Metadata.from_dict(data, codec_class_map=codec_class_map) if len(meta_out.storage_transformers) > 0: msg = ( f"Array metadata contains storage transformers: {meta_out.storage_transformers}." @@ -218,20 +231,31 @@ def parse_array_metadata(data: Any) -> ArrayMetadata: raise TypeError # pragma: no cover -def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None) -> CodecPipeline: +def create_codec_pipeline( + metadata: ArrayMetadata, + *, + store: Store | None = None, + config: ArrayConfig | None = None, +) -> CodecPipeline: + pipeline_class: type[CodecPipeline] + if config is not None: + pipeline_class = config.codec_pipeline_class + else: + pipeline_class = get_pipeline_class() + if store is not None: try: - return get_pipeline_class().from_array_metadata_and_store( + return pipeline_class.from_array_metadata_and_store( array_metadata=metadata, store=store ) except NotImplementedError: pass if isinstance(metadata, ArrayV3Metadata): - return get_pipeline_class().from_codecs(metadata.codecs) + return pipeline_class.from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) - return get_pipeline_class().from_codecs([v2_codec]) + return pipeline_class.from_codecs([v2_codec]) raise TypeError # pragma: no cover @@ -353,8 +377,10 @@ def __init__( store_path: StorePath, config: ArrayConfigLike | None = None, ) -> None: - metadata_parsed = parse_array_metadata(metadata) config_parsed = parse_array_config(config) + metadata_parsed = parse_array_metadata( + metadata, codec_class_map=config_parsed.codec_class_map + ) object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) @@ -363,7 +389,9 @@ def __init__( object.__setattr__( self, "codec_pipeline", - create_codec_pipeline(metadata=metadata_parsed, store=store_path.store), + create_codec_pipeline( + metadata=metadata_parsed, store=store_path.store, config=config_parsed + ), ) # this overload defines the function signature when zarr_format is 2 @@ -779,6 +807,7 @@ def _create_metadata_v3( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: DimensionNamesLike = None, attributes: dict[str, JSON] | None = None, + codec_class_map: Mapping[str, type[Codec]] | None = None, ) -> ArrayV3Metadata: """Create an instance of ArrayV3Metadata.""" filters: tuple[ArrayArrayCodec, ...] @@ -816,6 +845,7 @@ def _create_metadata_v3( codecs=codecs_parsed, # type: ignore[arg-type] dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, + codec_class_map=codec_class_map, ) @classmethod @@ -863,6 +893,7 @@ async def _create_v3( codecs=codecs, dimension_names=dimension_names, attributes=attributes, + codec_class_map=config.codec_class_map, ) array = cls(metadata=metadata, store_path=store_path, config=config) @@ -987,7 +1018,9 @@ def from_dict( ValueError If the dictionary data is invalid or incompatible with either Zarr format 2 or 3 array creation. """ - metadata = parse_array_metadata(data) + from zarr.core.array_spec import parse_codec_class_map + + metadata = parse_array_metadata(data, codec_class_map=parse_codec_class_map(None)) return cls(metadata=metadata, store_path=store_path) @classmethod @@ -995,6 +1028,8 @@ async def open( cls, store: StoreLike, zarr_format: ZarrFormat | None = 3, + *, + config: ArrayConfigLike | None = None, ) -> AnyAsyncArray: """ Async method to open an existing Zarr array from a given store. @@ -1007,6 +1042,8 @@ async def open( for a description of all valid StoreLike values. zarr_format : ZarrFormat | None, optional The Zarr format version (default is 3). + config : ArrayConfigLike | None, (default is None) + Runtime configuration for the array. Returns ------- @@ -1038,7 +1075,7 @@ async def example(): metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format) # TODO: remove this cast when we have better type hints _metadata_dict = cast("ArrayMetadataJSON_V3", metadata_dict) - return cls(store_path=store_path, metadata=_metadata_dict) + return cls(store_path=store_path, metadata=_metadata_dict, config=config) @property def store(self) -> Store: @@ -4704,7 +4741,7 @@ async def init_array( chunk_key_encoding: ChunkKeyEncodingLike | None = None, dimension_names: DimensionNamesLike = None, overwrite: bool = False, - config: ArrayConfigLike | None = None, + config: ArrayConfig | None = None, ) -> AnyAsyncArray: """Create and persist an array metadata document. @@ -4942,6 +4979,7 @@ async def init_array( codecs=codecs_out, dimension_names=dimension_names, attributes=attributes, + codec_class_map=config.codec_class_map if config is not None else None, ) arr = AsyncArray(metadata=meta, store_path=store_path, config=config) @@ -5139,7 +5177,7 @@ async def create_array( chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, overwrite=overwrite, - config=config, + config=parse_array_config(config), ) @@ -5769,11 +5807,16 @@ def _get_chunk_spec( spec = chunk_grid[chunk_coords] if spec is None: raise IndexError(f"Chunk coordinates {chunk_coords} are out of bounds.") + spec_config = ArraySpecConfig( + order=array_config.order, + read_missing_chunks=array_config.read_missing_chunks, + write_empty_chunks=array_config.write_empty_chunks, + ) return ArraySpec( shape=spec.codec_shape, dtype=metadata.dtype, fill_value=metadata.fill_value, - config=array_config, + config=spec_config, prototype=prototype, ) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 2b5eb0191c..a187356b5c 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -1,7 +1,9 @@ from __future__ import annotations from dataclasses import dataclass, fields -from typing import TYPE_CHECKING, Any, Literal, Self, TypedDict, cast +from typing import TYPE_CHECKING, Any, Final, Literal, Self, cast + +from typing_extensions import TypedDict from zarr.core.common import ( MemoryOrder, @@ -13,13 +15,42 @@ from zarr.core.config import config as zarr_config if TYPE_CHECKING: + from collections.abc import Mapping from typing import NotRequired + from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BytesBytesCodec, + Codec, + CodecPipeline, + ) from zarr.core.buffer import BufferPrototype from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType -class ArrayConfigParams(TypedDict): +class CodecPipelineRequest(TypedDict): + """ + A dictionary model of a request for a codec pipeline. + """ + + class_path: str + options: NotRequired[dict[str, object]] + + +class ArrayConfigParams(TypedDict, closed=True): # type: ignore[call-arg] + """ + A TypedDict model of the attributes of an ArrayConfig class. + """ + + order: MemoryOrder + write_empty_chunks: bool + read_missing_chunks: bool + codec_class_map: Mapping[str, type[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec]] + codec_pipeline_class: type[CodecPipeline] + + +class ArrayConfigRequest(TypedDict, closed=True): # type: ignore[call-arg] """ A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. This allows for partial construction of an ArrayConfig, with the assumption that the unset @@ -29,6 +60,29 @@ class ArrayConfigParams(TypedDict): order: NotRequired[MemoryOrder] write_empty_chunks: NotRequired[bool] read_missing_chunks: NotRequired[bool] + codec_class_map: NotRequired[ + Mapping[str, type[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec]] + ] + codec_pipeline_class: NotRequired[type[CodecPipeline]] + + +ArrayConfigKeys = Literal[ + "order", "write_empty_chunks", "read_missing_chunks", "codec_class_map", "codec_pipeline_class" +] + +ARRAY_CONFIG_PARAMS_KEYS: Final[set[str]] = { + "order", + "write_empty_chunks", + "read_missing_chunks", + "codec_class_map", + "codec_pipeline_class", +} +ARRAY_CONFIG_PARAMS_KEYS_STATIC: Final[set[str]] = { + "order", + "write_empty_chunks", + "read_missing_chunks", +} +"""The keys of the ArrayConfigParams object that are static and retrievable from the config""" @dataclass(frozen=True) @@ -38,6 +92,24 @@ class ArrayConfig: Parameters ---------- + order : MemoryOrder + The memory layout of the arrays returned when reading data from the store. + write_empty_chunks : bool + If True, empty chunks will be written to the store. + read_missing_chunks : bool, default is True + If True, missing chunks will be filled with the array's fill value on read. + If False, reading missing chunks will raise a ``ChunkNotFoundError``. + codec_class_map : Mapping[str, object] | None, default is None + A request for a codec name : codec class mapping that defines the codec classes available + for array creation. Defaults to `None`, in which case a default collection of codecs + is retrieved from the global config object. + codec_pipeline_class : CodecPipelineRequest | None, default = None + A request for a codec pipeline class to be used for orchestrating chunk encoding and + decoding. Defaults to `None`, in which case the default codec pipeline request + is retrieved from information in the global config object. + + Attributes + ---------- order : MemoryOrder The memory layout of the arrays returned when reading data from the store. write_empty_chunks : bool @@ -45,37 +117,62 @@ class ArrayConfig: read_missing_chunks : bool If True, missing chunks will be filled with the array's fill value on read. If False, reading missing chunks will raise a ``ChunkNotFoundError``. + codec_class_map : Mapping[str, object] + A codec name : codec class mapping that defines the codec classes available + for array creation. + codec_pipeline_class : type[CodecPipeline] + A codec pipeline class that will be used for orchestrating chunk encoding and + decoding. """ order: MemoryOrder write_empty_chunks: bool read_missing_chunks: bool + codec_class_map: Mapping[str, type[Codec]] + codec_pipeline_class: type[CodecPipeline] def __init__( - self, order: MemoryOrder, write_empty_chunks: bool, *, read_missing_chunks: bool = True + self, + order: MemoryOrder, + write_empty_chunks: bool, + *, + read_missing_chunks: bool = True, + codec_class_map: Mapping[str, type[ArrayBytesCodec | ArrayArrayCodec | BytesBytesCodec]] + | None = None, + codec_pipeline_class: type[CodecPipeline] | None = None, ) -> None: order_parsed = parse_order(order) write_empty_chunks_parsed = parse_bool(write_empty_chunks) read_missing_chunks_parsed = parse_bool(read_missing_chunks) + codec_class_map_parsed = parse_codec_class_map(codec_class_map) + codec_pipeline_class_parsed = parse_codec_pipeline_class(codec_pipeline_class) object.__setattr__(self, "order", order_parsed) object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) object.__setattr__(self, "read_missing_chunks", read_missing_chunks_parsed) + object.__setattr__(self, "codec_class_map", codec_class_map_parsed) + object.__setattr__(self, "codec_pipeline_class", codec_pipeline_class_parsed) @classmethod - def from_dict(cls, data: ArrayConfigParams) -> Self: + def from_dict(cls, data: ArrayConfigRequest) -> Self: """ Create an ArrayConfig from a dict. The keys of that dict are a subset of the attributes of the ArrayConfig class. Any keys missing from that dict will be set to the the values in the ``array`` namespace of ``zarr.config``. """ - kwargs_out: ArrayConfigParams = {} + kwargs_out: ArrayConfigRequest = {} for f in fields(ArrayConfig): field_name = cast( - "Literal['order', 'write_empty_chunks', 'read_missing_chunks']", f.name + "Literal['order', 'write_empty_chunks', 'read_missing_chunks', 'codec_class_map', 'codec_pipeline_class']", + f.name, ) if field_name not in data: - kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") + if field_name in ARRAY_CONFIG_PARAMS_KEYS_STATIC: + kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") + elif field_name == "codec_class_map": + kwargs_out["codec_class_map"] = parse_codec_class_map(None) + elif field_name == "codec_pipeline_class": + kwargs_out["codec_pipeline_class"] = parse_codec_pipeline_class(None) else: kwargs_out[field_name] = data[field_name] return cls(**kwargs_out) @@ -88,10 +185,33 @@ def to_dict(self) -> ArrayConfigParams: "order": self.order, "write_empty_chunks": self.write_empty_chunks, "read_missing_chunks": self.read_missing_chunks, + "codec_class_map": self.codec_class_map, + "codec_pipeline_class": self.codec_pipeline_class, } -ArrayConfigLike = ArrayConfig | ArrayConfigParams +ArrayConfigLike = ArrayConfig | ArrayConfigRequest + + +def parse_codec_pipeline_class(obj: type[CodecPipeline] | None) -> type[CodecPipeline]: + if obj is None: + from zarr.registry import get_pipeline_class + + return get_pipeline_class() + return obj + + +def parse_codec_class_map(obj: Mapping[str, type[Codec]] | None) -> Mapping[str, type[Codec]]: + """ + Convert a request for a codec class map into an actual Mapping[str, type[Codec]]. + If the input is `None`, build the map from the codec registry. + """ + if obj is None: + from zarr.registry import get_codec_class + + name_map: dict[str, str] = zarr_config.get("codecs", {}) + return {key: get_codec_class(key) for key in name_map} + return obj def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: @@ -106,12 +226,19 @@ def parse_array_config(data: ArrayConfigLike | None) -> ArrayConfig: return ArrayConfig.from_dict(data) +@dataclass(frozen=True) +class ArraySpecConfig: + order: MemoryOrder + write_empty_chunks: bool + read_missing_chunks: bool = False + + @dataclass(frozen=True) class ArraySpec: shape: tuple[int, ...] dtype: ZDType[TBaseDType, TBaseScalar] fill_value: Any - config: ArrayConfig + config: ArraySpecConfig prototype: BufferPrototype def __init__( @@ -119,12 +246,12 @@ def __init__( shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], fill_value: Any, - config: ArrayConfig, + config: ArraySpecConfig, prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) fill_value_parsed = parse_fill_value(fill_value) - + assert isinstance(config, ArraySpecConfig) object.__setattr__(self, "shape", shape_parsed) object.__setattr__(self, "dtype", dtype) object.__setattr__(self, "fill_value", fill_value_parsed) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 8626d480a7..ce66e2368b 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -30,7 +30,7 @@ import numpy as np -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec, ArraySpecConfig from zarr.core.chunk_key_encodings import parse_separator from zarr.core.common import ( JSON, @@ -242,11 +242,16 @@ def to_dict(self) -> dict[str, JSON]: def get_chunk_spec( self, _chunk_coords: tuple[int, ...], array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: + spec_config = ArraySpecConfig( + order=array_config.order, + read_missing_chunks=array_config.read_missing_chunks, + write_empty_chunks=array_config.write_empty_chunks, + ) return ArraySpec( shape=self.chunks, dtype=self.dtype, fill_value=self.fill_value, - config=array_config, + config=spec_config, prototype=prototype, ) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 7773e2489d..c370680b41 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -7,7 +7,7 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.metadata import Metadata -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec, ArraySpecConfig from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, @@ -33,7 +33,6 @@ from zarr.core.dtype.common import check_dtype_spec_v3 from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError, UnknownCodecError -from zarr.registry import get_codec_class if TYPE_CHECKING: from typing import Self @@ -56,7 +55,7 @@ def parse_node_type_array(data: object) -> Literal["array"]: raise NodeTypeValidationError(msg) -def parse_codecs(data: object) -> tuple[Codec, ...]: +def parse_codecs(data: object, codec_class_map: Mapping[str, type[Codec]]) -> tuple[Codec, ...]: out: tuple[Codec, ...] = () if not isinstance(data, Iterable): @@ -71,7 +70,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: name_parsed, _ = parse_named_configuration(c, require_configuration=False) try: - out += (get_codec_class(name_parsed).from_dict(c),) + out += (codec_class_map[name_parsed].from_dict(c),) except KeyError as e: raise UnknownCodecError(f"Unknown codec: {e.args[0]!r}") from e @@ -460,11 +459,14 @@ def __init__( dimension_names: DimensionNamesLike, storage_transformers: Iterable[dict[str, JSON]] | None = None, extra_fields: Mapping[str, AllowedExtraField] | None = None, + codec_class_map: Mapping[str, type[Codec]] | None = None, ) -> None: """ Because the class is a frozen dataclass, we set attributes using object.__setattr__ """ + from zarr.core.array_spec import parse_codec_class_map + codec_class_map_parsed = parse_codec_class_map(codec_class_map) shape_parsed = parse_shapelike(shape) chunk_grid_parsed = parse_chunk_grid(chunk_grid) chunk_key_encoding_parsed = parse_chunk_key_encoding(chunk_key_encoding) @@ -472,14 +474,16 @@ def __init__( # Note: relying on a type method is numpy-specific fill_value_parsed = data_type.cast_scalar(fill_value) attributes_parsed = parse_attributes(attributes) - codecs_parsed_partial = parse_codecs(codecs) + codecs_parsed_partial = parse_codecs(codecs, codec_class_map_parsed) storage_transformers_parsed = parse_storage_transformers(storage_transformers) extra_fields_parsed = parse_extra_fields(extra_fields) array_spec = ArraySpec( shape=shape_parsed, dtype=data_type, fill_value=fill_value_parsed, - config=ArrayConfig.from_dict({}), # TODO: config is not needed here. + config=ArraySpecConfig( + write_empty_chunks=False, order="C" + ), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial) @@ -573,7 +577,9 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: } @classmethod - def from_dict(cls, data: dict[str, JSON]) -> Self: + def from_dict( + cls, data: dict[str, JSON], *, codec_class_map: Mapping[str, type[Codec]] | None = None + ) -> Self: # make a copy because we are modifying the dict _data = data.copy() @@ -626,6 +632,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: data_type=data_type, extra_fields=allowed_extra_fields, storage_transformers=_data_typed.get("storage_transformers", ()), # type: ignore[arg-type] + codec_class_map=codec_class_map, ) def to_dict(self) -> dict[str, JSON]: @@ -663,3 +670,12 @@ def update_shape(self, shape: tuple[int, ...]) -> Self: def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) + + def with_config(self, config: ArrayConfig | None) -> Self: + """ + Return a copy of this metadata with a new configuration object. + """ + return type(self).from_dict( + self.to_dict(), + codec_class_map=config.codec_class_map if config is not None else None, + ) diff --git a/tests/test_array.py b/tests/test_array.py index f7f564f30e..4c412e1151 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -44,10 +44,11 @@ default_filters_v2, default_serializer_v3, ) -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, ArrayConfigRequest from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.chunk_grids import _auto_partition from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams +from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.common import JSON, ZarrFormat, ceildiv from zarr.core.dtype import ( DateTime64, @@ -2287,13 +2288,13 @@ def test_shard_write_num_gets(selection: slice, expected_gets: int) -> None: @pytest.mark.parametrize("config", [{}, {"write_empty_chunks": True}, {"order": "C"}]) -def test_with_config(config: ArrayConfigParams) -> None: +def test_with_config(config: ArrayConfigRequest) -> None: """ Test that `AsyncArray.with_config` and `Array.with_config` create a copy of the source array with a new runtime configuration. """ # the config we start with - source_config: ArrayConfigParams = {"write_empty_chunks": False, "order": "F"} + source_config: ArrayConfigRequest = {"write_empty_chunks": False, "order": "F"} source_array = zarr.create_array({}, shape=(1,), dtype="uint8", config=source_config) new_async_array_config_dict = source_array._async_array.with_config(config).config.to_dict() @@ -2314,10 +2315,45 @@ def test_with_config_polymorphism() -> None: objects. """ source_config: ArrayConfig = ArrayConfig.from_dict({"write_empty_chunks": False, "order": "F"}) - source_config_dict = source_config.to_dict() + source_config_dict: ArrayConfigParams = source_config.to_dict() arr = zarr.create_array({}, shape=(1,), dtype="uint8") arr_source_config = arr.with_config(source_config) - arr_source_config_dict = arr.with_config(source_config_dict) + arr_source_config_dict = arr.with_config(source_config_dict) # type: ignore[arg-type] assert arr_source_config.config == arr_source_config_dict.config + + +def test_array_config_specify_codecs() -> None: + """ + Test that we can use the array config to define the codec classes available to the array + """ + + class FakeGzipCodec(GzipCodec): ... + + store = {} # type: ignore[var-annotated] + arr = zarr.create_array(store, shape=(1,), dtype="uint8", compressors=GzipCodec()) + new_config: ArrayConfigRequest = { + "codec_class_map": {**arr.config.codec_class_map, "gzip": FakeGzipCodec} + } + arr_2 = arr.with_config(new_config) + assert isinstance(arr_2.compressors[0], FakeGzipCodec) + + arr_3 = zarr.open_array(store=store, config=new_config) + assert isinstance(arr_3.compressors[0], FakeGzipCodec) + + +def test_aray_config_specify_codecpipeline() -> None: + """ + Test that we can use the array configuration to open an array with a different codec pipeline + """ + store = {} # type: ignore[var-annotated] + + class FakeCodecPipeline(BatchedCodecPipeline): ... + + arr = zarr.create_array( + store, shape=(1,), dtype="uint8", config={"codec_pipeline_class": FakeCodecPipeline} + ) + assert isinstance(arr.async_array.codec_pipeline, FakeCodecPipeline) + arr_2 = arr.with_config({"codec_pipeline_class": BatchedCodecPipeline}) + assert isinstance(arr_2.async_array.codec_pipeline, BatchedCodecPipeline) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 0201beb8de..717ac4574a 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -9,7 +9,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.codecs import BloscCodec from zarr.codecs.blosc import BloscShuffle, Shuffle -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import UInt16, get_data_type_from_native_dtype from zarr.storage import MemoryStore, StorePath @@ -125,7 +125,7 @@ def test_blosc_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) diff --git a/tests/test_codecs/test_crc32c.py b/tests/test_codecs/test_crc32c.py index 3ab1070f60..941531100e 100644 --- a/tests/test_codecs/test_crc32c.py +++ b/tests/test_codecs/test_crc32c.py @@ -4,7 +4,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import get_data_type_from_native_dtype @@ -21,7 +21,7 @@ def test_crc32c_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index c505cee828..9d043c6790 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -7,7 +7,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import BytesCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import NDBuffer, default_buffer_prototype from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath @@ -49,7 +49,7 @@ def test_bytes_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) nd_buf: NDBuffer = default_buffer_prototype().nd_buffer.from_numpy_array(arr) diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 8932ba5e59..feeb9f9949 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -5,7 +5,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import GzipCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath @@ -40,7 +40,7 @@ def test_gzip_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 949bb72a62..16fe2e6bb5 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -6,7 +6,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import TransposeCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import NDBuffer, default_buffer_prototype from zarr.core.common import MemoryOrder from zarr.core.dtype import get_data_type_from_native_dtype @@ -111,7 +111,7 @@ def test_transpose_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) nd_buf: NDBuffer = default_buffer_prototype().nd_buffer.from_numpy_array(arr) diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 3f3f15a41a..199b77a941 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -5,7 +5,7 @@ from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath @@ -41,7 +41,7 @@ def test_zstd_codec_sync_roundtrip() -> None: shape=arr.shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), ) buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 8658b7b393..974f4d89c3 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -345,7 +345,7 @@ def test_parse_codecs_unknown_codec_raises(monkeypatch: pytest.MonkeyPatch) -> N codecs = [{"name": "unknown"}] with pytest.raises(UnknownCodecError): - parse_codecs(codecs) + parse_codecs(codecs, codec_class_map={}) @pytest.mark.parametrize( diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py index 1bfde7c837..4e0d9f4484 100644 --- a/tests/test_sync_codec_pipeline.py +++ b/tests/test_sync_codec_pipeline.py @@ -11,7 +11,7 @@ from zarr.codecs.gzip import GzipCodec from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.array_spec import ArraySpec, ArraySpecConfig from zarr.core.buffer import Buffer, NDBuffer, default_buffer_prototype from zarr.core.codec_pipeline import ChunkTransform from zarr.core.dtype import get_data_type_from_native_dtype @@ -38,7 +38,7 @@ def _make_array_spec(shape: tuple[int, ...], dtype: np.dtype[np.generic]) -> Arr shape=shape, dtype=zdtype, fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), + config=ArraySpecConfig(order="C", write_empty_chunks=True), prototype=default_buffer_prototype(), )