-
Notifications
You must be signed in to change notification settings - Fork 3
feat: implement scale-offset and data type casting via codecs #154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
d-v-b
wants to merge
5
commits into
EOPF-Explorer:main
Choose a base branch
from
d-v-b:feat/scale-offset-codecs
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
24477e0
feat: implement scale-offset and data type casting via codecs
d-v-b aee3fde
fix: fix dependency declaration
d-v-b df56d50
chore: use latest version of cast value
d-v-b a2f4386
chore: make cast-value a project dependency
d-v-b c16bdb6
test: expand test coverage
d-v-b File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| from zarr.registry import register_codec | ||
|
|
||
| from eopf_geozarr.codecs.scale_offset import ScaleOffset | ||
|
|
||
| register_codec("scale_offset", ScaleOffset) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,110 @@ | ||
| """ | ||
| Zarr V3 array-to-array codec implementing the scale_offset extension. | ||
|
|
||
| Encode: out = (in - offset) * scale | ||
| Decode: out = (in / scale) + offset | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from dataclasses import dataclass | ||
| from typing import TYPE_CHECKING, Self | ||
|
|
||
| from zarr.abc.codec import ArrayArrayCodec | ||
| from zarr.core.common import JSON, parse_named_configuration | ||
|
|
||
| if TYPE_CHECKING: | ||
| from zarr.core.array_spec import ArraySpec | ||
| from zarr.core.chunk_grids import ChunkGrid | ||
| from zarr.core.dtype import ZDType | ||
| from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar | ||
| from zarr.core.ndbuffer import NDBuffer | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class ScaleOffset(ArrayArrayCodec): | ||
| """Array-to-array codec that applies a linear scale and offset transformation.""" | ||
|
|
||
| is_fixed_size = True | ||
|
|
||
| offset: float | ||
| scale: float | ||
|
|
||
| def __init__(self, *, offset: float = 0.0, scale: float = 1.0) -> None: | ||
| object.__setattr__(self, "offset", float(offset)) | ||
| object.__setattr__(self, "scale", float(scale)) | ||
|
|
||
| @classmethod | ||
| def from_dict(cls, data: dict[str, JSON]) -> Self: | ||
| _, configuration_parsed = parse_named_configuration(data, "scale_offset") | ||
| return cls(**configuration_parsed) | ||
|
|
||
| def to_dict(self) -> dict[str, JSON]: | ||
| return { | ||
| "name": "scale_offset", | ||
| "configuration": {"offset": self.offset, "scale": self.scale}, | ||
| } | ||
|
|
||
| def validate( | ||
| self, | ||
| shape: tuple[int, ...], | ||
| dtype: ZDType[TBaseDType, TBaseScalar], | ||
| chunk_grid: ChunkGrid, | ||
| ) -> None: | ||
| pass | ||
|
|
||
| def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: | ||
| return self | ||
|
|
||
| def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: | ||
| return chunk_spec | ||
|
|
||
| async def _decode_single( | ||
| self, | ||
| chunk_array: NDBuffer, | ||
| chunk_spec: ArraySpec, | ||
| ) -> NDBuffer: | ||
| return self._decode_sync(chunk_array, chunk_spec) | ||
|
|
||
| def _decode_sync( | ||
| self, | ||
| chunk_array: NDBuffer, | ||
| chunk_spec: ArraySpec, | ||
| ) -> NDBuffer: | ||
| data = chunk_array.as_numpy_array() | ||
| decoded = (data / self.scale) + self.offset | ||
| return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded) | ||
|
|
||
| async def _encode_single( | ||
| self, | ||
| chunk_array: NDBuffer, | ||
| chunk_spec: ArraySpec, | ||
| ) -> NDBuffer | None: | ||
| return self._encode_sync(chunk_array, chunk_spec) | ||
|
|
||
| def _encode_sync( | ||
| self, | ||
| chunk_array: NDBuffer, | ||
| _chunk_spec: ArraySpec, | ||
| ) -> NDBuffer | None: | ||
| data = chunk_array.as_numpy_array() | ||
| encoded = (data - self.offset) * self.scale | ||
| return chunk_array.from_numpy_array(encoded) | ||
|
|
||
| def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: | ||
| return input_byte_length | ||
|
|
||
|
|
||
| def scale_offset_from_cf(*, scale_factor: float, add_offset: float) -> ScaleOffset: | ||
| """ | ||
| Convert CF-convention scale_factor and add_offset to a ScaleOffset codec. | ||
|
|
||
| CF convention: unpacked = packed * scale_factor + add_offset | ||
|
|
||
| ScaleOffset convention: | ||
| encode: out = (in - offset) * scale | ||
| decode: out = (in / scale) + offset | ||
|
|
||
| To match CF: offset = add_offset, scale = 1 / scale_factor. | ||
| """ | ||
| return ScaleOffset(offset=add_offset, scale=1.0 / scale_factor) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| # Test that the ScaleOffset codec combined with the CastValueRustV1 codec from the | ||
| # cast_value library works correctly. Correctness is measured by the ability of | ||
| # these two codecs to define a procedure that saves an array of floats ranging from -1000 to 1000 | ||
| # as uint16 array. The offset should be the minimum value of the array, but the scale can be 1 in | ||
| # this case. | ||
|
|
||
| import numpy as np | ||
| import zarr | ||
| import zarr.storage | ||
| from cast_value import CastValueRustV1 | ||
|
|
||
| from eopf_geozarr.codecs.scale_offset import ScaleOffset, scale_offset_from_cf | ||
|
|
||
|
|
||
| def test_scale_offset_with_cast_value() -> None: | ||
| """ | ||
| Round-trip test: write float64 data through ScaleOffset + CastValueRustV1, | ||
| verify it is stored as uint16, and read back as the original float64 values. | ||
| """ | ||
| data = np.linspace(-1000, 1000, 2001, dtype="float64") | ||
| offset = float(data.min()) # -1000.0 | ||
| scale = 1.0 | ||
|
|
||
| store = zarr.storage.MemoryStore() | ||
| arr = zarr.open_array( | ||
| store, | ||
| mode="w", | ||
| shape=data.shape, | ||
| dtype="float64", | ||
| codecs=[ | ||
| ScaleOffset(offset=offset, scale=scale), | ||
| CastValueRustV1(data_type="uint16", rounding="nearest-even"), | ||
| zarr.codecs.BytesCodec(), | ||
| ], | ||
| ) | ||
|
|
||
| arr[:] = data | ||
| result = arr[:] | ||
|
|
||
| np.testing.assert_array_almost_equal(result, data) | ||
|
|
||
|
|
||
| def test_cf_scale_offset_pushed_into_codecs() -> None: | ||
| """ | ||
| Given CF-convention scale_factor and add_offset, generate a ScaleOffset codec | ||
| that replicates the CF behavior at the zarr chunk level, paired with a | ||
| CastValueRustV1 codec for the packed integer dtype. | ||
|
|
||
| CF convention: unpacked = packed * scale_factor + add_offset | ||
| """ | ||
| scale_factor = 0.01 | ||
| add_offset = 273.15 | ||
| packed_dtype = "int16" | ||
|
|
||
| # Build the "unpacked" (decoded) float data that the user sees | ||
| packed_values = np.arange(-1000, 1001, dtype=packed_dtype) | ||
| unpacked_values = packed_values * scale_factor + add_offset | ||
|
|
||
| # Generate the ScaleOffset codec from CF parameters | ||
| so_codec = scale_offset_from_cf(scale_factor=scale_factor, add_offset=add_offset) | ||
| cv_codec = CastValueRustV1(data_type=packed_dtype, rounding="nearest-even") | ||
|
|
||
| # Write the unpacked float data through the codec pipeline | ||
| store = zarr.storage.MemoryStore() | ||
| arr = zarr.open_array( | ||
| store, | ||
| mode="w", | ||
| shape=unpacked_values.shape, | ||
| dtype=unpacked_values.dtype, | ||
| codecs=[so_codec, cv_codec, zarr.codecs.BytesCodec()], | ||
| ) | ||
|
|
||
| arr[:] = unpacked_values | ||
| result = arr[:] | ||
|
|
||
| # The round-trip should recover the original unpacked floats | ||
| np.testing.assert_array_almost_equal(result, unpacked_values) | ||
|
|
||
|
|
||
| def test_scale_offset_from_dict_round_trip() -> None: | ||
| """ScaleOffset.to_dict / from_dict should round-trip.""" | ||
| codec = ScaleOffset(offset=273.15, scale=100.0) | ||
| d = codec.to_dict() | ||
| restored = ScaleOffset.from_dict(d) | ||
| assert restored.offset == codec.offset | ||
| assert restored.scale == codec.scale |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did not make a stand-alone library for this codec, because it is so simple.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok but will it live in zarr-python or will users have to import eopf-geozarr to use the scale_offset?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in the current state users have to import eopf-geozarr, but I don't like that outcome. I will see if we can get this into zarr-python