EOPF-Explorer
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/eopf_geozarr/cli.py‎
Lines changed: 6 additions & 0 deletions b/‎src/eopf_geozarr/cli.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/eopf_geozarr/codecs/__init__.py‎ b/‎src/eopf_geozarr/codecs/__init__.py‎
diff --git a/‎src/eopf_geozarr/codecs/scale_offset.py‎
Lines changed: 27 additions & 0 deletions b/‎src/eopf_geozarr/codecs/scale_offset.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/eopf_geozarr/conversion/geozarr.py‎
Lines changed: 13 additions & 2 deletions b/‎src/eopf_geozarr/conversion/geozarr.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎src/eopf_geozarr/conversion/utils.py‎
Lines changed: 35 additions & 0 deletions b/‎src/eopf_geozarr/conversion/utils.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/eopf_geozarr/s2_optimization/s2_converter.py‎
Lines changed: 3 additions & 0 deletions b/‎src/eopf_geozarr/s2_optimization/s2_converter.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/eopf_geozarr/s2_optimization/s2_multiscale.py‎
Lines changed: 70 additions & 6 deletions b/‎src/eopf_geozarr/s2_optimization/s2_multiscale.py‎
Lines changed: 70 additions & 6 deletions
diff --git a/‎src/eopf_geozarr/types.py‎
Lines changed: 1 addition & 0 deletions b/‎src/eopf_geozarr/types.py‎
Lines changed: 1 addition & 0 deletions
@@ -29,7 +29,7 @@ requires-python = ">=3.12"
 dependencies = [
     "pydantic-zarr>=0.8.0",
     "pydantic>=2.12",
-    "zarr>=3.1.1",
+    "zarr[cast-value-rs]>=3.2.0",
     "xarray>=2025.7.1",
     "dask[array,distributed]>=2026.1.0",
     "numpy>=2.3.1",
 
@@ -1165,6 +1165,11 @@ def add_s2_optimization_commands(subparsers: argparse._SubParsersAction) -> None
         scale-offset encoding will be re-saved as the decoded data type, i.e. floating point values.
         """,
     )
+    s2_parser.add_argument(
+        "--experimental-scale-offset-codec",
+        action="store_true",
+        help="Push CF scale-offset encoding into zarr codec pipeline instead of decoding to float.",
+    )
     s2_parser.add_argument(
         "--dask-cluster",
         action="store_true",
@@ -1197,6 +1202,7 @@ def convert_s2_optimized_command(args: argparse.Namespace) -> None:
             compression_level=args.compression_level,
             validate_output=not args.skip_validation,
             keep_scale_offset=args.keep_scale_offset,
+            experimental_scale_offset_codec=args.experimental_scale_offset_codec,
         )
 
         log.info("✅ S2 optimization completed", output_path=args.output_path)
 
@@ -0,0 +1,27 @@
+"""
+CF-to-zarr-codec helper for the `scale_offset` codec.
+
+The `scale_offset` codec itself ships with zarr-python >= 3.2.0
+(`zarr.codecs.ScaleOffset`); this module only provides the small mapping from
+CF-convention `scale_factor` / `add_offset` attributes to `ScaleOffset`
+constructor arguments.
+"""
+
+from __future__ import annotations
+
+from zarr.codecs import ScaleOffset
+
+
+def scale_offset_from_cf(*, scale_factor: float, add_offset: float) -> ScaleOffset:
+    """
+    Convert CF-convention scale_factor and add_offset to a ScaleOffset codec.
+
+    CF convention: unpacked = packed * scale_factor + add_offset
+
+    ScaleOffset convention:
+        encode: out = (in - offset) * scale
+        decode: out = (in / scale) + offset
+
+    To match CF: offset = add_offset, scale = 1 / scale_factor.
+    """
+    return ScaleOffset(offset=add_offset, scale=1.0 / scale_factor)
@@ -1479,7 +1479,14 @@ def _create_encoding(
             else:
                 chunking = (min(spatial_chunk, data_shape[-1]),)
 
-        encoding[var] = {"compressors": [compressor], "chunks": chunking}
+        var_encoding: XarrayEncodingJSON = {
+            "compressors": [compressor],
+            "chunks": chunking,
+        }
+        fv = utils.explicit_fill_value(ds[var])
+        if fv is not utils.UNSET:
+            var_encoding["fill_value"] = fv
+        encoding[var] = var_encoding
 
     # Add coordinate encoding
     for coord in ds.coords:
@@ -1565,11 +1572,15 @@ def _create_geozarr_encoding(
                             axis=i,
                         )
 
-            encoding[var] = {
+            var_encoding: XarrayEncodingJSON = {
                 "chunks": chunks,
                 "compressors": compressor,
                 "shards": shards,
             }
+            fv = utils.explicit_fill_value(ds[var])
+            if fv is not utils.UNSET:
+                var_encoding["fill_value"] = fv
+            encoding[var] = var_encoding
 
     # Add coordinate encoding
     for coord in ds.coords:
 
@@ -1,5 +1,7 @@
 """Utility functions for GeoZarr conversion."""
 
+from typing import Any
+
 import numpy as np
 import rasterio  # noqa: F401  # Import to enable .rio accessor
 import structlog
@@ -8,6 +10,39 @@
 log = structlog.get_logger()
 
 
+# Sentinel: distinguish "no explicit fill_value" from a legitimate `None`.
+UNSET: Any = object()
+
+
+def explicit_fill_value(var: xr.DataArray) -> Any:
+    """Pick a zarr-level `fill_value` for `var` based on its source `_FillValue`.
+
+    Different xarray versions infer different on-disk fill values when the
+    encoding dict doesn't pin it: older xarray defaults floats to 0.0; newer
+    xarray honours the source `_FillValue`. Setting `fill_value` explicitly
+    via this helper removes that degree of freedom so the on-disk metadata is
+    stable across xarray versions.
+
+    Returns
+    -------
+    object
+        The value to assign to `encoding["fill_value"]`. The sentinel `UNSET`
+        is returned when the source has no `_FillValue` (caller should leave
+        the encoding entry alone). For non-finite floats, returns the
+        JSON-canonical string form (`"NaN"` / `"Infinity"` / `"-Infinity"`)
+        that zarr-python serialises.
+    """
+    source_fill = var.encoding.get("_FillValue")
+    if source_fill is None:
+        return UNSET
+    fill_arr = np.asarray(source_fill)
+    if np.issubdtype(fill_arr.dtype, np.floating) and not np.isfinite(fill_arr):
+        if np.isnan(fill_arr):
+            return "NaN"
+        return "Infinity" if fill_arr > 0 else "-Infinity"
+    return source_fill
+
+
 def downsample_2d_array(
     source_data: np.ndarray,
     target_height: int,
 
@@ -186,6 +186,7 @@ def convert_s2_optimized(
     compression_level: int,
     validate_output: bool,
     keep_scale_offset: bool,
+    experimental_scale_offset_codec: bool = False,
     max_retries: int = 3,
 ) -> xr.DataTree:
     """
@@ -199,6 +200,7 @@ def convert_s2_optimized(
         compression_level: Compression level 1-9
         validate_output: Whether to validate the output
         keep_scale_offset: Whether to preserve scale-offset encoding of the source data.
+        experimental_scale_offset_codec: Push CF scale-offset into zarr codec pipeline.
         max_retries: Maximum number of retries for network operations
 
     Returns:
@@ -234,6 +236,7 @@ def convert_s2_optimized(
         enable_sharding=enable_sharding,
         crs=crs,
         keep_scale_offset=keep_scale_offset,
+        experimental_scale_offset_codec=experimental_scale_offset_codec,
     )
 
     log.info("Created multiscale pyramids", num_groups=len(datasets))
 
@@ -15,10 +15,12 @@
 from dask.array import from_delayed
 from pydantic.experimental.missing_sentinel import MISSING
 from pyproj import CRS
+from zarr.codecs import CastValue
 from zarr_cm import geo_proj
 from zarr_cm import multiscales as multiscales_cm
 from zarr_cm import spatial as spatial_cm
 
+from eopf_geozarr.conversion import utils
 from eopf_geozarr.conversion.fs_utils import sanitize_dataset_attributes
 from eopf_geozarr.conversion.geozarr import (
     _create_tile_matrix_limits,
@@ -84,8 +86,14 @@ def _coarsen_variable(var_name: str, var_data: xr.DataArray, factor: int) -> xr.
     else:
         raise ValueError(f"Unknown variable type {var_type}")
 
-    result.encoding = var_data.encoding
-    return result.astype(var_data.dtype)
+    # `xr.DataArray.astype` clears `.encoding`, so we capture it first and
+    # restore it on the cast result. Without this, downstream code that
+    # inspects encoding (e.g. to push CF scale-offset into a codec pipeline)
+    # would see an empty encoding on every coarsened level.
+    encoding = var_data.encoding
+    result = result.astype(var_data.dtype)
+    result.encoding = encoding
+    return result
 
 
 def inject_missing_bands(
@@ -170,6 +178,7 @@ def create_multiscale_from_datatree(
     spatial_chunk: int,
     crs: CRS | None = None,
     keep_scale_offset: bool,
+    experimental_scale_offset_codec: bool = False,
 ) -> dict[str, dict]:
     """
     Create multiscale versions preserving original structure.
@@ -239,11 +248,17 @@ def create_multiscale_from_datatree(
                 spatial_chunk=spatial_chunk,
                 enable_sharding=enable_sharding,
                 keep_scale_offset=keep_scale_offset,
+                experimental_scale_offset_codec=experimental_scale_offset_codec,
             )
-            # convert float64 arrays to float32
+            # convert float64 arrays to float32. `xr.DataArray.astype` clears
+            # encoding, so we capture and restore it — downstream pyramid
+            # levels are coarsened from this dataset and rely on the encoding
+            # to drive CF packing / codec filter generation.
             for data_var in dataset.data_vars:
                 if dataset[data_var].dtype in (np.dtype("<f8"), np.dtype(">f8")):
+                    var_encoding = dataset[data_var].encoding
                     dataset[data_var] = dataset[data_var].astype("float32")
+                    dataset[data_var].encoding = var_encoding
             # Clear _FillValue from the DataArray's own encoding to prevent
             # xarray from raising "Zarr does not support _FillValue in encoding".
             if not keep_scale_offset:
@@ -300,6 +315,7 @@ def create_multiscale_from_datatree(
             spatial_chunk=spatial_chunk,
             enable_sharding=enable_sharding,
             keep_scale_offset=keep_scale_offset,
+            experimental_scale_offset_codec=experimental_scale_offset_codec,
         )
 
         # Strip _FillValue from DataArray encoding for downsampled levels too
@@ -343,6 +359,7 @@ def create_measurements_encoding(
     spatial_chunk: int,
     enable_sharding: bool = True,
     keep_scale_offset: bool = True,
+    experimental_scale_offset_codec: bool = False,
 ) -> dict[str, XarrayDataArrayEncoding]:
     """
     Create optimized encoding for a pyramid level with advanced chunking and sharding.
@@ -390,7 +407,48 @@ def create_measurements_encoding(
         # Forward-propagate the existing encoding, minus keys that should be omitted
         keep_keys = XARRAY_ENCODING_KEYS - {"compressors", "shards", "chunks"}
 
-        if not keep_scale_offset:
+        if experimental_scale_offset_codec and not keep_scale_offset:
+            # Push CF scale-offset into the zarr codec pipeline instead of
+            # decoding to float. The data stays as packed integers on disk,
+            # but zarr transparently decodes on read.
+            scale_factor = var_data.encoding.get("scale_factor")
+            add_offset = var_data.encoding.get("add_offset")
+            packed_dtype = var_data.encoding.get("dtype")
+
+            if scale_factor is not None and add_offset is not None and packed_dtype is not None:
+                from eopf_geozarr.codecs.scale_offset import scale_offset_from_cf
+
+                so_codec = scale_offset_from_cf(
+                    scale_factor=float(scale_factor), add_offset=float(add_offset)
+                )
+                # CastValue refuses to cast NaN to integer without an explicit
+                # mapping, so we need a packed-dtype sentinel for NaN. Prefer
+                # the source's existing `_FillValue` (it already encodes the
+                # "no data" semantic via xarray's CF mask_and_scale loop), and
+                # fall back to the dtype's lowest representable integer.
+                packed_np_dtype = np.dtype(packed_dtype)
+                source_fill = var_data.encoding.get("_FillValue")
+                if source_fill is not None:
+                    nan_sentinel = int(source_fill)
+                else:
+                    nan_sentinel = int(np.iinfo(packed_np_dtype).min)
+                cv_codec = CastValue(
+                    data_type=packed_np_dtype.name,
+                    rounding="nearest-even",
+                    scalar_map={
+                        "encode": [("NaN", nan_sentinel)],
+                        "decode": [(nan_sentinel, "NaN")],
+                    },
+                )
+                var_encoding["filters"] = (so_codec, cv_codec)
+
+            # Strip CF keys and `filters` from `keep_keys` — the codecs handle
+            # encoding/decoding now, and we don't want the forward-propagation
+            # loop below to overwrite our freshly-set filters with whatever was
+            # on the source variable.
+            keep_keys = keep_keys - CF_SCALE_OFFSET_KEYS - {"_FillValue", "filters"}
+            var_encoding["fill_value"] = "NaN"
+        elif not keep_scale_offset:
             # When stripping scale/offset, also strip _FillValue since the original
             # _FillValue is in raw integer units and meaningless for decoded float data.
             keep_keys = keep_keys - CF_SCALE_OFFSET_KEYS - {"_FillValue"}
@@ -399,7 +457,7 @@ def create_measurements_encoding(
             # xarray's zarr backend uses "fill_value" (no underscore) in encoding
             # to set the zarr-level fill value, distinct from "_FillValue" which
             # controls CF-convention attribute masking.
-            var_encoding["fill_value"] = float("nan")
+            var_encoding["fill_value"] = "NaN"
 
         for key in keep_keys:
             if key in var_data.encoding:
@@ -805,9 +863,15 @@ def create_original_encoding(dataset: xr.Dataset) -> dict[str, XarrayDataArrayEn
         var_data = dataset.data_vars[var_name]
         var_encoding: XarrayDataArrayEncoding = {}
         var_encoding["compressors"] = (compressor,)
-        for key in XARRAY_ENCODING_KEYS - {"compressors"}:
+        for key in XARRAY_ENCODING_KEYS - {"compressors", "fill_value"}:
             if key in var_data.encoding:
                 var_encoding[key] = var_data.encoding[key]  # type: ignore[literal-required]
+        # Set the zarr-level `fill_value` explicitly rather than letting xarray
+        # decide — different xarray versions infer different defaults from the
+        # variable's `_FillValue`. See `explicit_fill_value` for the rationale.
+        fv = utils.explicit_fill_value(var_data)
+        if fv is not utils.UNSET:
+            var_encoding["fill_value"] = fv
         if len(set(var_data.encoding.keys()) - XARRAY_ENCODING_KEYS) > 0:
             log.warning(
                 "Unknown encoding keys in %s: %s",
 
@@ -12,6 +12,7 @@ class TileMatrixLimitJSON(TypedDict):
 
 
 class XarrayEncodingJSON(TypedDict):
+    fill_value: NotRequired[object]
     chunks: NotRequired[tuple[int, ...]]
     compressors: Any
     shards: NotRequired[Any]