1515from dask .array import from_delayed
1616from pydantic .experimental .missing_sentinel import MISSING
1717from pyproj import CRS
18+ from zarr .codecs import CastValue
1819from zarr_cm import geo_proj
1920from zarr_cm import multiscales as multiscales_cm
2021from zarr_cm import spatial as spatial_cm
2122
23+ from eopf_geozarr .conversion import utils
2224from eopf_geozarr .conversion .fs_utils import sanitize_dataset_attributes
2325from eopf_geozarr .conversion .geozarr import (
2426 _create_tile_matrix_limits ,
@@ -84,8 +86,14 @@ def _coarsen_variable(var_name: str, var_data: xr.DataArray, factor: int) -> xr.
8486 else :
8587 raise ValueError (f"Unknown variable type { var_type } " )
8688
87- result .encoding = var_data .encoding
88- return result .astype (var_data .dtype )
89+ # `xr.DataArray.astype` clears `.encoding`, so we capture it first and
90+ # restore it on the cast result. Without this, downstream code that
91+ # inspects encoding (e.g. to push CF scale-offset into a codec pipeline)
92+ # would see an empty encoding on every coarsened level.
93+ encoding = var_data .encoding
94+ result = result .astype (var_data .dtype )
95+ result .encoding = encoding
96+ return result
8997
9098
9199def inject_missing_bands (
@@ -170,6 +178,7 @@ def create_multiscale_from_datatree(
170178 spatial_chunk : int ,
171179 crs : CRS | None = None ,
172180 keep_scale_offset : bool ,
181+ experimental_scale_offset_codec : bool = False ,
173182) -> dict [str , dict ]:
174183 """
175184 Create multiscale versions preserving original structure.
@@ -239,11 +248,17 @@ def create_multiscale_from_datatree(
239248 spatial_chunk = spatial_chunk ,
240249 enable_sharding = enable_sharding ,
241250 keep_scale_offset = keep_scale_offset ,
251+ experimental_scale_offset_codec = experimental_scale_offset_codec ,
242252 )
243- # convert float64 arrays to float32
253+ # convert float64 arrays to float32. `xr.DataArray.astype` clears
254+ # encoding, so we capture and restore it — downstream pyramid
255+ # levels are coarsened from this dataset and rely on the encoding
256+ # to drive CF packing / codec filter generation.
244257 for data_var in dataset .data_vars :
245258 if dataset [data_var ].dtype in (np .dtype ("<f8" ), np .dtype (">f8" )):
259+ var_encoding = dataset [data_var ].encoding
246260 dataset [data_var ] = dataset [data_var ].astype ("float32" )
261+ dataset [data_var ].encoding = var_encoding
247262 # Clear _FillValue from the DataArray's own encoding to prevent
248263 # xarray from raising "Zarr does not support _FillValue in encoding".
249264 if not keep_scale_offset :
@@ -300,6 +315,7 @@ def create_multiscale_from_datatree(
300315 spatial_chunk = spatial_chunk ,
301316 enable_sharding = enable_sharding ,
302317 keep_scale_offset = keep_scale_offset ,
318+ experimental_scale_offset_codec = experimental_scale_offset_codec ,
303319 )
304320
305321 # Strip _FillValue from DataArray encoding for downsampled levels too
@@ -343,6 +359,7 @@ def create_measurements_encoding(
343359 spatial_chunk : int ,
344360 enable_sharding : bool = True ,
345361 keep_scale_offset : bool = True ,
362+ experimental_scale_offset_codec : bool = False ,
346363) -> dict [str , XarrayDataArrayEncoding ]:
347364 """
348365 Create optimized encoding for a pyramid level with advanced chunking and sharding.
@@ -390,7 +407,48 @@ def create_measurements_encoding(
390407 # Forward-propagate the existing encoding, minus keys that should be omitted
391408 keep_keys = XARRAY_ENCODING_KEYS - {"compressors" , "shards" , "chunks" }
392409
393- if not keep_scale_offset :
410+ if experimental_scale_offset_codec and not keep_scale_offset :
411+ # Push CF scale-offset into the zarr codec pipeline instead of
412+ # decoding to float. The data stays as packed integers on disk,
413+ # but zarr transparently decodes on read.
414+ scale_factor = var_data .encoding .get ("scale_factor" )
415+ add_offset = var_data .encoding .get ("add_offset" )
416+ packed_dtype = var_data .encoding .get ("dtype" )
417+
418+ if scale_factor is not None and add_offset is not None and packed_dtype is not None :
419+ from eopf_geozarr .codecs .scale_offset import scale_offset_from_cf
420+
421+ so_codec = scale_offset_from_cf (
422+ scale_factor = float (scale_factor ), add_offset = float (add_offset )
423+ )
424+ # CastValue refuses to cast NaN to integer without an explicit
425+ # mapping, so we need a packed-dtype sentinel for NaN. Prefer
426+ # the source's existing `_FillValue` (it already encodes the
427+ # "no data" semantic via xarray's CF mask_and_scale loop), and
428+ # fall back to the dtype's lowest representable integer.
429+ packed_np_dtype = np .dtype (packed_dtype )
430+ source_fill = var_data .encoding .get ("_FillValue" )
431+ if source_fill is not None :
432+ nan_sentinel = int (source_fill )
433+ else :
434+ nan_sentinel = int (np .iinfo (packed_np_dtype ).min )
435+ cv_codec = CastValue (
436+ data_type = packed_np_dtype .name ,
437+ rounding = "nearest-even" ,
438+ scalar_map = {
439+ "encode" : [("NaN" , nan_sentinel )],
440+ "decode" : [(nan_sentinel , "NaN" )],
441+ },
442+ )
443+ var_encoding ["filters" ] = (so_codec , cv_codec )
444+
445+ # Strip CF keys and `filters` from `keep_keys` — the codecs handle
446+ # encoding/decoding now, and we don't want the forward-propagation
447+ # loop below to overwrite our freshly-set filters with whatever was
448+ # on the source variable.
449+ keep_keys = keep_keys - CF_SCALE_OFFSET_KEYS - {"_FillValue" , "filters" }
450+ var_encoding ["fill_value" ] = "NaN"
451+ elif not keep_scale_offset :
394452 # When stripping scale/offset, also strip _FillValue since the original
395453 # _FillValue is in raw integer units and meaningless for decoded float data.
396454 keep_keys = keep_keys - CF_SCALE_OFFSET_KEYS - {"_FillValue" }
@@ -399,7 +457,7 @@ def create_measurements_encoding(
399457 # xarray's zarr backend uses "fill_value" (no underscore) in encoding
400458 # to set the zarr-level fill value, distinct from "_FillValue" which
401459 # controls CF-convention attribute masking.
402- var_encoding ["fill_value" ] = float ( "nan" )
460+ var_encoding ["fill_value" ] = "NaN"
403461
404462 for key in keep_keys :
405463 if key in var_data .encoding :
@@ -805,9 +863,15 @@ def create_original_encoding(dataset: xr.Dataset) -> dict[str, XarrayDataArrayEn
805863 var_data = dataset .data_vars [var_name ]
806864 var_encoding : XarrayDataArrayEncoding = {}
807865 var_encoding ["compressors" ] = (compressor ,)
808- for key in XARRAY_ENCODING_KEYS - {"compressors" }:
866+ for key in XARRAY_ENCODING_KEYS - {"compressors" , "fill_value" }:
809867 if key in var_data .encoding :
810868 var_encoding [key ] = var_data .encoding [key ] # type: ignore[literal-required]
869+ # Set the zarr-level `fill_value` explicitly rather than letting xarray
870+ # decide — different xarray versions infer different defaults from the
871+ # variable's `_FillValue`. See `explicit_fill_value` for the rationale.
872+ fv = utils .explicit_fill_value (var_data )
873+ if fv is not utils .UNSET :
874+ var_encoding ["fill_value" ] = fv
811875 if len (set (var_data .encoding .keys ()) - XARRAY_ENCODING_KEYS ) > 0 :
812876 log .warning (
813877 "Unknown encoding keys in %s: %s" ,
0 commit comments