Skip to content

Commit e6e75cc

Browse files
committed
fix: pin fill value based on data type instead of relying on xarray
1 parent 3a6e73d commit e6e75cc

4 files changed

Lines changed: 57 additions & 3 deletions

File tree

src/eopf_geozarr/conversion/geozarr.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1479,7 +1479,14 @@ def _create_encoding(
14791479
else:
14801480
chunking = (min(spatial_chunk, data_shape[-1]),)
14811481

1482-
encoding[var] = {"compressors": [compressor], "chunks": chunking}
1482+
var_encoding: XarrayEncodingJSON = {
1483+
"compressors": [compressor],
1484+
"chunks": chunking,
1485+
}
1486+
fv = utils.explicit_fill_value(ds[var])
1487+
if fv is not utils.UNSET:
1488+
var_encoding["fill_value"] = fv
1489+
encoding[var] = var_encoding
14831490

14841491
# Add coordinate encoding
14851492
for coord in ds.coords:
@@ -1565,11 +1572,15 @@ def _create_geozarr_encoding(
15651572
axis=i,
15661573
)
15671574

1568-
encoding[var] = {
1575+
var_encoding: XarrayEncodingJSON = {
15691576
"chunks": chunks,
15701577
"compressors": compressor,
15711578
"shards": shards,
15721579
}
1580+
fv = utils.explicit_fill_value(ds[var])
1581+
if fv is not utils.UNSET:
1582+
var_encoding["fill_value"] = fv
1583+
encoding[var] = var_encoding
15731584

15741585
# Add coordinate encoding
15751586
for coord in ds.coords:

src/eopf_geozarr/conversion/utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Utility functions for GeoZarr conversion."""
22

3+
from typing import Any
4+
35
import numpy as np
46
import rasterio # noqa: F401 # Import to enable .rio accessor
57
import structlog
@@ -8,6 +10,39 @@
810
log = structlog.get_logger()
911

1012

13+
# Sentinel: distinguish "no explicit fill_value" from a legitimate `None`.
14+
UNSET: Any = object()
15+
16+
17+
def explicit_fill_value(var: xr.DataArray) -> Any:
18+
"""Pick a zarr-level `fill_value` for `var` based on its source `_FillValue`.
19+
20+
Different xarray versions infer different on-disk fill values when the
21+
encoding dict doesn't pin it: older xarray defaults floats to 0.0; newer
22+
xarray honours the source `_FillValue`. Setting `fill_value` explicitly
23+
via this helper removes that degree of freedom so the on-disk metadata is
24+
stable across xarray versions.
25+
26+
Returns
27+
-------
28+
object
29+
The value to assign to `encoding["fill_value"]`. The sentinel `UNSET`
30+
is returned when the source has no `_FillValue` (caller should leave
31+
the encoding entry alone). For non-finite floats, returns the
32+
JSON-canonical string form (`"NaN"` / `"Infinity"` / `"-Infinity"`)
33+
that zarr-python serialises.
34+
"""
35+
source_fill = var.encoding.get("_FillValue")
36+
if source_fill is None:
37+
return UNSET
38+
fill_arr = np.asarray(source_fill)
39+
if np.issubdtype(fill_arr.dtype, np.floating) and not np.isfinite(fill_arr):
40+
if np.isnan(fill_arr):
41+
return "NaN"
42+
return "Infinity" if fill_arr > 0 else "-Infinity"
43+
return source_fill
44+
45+
1146
def downsample_2d_array(
1247
source_data: np.ndarray,
1348
target_height: int,

src/eopf_geozarr/s2_optimization/s2_multiscale.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from zarr_cm import multiscales as multiscales_cm
2121
from zarr_cm import spatial as spatial_cm
2222

23+
from eopf_geozarr.conversion import utils
2324
from eopf_geozarr.conversion.fs_utils import sanitize_dataset_attributes
2425
from eopf_geozarr.conversion.geozarr import (
2526
_create_tile_matrix_limits,
@@ -862,9 +863,15 @@ def create_original_encoding(dataset: xr.Dataset) -> dict[str, XarrayDataArrayEn
862863
var_data = dataset.data_vars[var_name]
863864
var_encoding: XarrayDataArrayEncoding = {}
864865
var_encoding["compressors"] = (compressor,)
865-
for key in XARRAY_ENCODING_KEYS - {"compressors"}:
866+
for key in XARRAY_ENCODING_KEYS - {"compressors", "fill_value"}:
866867
if key in var_data.encoding:
867868
var_encoding[key] = var_data.encoding[key] # type: ignore[literal-required]
869+
# Set the zarr-level `fill_value` explicitly rather than letting xarray
870+
# decide — different xarray versions infer different defaults from the
871+
# variable's `_FillValue`. See `explicit_fill_value` for the rationale.
872+
fv = utils.explicit_fill_value(var_data)
873+
if fv is not utils.UNSET:
874+
var_encoding["fill_value"] = fv
868875
if len(set(var_data.encoding.keys()) - XARRAY_ENCODING_KEYS) > 0:
869876
log.warning(
870877
"Unknown encoding keys in %s: %s",

src/eopf_geozarr/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class TileMatrixLimitJSON(TypedDict):
1212

1313

1414
class XarrayEncodingJSON(TypedDict):
15+
fill_value: NotRequired[object]
1516
chunks: NotRequired[tuple[int, ...]]
1617
compressors: Any
1718
shards: NotRequired[Any]

0 commit comments

Comments
 (0)