Skip to content

Commit fb20af7

Browse files
authored
Merge pull request #118 from EOPF-Explorer/fix-nan
Fix JSON compliance for NaN values in zarr attributes
2 parents 7f6efc9 + ab93efd commit fb20af7

10 files changed

Lines changed: 32708 additions & 32527 deletions

src/eopf_geozarr/conversion/fs_utils.py

Lines changed: 89 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import json
44
import os
55
from collections.abc import Mapping
6-
from typing import Any
6+
from typing import TYPE_CHECKING, Any
77
from urllib.parse import urlparse
88

99
import s3fs
@@ -13,6 +13,92 @@
1313

1414
from eopf_geozarr.types import S3Credentials, S3FsOptions
1515

16+
if TYPE_CHECKING:
17+
import xarray as xr
18+
19+
20+
def replace_json_invalid_floats(obj: object) -> object:
21+
"""
22+
Recursively replace NaN and Infinity float values in a JSON-like object
23+
with their string representations, to make them JSON-compliant.
24+
25+
Parameters
26+
----------
27+
obj : object
28+
The JSON-like object to process
29+
30+
Returns
31+
-------
32+
object
33+
The processed object with NaN and Infinity replaced
34+
"""
35+
if isinstance(obj, float):
36+
if obj != obj: # NaN check
37+
return "NaN"
38+
if obj == float("inf"):
39+
return "Infinity"
40+
if obj == float("-inf"):
41+
return "-Infinity"
42+
return obj
43+
if isinstance(obj, dict):
44+
return {k: replace_json_invalid_floats(v) for k, v in obj.items()}
45+
if isinstance(obj, list):
46+
return [replace_json_invalid_floats(item) for item in obj]
47+
if isinstance(obj, tuple):
48+
return tuple(replace_json_invalid_floats(item) for item in obj)
49+
50+
return obj
51+
52+
53+
class NanCompatibleJSONEncoder(json.JSONEncoder):
54+
"""
55+
Custom JSON encoder that converts NaN, Inf, -Inf values to JSON-safe equivalents
56+
to ensure valid JSON output.
57+
"""
58+
59+
def encode(self, obj: Any) -> str:
60+
"""
61+
Encode object to JSON string, converting NaN values to "NaN".
62+
"""
63+
64+
converted_obj = replace_json_invalid_floats(obj)
65+
return super().encode(converted_obj)
66+
67+
68+
def sanitize_dataset_attributes(ds: "xr.Dataset") -> "xr.Dataset":
69+
"""
70+
Sanitize all NaN values in a dataset's attributes, variable attributes,
71+
and coordinate attributes, converting them to "NaN" strings.
72+
73+
Parameters
74+
----------
75+
ds : xr.Dataset
76+
Dataset to sanitize
77+
78+
Returns
79+
-------
80+
xr.Dataset
81+
Dataset with sanitized attributes
82+
"""
83+
84+
# Create a copy to avoid modifying the original
85+
ds_clean = ds.copy()
86+
87+
# Sanitize dataset attributes
88+
ds_clean.attrs = replace_json_invalid_floats(ds_clean.attrs)
89+
90+
# Sanitize variable attributes
91+
for var_name in ds_clean.data_vars:
92+
var = ds_clean[var_name]
93+
var.attrs = replace_json_invalid_floats(var.attrs)
94+
95+
# Sanitize coordinate attributes
96+
for coord_name in ds_clean.coords:
97+
coord = ds_clean[coord_name]
98+
coord.attrs = replace_json_invalid_floats(coord.attrs)
99+
100+
return ds_clean
101+
16102

17103
def normalize_s3_path(s3_path: str) -> str:
18104
"""
@@ -210,7 +296,7 @@ def write_s3_json_metadata(s3_path: str, metadata: Mapping[str, Any], **s3_kwarg
210296
fs = s3fs.S3FileSystem(**s3_config)
211297

212298
# Write JSON content
213-
json_content = json.dumps(metadata, indent=2)
299+
json_content = json.dumps(metadata, indent=2, cls=NanCompatibleJSONEncoder)
214300
with fs.open(s3_path, "w") as f:
215301
f.write(json_content)
216302

@@ -426,7 +512,7 @@ def write_json_metadata(path: str, metadata: dict[str, Any], **kwargs: Any) -> N
426512
fs.makedirs(parent_dir, exist_ok=True)
427513

428514
# Write JSON content using fsspec
429-
json_content = json.dumps(metadata, indent=2)
515+
json_content = json.dumps(metadata, indent=2, cls=NanCompatibleJSONEncoder)
430516
with fs.open(path, "w") as f:
431517
f.write(json_content)
432518

src/eopf_geozarr/conversion/geozarr.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
)
4545

4646
from . import fs_utils, utils
47+
from .fs_utils import sanitize_dataset_attributes
4748
from .sentinel1_reprojection import reproject_sentinel1_with_gcps
4849

4950
log = structlog.get_logger()
@@ -350,6 +351,9 @@ def iterative_copy(
350351
if node.data_vars:
351352
log.info("Writing %s with data variables to GeoZarr DataTree", current_group_path)
352353

354+
# Sanitize NaN values in attributes before writing
355+
ds = sanitize_dataset_attributes(ds)
356+
353357
# Set up encoding
354358
encoding = _create_encoding(ds, compressor, spatial_chunk)
355359

@@ -718,6 +722,9 @@ def create_geozarr_compliant_multiscales(
718722
if not fs_utils.is_s3_path(overview_path):
719723
os.makedirs(os.path.dirname(overview_path), exist_ok=True)
720724

725+
# Sanitize NaN values in overview dataset attributes
726+
overview_ds = sanitize_dataset_attributes(overview_ds)
727+
721728
# Write the overview dataset
722729
overview_group = f"{group_name}/{level}"
723730
# When sharding enabled, let Dask rechunk to shard boundaries
@@ -1170,6 +1177,9 @@ def cleanup_prefix(prefix: str) -> None:
11701177
else:
11711178
single_var_ds[var] = single_var_ds[var].chunk()
11721179

1180+
# Sanitize NaN values in single variable dataset attributes
1181+
single_var_ds = sanitize_dataset_attributes(single_var_ds)
1182+
11731183
single_var_ds.to_zarr(
11741184
output_path,
11751185
group=group_name,

src/eopf_geozarr/s2_optimization/s2_multiscale.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pydantic.experimental.missing_sentinel import MISSING
1717
from pyproj import CRS
1818

19+
from eopf_geozarr.conversion.fs_utils import sanitize_dataset_attributes
1920
from eopf_geozarr.conversion.geozarr import (
2021
_create_tile_matrix_limits,
2122
create_native_crs_tile_matrix_set,
@@ -873,6 +874,9 @@ def stream_write_dataset(
873874
if "/measurements/" in path or "/quality/" in path:
874875
write_geo_metadata(dataset, crs=crs)
875876

877+
# Sanitize NaN values in dataset attributes before writing
878+
dataset = sanitize_dataset_attributes(dataset)
879+
876880
# Write with streaming computation and progress tracking
877881
# The to_zarr operation will trigger all lazy computations
878882
write_job = dataset.to_zarr(

0 commit comments

Comments
 (0)