Skip to content

Commit d752349

Browse files
feat/geozarr model (#10)
* add geozarr model * initial working pydantic models for geozarr * initial working pydantic models for geozarr * bump pydantic min version and define serialization by alias per class * fix broken test * wip * working mini roundtrip * refactor test layout * refactor v2 and v3 data structures * adapt to src layout, relax cf requirement * add array_dimensions kwarg to from_array * bump mypy python version * lint * add grid mapping * update multiscale models * use GridMappingVariable class, and pydantic experimental missing sentinel * lint * remove unnecessary dependency on types-simplejson in pre-commit config * simplify models via protocols * fix failing tests * add tile matrix limit json type * add json types * clean up types * update pre-commit configuration and improve code formatting in tests and notebook * remove obsolete Sentinel-2 L2A data structure analysis notebook --------- Co-authored-by: Emmanuel Mathot <emmanuel.mathot@gmail.com>
1 parent cdbce04 commit d752349

18 files changed

Lines changed: 15073 additions & 104 deletions

.pre-commit-config.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ repos:
55
- id: validate-pyproject
66

77
- repo: https://github.com/PyCQA/isort
8-
rev: 5.12.0
8+
rev: 6.0.1
99
hooks:
1010
- id: isort
1111
language_version: python
1212

1313
- repo: https://github.com/astral-sh/ruff-pre-commit
14-
rev: v0.8.4
14+
rev: v0.13.1
1515
hooks:
1616
- id: ruff
1717
args: ["--fix"]
@@ -24,6 +24,5 @@ repos:
2424
language_version: python
2525
exclude: tests/.*
2626
additional_dependencies:
27-
- types-simplejson
2827
- types-attrs
29-
- pydantic~=2.0
28+
- pydantic>=2.11

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ classifiers = [
2828
requires-python = ">=3.11"
2929
dependencies = [
3030
"pydantic-zarr>=0.8.0",
31+
"pydantic>=2.12.0a1",
3132
"zarr>=3.1.1",
3233
"xarray>=2025.7.1",
3334
"dask[array,distributed]>=2025.5.1",
@@ -111,7 +112,7 @@ use_parentheses = true
111112
ensure_newline_before_comments = true
112113

113114
[tool.mypy]
114-
python_version = "3.10"
115+
python_version = "3.11"
115116
warn_return_any = true
116117
warn_unused_configs = true
117118
disallow_untyped_defs = true

src/eopf_geozarr/cli.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,9 +410,9 @@ def render_node(node: Any, path: str = "", level: int = 0) -> str:
410410
# Generate HTML for this node
411411
node_html = f"""
412412
<div class="tree-node" style="margin-left: {level * 20}px;">
413-
<details class="tree-details" {'open' if level < 2 else ''}>
413+
<details class="tree-details" {"open" if level < 2 else ""}>
414414
<summary class="tree-summary">
415-
<span class="tree-icon">{'📁' if children_count > 0 else '📄'}</span>
415+
<span class="tree-icon">{"📁" if children_count > 0 else "📄"}</span>
416416
<span class="tree-name">{node_name}</span>
417417
<span class="tree-info">({summary})</span>
418418
</summary>
@@ -882,7 +882,7 @@ def _generate_html_output(
882882
</div>
883883
<div class="header-info-item">
884884
<div class="header-info-label">Generated</div>
885-
<div class="header-info-value">{__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
885+
<div class="header-info-value">{__import__("datetime").datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</div>
886886
</div>
887887
</div>
888888
</div>

src/eopf_geozarr/data_api/geozarr/__init__.py

Whitespace-only changes.
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
"""Common utilities for GeoZarr data API."""
2+
3+
import io
4+
import urllib
5+
import urllib.request
6+
from typing import Annotated, Any, Mapping, TypeVar
7+
8+
from cf_xarray.utils import parse_cf_standard_name_table
9+
from pydantic import AfterValidator, BaseModel
10+
from pydantic.experimental.missing_sentinel import MISSING
11+
from typing_extensions import Protocol, runtime_checkable
12+
13+
from eopf_geozarr.data_api.geozarr.types import ResamplingMethod
14+
15+
16+
class BaseDataArrayAttrs(BaseModel, extra="allow"):
17+
"""
18+
Base attributes for a GeoZarr DataArray.
19+
20+
Attributes
21+
----------
22+
"""
23+
24+
grid_mapping: str | MISSING = MISSING
25+
26+
27+
class GridMappingAttrs(BaseModel, extra="allow"):
28+
"""
29+
Grid mapping attributes for a GeoZarr grid mapping variable.
30+
31+
Attributes
32+
----------
33+
grid_mapping_name : str
34+
The name of the grid mapping.
35+
36+
Extra fields are permitted.
37+
38+
Additional attributes might be present depending on the type of grid mapping.
39+
40+
References
41+
----------
42+
https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#grid-mappings-and-projections
43+
"""
44+
45+
grid_mapping_name: str
46+
47+
48+
def get_cf_standard_names(url: str) -> tuple[str, ...]:
49+
"""Retrieve the set of CF standard names and return them as a tuple."""
50+
51+
headers = {"User-Agent": "eopf_geozarr"}
52+
53+
req = urllib.request.Request(url, headers=headers)
54+
55+
try:
56+
with urllib.request.urlopen(req) as response:
57+
content = response.read() # Read the entire response body into memory
58+
content_fobj = io.BytesIO(content)
59+
except urllib.error.URLError as e:
60+
raise e
61+
62+
_info, table, _aliases = parse_cf_standard_name_table(source=content_fobj)
63+
return tuple(table.keys())
64+
65+
66+
# This is a URL to the CF standard names table.
67+
CF_STANDARD_NAME_URL = (
68+
"https://raw.githubusercontent.com/cf-convention/cf-convention.github.io/"
69+
"master/Data/cf-standard-names/current/src/cf-standard-name-table.xml"
70+
)
71+
72+
# this does IO against github. consider locally storing this data instead if fetching every time
73+
# is problematic.
74+
CF_STANDARD_NAMES = get_cf_standard_names(url=CF_STANDARD_NAME_URL)
75+
76+
77+
def check_standard_name(name: str) -> str:
78+
"""
79+
Check if the standard name is valid according to the CF conventions.
80+
81+
Parameters
82+
----------
83+
name : str
84+
The standard name to check.
85+
86+
Returns
87+
-------
88+
str
89+
The validated standard name.
90+
91+
Raises
92+
------
93+
ValueError
94+
If the standard name is not valid.
95+
"""
96+
97+
if name in CF_STANDARD_NAMES:
98+
return name
99+
raise ValueError(
100+
f"Invalid standard name: {name}. This name was not found in the list of CF standard names."
101+
)
102+
103+
104+
CFStandardName = Annotated[str, AfterValidator(check_standard_name)]
105+
106+
107+
@runtime_checkable
108+
class GroupLike(Protocol):
109+
members: Mapping[str, Any] | None
110+
attributes: Any
111+
112+
113+
TGroupLike = TypeVar("TGroupLike", bound=GroupLike)
114+
115+
116+
def check_valid_coordinates(model: TGroupLike) -> TGroupLike:
117+
"""
118+
Check if the coordinates of the DataArrayLike objects listed in GroupLike objects are valid.
119+
120+
For each DataArrayLike in the model, we check the dimensions associated with the DataArrayLike.
121+
For each dimension associated with a data variable, a DataArrayLike with the name of that data
122+
variable must be present in the members of the group.
123+
124+
Parameters
125+
----------
126+
model : GroupLike
127+
An object that implements the GroupLike protocol.
128+
129+
Returns
130+
-------
131+
GroupLike
132+
A GroupLike object with referentially valid coordinates.
133+
"""
134+
if model.members is None:
135+
raise ValueError("Model members cannot be None")
136+
137+
arrays: dict[str, DataArrayLike] = {
138+
k: v for k, v in model.members.items() if isinstance(v, DataArrayLike)
139+
}
140+
for key, array in arrays.items():
141+
for idx, dim in enumerate(array.array_dimensions):
142+
if dim not in model.members:
143+
raise ValueError(
144+
f"Dimension '{dim}' for array '{key}' is not defined in the model members."
145+
)
146+
member = model.members[dim]
147+
if isinstance(member, GroupLike):
148+
raise ValueError(
149+
f"Dimension '{dim}' for array '{key}' should be a group. Found an array instead."
150+
)
151+
if member.shape[0] != array.shape[idx]:
152+
raise ValueError(
153+
f"Dimension '{dim}' for array '{key}' has a shape mismatch: "
154+
f"{member.shape[0]} != {array.shape[idx]}."
155+
)
156+
return model
157+
158+
159+
@runtime_checkable
160+
class DataArrayLike(Protocol):
161+
"""
162+
This is a protocol that models the relevant properties of Zarr V2 and Zarr V3 DataArrays.
163+
"""
164+
165+
@property
166+
def array_dimensions(self) -> tuple[str, ...]: ...
167+
168+
shape: tuple[int, ...]
169+
attributes: BaseDataArrayAttrs
170+
171+
172+
class TileMatrixLimit(BaseModel):
173+
""""""
174+
175+
tileMatrix: str
176+
minTileCol: int
177+
minTileRow: int
178+
maxTileCol: int
179+
maxTileRow: int
180+
181+
182+
class TileMatrix(BaseModel):
183+
id: str
184+
scaleDenominator: float
185+
cellSize: float
186+
pointOfOrigin: tuple[float, float]
187+
tileWidth: int
188+
tileHeight: int
189+
matrixWidth: int
190+
matrixHeight: int
191+
192+
193+
class TileMatrixSet(BaseModel):
194+
id: str
195+
title: str | None = None
196+
crs: str | None = None
197+
supportedCRS: str | None = None
198+
orderedAxes: tuple[str, str] | None = None
199+
tileMatrices: tuple[TileMatrix, ...]
200+
201+
202+
class Multiscales(BaseModel, extra="allow"):
203+
"""
204+
Multiscale metadata for a GeoZarr dataset.
205+
206+
Attributes
207+
----------
208+
tile_matrix_set : str
209+
The tile matrix set identifier for the multiscale dataset.
210+
resampling_method : ResamplingMethod
211+
The name of the resampling method for the multiscale dataset.
212+
tile_matrix_set_limits : dict[str, TileMatrixSetLimits] | None, optional
213+
The tile matrix set limits for the multiscale dataset.
214+
"""
215+
216+
tile_matrix_set: TileMatrixSet
217+
resampling_method: ResamplingMethod
218+
# TODO: ensure that the keys match tile_matrix_set.tileMatrices[$index].id
219+
# TODO: ensure that the keys match the tileMatrix attribute
220+
tile_matrix_limits: dict[str, TileMatrixLimit] | None = None
221+
222+
223+
class DatasetAttrs(BaseModel, extra="allow"):
224+
"""
225+
Attributes for a GeoZarr dataset.
226+
227+
A dataset is a collection of DataArrays. This class models the attributes of a dataset
228+
"""
229+
230+
...
231+
232+
233+
@runtime_checkable
234+
class DatasetLike(Protocol):
235+
members: Mapping[str, DataArrayLike] | None
236+
237+
238+
TDataSetLike = TypeVar("TDataSetLike", bound=DatasetLike)
239+
240+
241+
def check_grid_mapping(model: TDataSetLike) -> TDataSetLike:
242+
"""
243+
Ensure that a grid mapping variable is present, and that it refers to a member of the model.
244+
"""
245+
if model.members is not None:
246+
for name, member in model.members.items():
247+
if member.attributes.grid_mapping not in model.members:
248+
msg = f"Grid mapping variable '{member.attributes.grid_mapping}' declared by {name} was not found in dataset members"
249+
raise ValueError(msg)
250+
return model
251+
252+
253+
class MultiscaleGroupAttrs(BaseModel, extra="allow"):
254+
"""
255+
Attributes for Multiscale GeoZarr dataset.
256+
257+
A Multiscale dataset is a collection of Dataet
258+
259+
Attributes
260+
----------
261+
multiscales: MultiscaleAttrs
262+
"""
263+
264+
multiscales: Multiscales

0 commit comments

Comments
 (0)