Skip to content

Commit fbe48e0

Browse files
committed
Merge branch 'main' of https://github.com/eopf-explorer/data-model into chore/src-layout
2 parents ef6d420 + 536d1e8 commit fbe48e0

13 files changed

Lines changed: 4908 additions & 458 deletions

File tree

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,4 +206,7 @@ marimo/_static/
206206
marimo/_lsp/
207207
__marimo__/
208208

209-
tests-output/
209+
tests-output/
210+
211+
# uv
212+
uv.lock

.vscode/launch.json

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
"https://objectstore.eodc.eu:2222/e05ab01a9d56408d82ac32d69a5aae2a:sample-data/tutorial_data/cpm_v253/S2B_MSIL1C_20250113T103309_N0511_R108_T32TLQ_20250113T122458.zarr",
1717
"./tests-output/eopf_geozarr/s2b_test.zarr",
1818
"--groups", "/measurements/reflectance/r10m", "/measurements/reflectance/r20m", "/measurements/reflectance/r60m", "/quality/l1c_quicklook/r10m",
19-
"--spatial-chunk", "512",
20-
"--min-dimension", "128",
19+
"--spatial-chunk", "4096",
20+
"--min-dimension", "256",
2121
"--tile-width", "256",
2222
"--max-retries", "2",
2323
"--verbose"
@@ -38,25 +38,77 @@
3838
"args": [
3939
"convert",
4040
"https://objectstore.eodc.eu:2222/e05ab01a9d56408d82ac32d69a5aae2a:sample-data/tutorial_data/cpm_v253/S2B_MSIL1C_20250113T103309_N0511_R108_T32TLQ_20250113T122458.zarr",
41-
"s3://esa-zarr-sentinel-explorer/tests-output/eopf_geozarr/s2b_test.zarr",
41+
"s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/s2b_test.zarr",
4242
"--groups", "/measurements/reflectance/r10m", "/measurements/reflectance/r20m", "/measurements/reflectance/r60m", "/quality/l1c_quicklook/r10m",
4343
"--spatial-chunk", "4096",
4444
"--min-dimension", "256",
4545
"--tile-width", "256",
4646
"--max-retries", "2",
47+
"--dask-cluster",
48+
"--verbose"
49+
],
50+
"cwd": "${workspaceFolder}",
51+
"justMyCode": false,
52+
"console": "integratedTerminal",
53+
"env": {
54+
"PYTHONPATH": "${workspaceFolder}/.venv/bin",
55+
"AWS_PROFILE": "eopf-explorer",
56+
"AWS_DEFAULT_REGION": "de",
57+
"AWS_S3_ENDPOINT": "https://s3.de.io.cloud.ovh.net/"
58+
},
59+
60+
},
61+
{
62+
// eopf_geozarr convert https://objectstore.eodc.eu:2222/e05ab01a9d56408d82ac32d69a5aae2a:sample-data/tutorial_data/cpm_v253/S2B_MSIL1C_20250113T103309_N0511_R108_T32TLQ_20250113T122458.zarr /tmp/tmp7mmjkjk3/s2b_subset_test.zarr --groups /measurements/reflectance/r10m --spatial-chunk 512 --min-dimension 128 --tile-width 256 --max-retries 2 --verbose
63+
"name": "Convert to GeoZarr S2L2A (S3)",
64+
"type": "debugpy",
65+
"request": "launch",
66+
"module": "eopf_geozarr",
67+
"args": [
68+
"convert",
69+
"https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202507-s02msil2a/04/products/cpm_v256/S2A_MSIL2A_20250704T094051_N0511_R036_T33SWB_20250704T115824.zarr",
70+
"s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S2A_MSIL2A_20250704T094051_N0511_R036_T33SWB_20250704T115824.zarr",
71+
"--groups", "/measurements/reflectance/r10m", "/measurements/reflectance/r20m", "/measurements/reflectance/r60m", "/quality/l2a_quicklook/r10m",
72+
"--spatial-chunk", "1024",
73+
"--min-dimension", "256",
74+
"--tile-width", "256",
75+
"--max-retries", "2",
76+
"--dask-cluster",
4777
"--verbose"
4878
],
4979
"cwd": "${workspaceFolder}",
5080
"justMyCode": false,
5181
"console": "integratedTerminal",
5282
"env": {
5383
"PYTHONPATH": "${workspaceFolder}/.venv/bin",
54-
"AWS_ACCESS_KEY_ID": "secret",
55-
"AWS_SECRET_ACCESS_KEY": "secret",
56-
"AWS_DEFAULT_REGION": "gra",
57-
"AWS_S3_ENDPOINT": "https://s3.gra.io.cloud.ovh.net/"
84+
"AWS_PROFILE": "eopf-explorer",
85+
"AWS_DEFAULT_REGION": "de",
86+
"AWS_S3_ENDPOINT": "https://s3.de.io.cloud.ovh.net/"
5887
},
59-
88+
89+
},
90+
{
91+
// eopf_geozarr convert https://objectstore.eodc.eu:2222/e05ab01a9d56408d82ac32d69a5aae2a:sample-data/tutorial_data/cpm_v253/S2B_MSIL1C_20250113T103309_N0511_R108_T32TLQ_20250113T122458.zarr /tmp/tmp7mmjkjk3/s2b_subset_test.zarr --groups /measurements/reflectance/r10m --spatial-chunk 512 --min-dimension 128 --tile-width 256 --max-retries 2 --verbose
92+
"name": "Info GeoZarr S2L2A (S3)",
93+
"type": "debugpy",
94+
"request": "launch",
95+
"module": "eopf_geozarr",
96+
"args": [
97+
"info",
98+
"s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S2A_MSIL2A_20250704T094051_N0511_R036_T33SWB_20250704T115824.zarr",
99+
"--verbose",
100+
"--html-output", "dataset_info.html"
101+
],
102+
"cwd": "${workspaceFolder}",
103+
"justMyCode": false,
104+
"console": "integratedTerminal",
105+
"env": {
106+
"PYTHONPATH": "${workspaceFolder}/.venv/bin",
107+
"AWS_PROFILE": "eopf-explorer",
108+
"AWS_DEFAULT_REGION": "de",
109+
"AWS_S3_ENDPOINT": "https://s3.de.io.cloud.ovh.net/"
110+
},
111+
60112
}
61113
]
62114
}

README.md

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ This library provides tools to convert EOPF datasets to GeoZarr-spec 0.4 complia
1414
- **CF Conventions**: Proper CF standard names and grid_mapping attributes
1515
- **Robust Processing**: Band-by-band writing with validation and retry logic
1616
- **S3 Support**: Direct output to Amazon S3 buckets with automatic credential validation
17+
- **Parallel Processing**: Optional dask cluster support for parallel chunk processing
18+
- **Chunk Alignment**: Automatic chunk alignment to prevent data corruption with dask
1719

1820
## GeoZarr Compliance Features
1921

@@ -51,6 +53,12 @@ eopf-geozarr convert input.zarr output.zarr
5153
# Convert EOPF dataset to GeoZarr format (S3 output)
5254
eopf-geozarr convert input.zarr s3://my-bucket/path/to/output.zarr
5355

56+
# Convert with parallel processing using dask cluster
57+
eopf-geozarr convert input.zarr output.zarr --dask-cluster
58+
59+
# Convert with dask cluster and verbose output
60+
eopf-geozarr convert input.zarr output.zarr --dask-cluster --verbose
61+
5462
# Get information about a dataset
5563
eopf-geozarr info input.zarr
5664

@@ -111,6 +119,42 @@ aws configure
111119
- **Error Handling**: Provides helpful error messages for S3 configuration issues
112120
- **Performance**: Optimized for S3 with proper chunking and retry logic
113121

122+
### Parallel Processing with Dask
123+
124+
The library supports parallel processing using dask clusters for improved performance on large datasets:
125+
126+
```bash
127+
# Enable dask cluster for parallel processing
128+
eopf-geozarr convert input.zarr output.zarr --dask-cluster
129+
130+
# With verbose output to see cluster information
131+
eopf-geozarr convert input.zarr output.zarr --dask-cluster --verbose
132+
```
133+
134+
#### Dask Features
135+
136+
- **Local Cluster**: Automatically starts a local dask cluster with multiple workers
137+
- **Dashboard Access**: Provides access to the dask dashboard for monitoring (shown in verbose mode)
138+
- **Automatic Cleanup**: Properly closes the cluster even if errors occur during processing
139+
- **Chunk Alignment**: Automatically aligns Zarr chunks with dask chunks to prevent data corruption
140+
- **Memory Efficiency**: Better memory management through parallel chunk processing
141+
- **Error Handling**: Graceful handling of dask import errors with helpful installation instructions
142+
143+
#### Chunk Alignment
144+
145+
The library includes advanced chunk alignment logic to prevent the common issue of overlapping chunks when using dask:
146+
147+
- **Smart Detection**: Automatically detects if data is dask-backed and uses existing chunk structure
148+
- **Aligned Calculation**: Uses `calculate_aligned_chunk_size()` to find optimal chunk sizes that divide evenly into data dimensions
149+
- **Proper Rechunking**: Ensures datasets are rechunked to match encoding before writing
150+
- **Fallback Logic**: For non-dask arrays, uses reasonable chunk sizes that don't exceed data dimensions
151+
152+
This prevents errors like:
153+
```
154+
❌ Failed to write tci after 2 attempts: Specified Zarr chunks encoding['chunks']=(1, 3660, 3660)
155+
for variable named 'tci' would overlap multiple Dask chunks
156+
```
157+
114158
#### S3 Python API
115159

116160
```python
@@ -202,7 +246,22 @@ Downsample a 2D array using block averaging.
202246

203247
#### `calculate_aligned_chunk_size`
204248

205-
Calculate a chunk size that aligns well with the data dimension.
249+
Calculate a chunk size that divides evenly into the dimension size. This ensures that Zarr chunks align properly with the data dimensions, preventing chunk overlap issues when writing with Dask.
250+
251+
**Parameters:**
252+
- `dimension_size` (int): Size of the dimension to chunk
253+
- `target_chunk_size` (int): Desired chunk size
254+
255+
**Returns:**
256+
- `int`: Aligned chunk size that divides evenly into dimension_size
257+
258+
**Example:**
259+
```python
260+
from eopf_geozarr.conversion.utils import calculate_aligned_chunk_size
261+
262+
# For a dimension of size 5490 with target chunk size 3660
263+
aligned_size = calculate_aligned_chunk_size(5490, 3660) # Returns 2745
264+
```
206265

207266
#### `is_grid_mapping_variable`
208267

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ classifiers = [
3030
requires-python = ">=3.11"
3131
dependencies = [
3232
"pydantic-zarr@git+https://github.com/zarr-developers/pydantic-zarr",
33-
"zarr>=3.0.10",
33+
# "zarr>=3.0.10",
34+
"zarr@git+https://github.com/zarr-developers/zarr-python",
3435
"xarray>=2025.7.1",
3536
"dask[array,distributed]>=2025.5.1",
3637
"numpy>=2.3.1",

0 commit comments

Comments
 (0)