Add sharding support for GeoZarr conversion and CLI (#38)

emmanuelmathot · d-v-b · web-flow · commit 01bdda8e1e56 · 2025-09-29T09:40:08.000+02:00
* feat: add sharding support for GeoZarr conversion and CLI

* update launch configurations for GeoZarr conversion with new data sources and adjusted parameters

* feat: enable sharding in GeoZarr conversion launch configuration

* fix: update sharding codec handling in _create_sharded_encoding function

* refactor: streamline sharding configuration in _create_geozarr_encoding function

* feat: enhance sharding logic in _create_geozarr_encoding and add _calculate_shard_dimension utility

* feat: improve sharding configuration and validation in _create_geozarr_encoding

* fix: refine shard dimension calculation and improve divisor check in utility functions

* Add dataset tree structure and test script for sharding fix

- Introduced a new dataset tree structure for Sentinel-2 data, detailing conditions, quality, and measurements.
- Added a comprehensive test script to verify the sharding fix for GeoZarr conversion.
- Implemented tests for shard dimension calculations and encoding creation with sharding enabled/disabled.
- Enhanced output for better debugging and validation of shard dimensions against chunk dimensions.

* feat: enable sharding in Dask cluster setup and enhance chunking logic for sharded variables

* Remove outdated dataset tree structure and test script for sharding fix

- Deleted the `dataset_tree_simplified.txt` file as it is no longer needed.
- Removed the `test_sharding_fix.py` script which was used to verify the sharding fix for GeoZarr conversion.

* feat: update GeoZarr encoding to include optional shards attribute in XarrayEncodingJSON

* fix: update test for calculate_aligned_chunk_size to assert exact target chunk size when no suitable divisor is found

* declare types for ambiguous variables

---------

Co-authored-by: Davis Vann Bennett &lt;davis.v.bennett@gmail.com&gt;
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -13,15 +13,16 @@
             "module": "eopf_geozarr",
             "args": [
                 "convert",
-                "https://objectstore.eodc.eu:2222/e05ab01a9d56408d82ac32d69a5aae2a:sample-data/tutorial_data/cpm_v253/S2B_MSIL1C_20250113T103309_N0511_R108_T32TLQ_20250113T122458.zarr",
-                "./tests-output/eopf_geozarr/s2b_test.zarr",
-                "--groups", "/measurements/reflectance/r10m", "/measurements/reflectance/r20m", "/measurements/reflectance/r60m", "/quality/l1c_quicklook/r10m",
+                "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202508-s02msil2a/11/products/cpm_v256/S2C_MSIL2A_20250811T112131_N0511_R037_T29TPF_20250811T152216.zarr",
+                "./tests-output/eopf_geozarr/s2l2_test.zarr",
+                "--groups", "/measurements/reflectance/r10m", "/measurements/reflectance/r20m", "/measurements/reflectance/r60m", "/quality/l2a_quicklook/r10m",
                 "--crs-groups", "/conditions/geometry",
-                "--spatial-chunk", "4096",
+                "--spatial-chunk", "512",
                 "--min-dimension", "256",
                 "--tile-width", "256",
                 "--max-retries", "2",
-                "--verbose"
+                "--verbose",
+                "--enable-sharding"
             ],
             "cwd": "${workspaceFolder}",
             "justMyCode": false,
@@ -99,18 +100,27 @@
                 // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202507-s02msil2a/04/products/cpm_v256/S2A_MSIL2A_20250704T094051_N0511_R036_T33SWB_20250704T115824.zarr",
                 // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202508-s02msil2a/04/products/cpm_v256/S2B_MSIL2A_20250804T103629_N0511_R008_T31TDH_20250804T130722.zarr",
                 // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202508-s02msil2a/07/products/cpm_v256/S2B_MSIL2A_20250807T104619_N0511_R051_T31TDH_20250807T131144.zarr",
-                "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202508-s02msil2a/11/products/cpm_v256/S2C_MSIL2A_20250811T112131_N0511_R037_T29TPF_20250811T152216.zarr",
-                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S2A_MSIL2A_20250704T094051_N0511_R036_T33SWB_20250704T115824.zarr",
-                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S2B_MSIL2A_20250804T103629_N0511_R008_T31TDH_20250804T130722.zarr",
-                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S2B_MSIL2A_20250807T104619_N0511_R051_T31TDH_20250807T131144.zarr",
-                "s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S2C_MSIL2A_20250811T112131_N0511_R037_T29TPF_20250811T152216.zarr",
+                // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202508-s02msil2a/11/products/cpm_v256/S2C_MSIL2A_20250811T112131_N0511_R037_T29TPF_20250811T152216.zarr",
+                // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s02msil2a/13/products/cpm_v256/S2C_MSIL2A_20250913T095041_N0511_R079_T33TVF_20250913T151113.zarr",
+                // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s02msil2a/21/products/cpm_v256/S2B_MSIL2A_20250921T100029_N0511_R122_T32TQM_20250921T135752.zarr",
+                // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s02msil2a/21/products/cpm_v256/S2B_MSIL2A_20250921T100029_N0511_R122_T33TTG_20250921T135752.zarr",
+                "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s02msil2a/08/products/cpm_v256/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2A_MSIL2A_20250704T094051_N0511_R036_T33SWB_20250704T115824.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2B_MSIL2A_20250804T103629_N0511_R008_T31TDH_20250804T130722.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2B_MSIL2A_20250807T104619_N0511_R051_T31TDH_20250807T131144.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2C_MSIL2A_20250811T112131_N0511_R037_T29TPF_20250811T152216.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2C_MSIL2A_20250913T095041_N0511_R079_T33TVF_20250913T151113.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2B_MSIL2A_20250921T100029_N0511_R122_T32TQM_20250921T135752.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2B_MSIL2A_20250921T100029_N0511_R122_T33TTG_20250921T135752.zarr",
+                "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2A_MSIL2A_20250908T100041_N0511_R122_T32TQM_20250908T115116.zarr",
                 "--groups", "/measurements/reflectance/r10m", "/measurements/reflectance/r20m", "/measurements/reflectance/r60m", "/quality/l2a_quicklook/r10m",
                 "--crs-groups", "/conditions/geometry",
-                "--spatial-chunk", "512",
+                "--spatial-chunk", "256",
                 "--min-dimension", "256",
                 "--tile-width", "256",
                 "--max-retries", "2",
                 "--dask-cluster",
+                "--enable-sharding",
                 "--verbose"
             ],
             "cwd": "${workspaceFolder}",
@@ -156,8 +166,12 @@
             "module": "eopf_geozarr",
             "args": [
                 "convert",
-                "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s01siwgrh/12/products/cpm_v256/S1C_IW_GRDH_1SDV_20250912T053648_20250912T053713_004087_0081FD_5AA4.zarr",
-                "s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S1C_IW_GRDH_1SDV_20250912T053648_20250912T053713_004087_0081FD_5AA4.zarr",
+                // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:notebook-data/tutorial_data/cpm_v260/S1A_IW_GRDH_1SDV_20241124T180254_20241124T180319_056700_06F516_BA27.zarr",
+                "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:notebook-data/tutorial_data/cpm_v260/S1A_IW_GRDH_1SDV_20241218T180252_20241218T180317_057050_0702F2_0BC2.zarr",
+                // "https://objects.eodc.eu/e05ab01a9d56408d82ac32d69a5aae2a:202509-s01siwgrh/12/products/cpm_v256/S1A_IW_GRDH_1SDV_20241230T180251_20241230T180316_057225_0709DD_15AC.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S1A_IW_GRDH_1SDV_20241124T180254_20241124T180319_056700_06F516_BA27_2.zarr",
+                "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel1-l1-grd/S1A_IW_GRDH_1SDV_20241218T180252_20241218T180317_057050_0702F2_0BC2.zarr",
+                // "s3://esa-zarr-sentinel-explorer-fra/tests-output/eopf_geozarr/S1A_IW_GRDH_1SDV_20241230T180251_20241230T180316_057225_0709DD_15AC.zarr",
                 "--groups", "/measurements",
                 "--gcp-group", "/conditions/gcp",
                 // "--crs-groups", "/conditions/geometry",
@@ -205,7 +219,7 @@
             "module": "eopf_geozarr",
             "args": [
                 "info",
-                "./tests-output/eopf_geozarr/s2b_test.zarr",
+                "./tests-output/eopf_geozarr/s2l2_test.zarr",
                 "--verbose",
                 "--html-output", "dataset_info.html"
             ],
@@ -224,7 +238,7 @@
             "module": "eopf_geozarr",
             "args": [
                 "info",
-                "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2B_MSIL2A_20250921T100029_N0511_R122_T33TTG_20250921T135752.zarr",
+                "s3://esa-zarr-sentinel-explorer-fra/tests-output/sentinel-2-l2a/S2A_MSIL2A_20250704T094051_N0511_R036_T33SWB_20250704T115824.zarr",
                 "--verbose",
                 "--html-output", "dataset_info.html"
             ],
diff --git a/src/eopf_geozarr/cli.py b/src/eopf_geozarr/cli.py
@@ -51,8 +51,9 @@ def setup_dask_cluster(enable_dask: bool, verbose: bool = False) -> Optional[Any
     try:
         from dask.distributed import Client
 
-        # Set up local cluster
-        client = Client()  # set up local cluster
+        # Set up local cluster with high memory limits
+        client = Client(memory_limit="8GB")  # set up local cluster
+        # client = Client()  # set up local cluster
 
         if verbose:
             print(f"🚀 Dask cluster started: {client}")
@@ -175,6 +176,7 @@ def convert_command(args: argparse.Namespace) -> None:
             max_retries=args.max_retries,
             crs_groups=args.crs_groups,
             gcp_group=args.gcp_group,
+            enable_sharding=args.enable_sharding,
         )
 
         print("✅ Successfully converted EOPF dataset to GeoZarr format")
@@ -1109,6 +1111,11 @@ def create_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Start a local dask cluster for parallel processing of chunks",
     )
+    convert_parser.add_argument(
+        "--enable-sharding",
+        action="store_true",
+        help="Enable zarr sharding for spatial dimensions of each variable",
+    )
     convert_parser.set_defaults(func=convert_command)
 
     # Info command
diff --git a/src/eopf_geozarr/conversion/geozarr.py b/src/eopf_geozarr/conversion/geozarr.py
@@ -57,6 +57,7 @@ def create_geozarr_dataset(
     max_retries: int = 3,
     crs_groups: Iterable[str] | None = None,
     gcp_group: str | None = None,
+    enable_sharding: bool = False,
 ) -> xr.DataTree:
     """
     Create a GeoZarr-spec 0.4 compliant dataset from EOPF data.
@@ -81,6 +82,8 @@ def create_geozarr_dataset(
         Iterable of group names that need CRS information added on best-effort basis
     gcp_group : str, optional
         Group name where GCPs (Ground Control Points) are located.
+    enable_sharding : bool, default False
+        Enable zarr sharding for spatial dimensions of each variable
 
     Returns
     -------
@@ -90,6 +93,9 @@ def create_geozarr_dataset(
     dt = dt_input.copy()
     compressor = BloscCodec(cname="zstd", clevel=3, shuffle="shuffle", blocksize=0)
 
+    if enable_sharding:
+        print("🔧 Zarr sharding enabled for spatial dimensions")
+
     if _is_sentinel1(dt_input):
         if gcp_group is None:
             raise ValueError(
@@ -132,6 +138,7 @@ def create_geozarr_dataset(
         max_retries,
         crs_groups,
         gcp_group,
+        enable_sharding,
     )
 
     # Consolidate metadata at the root level AFTER all groups are written
@@ -230,6 +237,7 @@ def iterative_copy(
     max_retries: int = 3,
     crs_groups: Iterable[str] | None = None,
     gcp_group: str | None = None,
+    enable_sharding: bool = False,
 ) -> xr.DataTree:
     """
     Iteratively copy groups from original DataTree to GeoZarr DataTree.
@@ -301,6 +309,7 @@ def iterative_copy(
                 min_dimension=min_dimension,
                 tile_width=tile_width,
                 gcp_group=gcp_group,
+                enable_sharding=enable_sharding,
             )
             written_groups.add(current_group_path)
             continue
@@ -407,6 +416,7 @@ def write_geozarr_group(
     min_dimension: int = 256,
     tile_width: int = 256,
     gcp_group: str | None = None,
+    enable_sharding: bool = False,
 ) -> xr.DataTree:
     """
     Write a group to a GeoZarr dataset with multiscales support.
@@ -451,7 +461,7 @@ def write_geozarr_group(
     dt.attrs = ds.attrs.copy()
 
     # Create encoding for all variables
-    encoding = _create_geozarr_encoding(ds, compressor, spatial_chunk)
+    encoding = _create_geozarr_encoding(ds, compressor, spatial_chunk, enable_sharding)
 
     # Write native data in the group 0 (overview level 0)
     native_dataset_group_name = f"{group_name}/0"
@@ -492,6 +502,7 @@ def write_geozarr_group(
             tile_width=tile_width,
             spatial_chunk=spatial_chunk,
             ds_gcp=ds_gcp,
+            enable_sharding=enable_sharding,
         )
     except Exception as e:
         print(
@@ -517,6 +528,7 @@ def create_geozarr_compliant_multiscales(
     tile_width: int = 256,
     spatial_chunk: int = 4096,
     ds_gcp: xr.Dataset | None = None,
+    enable_sharding: bool = False,
 ) -> Dict[str, Any]:
     """
     Create GeoZarr-spec compliant multiscales following the specification exactly.
@@ -674,10 +686,13 @@ def create_geozarr_compliant_multiscales(
             native_bounds,
             data_vars,
             ds_gcp_overview,
+            enable_sharding,
         )
 
         # Create encoding for this overview level
-        encoding = _create_geozarr_encoding(overview_ds, compressor, spatial_chunk)
+        encoding = _create_geozarr_encoding(
+            overview_ds, compressor, spatial_chunk, enable_sharding
+        )
 
         # Write overview level
         overview_path = fs_utils.normalize_path(f"{output_path}/{group_name}/{level}")
@@ -885,6 +900,7 @@ def create_overview_dataset_all_vars(
     native_bounds: Tuple[float, float, float, float],
     data_vars: Sequence[Hashable],
     ds_gcp: xr.Dataset | None = None,
+    enable_sharding: bool = False,
 ) -> xr.Dataset:
     """
     Create an overview dataset containing all variables for a specific level.
@@ -1090,7 +1106,21 @@ def write_dataset_band_by_band_with_validation(
         for attempt in range(max_retries):
             try:
                 # Ensure the dataset is properly chunked to align with encoding
-                if var in var_encoding and "chunks" in var_encoding[var]:
+                if (
+                    var in var_encoding
+                    and "shards" in var_encoding[var]
+                    and var_encoding[var]["shards"] is not None
+                ):
+                    # For sharded variables, use the shards dimensions
+                    shard_dims = var_encoding[var].get("shards", None)
+                    if shard_dims is not None:
+                        var_dims = single_var_ds[var].dims
+                        chunk_dict = {}
+                        for i, dim in enumerate(var_dims):
+                            if i < len(shard_dims):
+                                chunk_dict[dim] = shard_dims[i]
+                        single_var_ds[var] = single_var_ds[var].chunk(chunk_dict)
+                elif var in var_encoding and "chunks" in var_encoding[var]:
                     target_chunks = var_encoding[var]["chunks"]
                     # Create chunk dict using the actual dimensions of the variable
                     var_dims = single_var_ds[var].dims
@@ -1442,10 +1472,11 @@ def _create_encoding(
 
 
 def _create_geozarr_encoding(
-    ds: xr.Dataset, compressor: Any, spatial_chunk: int
+    ds: xr.Dataset, compressor: Any, spatial_chunk: int, enable_sharding: bool = False
 ) -> dict[Hashable, XarrayEncodingJSON]:
     """Create encoding for GeoZarr dataset variables."""
     encoding: dict[Hashable, XarrayEncodingJSON] = {}
+    chunks: tuple[int, ...]
     for var in ds.data_vars:
         if utils.is_grid_mapping_variable(ds, var):
             encoding[var] = {"compressors": None}
@@ -1458,12 +1489,54 @@ def _create_geozarr_encoding(
                     utils.calculate_aligned_chunk_size(width, spatial_chunk),
                     utils.calculate_aligned_chunk_size(height, spatial_chunk),
                 )
+
+                if len(data_shape) == 3:
+                    chunks = (1, spatial_chunk_aligned, spatial_chunk_aligned)
+                else:
+                    chunks = (spatial_chunk_aligned, spatial_chunk_aligned)
             else:
                 spatial_chunk_aligned = spatial_chunk
+                chunks = (spatial_chunk_aligned,)
+
+            shards: tuple[int, ...] | None = None
+
+            if enable_sharding:
+                # Calculate shard dimensions that are divisible by chunk dimensions
+                if len(data_shape) == 3:
+                    # For 3D data (time, y, x), ensure shard dimensions are divisible by chunks
+                    shard_time = data_shape[0]  # Keep full time dimension
+                    shard_y = _calculate_shard_dimension(data_shape[1], chunks[1])
+                    shard_x = _calculate_shard_dimension(data_shape[2], chunks[2])
+                    shards = (shard_time, shard_y, shard_x)
+                    print(
+                        f"  🔧 Sharding config for {var}: data_shape={data_shape}, chunks={chunks}, shards={shards}"
+                    )
+                elif len(data_shape) == 2:
+                    # For 2D data (y, x), ensure shard dimensions are divisible by chunks
+                    shard_y = _calculate_shard_dimension(data_shape[0], chunks[0])
+                    shard_x = _calculate_shard_dimension(data_shape[1], chunks[1])
+                    shards = (shard_y, shard_x)
+                    print(
+                        f"  🔧 Sharding config for {var}: data_shape={data_shape}, chunks={chunks}, shards={shards}"
+                    )
+                else:
+                    # For 1D data, use the full dimension
+                    shards = (data_shape[0],)
+                    print(
+                        f"  🔧 Sharding config for {var}: data_shape={data_shape}, chunks={chunks}, shards={shards}"
+                    )
+
+                # Validate that shards are evenly divisible by chunks
+                for i, (shard_dim, chunk_dim) in enumerate(zip(shards, chunks)):
+                    if shard_dim % chunk_dim != 0:
+                        print(
+                            f"  ⚠️  Warning: Shard dimension {shard_dim} not evenly divisible by chunk dimension {chunk_dim} at axis {i}"
+                        )
 
             encoding[var] = {
-                "chunks": (spatial_chunk_aligned, spatial_chunk_aligned),
+                "chunks": chunks,
                 "compressors": compressor,
+                "shards": shards,
             }
 
     # Add coordinate encoding
@@ -1618,6 +1691,46 @@ def _add_grid_mapping_variable(
                 print(f"  Added grid_mapping attribute to {var_name}")
 
 
+def _calculate_shard_dimension(data_dim: int, chunk_dim: int) -> int:
+    """
+    Calculate shard dimension that is evenly divisible by chunk dimension.
+
+    For Zarr v3 sharding with Dask, the shard dimension must be evenly
+    divisible by the chunk dimension to avoid checksum mismatches.
+
+    Parameters
+    ----------
+    data_dim : int
+        Size of the data dimension
+    chunk_dim : int
+        Size of the chunk dimension
+
+    Returns
+    -------
+    int
+        Shard dimension that is evenly divisible by chunk_dim
+    """
+    # If chunk is larger than data dimension, the effective chunk will be data_dim
+    # In this case, shard should also be data_dim to maintain divisibility
+    if chunk_dim >= data_dim:
+        return data_dim
+
+    # Calculate how many complete chunks fit in the data dimension
+    num_complete_chunks = data_dim // chunk_dim
+
+    # If we have at least 2 complete chunks, use a multiple of chunk_dim
+    if num_complete_chunks >= 2:
+        # Use a shard size that's a multiple of chunk_dim
+        for multiplier in range(num_complete_chunks + 1, 2, -1):
+            shard_size = multiplier * chunk_dim
+            if shard_size <= data_dim:
+                return shard_size
+
+    # Fallback: use the largest multiple of chunk_dim that fits
+    # If no complete chunks fit, use data_dim (this handles edge cases)
+    return num_complete_chunks * chunk_dim if num_complete_chunks > 0 else data_dim
+
+
 def _is_sentinel1(dt: xr.DataTree) -> bool:
     """Return True if the input DataTree represents a Sentinel-1 product."""
     stac_props = dt.attrs.get("stac_discovery", {}).get("properties", {})
diff --git a/src/eopf_geozarr/conversion/utils.py b/src/eopf_geozarr/conversion/utils.py
@@ -124,8 +124,8 @@ def calculate_aligned_chunk_size(dimension_size: int, target_chunk_size: int) ->
         return dimension_size
 
     # Find the largest divisor of dimension_size that is <= target_chunk_size
-    for chunk_size in range(target_chunk_size, 0, -1):
-        if dimension_size % chunk_size < 0.1 * chunk_size:
+    for chunk_size in range(target_chunk_size, int(target_chunk_size * 0.51), -1):
+        if dimension_size % chunk_size == 0:
             return chunk_size
 
     # If no divisor is found, return the closest value to target_chunk_size
diff --git a/src/eopf_geozarr/tests/test_conversion.py b/src/eopf_geozarr/tests/test_conversion.py
diff --git a/src/eopf_geozarr/types.py b/src/eopf_geozarr/types.py