Move manifest to s3 and split workflow

thodson-usgs · thodson-usgs · commit 351761b3f646 · 2024-10-25T11:47:51.000-05:00
diff --git a/examples/virtual-rechunk/README.md b/examples/virtual-rechunk/README.md
@@ -1,11 +1,17 @@
 # Rechunk a virtual dataset
 
-This example demonstrates how to rechunk a collection of necdf files on s3
-into a single zarr store.
+This example demonstrates how to rechunk a collection of necdf files on s3 into a single zarr store.
+
+Most rechunking workflows can be conceptualized in two steps,
+which typically provides greater flexibility than combining them.
+The first (staging) step is mostly embarassingly parallel and prepares the input data.
+In this example, we construct a virtual zarr dataset using `lithops`,
+but we could incorporate data transfer and reprocessing as part of staging.
+
+The second (rechunking) step rechunks the staged data.
+Here, we rechuck the virtual zarr using `cubed`,
+but in theory, `dask` or other map-reduce frameworks may be used. 
 
-First, lithops and Virtualizarr construct a virtual dataset comprised of the
-netcdf files on s3. Then, xarray-cubed rechunks the virtual dataset into a
-zarr.
 
 ## Credits
 Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook
@@ -14,31 +20,36 @@ by norlandrhagen.
 Please, contribute improvements.
 
 
-
 1. Set up a Python environment
 ```bash
 conda create --name virtualizarr-rechunk -y python=3.11
 conda activate virtualizarr-rechunk
 pip install -r requirements.txt
 ```
 
-1. Set up cubed executor for [lithops-aws](https://github.com/cubed-dev/cubed/blob/main/examples/lithops/aws/README.md) by editing `./lithops.yaml` with your `bucket` and `execution_role`.
-```bash
+2. Set up cubed executor for [lithops-aws](https://github.com/cubed-dev/cubed/blob/main/examples/lithops/aws/README.md) by editing `./lithops.yaml` with your `bucket` and `execution_role`.
 
-1. Build a runtime image for Cubed
+3. Build a runtime image for `cubed`
 ```bash
 export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml
-export CUBED_CONFIG=$(pwd)
+export CUBED_CONFIG=$(pwd)/cubed.yaml
+# create a bucket for storing results
+export BUCKET_URL=s3://wma-uncertainty/scratch
 lithops runtime build -b aws_lambda -f Dockerfile_virtualizarr virtualizarr-runtime
 ```
 
-1. Run the script
+4. Stage the virtual zarr using `lithops`
+```bash
+python create-virtualzarr.py
+```
+
+5. Rechunk the virtual zarr with `cubed` (using `lithops`)
 ```bash
 python cubed-rechunk.py
 ```
 
 ## Cleaning up
-To rebuild the Litops image, delete the existing one by running
+To rebuild the `lithops` image, delete the existing one by running
 ```bash
 lithops runtime delete -b aws_lambda -d virtualizarr-runtime
 ```
diff --git a/examples/virtual-rechunk/create-virtualzarr.py b/examples/virtual-rechunk/create-virtualzarr.py
@@ -0,0 +1,59 @@
+# Use lithops to construct a virtual zarr from netcdf files on s3.
+
+import fsspec
+import lithops
+import os
+import xarray as xr
+
+from virtualizarr import open_virtual_dataset
+
+bucket_url = os.getenv("BUCKET_URL")
+
+fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True)
+files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*")
+file_pattern = sorted(["s3://" + f for f in files_paths])
+
+# Truncate file_pattern while debugging
+file_pattern = file_pattern[:4]
+
+print(f"{len(file_pattern)} file paths were retrieved.")
+
+
+def map_references(fil):
+    """ Map function to open virtual datasets.
+    """
+    vds = open_virtual_dataset(
+        fil,
+        indexes={},
+        loadable_variables=['Time'],
+        cftime_variables=['Time'],
+    )
+    return vds
+
+
+def reduce_references(results):
+    """ Reduce to concat virtual datasets.
+    """
+    combined_vds = xr.combine_nested(
+        results,
+        concat_dim=["Time"],
+        coords="minimal",
+        compat="override",
+    )
+
+    return combined_vds
+
+
+fexec = lithops.FunctionExecutor(config_file="lithops.yaml")
+
+futures = fexec.map_reduce(
+    map_references,
+    file_pattern,
+    reduce_references,
+    spawn_reducer=100,
+)
+
+ds = futures.get_result()
+
+# Save the virtual zarr manifest to s3
+ds.virtualize.to_kerchunk(f"{bucket_url}/combined.json", format="json")
diff --git a/examples/virtual-rechunk/lithops.yaml b/examples/virtual-rechunk/lithops.yaml
@@ -11,4 +11,4 @@ aws_lambda:
     runtime_memory: 2000
 
 aws_s3:
-    bucket: arn:aws:s3:::cubed-thodson-temp
+    storage_bucket: cubed-thodson-temp
diff --git a/examples/virtual-rechunk/requirements.txt b/examples/virtual-rechunk/requirements.txt
@@ -1,4 +1,4 @@
-boto
+boto3
 cftime
 cubed
 cubed-xarray
diff --git a/examples/virtual-rechunk/virtual-rechunk.py b/examples/virtual-rechunk/virtual-rechunk.py
@@ -1,85 +1,34 @@
-# Rechunk a collection of necdf files on s3 into a single zarr store.
+# Rechunk a virtual zarr on s3 into a single zarr store using xarray-cubed.
 #
-# First, lithops and Virtualizarr construct a virtual dataset comprised of the
-# netcdf files on s3. Then, xarray-cubed rechunks the virtual dataset into a
-# zarr.
+# Prior to running this script, create the virtual zarr with
+# > python create-virtualzarr.py
 #
-# Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook
-# by norlandrhagen.
-#
-# Please, contribute improvements.
+# NOTE: In jupyter, open_dataset seems to cache the json, such that changes
+# aren't propogated until the kernel is restarted.
 
-import fsspec
-import lithops
+import os
 import xarray as xr
 
-from virtualizarr import open_virtual_dataset
-
-fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True)
-files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*")
-file_pattern = sorted(["s3://" + f for f in files_paths])
-
-# truncate file_pattern while debugging
-file_pattern = file_pattern[:4]
-
-print(f"{len(file_pattern)} file paths were retrieved.")
-
-
-def map_references(fil):
-    """ Map function to open virtual datasets.
-    """
-    vds = open_virtual_dataset(fil,
-                               indexes={},
-                               loadable_variables=['Time'],
-                               cftime_variables=['Time'],
-                               )
-    return vds
-
-
-def reduce_references(results):
-    """ Reduce to concat virtual datasets.
+bucket_url = os.getenv("BUCKET_URL")
 
-    """
-    combined_vds = xr.combine_nested(
-        results,
-        concat_dim=["Time"],
-        coords="minimal",
-        compat="override",
-    )
-    # possibly write parquet to s3 here
-    return combined_vds
-
-
-fexec = lithops.FunctionExecutor(config_file="lithops.yaml")
-
-futures = fexec.map_reduce(
-    map_references,
-    file_pattern,
-    reduce_references,
-    spawn_reducer=100,
+combined_ds = xr.open_dataset(
+    f"{bucket_url}/combined.json", # location must be accessible to workers
+    engine="kerchunk",
+    chunks={},
+    chunked_array_type="cubed",
 )
 
-ds = futures.get_result()
-ds.virtualize.to_kerchunk("combined.json", format="json")
-
-# NOTE: In jupyter, open_dataset seems to cache the json, such that changes
-# aren't propogated until the kernel is restarted.
-combined_ds = xr.open_dataset("combined.json",
-                              engine="kerchunk",
-                              chunks={},
-                              chunked_array_type="cubed",
-                              )
-
-combined_ds['Time'].attrs = {}  # to_zarr complains about attrs
+combined_ds['Time'].attrs = {}  # otherwise to_zarr complains about attrs
 
 rechunked_ds = combined_ds.chunk(
     chunks={'Time': 5, 'south_north': 25, 'west_east': 32},
     chunked_array_type="cubed",
 )
 
-rechunked_ds.to_zarr("rechunked.zarr",
-                     mode="w",
-                     encoding={},  # TODO
-                     consolidated=True,
-                     safe_chunks=False,
-                     )
+rechunked_ds.to_zarr(
+    f"{bucket_url}/rechunked.zarr",
+    mode="w",
+    encoding={},  # TODO
+    consolidated=True,
+    safe_chunks=False,
+)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-boto`
	`1`	`+boto3`
`2`	`2`	`cftime`
`3`	`3`	`cubed`
`4`	`4`	`cubed-xarray`