From 0f0de9dc82e02491515f717839097bbb50f60b28 Mon Sep 17 00:00:00 2001
From: Tejas <tejasdahiya0007@gmail.com>
Date: Sun, 31 May 2026 16:15:28 -0500
Subject: [PATCH] Add PEcAn-to-ILAMB GeoTIFF conversion pipeline (GH-4019)

Adds a Python pipeline in PEcAn.benchmark to convert downscaled SDA
reanalysis GeoTIFF ensemble maps into ILAMB-compatible CF-1.8 netCDF
for cVeg, cSoil, mrsol, and lai, with unit conversions and 13 unit tests.
---
 CHANGELOG.md                                  |   1 +
 CITATION.cff                                  |   1 +
 modules/benchmark/NEWS.md                     |   4 +
 modules/benchmark/inst/ilamb/README.md        | 105 +++++++
 .../inst/ilamb/convert_geotiff_to_ilamb.py    | 269 ++++++++++++++++++
 modules/benchmark/inst/ilamb/test_convert.py  | 162 +++++++++++
 6 files changed, 542 insertions(+)
 create mode 100644 modules/benchmark/inst/ilamb/README.md
 create mode 100644 modules/benchmark/inst/ilamb/convert_geotiff_to_ilamb.py
 create mode 100644 modules/benchmark/inst/ilamb/test_convert.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 55550127e09..17f279b811c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ For more information about this file see also [Keep a Changelog](http://keepacha
 ## Unreleased
 
 ### Added
+- Added `inst/ilamb/` pipeline in PEcAn.benchmark to convert downscaled SDA reanalysis GeoTIFFs into ILAMB-compatible CF netCDF for carbon-cycle benchmarking (#4019).
 - Added PEcAn.PEPRMT model, including a demo run with example data
 - Add `format_try_for_ma()` and `try_trait_mapping()` to `PEcAn.data.remote` to convert trait data from the external TRY database into the tabular format required by the PEcAn meta-analysis module (#3717).
 - Add function `qsub_sda()` for submitting SDA batch jobs by splitting a large number of sites into multiple small groups of sites (#3634).
diff --git a/CITATION.cff b/CITATION.cff
index 23bb09dd918..b50e09b7227 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -30,6 +30,7 @@ authors:
   - given-names: David LeBauer
     affiliation: University of Arizona
     orcid: 'https://orcid.org/0000-0001-7228-053X'
+  - given-names: Tejas Dahiya
   - given-names: Michael Dietze
     orcid: 'https://orcid.org/0000-0002-2324-2518'
     affiliation: Boston University
diff --git a/modules/benchmark/NEWS.md b/modules/benchmark/NEWS.md
index 521e5560b30..c09da5ed37e 100644
--- a/modules/benchmark/NEWS.md
+++ b/modules/benchmark/NEWS.md
@@ -1,5 +1,9 @@
 # PEcAn.benchmark 1.7.5.9000
 
+## Added
+
+* Added Python pipeline (`inst/ilamb/`) to convert PEcAn SDA reanalysis GeoTIFFs to ILAMB-compatible CF netCDF (cVeg, cSoil, mrsol, lai) with unit tests (#4019).
+
 ## Fixed
 
 * `metric_PPMC()`: added `use = "pairwise.complete.obs"` to `stats::cor()` so the Pearson correlation is computed on available pairs rather than returning `NA` whenever any observation is missing. This matches the behaviour of `metric_cor()` in the same package.
diff --git a/modules/benchmark/inst/ilamb/README.md b/modules/benchmark/inst/ilamb/README.md
new file mode 100644
index 00000000000..3be450b9d94
--- /dev/null
+++ b/modules/benchmark/inst/ilamb/README.md
@@ -0,0 +1,105 @@
+# PEcAn-to-ILAMB Conversion
+
+Converts PEcAn SDA carbon reanalysis outputs (downscaled GeoTIFF ensemble maps)
+into ILAMB-compatible CF-convention netCDF files for benchmarking against
+TRENDY, CMIP, and observational datasets.
+
+## Overview
+
+The PEcAn North American carbon reanalysis (Zhang et al. 2026) provides
+downscaled 1 km ensemble maps of four state variables. This tool reads those
+GeoTIFFs, computes the ensemble mean, coarsens to ILAMB's default 0.5 degree
+resolution, applies unit conversions, and writes CF-1.8 compliant netCDF that
+ILAMB can ingest directly.
+
+## Input
+
+GeoTIFF ensemble maps organized as:
+
+```
+<input_dir>/<year>/<variable>_<year>/ensemble_<n>_<year>_<variable>.tiff
+```
+
+- 13 years (2012-2024), annual snapshots fixed to July 15
+- 4 variables, 100 ensemble members each
+- 1 km resolution (9360 x 19080), EPSG:4326
+
+## Variable mapping and unit conversions
+
+| PEcAn variable | CMOR name | Source units | Conversion | ILAMB units |
+|----------------|-----------|--------------|------------|-------------|
+| AbvGrndWood    | cVeg      | Mg C ha-1    | x 0.1      | kg m-2      |
+| TotSoilCarb    | cSoil     | kg C m-2     | none       | kg m-2      |
+| SoilMoistFrac  | mrsol     | vol. percent | x 9.98     | kg m-2      |
+| LAI            | lai       | m2 m-2       | none       | m2 m-2      |
+
+**Aboveground biomass** is already a carbon density (Mg C ha-1), so the
+conversion to kg m-2 is purely unit scaling: 1 Mg ha-1 = 0.1 kg m-2.
+
+**Soil moisture** is volumetric water content expressed as percent over the
+0-100 cm root zone. Conversion to mass per area:
+
+```
+kg m-2 = percent / 100 x 1.0 m depth x 998 kg/m3  =  percent x 9.98
+```
+
+The 0-100 cm root-zone depth was confirmed with the dataset author
+(D. Zhang, pers. comm.), and matches the depth span of the ILAMB Wang2021
+soil-moisture benchmark (0-10, 10-30, 30-50, 50-100 cm layers).
+
+## Output
+
+CF-1.8 compliant netCDF on a 0.5 degree regular grid (156 x 318) covering the
+North American study area (7-85N, 179-20W):
+
+- `<output_dir>/<cmor_name>/<cmor_name>_<year>.nc` (one file per year)
+- `<output_dir>/<cmor_name>.nc` (merged multi-year file)
+
+Latitude is monotonically increasing (south to north); coordinates are rounded
+to 0.01 degrees; time is encoded as days since 1850-01-01 with full-year
+bounds.
+
+## Usage
+
+```bash
+module load python3 gcc/13.2.0
+export PATH=$HOME/.local/bin:$PATH
+
+# Convert all variables, all years
+python convert_geotiff_to_ilamb.py \
+    --input_dir /path/to/NA_SDA_maps_zipped \
+    --output_dir /path/to/output
+
+# Single variable / year range
+python convert_geotiff_to_ilamb.py --variables AbvGrndWood --years 2014 2014
+
+# Skip the merge step
+python convert_geotiff_to_ilamb.py --skip-merge
+```
+
+On an HPC system, reading 100 full-resolution members per variable exceeds
+interactive CPU limits; submit the full run as a batch job.
+
+## Testing
+
+```bash
+pytest test_convert.py -v
+```
+
+13 tests cover file existence, CMOR variable naming, all four unit
+conversions, output grid shape, CF-1.8 compliance, latitude direction,
+spatial coverage, chronological multi-year merging, and ILAMB `ModelResult`
+loading. Set `ILAMB_OUTPUT_DIR` to point the tests at your output directory.
+
+## Dependencies
+
+`numpy`, `xarray`, `rasterio`, `netCDF4`, and `ILAMB` (for the loading test).
+
+## Notes
+
+- Fluxes (GPP, NEE) are not included; the downscaled product covers only the
+  four state variables above. Flux benchmarking will draw from the raw SDA
+  netCDF outputs in a later contribution.
+- A known structural discontinuity exists in the underlying LandTrendr input
+  around 2017-2018 (see the ORNL DAAC documentation); it is preserved as-is in
+  the converted output rather than adjusted here.
diff --git a/modules/benchmark/inst/ilamb/convert_geotiff_to_ilamb.py b/modules/benchmark/inst/ilamb/convert_geotiff_to_ilamb.py
new file mode 100644
index 00000000000..21913902d02
--- /dev/null
+++ b/modules/benchmark/inst/ilamb/convert_geotiff_to_ilamb.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+"""
+convert_geotiff_to_ilamb.py
+
+Convert PEcAn SDA carbon reanalysis outputs (downscaled GeoTIFF ensemble maps)
+into ILAMB-compatible CF-convention netCDF files.
+
+The PEcAn North American carbon reanalysis (Zhang et al. 2026) provides
+downscaled 1 km ensemble maps of four state variables. This tool:
+
+  1. Reads the 100 GeoTIFF ensemble members for each variable/year
+  2. Computes the ensemble mean (memory-safe, one member at a time)
+  3. Coarsens from 1 km to ILAMB's default 0.5 degree resolution
+  4. Applies unit conversions to CF/CMOR standards
+  5. Writes CF-1.8 compliant netCDF that ILAMB can ingest directly
+
+Input GeoTIFF layout:
+    <input_dir>/<year>/<variable>_<year>/ensemble_<n>_<year>_<variable>.tiff
+
+Output netCDF layout:
+    <output_dir>/<cmor_name>/<cmor_name>_<year>.nc   (annual files)
+    <output_dir>/<cmor_name>.nc                       (merged multi-year)
+
+Usage:
+    python convert_geotiff_to_ilamb.py --input_dir DIR --output_dir DIR
+    python convert_geotiff_to_ilamb.py --variables AbvGrndWood --years 2014 2014
+
+Author: Tejas Dahiya (Google Summer of Code 2026, PEcAn Project)
+"""
+
+import argparse
+import glob
+import os
+from datetime import datetime
+
+import numpy as np
+import rasterio
+import xarray as xr
+
+
+# -----------------------------------------------------------------------------
+# Configuration
+# -----------------------------------------------------------------------------
+
+# Maps each PEcAn GeoTIFF variable to its CMOR name, target units, the
+# multiplicative scale factor needed to reach those units, and a descriptive
+# long name. Unit conversions are documented in the README.
+#
+#   AbvGrndWood : Mg C ha-1 -> kg m-2     requires x 0.1
+#   TotSoilCarb : kg C m-2  -> kg m-2     no conversion
+#   SoilMoistFrac: vol. %   -> kg m-2     requires x 9.98
+#                  (percent / 100 x 1.0 m depth x 998 kg/m3 water density)
+#   LAI         : m2 m-2    -> m2 m-2     no conversion
+VARIABLE_MAP = {
+    "AbvGrndWood":   {"cmor": "cVeg",  "units": "kg m-2", "scale": 0.1,
+                      "long_name": "Carbon Mass in Vegetation"},
+    "TotSoilCarb":   {"cmor": "cSoil", "units": "kg m-2", "scale": 1.0,
+                      "long_name": "Carbon Mass in Soil Pool"},
+    "SoilMoistFrac": {"cmor": "mrsol", "units": "kg m-2", "scale": 9.98,
+                      "long_name": "Total Soil Moisture Content"},
+    "LAI":           {"cmor": "lai",   "units": "m2 m-2", "scale": 1.0,
+                      "long_name": "Leaf Area Index"},
+}
+
+# Coarsening factor: native 0.00833 deg (1 km) x 60 = 0.5 deg (ILAMB default).
+# The native grid (9360 x 19080) divides evenly: 156 x 318.
+COARSEN_FACTOR = 60
+
+# GeoTIFF NoData sentinel per the ORNL DAAC documentation. Files on the SCC
+# use NaN, but we guard against -9999 as well for robustness.
+NODATA_VALUE = -9999
+
+
+# -----------------------------------------------------------------------------
+# Core processing
+# -----------------------------------------------------------------------------
+
+def process_variable_year(base_dir, year, pecan_var, output_dir):
+    """Convert one variable for one year to an ILAMB-compatible netCDF.
+
+    Reads all ensemble GeoTIFFs, computes the ensemble mean one member at a
+    time (to stay within memory limits), applies the unit conversion, coarsens
+    to 0.5 degrees, orients latitude south-to-north, and writes a CF-1.8 file.
+
+    Parameters
+    ----------
+    base_dir : str
+        Root directory of the GeoTIFF ensemble maps.
+    year : int
+        Year to process (e.g. 2014).
+    pecan_var : str
+        PEcAn variable name (key in VARIABLE_MAP).
+    output_dir : str
+        Directory to write the netCDF output.
+    """
+    info = VARIABLE_MAP[pecan_var]
+    cmor_name = info["cmor"]
+    var_dir = os.path.join(base_dir, str(year), f"{pecan_var}_{year}")
+
+    if not os.path.isdir(var_dir):
+        print(f"  WARNING: {var_dir} not found, skipping")
+        return
+
+    files = sorted(
+        os.path.join(var_dir, f)
+        for f in os.listdir(var_dir)
+        if f.endswith((".tiff", ".tif"))
+    )
+    n_ens = len(files)
+    print(f"  {pecan_var} -> {cmor_name}: {n_ens} ensemble files")
+
+    # Grid metadata from the first member (all members share the same grid).
+    with rasterio.open(files[0]) as src:
+        shape = src.shape
+        bounds = src.bounds
+
+    # Accumulate the ensemble mean with a running sum / count so we never hold
+    # more than one full-resolution member in memory at once.
+    running_sum = np.zeros(shape, dtype=np.float64)
+    running_count = np.zeros(shape, dtype=np.float64)
+
+    for i, f in enumerate(files):
+        with rasterio.open(f) as src:
+            data = src.read(1).astype(np.float64)
+        data = np.where(data == NODATA_VALUE, np.nan, data)
+        valid = np.isfinite(data)
+        running_sum[valid] += data[valid]
+        running_count[valid] += 1
+        if (i + 1) % 25 == 0:
+            print(f"    Read {i + 1}/{n_ens} members")
+
+    ens_mean = np.where(running_count > 0, running_sum / running_count, np.nan)
+
+    # Apply unit conversion to reach CMOR units.
+    ens_mean *= info["scale"]
+
+    valid_vals = ens_mean[np.isfinite(ens_mean)]
+    print(f"    Mean range: {valid_vals.min():.3f} to "
+          f"{valid_vals.max():.3f} {info['units']}")
+
+    # Coarsen 1 km -> 0.5 deg by block-averaging COARSEN_FACTOR^2 pixels,
+    # ignoring NaN (ocean / non-land) within each block.
+    nr = shape[0] // COARSEN_FACTOR
+    nc = shape[1] // COARSEN_FACTOR
+    trimmed = ens_mean[:nr * COARSEN_FACTOR, :nc * COARSEN_FACTOR]
+    reshaped = trimmed.reshape(nr, COARSEN_FACTOR, nc, COARSEN_FACTOR)
+    with np.errstate(all="ignore"):
+        coarsened = np.nanmean(reshaped, axis=(1, 3))
+
+    # GeoTIFF rows run north-to-south; flip so latitude increases south-to-north
+    # (the convention expected by ILAMB and most CF tools). Coordinates are
+    # rounded to remove sub-millidegree floating point noise from the bounds.
+    coarsened = np.flip(coarsened, axis=0)
+    lat = np.round(
+        np.flip(np.linspace(bounds.top - 0.25, bounds.bottom + 0.25, nr)), 2)
+    lon = np.round(
+        np.linspace(bounds.left + 0.25, bounds.right - 0.25, nc), 2)
+
+    # Time: annual snapshot fixed to July 15 (per ORNL documentation), encoded
+    # as days since 1850-01-01 with full-year bounds.
+    time_val = (datetime(year, 7, 15) - datetime(1850, 1, 1)).days
+    tb_start = (datetime(year, 1, 1) - datetime(1850, 1, 1)).days
+    tb_end = (datetime(year, 12, 31) - datetime(1850, 1, 1)).days
+
+    ds = xr.Dataset(
+        {
+            cmor_name: xr.DataArray(
+                data=coarsened[np.newaxis, :, :].astype(np.float32),
+                dims=["time", "lat", "lon"],
+                attrs={"units": info["units"], "long_name": info["long_name"]},
+            ),
+            "time_bounds": xr.DataArray(
+                data=np.array([[tb_start, tb_end]], dtype=np.float64),
+                dims=["time", "nb"],
+            ),
+        },
+        coords={
+            "time": ("time", [time_val], {
+                "units": "days since 1850-01-01 00:00:00",
+                "calendar": "standard",
+                "bounds": "time_bounds",
+            }),
+            "lat": ("lat", lat, {"units": "degrees_north"}),
+            "lon": ("lon", lon, {"units": "degrees_east"}),
+        },
+        attrs={
+            "Conventions": "CF-1.8",
+            "source": ("PEcAn SDA Reanalysis (Zhang et al. 2026)"),
+        },
+    )
+
+    out_var_dir = os.path.join(output_dir, cmor_name)
+    os.makedirs(out_var_dir, exist_ok=True)
+    outpath = os.path.join(out_var_dir, f"{cmor_name}_{year}.nc")
+    ds.to_netcdf(outpath, encoding={cmor_name: {"zlib": True, "complevel": 4}})
+    print(f"    Saved: {outpath}")
+
+
+def merge_annual_files(output_dir):
+    """Merge per-year files into one multi-year netCDF per variable."""
+    for info in VARIABLE_MAP.values():
+        cmor_name = info["cmor"]
+        var_dir = os.path.join(output_dir, cmor_name)
+        if not os.path.isdir(var_dir):
+            continue
+        files = sorted(glob.glob(os.path.join(var_dir, f"{cmor_name}_????.nc")))
+        if len(files) < 2:
+            continue
+        print(f"\nMerging {len(files)} files for {cmor_name}...")
+        ds = xr.open_mfdataset(files, combine="by_coords")
+        merged_path = os.path.join(output_dir, f"{cmor_name}.nc")
+        ds.to_netcdf(merged_path,
+                     encoding={cmor_name: {"zlib": True, "complevel": 4}})
+        print(f"  Saved: {merged_path}")
+        ds.close()
+
+
+# -----------------------------------------------------------------------------
+# Command-line interface
+# -----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert PEcAn SDA GeoTIFF outputs to ILAMB netCDF")
+    parser.add_argument(
+        "--input_dir",
+        default=("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/"
+                 "SDA_8k_site/NA_SDA_maps_zipped"),
+        help="Root directory of GeoTIFF ensemble maps")
+    parser.add_argument(
+        "--output_dir",
+        default="/projectnb/dietzelab/tdahiya/ilamb_outputs/pecan_v2",
+        help="Directory to write ILAMB-compatible netCDF")
+    parser.add_argument(
+        "--variables", nargs="+", default=None,
+        help="PEcAn variable names to convert (default: all four)")
+    parser.add_argument(
+        "--years", nargs=2, type=int, default=None, metavar=("START", "END"),
+        help="Inclusive year range to process (default: 2012 2024)")
+    parser.add_argument(
+        "--skip-merge", action="store_true",
+        help="Skip merging annual files into multi-year files")
+    args = parser.parse_args()
+
+    variables = args.variables or list(VARIABLE_MAP.keys())
+    years = (range(args.years[0], args.years[1] + 1)
+             if args.years else range(2012, 2025))
+
+    print("GeoTIFF -> ILAMB Pipeline")
+    print(f"  Input:     {args.input_dir}")
+    print(f"  Output:    {args.output_dir}")
+    print(f"  Variables: {variables}")
+    print(f"  Years:     {list(years)}")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    for year in years:
+        print(f"\n=== Year {year} ===")
+        for var in variables:
+            process_variable_year(args.input_dir, year, var, args.output_dir)
+
+    if not args.skip_merge:
+        merge_annual_files(args.output_dir)
+
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/benchmark/inst/ilamb/test_convert.py b/modules/benchmark/inst/ilamb/test_convert.py
new file mode 100644
index 00000000000..86f10545653
--- /dev/null
+++ b/modules/benchmark/inst/ilamb/test_convert.py
@@ -0,0 +1,162 @@
+"""
+test_convert.py
+
+Unit tests for convert_geotiff_to_ilamb.py.
+
+Validates the converted ILAMB-compatible netCDF outputs: file existence,
+CMOR variable naming, unit conversions, output grid shape, CF-1.8 compliance,
+latitude orientation, spatial coverage, multi-year merging, and that ILAMB's
+ModelResult can load the results.
+
+Run:
+    module load python3 gcc/13.2.0
+    export PATH=$HOME/.local/bin:$PATH
+    pytest test_convert.py -v
+
+Author: Tejas Dahiya (Google Summer of Code 2026, PEcAn Project)
+"""
+
+import os
+
+import numpy as np
+import xarray as xr
+
+# Directory holding the converted outputs. Override with the ILAMB_OUTPUT_DIR
+# environment variable if your outputs live elsewhere.
+OUTPUT_DIR = os.environ.get(
+    "ILAMB_OUTPUT_DIR",
+    "/projectnb/dietzelab/tdahiya/ilamb_outputs/pecan_v2",
+)
+
+CMOR_VARS = ["cVeg", "cSoil", "mrsol", "lai"]
+YEARS = range(2012, 2025)
+
+
+def test_all_files_exist():
+    """Every variable has 13 annual files plus a merged multi-year file."""
+    for var in CMOR_VARS:
+        merged = os.path.join(OUTPUT_DIR, f"{var}.nc")
+        assert os.path.exists(merged), f"Missing merged file: {merged}"
+        for year in YEARS:
+            annual = os.path.join(OUTPUT_DIR, var, f"{var}_{year}.nc")
+            assert os.path.exists(annual), f"Missing annual file: {annual}"
+
+
+def test_variable_names():
+    """Each output uses the correct CMOR variable name."""
+    for cmor_name in CMOR_VARS:
+        ds = xr.open_dataset(
+            os.path.join(OUTPUT_DIR, cmor_name, f"{cmor_name}_2014.nc"))
+        assert cmor_name in ds.data_vars, f"{cmor_name} not in dataset"
+        ds.close()
+
+
+def test_output_shape():
+    """Coarsened grid is the expected 0.5 degree resolution (156 x 318)."""
+    ds = xr.open_dataset(os.path.join(OUTPUT_DIR, "cVeg", "cVeg_2014.nc"))
+    assert ds.sizes["time"] == 1
+    assert ds.sizes["lat"] == 156
+    assert ds.sizes["lon"] == 318
+    ds.close()
+
+
+def test_cveg_unit_conversion():
+    """cVeg (AGB) is converted from Mg C ha-1 to kg m-2 (x 0.1)."""
+    ds = xr.open_dataset(os.path.join(OUTPUT_DIR, "cVeg", "cVeg_2014.nc"))
+    valid = ds["cVeg"].values[np.isfinite(ds["cVeg"].values)]
+    assert valid.min() >= 0, "cVeg has negative values"
+    assert valid.max() < 60, f"cVeg max {valid.max()} too high (units?)"
+    assert valid.mean() > 0.1, "cVeg mean suspiciously low"
+    assert ds["cVeg"].attrs["units"] == "kg m-2"
+    ds.close()
+
+
+def test_csoil_values():
+    """cSoil (SOC) stays in kg m-2 with a physically reasonable range."""
+    ds = xr.open_dataset(os.path.join(OUTPUT_DIR, "cSoil", "cSoil_2014.nc"))
+    valid = ds["cSoil"].values[np.isfinite(ds["cSoil"].values)]
+    assert valid.min() > 0, "cSoil has zero/negative values"
+    assert valid.max() < 300, f"cSoil max {valid.max()} unreasonably high"
+    assert ds["cSoil"].attrs["units"] == "kg m-2"
+    ds.close()
+
+
+def test_lai_values():
+    """lai stays in m2 m-2 with a physically reasonable range."""
+    ds = xr.open_dataset(os.path.join(OUTPUT_DIR, "lai", "lai_2014.nc"))
+    valid = ds["lai"].values[np.isfinite(ds["lai"].values)]
+    assert valid.min() >= 0, "LAI has negative values"
+    assert valid.max() < 15, f"LAI max {valid.max()} unreasonably high"
+    assert ds["lai"].attrs["units"] == "m2 m-2"
+    ds.close()
+
+
+def test_mrsol_values():
+    """mrsol is converted from volumetric percent to kg m-2 (x 9.98)."""
+    ds = xr.open_dataset(os.path.join(OUTPUT_DIR, "mrsol", "mrsol_2014.nc"))
+    valid = ds["mrsol"].values[np.isfinite(ds["mrsol"].values)]
+    assert valid.min() >= 0, "mrsol has negative values"
+    assert valid.max() < 600, f"mrsol max {valid.max()} too high for kg m-2"
+    assert valid.mean() > 50, f"mrsol mean {valid.mean()} too low for kg m-2"
+    assert ds["mrsol"].attrs["units"] == "kg m-2"
+    ds.close()
+
+
+def test_cf_time_encoding():
+    """Time is CF-encoded as days since 1850-01-01 with bounds."""
+    ds = xr.open_dataset(
+        os.path.join(OUTPUT_DIR, "cVeg", "cVeg_2014.nc"), decode_times=False)
+    assert "time" in ds.coords
+    assert ds.time.attrs.get("units") == "days since 1850-01-01 00:00:00"
+    assert ds.time.attrs.get("calendar") == "standard"
+    assert "time_bounds" in ds.data_vars
+    ds.close()
+
+
+def test_cf_coordinates():
+    """Coordinate attributes and global Conventions are CF-compliant."""
+    ds = xr.open_dataset(os.path.join(OUTPUT_DIR, "cVeg", "cVeg_2014.nc"))
+    assert ds.lat.attrs.get("units") == "degrees_north"
+    assert ds.lon.attrs.get("units") == "degrees_east"
+    assert ds.attrs.get("Conventions") == "CF-1.8"
+    ds.close()
+
+
+def test_lat_increasing():
+    """Latitude is monotonically increasing (south to north)."""
+    ds = xr.open_dataset(os.path.join(OUTPUT_DIR, "cVeg", "cVeg_2014.nc"))
+    assert ds.lat.values[1] > ds.lat.values[0], "Latitude not increasing"
+    assert ds.lat.values[0] == 7.25, f"First lat {ds.lat.values[0]} != 7.25"
+    assert ds.lat.values[-1] == 84.75, f"Last lat {ds.lat.values[-1]} != 84.75"
+    ds.close()
+
+
+def test_spatial_coverage():
+    """Grid covers the North American study area (7-85N, 179-20W)."""
+    ds = xr.open_dataset(os.path.join(OUTPUT_DIR, "cVeg", "cVeg_2014.nc"))
+    assert float(ds.lat.min()) < 10, "Southern boundary too far north"
+    assert float(ds.lat.max()) > 80, "Northern boundary too far south"
+    assert float(ds.lon.min()) < -170, "Western boundary too far east"
+    assert float(ds.lon.max()) > -25, "Eastern boundary too far west"
+    ds.close()
+
+
+def test_merged_file_years():
+    """Merged file spans all 13 years in chronological order."""
+    ds = xr.open_dataset(
+        os.path.join(OUTPUT_DIR, "cVeg.nc"), decode_times=False)
+    assert ds.sizes["time"] == 13, f"Expected 13 years, got {ds.sizes['time']}"
+    times = ds.time.values
+    assert all(times[i] < times[i + 1] for i in range(len(times) - 1)), \
+        "Merged time axis is not monotonically increasing"
+    ds.close()
+
+
+def test_ilamb_loads():
+    """ILAMB's ModelResult can load all four converted variables."""
+    from ILAMB.ModelResult import ModelResult
+    m = ModelResult(OUTPUT_DIR, modelname="PEcAn")
+    assert m.name == "PEcAn"
+    expected = set(CMOR_VARS)
+    found = set(m.variables.keys()) & expected
+    assert found == expected, f"Missing variables: {expected - found}"