From 266c2d70ad266905d96241e7d2ee39166b431624 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 27 Feb 2026 10:43:28 -0500
Subject: [PATCH 01/67] Add CIMIS-ET workflow for California irrigation
 management
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Workflow:
1. Extract CIMIS reference ET (ETref) for each site and date range
2. Extract CHIRPS precipitation data for the same locations/dates
3. Map LandIQ parcels to BIS crop coefficients
4. Calculate crop evapotranspiration (ETc = ETref × Kc)
5. Apply water balance model to determine irrigation requirements
6. Write SIPNET-compatible event files for irrigation scheduling
---
 modules/data.land/DESCRIPTION                 |   2 +
 modules/data.land/NAMESPACE                   |   8 +
 modules/data.land/R/CHIRPS-precip.R           |  60 ++++
 modules/data.land/R/CIMIS-ET.R                |  91 +++++
 modules/data.land/R/get-landiq-parcels.R      |  41 +++
 modules/data.land/R/get-landiq.R              |  25 ++
 modules/data.land/R/irrigation-event-files.R  |  23 ++
 modules/data.land/R/water_balance.R           |  84 +++++
 modules/data.land/man/apply_water_balance.Rd  |  21 ++
 modules/data.land/man/calc_water_balance.Rd   |  25 ++
 modules/data.land/man/create_event_file.Rd    |  19 ++
 modules/data.land/man/download_cimis_et.Rd    |  21 ++
 .../data.land/man/extract_chirps_remote.Rd    |  32 ++
 modules/data.land/man/extract_cimis_date.Rd   |  31 ++
 modules/data.land/man/extract_cimis_dates.Rd  |  20 ++
 modules/data.land/man/get_landiq.Rd           |  22 ++
 .../data.land/man/get_landiq_parcel_ids.Rd    |  20 ++
 .../tests/testthat/test-CHIRPS-precip.R       |  64 ++++
 .../data.land/vignettes/CIMIS-event-files.qmd | 312 ++++++++++++++++++
 19 files changed, 921 insertions(+)
 create mode 100644 modules/data.land/R/CHIRPS-precip.R
 create mode 100644 modules/data.land/R/CIMIS-ET.R
 create mode 100644 modules/data.land/R/get-landiq-parcels.R
 create mode 100644 modules/data.land/R/get-landiq.R
 create mode 100644 modules/data.land/R/irrigation-event-files.R
 create mode 100644 modules/data.land/R/water_balance.R
 create mode 100644 modules/data.land/man/apply_water_balance.Rd
 create mode 100644 modules/data.land/man/calc_water_balance.Rd
 create mode 100644 modules/data.land/man/create_event_file.Rd
 create mode 100644 modules/data.land/man/download_cimis_et.Rd
 create mode 100644 modules/data.land/man/extract_chirps_remote.Rd
 create mode 100644 modules/data.land/man/extract_cimis_date.Rd
 create mode 100644 modules/data.land/man/extract_cimis_dates.Rd
 create mode 100644 modules/data.land/man/get_landiq.Rd
 create mode 100644 modules/data.land/man/get_landiq_parcel_ids.Rd
 create mode 100644 modules/data.land/tests/testthat/test-CHIRPS-precip.R
 create mode 100644 modules/data.land/vignettes/CIMIS-event-files.qmd

diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION
index 13691a43277..a6f6c261a63 100644
--- a/modules/data.land/DESCRIPTION
+++ b/modules/data.land/DESCRIPTION
@@ -28,6 +28,8 @@ Imports:
     coda,
     curl,
     dplyr,
+    duckdb,
+    duckspatial,
     foreach,
     fs,
     future,
diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index c3214ba9e67..e90edad0df3 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -9,11 +9,15 @@ export(InventoryGrowthFusionDiagnostics)
 export(Read.IC.info.BADM)
 export(Read_Tucson)
 export(Soilgrids_SoilC_prep)
+export(apply_water_balance)
 export(buildJAGSdata_InventoryRings)
+export(calc_water_balance)
 export(clip_and_save_raster_file)
 export(cohort2pool)
+export(create_event_file)
 export(dataone_download)
 export(download.SM_CDS)
+export(download_cimis_et)
 export(download_package_rm)
 export(ens_veg_module)
 export(eto_to_etc)
@@ -22,6 +26,8 @@ export(extract.stringCode)
 export(extract_FIA)
 export(extract_NEON_veg)
 export(extract_SM_CDS)
+export(extract_cimis_date)
+export(extract_cimis_dates)
 export(extract_soil_gssurgo)
 export(extract_soil_nc)
 export(extract_veg)
@@ -33,6 +39,8 @@ export(gSSURGO.Query)
 export(generate_soilgrids_ensemble)
 export(get.attributes)
 export(get.soil)
+export(get_landiq)
+export(get_landiq_parcel_ids)
 export(get_resource_map)
 export(get_veg_module)
 export(ic_process)
diff --git a/modules/data.land/R/CHIRPS-precip.R b/modules/data.land/R/CHIRPS-precip.R
new file mode 100644
index 00000000000..c451c195f52
--- /dev/null
+++ b/modules/data.land/R/CHIRPS-precip.R
@@ -0,0 +1,60 @@
+#' Extract CHIRPS Precipitation Data from Remote NetCDF
+#'
+#' Downloads and extracts daily precipitation data from the CHIRPS (Climate
+#' Hazards group InfraRed Precipitation with Station data) dataset via remote
+#' NetCDF file access using vsicurl.
+#'
+#' @param design_points A data frame or tibble containing columns `lon` and `lat`
+#'   specifying the geographic coordinates of points to extract precipitation for.
+#' @param dates A vector of dates or date-time objects specifying the days for which
+#'   to extract precipitation data.
+#' @returns A modified version of `design_points` with new rows added for each date,
+#'   plus two new columns:
+#'   \item{date}{The date of the extracted data (same as the input `date`).}
+#'   \item{precip_mm_day}{Precipitation in millimeters for the specified day.}
+#' @examples
+#' \dontrun{
+#' pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
+#' result <- extract_chirps_remote(pts, as.Date(c("2020-06-15", "2021-06-15")))
+#' }
+extract_chirps_remote <- function(design_points, dates) {
+  CHIRPS_REMOTE_ROOT <- "https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_daily/netcdf/p05"
+
+  dates <- lubridate::as_date(dates)
+  years <- lubridate::year(dates)
+
+  result_list <- list()
+
+  for (yr in unique(years)) {
+    dates_yr <- dates[years == yr]
+    day_of_year_yr <- lubridate::yday(dates_yr)
+
+    url <- glue::glue("{CHIRPS_REMOTE_ROOT}/chirps-v2.0.{yr}.days_p05.nc")
+    vsicurl_path <- paste0("/vsicurl/", url)
+
+    # Suppress only the "no extent" warning. We set the extent on the next
+    # line. Other warnings should still fire.
+    withCallingHandlers({
+      r <- terra::rast(vsicurl_path)
+    },
+    warning = function(w) {
+      if (grepl("unknown extent", w$message)) {
+        invokeRestart("muffleWarning")
+      }
+    })
+    terra::ext(r) <- c(-180, 180, -50, 50)
+    terra::crs(r) <- "EPSG:4326"
+
+    r_days <- r[[day_of_year_yr]]
+
+    pts <- as.matrix(design_points[, c("lon", "lat")])
+    vals <- terra::extract(r_days, pts)
+    vals_vec <- as.vector(t(t(vals)))
+
+    result_list[[as.character(yr)]] <- design_points |>
+      tidyr::expand_grid(date = dates_yr) |>
+      dplyr::mutate(precip_mm_day = vals_vec)
+  }
+
+  dplyr::bind_rows(result_list)
+}
diff --git a/modules/data.land/R/CIMIS-ET.R b/modules/data.land/R/CIMIS-ET.R
new file mode 100644
index 00000000000..f1503e7bd1d
--- /dev/null
+++ b/modules/data.land/R/CIMIS-ET.R
@@ -0,0 +1,91 @@
+#' Download CIMIS ETo data
+#'
+#' Read raw ETo.asc.gz directly from CIMIS spatial portal, add CRS, and save
+#' locally as Cloud-optimized GeoTIFF. Outputs will be saved to
+#' `<local_root_dir>/CIMIS-ETo-YYYY-MM-DD.tif`.
+#'
+#' @param date Date to download
+#' @param local_root_dir Root directory for storing outputs.
+#'
+#' @return Path to saved TIF file (invisibly)
+#' @export
+download_cimis_et <- function(date, local_root_dir) {
+  date_str <- format(date, "%Y/%m/%d")
+  date_filename <- format(date, "%Y-%m-%d")
+
+  base_url <- "https://spatialcimis.water.ca.gov/cimis"
+  remote_path <- file.path(base_url, date_str, "ETo.asc.gz")
+  vsicurl_path <- paste0("/vsigzip//vsicurl/", remote_path)
+
+  tif_path <- file.path(
+    local_root_dir,
+    paste0("CIMIS-ETo-", date_filename, ".tif")
+  )
+
+  r <- terra::rast(vsicurl_path)
+  terra::crs(r) <- "EPSG:3310"
+
+  terra::writeRaster(r, tif_path, filetype = "COG", overwrite = TRUE)
+
+  invisible(tif_path)
+}
+
+#' Extract CIMIS daily reference ETo values
+#'
+#' @param design_points `data.frame` of design points with columns
+#' `location_id`, `lat`, and `lon`
+#' @param download_missing If `TRUE` and the local COG is missing, download it.
+#' If `FALSE` and the file is missing, throw an error.
+#'
+#' @inheritParams download_cimis_et
+#'
+#' @return `design_points` `data.frame` with additional columns `date`, and
+#' `etref_mm_day` (reference ET, mm/day)
+#' @export
+extract_cimis_date <- function(
+  design_points,
+  date,
+  local_root_dir,
+  download_missing = FALSE
+) {
+  date_filename <- format(date, "%Y-%m-%d")
+  tif_path <- file.path(
+    local_root_dir,
+    paste0("CIMIS-ET-", date_filename, ".tif")
+  )
+
+  if (!file.exists(tif_path)) {
+    if (!download_missing) {
+      stop("Missing file ", tif_path)
+    }
+    download_cimis_et(date, local_root_dir)
+  }
+
+  r <- terra::rast(tif_path)
+
+  pts_sf <- sf::st_as_sf(design_points, coords = c("lon", "lat"), crs = 4326)
+  pts_albers <- sf::st_transform(pts_sf, crs = 3310)
+  coords <- sf::st_coordinates(pts_albers)
+
+  vals <- terra::extract(r, coords)
+
+  design_points |>
+    dplyr::mutate(date = date, etref_mm_day = vals[, 1])
+}
+
+#' Extract CIMIS reference ET for multiple dates
+#'
+#' @param dates Sequence of dates for which to extract data
+#' @inheritParams extract_cimis_date
+#'
+#' @return `design_points` `data.frame` extended with ETref data for all dates.
+#' @export
+extract_cimis_dates <- function(design_points, dates, ...) {
+  df_list <- purrr::map(
+    dates,
+    purrr::possibly(extract_cimis_date, NULL, quiet = FALSE),
+    design_points = design_points,
+    ...
+  )
+  dplyr::bind_rows(df_list)
+}
diff --git a/modules/data.land/R/get-landiq-parcels.R b/modules/data.land/R/get-landiq-parcels.R
new file mode 100644
index 00000000000..74aef0c2802
--- /dev/null
+++ b/modules/data.land/R/get-landiq-parcels.R
@@ -0,0 +1,41 @@
+#' Get Parcel IDs from LandIQ
+#'
+#' @param design_points `data.frame` of coordinates to extract. Must contain
+#' columns `id`, `lat`, and `lon`.
+#' @param parcels_file Path to harmonized LandIQ parcels (GPKG) file
+#'
+#' @return `design_points` `data.frame` with harmonized LandIQ parcel_IDs
+#' @export
+get_landiq_parcel_ids <- function(design_points, parcels_file) {
+  parcel_crs <- sf::st_layers(parcels_file)[["crs"]][[1]]
+  pts_sf <- design_points |>
+    dplyr::select("id", "lat", "lon") |>
+    sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
+    sf::st_transform(parcel_crs)
+  conn <- duckspatial::ddbs_create_conn()
+  duckspatial::ddbs_write_vector(
+    conn = conn,
+    data = pts_sf,
+    name = "design_points"
+  )
+  duckdb::dbSendQuery(
+    conn,
+    glue::glue(
+      "
+      CREATE TABLE merged AS
+      SELECT dp.*, p.parcel_id,
+      FROM design_points dp
+      LEFT JOIN ST_Read('{parcels_file}', layer='parcels') p
+      ON ST_Within(dp.geometry, p.geom)
+      "
+    )
+  )
+  dp_with_parcels <- duckspatial::ddbs_read_vector(
+    conn = conn,
+    name = "merged"
+  ) |>
+    sf::st_drop_geometry(dp_parcels) |>
+    dplyr::right_join(design_points, by = "id")
+
+  dp_with_parcels
+}
diff --git a/modules/data.land/R/get-landiq.R b/modules/data.land/R/get-landiq.R
new file mode 100644
index 00000000000..638ef5a0519
--- /dev/null
+++ b/modules/data.land/R/get-landiq.R
@@ -0,0 +1,25 @@
+#' Get LandIQ parcels and crop data
+#'
+#' @param design_points `data.frame` of coordinates to extract. Must contain
+#' columns `id`, `lat`, and `lon`.
+#' @param parcels_file Path to harmonized LandIQ parcels (GPKG) file
+#' @param crops_file Path to LandIQ crops parquet file
+#'
+#' @return `design_points` `data.frame` with LandIQ parcel IDs, year, season, CLASS, and SUBCLASS
+#' @export
+get_landiq <- function(design_points, parcels_file, crops_file) {
+  dp_with_parcels <- get_landiq_parcel_ids(design_points, parcels_file)
+
+  crops <- arrow::read_parquet(crops_file) |>
+    dplyr::semi_join(dp_with_parcels, by = "parcel_id") |>
+    dplyr::mutate(
+      dplyr::across(c("CLASS", "SUBCLASS"), ~ dplyr::na_if(.x, "**")),
+      SUBCLASS = as.integer(.data$SUBCLASS)
+    ) |>
+    dplyr::select("parcel_id", "year", "season", "CLASS", "SUBCLASS")
+
+  dp_with_crops <- dp_with_parcels |>
+    dplyr::left_join(crops, by = "parcel_id")
+
+  dp_with_crops
+}
diff --git a/modules/data.land/R/irrigation-event-files.R b/modules/data.land/R/irrigation-event-files.R
new file mode 100644
index 00000000000..e7eb3475411
--- /dev/null
+++ b/modules/data.land/R/irrigation-event-files.R
@@ -0,0 +1,23 @@
+#' Create SIPNET event files from water balance data
+#'
+#' Aggregates irrigation to weekly values and formats for SIPNET.
+#' Irrigation is summed by week and reported on the first day of each week.
+#' Units are converted from mm to cm.
+#'
+#' @param df Data frame with columns: date, year, week, day_of_year, irr
+#' @return Data frame with columns: loc, year, doy, event_type, irr_cm, type
+#' @export
+create_event_file <- function(df) {
+  df |>
+    dplyr::summarize(
+      loc = 0,
+      year = dplyr::first(.data$year),
+      doy = dplyr::first(.data$day_of_year),
+      event_type = "irrig",
+      irr_mm_week = sum(.data$irr, na.rm = TRUE),
+      type = 1,
+      .by = c(.data$year, .data$week)
+    ) |>
+    dplyr::mutate(irr_cm = .data$irr_mm_week / 10) |>
+    dplyr::select("loc", "year", "doy", "event_type", "irr_cm", "type")
+}
diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
new file mode 100644
index 00000000000..6b4d592557a
--- /dev/null
+++ b/modules/data.land/R/water_balance.R
@@ -0,0 +1,84 @@
+#' Calculate water balance for a time series at a single site
+#'
+#' This is the core water balance calculation that operates on primitive
+#' numeric vectors for easy testing and debugging. Each input is a time series
+#' of daily values for a single location (one date per row).
+#'
+#' @param et Vector of evapotranspiration values (mm/day)
+#' @param precip Vector of precipitation values (mm/day)
+#' @param whc Water holding capacity (mm), default 500
+#' @param w_min_frac Fraction of WHC for minimum water level, default 0.15
+#' @return List with vectors: W_t (water balance), irr (irrigation), runoff
+#' @export
+calc_water_balance <- function(et, precip, whc = 500, w_min_frac = 0.15) {
+  n <- length(et)
+  if (length(precip) != n) {
+    stop("et and precip must have the same length")
+  }
+
+  w_min <- w_min_frac * whc
+  field_capacity <- whc / 2
+
+  W_t <- numeric(n) #nolint: object_name_linter
+  W0_t <- numeric(n) #nolint: object_name_linter
+  irr <- numeric(n)
+  runoff <- numeric(n)
+
+  W_t[1] <- field_capacity #nolint: object_name_linter
+
+  for (t in seq_len(n)) {
+    if (t == 1) {
+      W_prev <- field_capacity #nolint: object_name_linter
+    } else {
+      W_prev <- W_t[t - 1] #nolint: object_name_linter
+    }
+
+    W0_t[t] <- W_prev + precip[t] - et[t] #nolint: object_name_linter
+
+    irr[t] <- max(w_min - W0_t[t], 0)
+
+    runoff[t] <- max(W0_t[t] - whc, 0)
+
+    W_t[t] <- W_prev + precip[t] + irr[t] - et[t] - runoff[t] #nolint: object_name_linter
+  }
+
+  list(
+    W_t = W_t,
+    irr = irr,
+    runoff = runoff
+  )
+}
+
+#' Apply water balance calculations to a data frame with multiple sites
+#'
+#' Groups by location and applies calc_water_balance to each group.
+#'
+#' @param df Data frame with columns: date, location_id, et_mm_day, precip_mm_day
+#' @param idcol Column name for grouping (typically, `location_id`, `parcel_id` or similar)
+#' @param whc Water holding capacity (mm)
+#' @return Data frame with added columns: W_t, irr, runoff
+#' @export
+apply_water_balance <- function(df, idcol, whc = 500) {
+  need_cols <- c("etc_mm_day", "precip_mm_day", "date")
+  missing_cols <- need_cols[!(need_cols %in% colnames(df))]
+  if (length(missing_cols) > 0) {
+    PEcAn.logger::logger.severe(
+      "Missing the following required columns: ",
+      paste(missing_cols, collapse = ", ")
+    )
+  }
+  df |>
+    dplyr::arrange(.data[[idcol]], .data$date) |>
+    dplyr::mutate(
+      year = as.integer(format(.data$date, "%Y")),
+      week = as.integer(format(.data$date, "%U")),
+      day_of_year = as.integer(format(.data$date, "%j")),
+      results = tibble::as_tibble(calc_water_balance(
+        .data$etc_mm_day,
+        .data$precip_mm_day,
+        whc = whc
+      )),
+      .by = dplyr::all_of(idcol)
+    ) |>
+    tidyr::unpack(results)
+}
diff --git a/modules/data.land/man/apply_water_balance.Rd b/modules/data.land/man/apply_water_balance.Rd
new file mode 100644
index 00000000000..bb90f1ec785
--- /dev/null
+++ b/modules/data.land/man/apply_water_balance.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/water_balance.R
+\name{apply_water_balance}
+\alias{apply_water_balance}
+\title{Apply water balance calculations to a data frame with multiple sites}
+\usage{
+apply_water_balance(df, idcol, whc = 500)
+}
+\arguments{
+\item{df}{Data frame with columns: date, location_id, et_mm_day, precip_mm_day}
+
+\item{idcol}{Column name for grouping (typically, `location_id`, `parcel_id` or similar)}
+
+\item{whc}{Water holding capacity (mm)}
+}
+\value{
+Data frame with added columns: W_t, irr, runoff
+}
+\description{
+Groups by location and applies calc_water_balance to each group.
+}
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
new file mode 100644
index 00000000000..d18b98c830b
--- /dev/null
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/water_balance.R
+\name{calc_water_balance}
+\alias{calc_water_balance}
+\title{Calculate water balance for a time series at a single site}
+\usage{
+calc_water_balance(et, precip, whc = 500, w_min_frac = 0.15)
+}
+\arguments{
+\item{et}{Vector of evapotranspiration values (mm/day)}
+
+\item{precip}{Vector of precipitation values (mm/day)}
+
+\item{whc}{Water holding capacity (mm), default 500}
+
+\item{w_min_frac}{Fraction of WHC for minimum water level, default 0.15}
+}
+\value{
+List with vectors: W_t (water balance), irr (irrigation), runoff
+}
+\description{
+This is the core water balance calculation that operates on primitive
+numeric vectors for easy testing and debugging. Each input is a time series
+of daily values for a single location (one date per row).
+}
diff --git a/modules/data.land/man/create_event_file.Rd b/modules/data.land/man/create_event_file.Rd
new file mode 100644
index 00000000000..1031fb712e0
--- /dev/null
+++ b/modules/data.land/man/create_event_file.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/irrigation-event-files.R
+\name{create_event_file}
+\alias{create_event_file}
+\title{Create SIPNET event files from water balance data}
+\usage{
+create_event_file(df)
+}
+\arguments{
+\item{df}{Data frame with columns: date, year, week, day_of_year, irr}
+}
+\value{
+Data frame with columns: loc, year, doy, event_type, irr_cm, type
+}
+\description{
+Aggregates irrigation to weekly values and formats for SIPNET.
+Irrigation is summed by week and reported on the first day of each week.
+Units are converted from mm to cm.
+}
diff --git a/modules/data.land/man/download_cimis_et.Rd b/modules/data.land/man/download_cimis_et.Rd
new file mode 100644
index 00000000000..dca753c5bef
--- /dev/null
+++ b/modules/data.land/man/download_cimis_et.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/CIMIS-ET.R
+\name{download_cimis_et}
+\alias{download_cimis_et}
+\title{Download CIMIS ETo data}
+\usage{
+download_cimis_et(date, local_root_dir)
+}
+\arguments{
+\item{date}{Date to download}
+
+\item{local_root_dir}{Root directory for storing outputs.}
+}
+\value{
+Path to saved TIF file (invisibly)
+}
+\description{
+Read raw ETo.asc.gz directly from CIMIS spatial portal, add CRS, and save
+locally as Cloud-optimized GeoTIFF. Outputs will be saved to
+`<local_root_dir>/CIMIS-ETo-YYYY-MM-DD.tif`.
+}
diff --git a/modules/data.land/man/extract_chirps_remote.Rd b/modules/data.land/man/extract_chirps_remote.Rd
new file mode 100644
index 00000000000..ff9a4d69dfd
--- /dev/null
+++ b/modules/data.land/man/extract_chirps_remote.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/CHIRPS-precip.R
+\name{extract_chirps_remote}
+\alias{extract_chirps_remote}
+\title{Extract CHIRPS Precipitation Data from Remote NetCDF}
+\usage{
+extract_chirps_remote(design_points, dates)
+}
+\arguments{
+\item{design_points}{A data frame or tibble containing columns `lon` and `lat`
+specifying the geographic coordinates of points to extract precipitation for.}
+
+\item{dates}{A vector of dates or date-time objects specifying the days for which
+to extract precipitation data.}
+}
+\value{
+A modified version of `design_points` with new rows added for each date,
+  plus two new columns:
+  \item{date}{The date of the extracted data (same as the input `date`).}
+  \item{precip_mm_day}{Precipitation in millimeters for the specified day.}
+}
+\description{
+Downloads and extracts daily precipitation data from the CHIRPS (Climate
+Hazards group InfraRed Precipitation with Station data) dataset via remote
+NetCDF file access using vsicurl.
+}
+\examples{
+\dontrun{
+pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
+result <- extract_chirps_remote(pts, as.Date(c("2020-06-15", "2021-06-15")))
+}
+}
diff --git a/modules/data.land/man/extract_cimis_date.Rd b/modules/data.land/man/extract_cimis_date.Rd
new file mode 100644
index 00000000000..445161d55ca
--- /dev/null
+++ b/modules/data.land/man/extract_cimis_date.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/CIMIS-ET.R
+\name{extract_cimis_date}
+\alias{extract_cimis_date}
+\title{Extract CIMIS daily reference ETo values}
+\usage{
+extract_cimis_date(
+  design_points,
+  date,
+  local_root_dir,
+  download_missing = FALSE
+)
+}
+\arguments{
+\item{design_points}{`data.frame` of design points with columns
+`location_id`, `lat`, and `lon`}
+
+\item{date}{Date to download}
+
+\item{local_root_dir}{Root directory for storing outputs.}
+
+\item{download_missing}{If `TRUE` and the local COG is missing, download it.
+If `FALSE` and the file is missing, throw an error.}
+}
+\value{
+`design_points` `data.frame` with additional columns `date`, and
+`etref_mm_day` (reference ET, mm/day)
+}
+\description{
+Extract CIMIS daily reference ETo values
+}
diff --git a/modules/data.land/man/extract_cimis_dates.Rd b/modules/data.land/man/extract_cimis_dates.Rd
new file mode 100644
index 00000000000..24f3a4ef66d
--- /dev/null
+++ b/modules/data.land/man/extract_cimis_dates.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/CIMIS-ET.R
+\name{extract_cimis_dates}
+\alias{extract_cimis_dates}
+\title{Extract CIMIS reference ET for multiple dates}
+\usage{
+extract_cimis_dates(design_points, dates, ...)
+}
+\arguments{
+\item{design_points}{`data.frame` of design points with columns
+`location_id`, `lat`, and `lon`}
+
+\item{dates}{Sequence of dates for which to extract data}
+}
+\value{
+`design_points` `data.frame` extended with ETref data for all dates.
+}
+\description{
+Extract CIMIS reference ET for multiple dates
+}
diff --git a/modules/data.land/man/get_landiq.Rd b/modules/data.land/man/get_landiq.Rd
new file mode 100644
index 00000000000..e15e3395071
--- /dev/null
+++ b/modules/data.land/man/get_landiq.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get-landiq.R
+\name{get_landiq}
+\alias{get_landiq}
+\title{Get LandIQ parcels and crop data}
+\usage{
+get_landiq(design_points, parcels_file, crops_file)
+}
+\arguments{
+\item{design_points}{`data.frame` of coordinates to extract. Must contain
+columns `id`, `lat`, and `lon`.}
+
+\item{parcels_file}{Path to harmonized LandIQ parcels (GPKG) file}
+
+\item{crops_file}{Path to LandIQ crops parquet file}
+}
+\value{
+`design_points` `data.frame` with LandIQ parcel IDs, year, season, CLASS, and SUBCLASS
+}
+\description{
+Get LandIQ parcels and crop data
+}
diff --git a/modules/data.land/man/get_landiq_parcel_ids.Rd b/modules/data.land/man/get_landiq_parcel_ids.Rd
new file mode 100644
index 00000000000..f3cefe20a33
--- /dev/null
+++ b/modules/data.land/man/get_landiq_parcel_ids.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get-landiq-parcels.R
+\name{get_landiq_parcel_ids}
+\alias{get_landiq_parcel_ids}
+\title{Get Parcel IDs from LandIQ}
+\usage{
+get_landiq_parcel_ids(design_points, parcels_file)
+}
+\arguments{
+\item{design_points}{`data.frame` of coordinates to extract. Must contain
+columns `id`, `lat`, and `lon`.}
+
+\item{parcels_file}{Path to harmonized LandIQ parcels (GPKG) file}
+}
+\value{
+`design_points` `data.frame` with harmonized LandIQ parcel_IDs
+}
+\description{
+Get Parcel IDs from LandIQ
+}
diff --git a/modules/data.land/tests/testthat/test-CHIRPS-precip.R b/modules/data.land/tests/testthat/test-CHIRPS-precip.R
new file mode 100644
index 00000000000..64715fc9f11
--- /dev/null
+++ b/modules/data.land/tests/testthat/test-CHIRPS-precip.R
@@ -0,0 +1,64 @@
+test_that("extract_chirps_remote returns data for single date", {
+  skip_if_offline()
+
+  pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
+
+  result <- extract_chirps_remote(pts, as.Date("2020-06-15"))
+
+  expect_s3_class(result, "data.frame")
+  expect_equal(nrow(result), 2)
+  expect_true("date" %in% names(result))
+  expect_true("precip_mm_day" %in% names(result))
+  expect_true(all(result$date == as.Date("2020-06-15")))
+  expect_type(result$precip_mm_day, "double")
+  expect_true(all(result$precip_mm_day >= 0, na.rm = TRUE))
+})
+
+test_that("extract_chirps_remote handles multiple dates in same year", {
+  skip_if_offline()
+
+  pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
+  dates <- seq(as.Date("2020-06-01"), as.Date("2020-06-30"), by = "1 day")
+
+  result <- extract_chirps_remote(pts, dates)
+
+  expect_equal(nrow(result), 2 * 30)
+  expect_equal(unique(lubridate::year(result$date)), 2020)
+  expect_equal(sort(unique(result$date)), sort(dates))
+  expect_type(result$precip_mm_day, "double")
+  expect_true(all(result$precip_mm_day >= 0, na.rm = TRUE))
+})
+
+test_that("extract_chirps_remote handles dates spanning multiple years", {
+  skip_if_offline()
+
+  pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
+  dates <- c(
+    as.Date("2020-12-15"),
+    as.Date("2020-12-31"),
+    as.Date("2021-01-01"),
+    as.Date("2021-01-15")
+  )
+
+  result <- extract_chirps_remote(pts, dates)
+
+  expect_equal(nrow(result), 2 * 4)
+  expect_equal(sort(unique(result$date)), sort(dates))
+  expect_true(all(result$precip_mm_day >= 0, na.rm = TRUE))
+})
+
+test_that("extract_chirps_remote output has correct structure", {
+  skip_if_offline()
+
+  pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
+  dates <- seq(as.Date("2020-06-01"), as.Date("2020-06-10"), by = "1 day")
+
+  result <- extract_chirps_remote(pts, dates)
+
+  expect_equal(nrow(result), nrow(pts) * length(dates))
+  expect_true("date" %in% names(result))
+  expect_true("precip_mm_day" %in% names(result))
+  expect_true(all(result$date %in% dates))
+  expect_true(is.numeric(result$precip_mm_day))
+  expect_true(all(is.na(result$precip_mm_day) | result$precip_mm_day >= 0))
+})
diff --git a/modules/data.land/vignettes/CIMIS-event-files.qmd b/modules/data.land/vignettes/CIMIS-event-files.qmd
new file mode 100644
index 00000000000..0afc6136bc6
--- /dev/null
+++ b/modules/data.land/vignettes/CIMIS-event-files.qmd
@@ -0,0 +1,312 @@
+---
+title: "Generating SIPNET event files from CIMIS and CHIRPS data"
+author: "Alexey N. Shiklomanov"
+---
+
+```{r}
+library(PEcAn.data.land)
+# devtools::load_all("modules/data.land")
+```
+
+Start from a range of dates (2020 to present) and locations (`design_points.csv`).
+
+```{r}
+dates <- seq.Date(as.Date("2020-03-01"), as.Date("2020-11-30"), "day")
+design_points <- readr::read_csv("~/projects/cimis-to-irrigation/design_points.csv") |>
+  head(10)
+```
+
+# CIMIS ETref
+
+For each site, extract its reference ETref from the CIMIS data.
+
+```{r}
+etref <- design_points |>
+  extract_cimis_dates(
+    dates,
+    "~/data/CIMIS-ETo-COG",
+    .progress = TRUE
+  )
+```
+
+# CHIRPS Precipitation
+
+Also, extract precipitation from CHIRPS v2.
+
+```{r}
+precip <- extract_chirps_remote(design_points, dates)
+```
+
+# BIS Kc coefficients
+
+For each site, get LandIQ parcel and crop data.
+
+```{r}
+dp_with_crops <- get_landiq(
+  design_points,
+  parcels_file = "~/data/LandIQ-harmonized-v3/parcels.gpkg",
+  crops_file = "~/data/LandIQ-harmonized-v3/crops_all_years.parq"
+) |>
+  tibble::as_tibble()
+```
+
+Map `CLASS/SUBCLASS` to `crop_name` using `bism_kc_by_crop`.
+
+**NOTE:** Some LandIQ classes/subclasses map onto *multiple BISM crop types*.
+
+```{r}
+bism_kc_by_crop |>
+  dplyr::summarize(n_unique = dplyr::n(), .by = c("landiq_class", "landiq_subclass")) |>
+  dplyr::filter(n_unique > 1) |>
+  dplyr::left_join(bism_kc_by_crop) |>
+  dplyr::summarize(
+    crops = paste(crop_name, collapse = ", "),
+    .by = c("landiq_class", "landiq_subclass", "n_unique")
+  )
+```
+
+So, below, we introduce a **HACK** to select just the first crop in any of these groups.
+The more correct fix is to do some kind of averaging later.
+
+```{r}
+bism_crop_unique <- bism_kc_by_crop |>
+  dplyr::distinct(landiq_class, landiq_subclass, crop_name) |>
+  # WARNING: Hack here!
+  dplyr::slice(1, .by = c("landiq_class", "landiq_subclass"))
+design_point_crops <- dp_with_crops |>
+  dplyr::left_join(
+    bism_crop_unique,
+    by = c("CLASS" = "landiq_class", "SUBCLASS" = "landiq_subclass")
+  )
+```
+
+For demonstration purposes, we will expand this naively using `tidyr::fill` and hard-code dates for the 4 seasons to January 1, April 1, July 1, October 1.
+In reality, you would resolve these more finely using phenology data (e.g., from remote sensing).
+
+```{r}
+fill_season <- function(year, season) {
+  if (season == 1) {
+    start <- lubridate::make_date(year, 1, 1)
+    end <- lubridate::make_date(year, 3, 31)
+  } else if (season == 2) {
+    start <- lubridate::make_date(year, 4, 1)
+    end <- lubridate::make_date(year, 6, 30)
+  } else if (season == 3) {
+    start <- lubridate::make_date(year, 7, 1)
+    end <- lubridate::make_date(year, 9, 30)
+  } else if (season == 4) {
+    start <- lubridate::make_date(year, 10, 1)
+    end <- lubridate::make_date(year, 12, 31)
+  }
+  seq.Date(start, end, "day")
+}
+
+dp_crops_filled <- design_point_crops |>
+  dplyr::filter(!is.na(season)) |>
+  tidyr::fill(
+    "CLASS",
+    "SUBCLASS",
+    "crop_name",
+    .direction = "downup",
+    .by = "parcel_id"
+  ) |>
+  dplyr::mutate(date = purrr::map2(year, season, fill_season)) |>
+  tidyr::unnest(date) |>
+  dplyr::filter(date %in% !!dates)
+```
+
+Identify and warn about parcels with no matching BIS crop.
+
+```{r}
+missing_crops <- dp_crops_filled |> dplyr::filter(is.na(crop_name))
+if (nrow(missing_crops) > 0) {
+  missing_crop_strs <- missing_crops |>
+    dplyr::distinct(CLASS, SUBCLASS) |>
+    dplyr::mutate(string = glue::glue("CLASS: {CLASS} SUBCLASS: {SUBCLASS}")) |>
+    dplyr::pull(string)
+  missing_crop_str <- sprintf("[%s]", paste(missing_crop_strs, collapse = "; "))
+  warning(
+    "Skipping ",
+    nrow(missing_crops),
+    " rows with no matching BIS crop. Relevant pairs are: ",
+    missing_crop_str
+  )
+}
+
+dp_with_cropname <- dp_crops_filled |> dplyr::filter(!is.na(crop_name))
+```
+
+# Join with ETo data
+
+Join with ETref data.
+
+```{r}
+dp_with_eto <- dp_with_cropname |>
+  dplyr::left_join(etref, by = c("id", "date"))
+```
+
+Calculate ETc directly using eto_to_etc_bism. Group by crop_name and apply since eto_to_etc_bism takes a single crop at a time.
+
+```{r}
+dp_with_etc <- dp_with_eto |>
+  dplyr::mutate(
+    etc_mm_day = eto_to_etc_bism(
+      eto = etref_mm_day,
+      crop_name = crop_name[[1]],
+      date = date
+    ),
+    .by = "crop_name"
+  ) |>
+  dplyr::select(
+    dplyr::any_of(c("id", "parcel_id", "lat", "lon")),
+    "date",
+    "etc_mm_day"
+  )
+```
+
+Handle multi-crop parcels (double-cropping) - placeholder logic that warns and averages ETc values.
+
+```{r}
+resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
+  id_sym <- rlang::sym(id_col)
+  date_sym <- rlang::sym(date_col)
+
+  multicrop_counts <- etc_data |>
+    dplyr::add_count(!!id_sym, !!date_sym, name = "n") |>
+    dplyr::filter(.data$n > 1) |>
+    dplyr::summarize(
+      n_multicrop = dplyr::n_distinct(!!id_sym, !!date_sym),
+      .groups = "drop"
+    )
+
+  if (multicrop_counts$n_multicrop > 0) {
+    message(
+      "Multi-crop parcels: ",
+      multicrop_counts$n_multicrop,
+      " date-parcel combinations have multiple crops. Averaging ETc values."
+    )
+  }
+
+  etc_data |>
+    dplyr::group_by(!!id_sym, !!date_sym) |>
+    dplyr::summarize(
+      etc_mm_day = mean(.data$etc_mm_day, na.rm = TRUE),
+      .groups = "drop"
+    )
+}
+
+dp_with_etc <- resolve_multicrop(dp_with_etc)
+```
+
+Join with precipitation data (inner_join to ensure matching dates). Rename etc_mm_day to et_mm_day for apply_water_balance compatibility.
+
+```{r}
+dp_crops_all <- dp_with_etc |>
+  dplyr::inner_join(precip, by = c("id", "date")) |>
+  dplyr::select(c("id", "lat", "lon", "date", "etc_mm_day", "precip_mm_day"))
+```
+
+# Calculate water balance
+
+```{r}
+dpwb <- apply_water_balance(dp_crops_all, "id")
+```
+
+Check crop evapotranspiration values are reasonable.
+
+```{r}
+etc_summary <- dp_crops_all |>
+  dplyr::summarize(
+    etc_min = min(.data$etc_mm_day, na.rm = TRUE),
+    etc_max = max(.data$etc_mm_day, na.rm = TRUE),
+    etc_mean = mean(.data$etc_mm_day, na.rm = TRUE),
+    .by = "id"
+  )
+print(etc_summary)
+```
+
+Check that water balance calculations are reasonable.
+
+```{r}
+wb_summary <- dpwb |>
+  dplyr::group_by(.data$id) |>
+  dplyr::summarize(
+    irr_total = sum(.data$irr, na.rm = TRUE),
+    irr_max = max(.data$irr, na.rm = TRUE),
+    irr_mean = mean(.data$irr, na.rm = TRUE),
+    runoff_total = sum(.data$runoff, na.rm = TRUE),
+    W_t_min = min(.data$W_t, na.rm = TRUE),
+    W_t_max = max(.data$W_t, na.rm = TRUE),
+    .groups = "drop"
+  )
+print(wb_summary)
+```
+
+Check for other issues.
+
+```{r}
+if (any(wb_summary$irr_max < 0)) {
+  warning("Negative irrigation values detected!")
+} else {
+  message("Irrigation values are non-negative")
+}
+
+if (any(wb_summary$W_t_min < 0)) {
+  warning("Negative soil water values detected!")
+} else {
+  message("Soil water values are non-negative")
+}
+```
+
+Seasonal variation check - irrigation should be higher in summer.
+
+```{r}
+monthly_irr <- dpwb |>
+  dplyr::mutate(month = lubridate::month(.data$date)) |>
+  dplyr::group_by(.data$month) |>
+  dplyr::summarize(irr_mean = mean(.data$irr, na.rm = TRUE), .groups = "drop")
+print(monthly_irr)
+```
+
+# Plot results
+
+```{r}
+library(ggplot2)
+dpwb |>
+  ggplot() +
+  aes(x = date, y = irr, color = id) +
+  geom_line() +
+  labs(title = "Irrigation Requirements by Site", y = "Irrigation (mm/day)")
+```
+
+# Write event files
+
+Example of a single event data frame.
+
+```{r}
+dpwb |>
+  dplyr::filter(id == id[[1]]) |>
+  create_event_file()
+```
+
+Write all event files.
+
+```{r}
+outdir <- tempfile(pattern = "events_")
+dir.create(outdir)
+dpwb |>
+  dplyr::group_nest(.data$id) |>
+  dplyr::mutate(fname = purrr::map2(
+    id,
+    data,
+    \(id, dat) readr::write_delim(
+      create_event_file(dat),
+      file.path(outdir, glue::glue("{id}_events.txt")),
+      delim = " ",
+      col_names = FALSE
+    )
+  ))
+
+fnames <- list.files(outdir, full.names = TRUE)
+cat(readr::read_file(fnames[[1]]))
+```

From 161451d6ac0434e0d9131a1da788d5717bf13dfa Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 27 Feb 2026 10:59:44 -0500
Subject: [PATCH 02/67] increase default water balance WHC value

https://github.com/PecanProject/pecan/issues/3763
---
 modules/data.land/R/water_balance.R         | 5 +++--
 modules/data.land/man/calc_water_balance.Rd | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 6b4d592557a..263ce34af82 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -7,10 +7,11 @@
 #' @param et Vector of evapotranspiration values (mm/day)
 #' @param precip Vector of precipitation values (mm/day)
 #' @param whc Water holding capacity (mm), default 500
-#' @param w_min_frac Fraction of WHC for minimum water level, default 0.15
+#' @param w_min_frac Fraction of WHC for minimum water level, default 0.375 is
+#' halfway between the recommended defaults for woody PFTs (0.25) and annuals (0.50)
 #' @return List with vectors: W_t (water balance), irr (irrigation), runoff
 #' @export
-calc_water_balance <- function(et, precip, whc = 500, w_min_frac = 0.15) {
+calc_water_balance <- function(et, precip, whc = 500, w_min_frac = 0.375) {
   n <- length(et)
   if (length(precip) != n) {
     stop("et and precip must have the same length")
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index d18b98c830b..7b40159a458 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -4,7 +4,7 @@
 \alias{calc_water_balance}
 \title{Calculate water balance for a time series at a single site}
 \usage{
-calc_water_balance(et, precip, whc = 500, w_min_frac = 0.15)
+calc_water_balance(et, precip, whc = 500, w_min_frac = 0.375)
 }
 \arguments{
 \item{et}{Vector of evapotranspiration values (mm/day)}
@@ -13,7 +13,8 @@ calc_water_balance(et, precip, whc = 500, w_min_frac = 0.15)
 
 \item{whc}{Water holding capacity (mm), default 500}
 
-\item{w_min_frac}{Fraction of WHC for minimum water level, default 0.15}
+\item{w_min_frac}{Fraction of WHC for minimum water level, default 0.375 is
+halfway between the recommended defaults for woody PFTs (0.25) and annuals (0.50)}
 }
 \value{
 List with vectors: W_t (water balance), irr (irrigation), runoff

From acbf01b3a5cec02eb15360e3d636a25bd7b373ef Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 27 Feb 2026 11:32:48 -0500
Subject: [PATCH 03/67] add crop WHC data

---
 modules/data.land/R/data.R              |  23 ++++++++
 modules/data.land/data-raw/crop_whc.R   |  23 ++++++++
 modules/data.land/data-raw/crop_whc.csv |  67 ++++++++++++++++++++++++
 modules/data.land/data/crop_whc.rda     | Bin 0 -> 2696 bytes
 modules/data.land/man/crop_whc.Rd       |  34 ++++++++++++
 5 files changed, 147 insertions(+)
 create mode 100644 modules/data.land/data-raw/crop_whc.R
 create mode 100644 modules/data.land/data-raw/crop_whc.csv
 create mode 100644 modules/data.land/data/crop_whc.rda
 create mode 100644 modules/data.land/man/crop_whc.Rd

diff --git a/modules/data.land/R/data.R b/modules/data.land/R/data.R
index a49b3eb76c7..2da17bd6a6e 100644
--- a/modules/data.land/R/data.R
+++ b/modules/data.land/R/data.R
@@ -287,3 +287,26 @@
 #'   \code{\link{look_up_fertilizer_components}} for fertilizer nutrient
 #'   composition (N/C fractions) from the SWAT/DayCent database.
 "ca_compost_amendment"
+
+#' Crop-specific minimum water holding capacity (WHC) thresholds
+#'
+#' Minimum soil water content (fraction of available water-holding capacity)
+#' that should be maintained for each crop to avoid yield loss or quality issues.
+#' Values are based on crop root depth, drought tolerance, and sensitivity to
+#' water stress.
+#'
+#' @format A tibble with one row per crop and the following columns:
+#' \describe{
+#'   \item{crop_number}{BIS crop number (character). Blank for crops not in BIS.}
+#'   \item{crop_name}{Crop name.}
+#'   \item{Category}{Crop category (e.g., Woody Perennial, Annual (Hardy)).}
+#'   \item{minWHC}{Minimum WHC (fraction, 0-1).}
+#'   \item{Comments}{Rationale for the minimum WHC value.}
+#' }
+#' @source Expert knowledge; California irrigation scheduling guidelines.
+#' @examples
+#' data(crop_whc)
+#' head(crop_whc)
+#'
+#' @keywords datasets
+"crop_whc"
diff --git a/modules/data.land/data-raw/crop_whc.R b/modules/data.land/data-raw/crop_whc.R
new file mode 100644
index 00000000000..4435b77fcd8
--- /dev/null
+++ b/modules/data.land/data-raw/crop_whc.R
@@ -0,0 +1,23 @@
+#!/usr/bin/env Rscript
+
+library(readr)
+
+# Convert warnings to errors
+options(warn = 2)
+
+raw_csv <- file.path("data-raw", "crop_whc.csv")
+stopifnot(file.exists(raw_csv))
+
+crop_whc <- read_csv(
+  raw_csv,
+  col_types = cols(
+    crop_number = col_character(),
+    crop_name = col_character(),
+    Category = col_character(),
+    minWHC = col_double(),
+    Comments = col_character()
+  ),
+  progress = FALSE
+)
+
+usethis::use_data(crop_whc, overwrite = TRUE)
diff --git a/modules/data.land/data-raw/crop_whc.csv b/modules/data.land/data-raw/crop_whc.csv
new file mode 100644
index 00000000000..88f3cf2097a
--- /dev/null
+++ b/modules/data.land/data-raw/crop_whc.csv
@@ -0,0 +1,67 @@
+"crop_number","crop_name","Category","minWHC","Comments"
+"3.01","Almonds","Woody Perennial","0.35","Deep roots but managed to avoid nut shrivel during heat."
+"3.02","Apple","Woody Perennial","0.35","Deep roots but managed for fruit sizing."
+"1.02","Artichokes","Non-woody Perennial","0.4","Large established crown systems with moderate drought tolerance."
+"1.03","Asparagus","Non-woody Perennial","0.4","Deep-rooted herbaceous perennial with high storage capacity."
+"1.01","Alfalfa (cycle)","Non-woody Perennial","0.4","Deep taproots can scavenge moisture; tolerant of moderate depletion."
+"2.01","Alfalfa (annual)","Non-woody Perennial","0.4","Deep taproots can scavenge moisture; tolerant of moderate depletion."
+"4.01","Avocado","Woody Perennial","0.35","Note: Sensitive to low WHC due to shallow roots without root hairs."
+"1.04","Barley","Annual (Hardy)","0.45","Scavenging root system; relatively drought-tolerant small grain."
+"1.05","Beans (pinto)","Annual (Hardy)","0.45","Can tolerate moderate tension compared to succulent fresh beans."
+"1.06","Beans (dry)","Annual (Hardy)","0.45","Can tolerate moderate tension compared to succulent fresh beans."
+"","Beans (Fresh)","Annual (Sensitive)","0.6","Shallow roots; moisture stress quickly impacts pod quality."
+"1.08","Beets (table)","Annual (Sensitive)","0.6","Consistent moisture required for root development and texture."
+"1.09","Broccoli","Annual (Sensitive)","0.6","Shallow roots; high ET demand to maintain vegetative growth."
+"1.1","Cabbage","Annual (Sensitive)","0.6","High water content crop; very sensitive to head splitting/wilting."
+"1.11","Carrots","Annual (Sensitive)","0.6","Requires high soil moisture to prevent woody texture and forking."
+"1.12","Celery","Annual (Sensitive)","0.6","Extremely shallow roots; highest moisture requirement for crispness."
+"4.02","Citrus (>3.8 m tall)","Woody Perennial","0.35","Moderate depletion allowed but high frequency preferred in summer."
+"4.03","Citrus (<3.0 m tall)","Woody Perennial","0.35","Moderate depletion allowed but high frequency preferred in summer."
+"1.13","Corn (grain)","Annual (Hardy)","0.45","Deep roots; can mine moisture but sensitive during pollination."
+"1.14","Corn (silage)","Annual (Hardy)","0.45","Deep roots; can mine moisture but sensitive during pollination."
+"1.15","Cotton","Annual (Hardy)","0.45","Taproot system allows for significant soil moisture extraction."
+"1.16","Cucumber","Annual (Sensitive)","0.6","High water turnover; stress leads to bitter fruit."
+"4.04","Date Palm","Woody Perennial","0.35","Extremely hardy but requires managed floors for fruit yield."
+"1.17","Eggplant","Annual (Hardy)","0.45","Deeper rooted than most solanaceous vegetables."
+"4.05","Evergreen","Woody Perennial","0.35","Stable root volume allows for lower management floor."
+"1.18","Flax","Annual (Hardy)","0.45","Moderate drought tolerance once established."
+"1.19","Grains (small)","Annual (Hardy)","0.45","General category for cereals with scavenging root architectures."
+"1.20","Grains (winter)","Annual (Hardy)","0.45","General category for cereals with scavenging root architectures."
+"3.03","Grapes (wine)","Woody Perennial","0.35","Deep roots; often managed with Regulated Deficit Irrigation (RDI)."
+"3.04","Grapes (table)","Woody Perennial","0.35","Deep roots; often managed with Regulated Deficit Irrigation (RDI)."
+"3.05","Grapes (raisin)","Woody Perennial","0.35","Deep roots; often managed with Regulated Deficit Irrigation (RDI)."
+"2.02","Improved Pasture","Non-woody Perennial","0.4","Dense root mat; requires moderate floor to maintain regrowth."
+"3.06","Kiwifruit","Woody Perennial","0.35","Vines require stable moisture but can access larger soil volumes."
+"1.21","Lentil","Annual (Hardy)","0.45","Cool-season legume with reasonable drought resistance."
+"1.22","Lettuce","Annual (Sensitive)","0.6","Very shallow roots; moisture stress causes tipburn and bolting."
+"1.23","Melon","Annual (Hardy)","0.45","Deep taproots; managed depletion can increase sugar content (Brix)."
+"1.24","Millet","Annual (Hardy)","0.45","Highly efficient water user; scavenging roots."
+"1.25","Mustard","Annual (Hardy)","0.45","Rapid growth and reasonable scavenging ability."
+"1.26","Oats","Annual (Hardy)","0.45","Standard cereal tolerance for moderate soil moisture depletion."
+"4.06","Olives","Woody Perennial","0.35","Highly drought tolerant; 0.35 is a conservative management floor."
+"1.27","Onion (dry)","Annual (Sensitive)","0.6","Very shallow, fibrous roots; sensitive to bulb sizing issues."
+"1.28","Onion (green)","Annual (Sensitive)","0.6","Very shallow, fibrous roots; sensitive to bulb sizing issues."
+"1.29","Peas","Annual (Sensitive)","0.6","Shallow roots; moisture critical during flowering and pod fill."
+"1.3","Peppers","Annual (Sensitive)","0.6","Sensitive to blossom end rot if moisture fluctuates."
+"1.31","Potato","Annual (Sensitive)","0.6","Shallow roots; stress causes internal defects and yield loss."
+"1.32","Radishes","Annual (Sensitive)","0.6","Fast growing and shallow; requires constant moisture for quality."
+"1.34","Safflower","Annual (Hardy)","0.45","Deep taproot (up to 6ft+); excellent scavenger."
+"1.35","Sisal","Annual (Hardy)","0.45","Succulent-like properties with high drought tolerance."
+"1.36","Sorghum","Annual (Hardy)","0.45","Exceptional drought tolerance; can extract water at high tension."
+"1.37","Spinach","Annual (Sensitive)","0.6","Extremely shallow roots; sensitive to rapid ET changes."
+"1.38","Squash","Annual (Hardy)","0.45","Large leaf area but deep/wide-ranging root system."
+"3.07","Stone fruits","Woody Perennial","0.35","Deep roots; floor managed for fruit sizing and bud set."
+"1.39","Strawberries w/mulch","Annual (Sensitive)","0.6","Shallow roots; usually drip-irrigated with high frequency in CA."
+"1.47","Sudan grass","Annual (Hardy)","0.45","Hardy forage crop with extensive scavenging roots."
+"1.4","Sugarbeet","Annual (Hardy)","0.45","Deep taproot; can tolerate moderate stress before harvest."
+"3.48","Sugarcane","Non-woody Perennial","0.4","High biomass requires steady water but has established root depth."
+"1.41","Sunflower","Annual (Hardy)","0.45","Deep taproot; very effective at mining subsoil moisture."
+"1.42","Sweet Potatoes","Annual (Hardy)","0.45","Deeper and more resilient root system than Irish potatoes."
+"1.43","Tomato","Annual (Sensitive)","0.6","Fresh market types require high WHC for fruit quality/sizing."
+"2.03","Turfgrass (cool-season)","Non-woody Perennial","0.4","Fibrous mat; 0.4 prevents dormancy/browning in CA heat."
+"2.04","Turfgrass (warm-season)","Non-woody Perennial","0.4","Fibrous mat; 0.4 prevents dormancy/browning in CA heat."
+"1.44","Vegetables","Annual (Sensitive)","0.6","General category for high-turnover, shallow-rooted garden crops."
+"3.08","Walnuts","Woody Perennial","0.35","Large trees with deep roots but highly sensitive to late-season stress."
+"1.46","Watermelon","Annual (Hardy)","0.45","Deep roots; can handle more depletion than other melons."
+"1.45","Wheat","Annual (Hardy)","0.45","Standard grain scavenging profile."
+"1.33","Rice","Rice","1.0","WHC logic should be bypassed for this crop."
diff --git a/modules/data.land/data/crop_whc.rda b/modules/data.land/data/crop_whc.rda
new file mode 100644
index 0000000000000000000000000000000000000000..d22ad6545a98fcd40e0a14656486f93e73ad388d
GIT binary patch
literal 2696
zcmV;33U~EFT4*^jL0KkKSs)xY+5iZDfA9bQ-GESU|NsB*-@w1`|L{Nn05AXn00sdh
z2mn9<;0xa@cs2pH$=iU^t#;i5AP|&Mxw;J-gpvXi)M=^dG9=!jdM2Klk12@I+D3<z
z^*v8DPe{!tk(3QKlhFW50%$@MPbl>?dQAWxkYWI600w{opa1~lLqkJAGzK7O41fSM
z&;Za3fB*ntCZd9wq}4W~ZBxoLJrgIXgwWGK&;S4c000^TjSUSA0MHnLpfUge(?A12
zG5`Ppgoyx?O$5Y`O;r7=o;6HQDd`#-Xc}k$4FDPd0Q5?47phz8n_0l5Ewz@E2xve^
zf|5<7h3hb10~h3CNcesrCf>hdwFDDsAz(XU3;1-%lf&|7^kPCZ+Vq(~F*L`b+V%Q-
zv%8yN^`QYlghUWFR1=}qI=??BQ0jGh*x?0<l~~6uuT@i<%}rBP6>C~z^H)6=@?QA6
z66P|$Z_(C24u3zP$=zNozjdz?=h(lg$;Vxte*E3KELu-w)^^;Lzc+E2of%Ra5teQJ
z@z=uM1{o?N1z52CoqN<63XcU8a!{ozL1GF}R(S2cjDea-4tx#CAtv}pKxBa@@CWq*
zT$159xe-)URbsFjfoy;f-y%{34A@9Weay)uf!#2M0A)lCrsskL;)w(BwYZBYGTv9g
zXAMV5YQr#)_^t2h)_QAI^_=(hT_&z<W{kGl<tvUU8|?*?bf!0&sUpV&a;|o0%)(IE
zn$_`&B|-``bgFZt#13$4F4d#-%_a07ocSYJGYccKy5%fxLeOojyff3k4Lb#GZ3;DJ
z)>zI3xuyA}ZC6R%Q>QmGn__k_jh)g=NZe%1*h=RXq;beiaI#87URQCR2E4>yYw3!_
zXGtzKE4$#f9u@|F(K#SsWECi6tQWgv%fQCM+gC%4`j*_kn5Gvn){UjKRb6-NJC_CI
zI0*&!=__q@yfws!WK*Az*juF8)+$(wLuWy-k(FyUwqt|N_9B1=n)BXoZQ-UH_GGLT
zNaB=?nhtXefs;M^9E*mO+00;nn@UuZ==_e2xPX=Lo(-;d6}nh$6s=>n(so@{U3GHn
zt6klt+g7YIOH5+~WqPk1ja7WS?!|T*tER5*?(V6kmO^vbBPAR?E*hH3cy!m-@1jCz
z>4uvzonShQfUp)!&>Xd_^tKBKY?gv*Y6%cJeT*P{RFViE@PNpbmCUi3ONgLp6t!w~
zmPUD*Hiw&9!SBjctgCw|Qncqo#-oysa3iN|`OF<imu>y_19Eo!e<ccgYePttO$Z``
zNlZ~`9L#~4pa*p_H+Z;rkS`d6Z|~oYK(ui2F?ScC9hEY!L84dW9Cr8ol$2{<z*~!S
z<uqqpoDuVx{y0Y}^RaFp6zislIPcewOQ^lwVY752y)+9b>U@^2)<C}u`TDxur%=%j
zP&4M2zmpLOxJZj$xbyX(veaLHf;EUb{f!-Y<;G+DXU(thY-YK3InPyiKSLLC7aE)9
zV~Zr}*b;WXgft9z0p3?}o+^Dkw_5R;Cvm);f@%ErIeMoel?Zi=%o~S|+nl0Du^|sr
zDM-fQm3A{&g^VH5w^>RA=y_NiHNe}fMV7WftOn#-&Yqn}y=JQ04ifiTKQ03^j07<c
zj_8FM9+gXcJQ`Kp&6R~al~#GPO9;v<9(shU3$U7M)M{H5sA>Cab{0K;<^We>nr^ow
z6WX<GBxJOZz6Ob%L+TWg@vvQ|I>q8>Gl&<FFDo@l#UZrP=7`lR-+TH(g=mXw`aZh*
z;@u7Xoy;QHxyZ5fOr&To(I>(UYA%d!qTKr^K{jz{la(P-cH-dX>TFUuRu*;h(D7au
zL=7Yz#?4KZfvlY5iU}1Qp#sqA{2-OKoo7D=ytaoz23biW^9sv(Vp>wtK-T_gIxt@F
zszVMVuNCxjz%|uHkm>JU_KI6mpN0+2wAyuVXeMFNoV&oD5udJCU|$s=oK8bgDH?hJ
z8zWuxp)-*sFJ%i6F7mby&TU}Hm!+mMup?5^8pvx!$=a&!1z?M$jrLrRl)<(jgIjAd
zH$%L8R3vUR1Z+#W%UB#5@pkJ5M|6TY%aqn}tlut+at#t`hBR9QjaJVyp`{G<SngUK
zXWyl6x0qyZiE`rjWuAL`i~!o;)~z(|S!*`z(M>7%yLBsug=TRQ>$vQoWk#5jVqT#r
zu0}<xMR|%_p#eAenkvQbSyV#er$EVMP;ijo!ur%eSt?$jPN?0Eif^0OW-m&EXA#4z
zn-+?tV<}lsrlxnus-hP!7&E_nEXHG1odmp#juzM(xspTMHj}J*NPywg<EPCvYvYZY
zT4B`s4JI*<b}y+C(fOm-%Grk@jI;9_s8U6IIQq9uo5>JODM;FQJLVV5J+)XG&|_lF
z@<IFZ#}4anUKN#0jzAfBlSW3=D^($R%5BUp_9CqUvzPsp(dEuXrKTE4508n2n;}?q
zB5#J<y=xs{AiY_=g|TxWPIn~o1&NcwS>}CG*>T0pEL^MlY$D*T5J@1z&WpVcF!D)E
zhXtx@#Hh-0!+hcj+LEpa*vLBuB1EEXI;}~RofgKqg-ZQ$s^ydf*3B?KJx8A0n->|Z
zve|`&RG`5dPWG-|7effK66qn0_(xN*k%mO9<p8o;+_>{L(Ft=D7IH}rhQl5+86n3Z
z4EBt~(u`?}5;Z_7A~CYWwGBZbp}T)hsRmxl+<b$?HZ5r)rV6!-_|aevcRJ4ip$ky^
z?o)TW<|<}lf+{wD;2|g;no_pAolxUqAgaZ?!W;nzxf7uSfrBba!+x}JFjt~5T0+AE
z+Bol$+ei?xl#=AmY;hdS(4nNkjZ6c+{ACS`rq0%SD<xA+x4N@9dU#gpV($o-SVy?>
zN$a&qEl{DLkeDV?Dhm;cq6(msiov3^os)kl#>1wB)+(eWhjf76ESNp;L_$)U?zv6{
z3>fSxLaow9wJ;=%i5_xgMP)+)o;4xNJbuQ9KW`XgF3ZC*qcam=&Ey0P18Eon98!sP
zLOXZP1C|ATHEZpLtMVmy=}2(Ng9zP5)sYHtieBYPCAis6SW*@l)%J)~6o+!J-I7>a
zinezZ#xl&BXj?^urgsHgLoXDF8u6f3+=A0fIx;DRXCNRN?v$#bp((FBDAy)p<mU4B
zyIE|=*ohZ&Rb^ecGY$Q?bRmUFod98LF@Zi5z1zf;r6Ta)gkB`9;f^Lvm8W}<)QBfg
zqAeXLSDaS_-LFX(ZOh5nRFbKziVg(lW&M4g%z!6dM3j>hk*#5S*^}56UN%j)QwzPI
z?t)8Gnb5Zr7+Zr>5Vc}>C^QMF0A2`7qYL&}7MlrD6orEZnR1@IsEUyYL<<k4@S-B9
z=DJ-4z#<>wS#Ot3pMQhP9V?|B<C%n}BZ5>`Je1nFbd83Wd71RugnA93q>Jn{7X62&
zEP`y%uLP2v2<>+f3TL@OmGfF<M4Q<Nl#Q%A{4nh|WGGy~h(+t3Z8Xfr_oWUQODc`h
z=v26f6Pg$ctnP1gHN)-wz+dPdusyvg2tVWr=oWzhbAqWYqG*3}{}*yaI8cxv95>px
CZ}i~+

literal 0
HcmV?d00001

diff --git a/modules/data.land/man/crop_whc.Rd b/modules/data.land/man/crop_whc.Rd
new file mode 100644
index 00000000000..f9e26ae3db2
--- /dev/null
+++ b/modules/data.land/man/crop_whc.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{crop_whc}
+\alias{crop_whc}
+\title{Crop-specific minimum water holding capacity (WHC) thresholds}
+\format{
+A tibble with one row per crop and the following columns:
+\describe{
+  \item{crop_number}{BIS crop number (character). Blank for crops not in BIS.}
+  \item{crop_name}{Crop name.}
+  \item{Category}{Crop category (e.g., Woody Perennial, Annual (Hardy)).}
+  \item{minWHC}{Minimum WHC (fraction, 0-1).}
+  \item{Comments}{Rationale for the minimum WHC value.}
+}
+}
+\source{
+Expert knowledge; California irrigation scheduling guidelines.
+}
+\usage{
+crop_whc
+}
+\description{
+Minimum soil water content (fraction of available water-holding capacity)
+that should be maintained for each crop to avoid yield loss or quality issues.
+Values are based on crop root depth, drought tolerance, and sensitivity to
+water stress.
+}
+\examples{
+data(crop_whc)
+head(crop_whc)
+
+}
+\keyword{datasets}

From 7547021a557a513e72cb3c6e983b0dcc1b721491 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 27 Feb 2026 14:31:40 -0500
Subject: [PATCH 04/67] more consistent variable names with units

---
 modules/data.land/R/data.R                    |  2 +-
 modules/data.land/R/water_balance.R           | 73 +++++++++++++------
 modules/data.land/data-raw/crop_whc.R         |  2 +-
 modules/data.land/data-raw/crop_whc.csv       |  2 +-
 modules/data.land/man/apply_water_balance.Rd  | 12 +--
 modules/data.land/man/calc_water_balance.Rd   | 15 ++--
 modules/data.land/man/crop_whc.Rd             |  2 +-
 .../data.land/vignettes/CIMIS-event-files.qmd | 18 +++--
 8 files changed, 83 insertions(+), 43 deletions(-)

diff --git a/modules/data.land/R/data.R b/modules/data.land/R/data.R
index 2da17bd6a6e..304d3b96ee6 100644
--- a/modules/data.land/R/data.R
+++ b/modules/data.land/R/data.R
@@ -300,7 +300,7 @@
 #'   \item{crop_number}{BIS crop number (character). Blank for crops not in BIS.}
 #'   \item{crop_name}{Crop name.}
 #'   \item{Category}{Crop category (e.g., Woody Perennial, Annual (Hardy)).}
-#'   \item{minWHC}{Minimum WHC (fraction, 0-1).}
+#'   \item{whc_min_frac}{Minimum WHC as fraction of total WHC (0-1).}
 #'   \item{Comments}{Rationale for the minimum WHC value.}
 #' }
 #' @source Expert knowledge; California irrigation scheduling guidelines.
diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 263ce34af82..f61d69b974b 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -2,16 +2,17 @@
 #'
 #' This is the core water balance calculation that operates on primitive
 #' numeric vectors for easy testing and debugging. Each input is a time series
-#' of daily values for a single location (one date per row).
+#' of daily values for a single location (one date per row). The units for all
+#' quantities are arbitrary, but they should be consistent (distance / time;
+#' e.g., most commonly, mm/day).
 #'
-#' @param et Vector of evapotranspiration values (mm/day)
-#' @param precip Vector of precipitation values (mm/day)
-#' @param whc Water holding capacity (mm), default 500
-#' @param w_min_frac Fraction of WHC for minimum water level, default 0.375 is
-#' halfway between the recommended defaults for woody PFTs (0.25) and annuals (0.50)
+#' @param et Vector of evapotranspiration values (distance / time)
+#' @param precip Vector of precipitation values (distance / time)
+#' @param whc Water holding capacity (distance)
+#' @param w_min_frac Fraction of WHC for minimum water level
 #' @return List with vectors: W_t (water balance), irr (irrigation), runoff
 #' @export
-calc_water_balance <- function(et, precip, whc = 500, w_min_frac = 0.375) {
+calc_water_balance <- function(et, precip, whc, w_min_frac) {
   n <- length(et)
   if (length(precip) != n) {
     stop("et and precip must have the same length")
@@ -20,29 +21,32 @@ calc_water_balance <- function(et, precip, whc = 500, w_min_frac = 0.375) {
   w_min <- w_min_frac * whc
   field_capacity <- whc / 2
 
-  W_t <- numeric(n) #nolint: object_name_linter
-  W0_t <- numeric(n) #nolint: object_name_linter
+  #nolint start: object_name_linter
+  W_t <- numeric(n)
+  W0_t <- numeric(n)
   irr <- numeric(n)
   runoff <- numeric(n)
 
-  W_t[1] <- field_capacity #nolint: object_name_linter
+  W_t[1] <- field_capacity
 
   for (t in seq_len(n)) {
     if (t == 1) {
-      W_prev <- field_capacity #nolint: object_name_linter
+      W_prev <- field_capacity
     } else {
-      W_prev <- W_t[t - 1] #nolint: object_name_linter
+      W_prev <- W_t[t - 1]
     }
 
-    W0_t[t] <- W_prev + precip[t] - et[t] #nolint: object_name_linter
+    W0_t[t] <- W_prev + precip[t] - et[t]
 
     irr[t] <- max(w_min - W0_t[t], 0)
 
     runoff[t] <- max(W0_t[t] - whc, 0)
 
-    W_t[t] <- W_prev + precip[t] + irr[t] - et[t] - runoff[t] #nolint: object_name_linter
+    W_t[t] <- W_prev + precip[t] + irr[t] - et[t] - runoff[t]
   }
 
+  # nolint end: object_name_linter
+
   list(
     W_t = W_t,
     irr = irr,
@@ -52,22 +56,46 @@ calc_water_balance <- function(et, precip, whc = 500, w_min_frac = 0.375) {
 
 #' Apply water balance calculations to a data frame with multiple sites
 #'
-#' Groups by location and applies calc_water_balance to each group.
+#' Groups by location and applies calc_water_balance to each group. Unlike
+#' `calc_water_balance`, the units here *do* matter -- they should be `mm_day`.
 #'
-#' @param df Data frame with columns: date, location_id, et_mm_day, precip_mm_day
+#' @param df Data frame with columns: `date`, `location_id`, `etc_mm_day`,
+#' `precip_mm_day`, and `whc_min_frac`
 #' @param idcol Column name for grouping (typically, `location_id`, `parcel_id` or similar)
-#' @param whc Water holding capacity (mm)
-#' @return Data frame with added columns: W_t, irr, runoff
+#' @param whc_mm Water holding capacity (mm)
+#' @return Data frame with added columns: `W_t`, `irr`, `runoff`
 #' @export
-apply_water_balance <- function(df, idcol, whc = 500) {
+apply_water_balance <- function(df, idcol, whc_mm = 500) {
   need_cols <- c("etc_mm_day", "precip_mm_day", "date")
   missing_cols <- need_cols[!(need_cols %in% colnames(df))]
+  default_whc_min_frac <- 0.375
   if (length(missing_cols) > 0) {
     PEcAn.logger::logger.severe(
       "Missing the following required columns: ",
       paste(missing_cols, collapse = ", ")
     )
   }
+
+  if ("whc_min_frac" %in% colnames(df)) {
+    w_min_frac <- df$whc_min_frac
+    n_na <- sum(is.na(w_min_frac))
+    if (n_na > 0) {
+      PEcAn.logger::logger.warn(
+        sprintf(
+          "whc_min_frac has %d NA values. Replacing with default = %.3f",
+          n_na,
+          default_whc_min_frac
+        )
+      )
+      w_min_frac <- tidyr::replace_na(w_min_frac, default_whc_min_frac)
+    }
+  } else {
+    PEcAn.logger::logger.warn(
+      "whc_min_frac column not found in input data. Using default = 0.375"
+    )
+    w_min_frac <- default_whc_min_frac
+  }
+
   df |>
     dplyr::arrange(.data[[idcol]], .data$date) |>
     dplyr::mutate(
@@ -75,9 +103,10 @@ apply_water_balance <- function(df, idcol, whc = 500) {
       week = as.integer(format(.data$date, "%U")),
       day_of_year = as.integer(format(.data$date, "%j")),
       results = tibble::as_tibble(calc_water_balance(
-        .data$etc_mm_day,
-        .data$precip_mm_day,
-        whc = whc
+        et = .data$etc_mm_day,
+        precip = .data$precip_mm_day,
+        whc = whc_mm,
+        w_min_frac = w_min_frac
       )),
       .by = dplyr::all_of(idcol)
     ) |>
diff --git a/modules/data.land/data-raw/crop_whc.R b/modules/data.land/data-raw/crop_whc.R
index 4435b77fcd8..800e4e567be 100644
--- a/modules/data.land/data-raw/crop_whc.R
+++ b/modules/data.land/data-raw/crop_whc.R
@@ -14,7 +14,7 @@ crop_whc <- read_csv(
     crop_number = col_character(),
     crop_name = col_character(),
     Category = col_character(),
-    minWHC = col_double(),
+    whc_min_frac = col_double(),
     Comments = col_character()
   ),
   progress = FALSE
diff --git a/modules/data.land/data-raw/crop_whc.csv b/modules/data.land/data-raw/crop_whc.csv
index 88f3cf2097a..33be8826ebb 100644
--- a/modules/data.land/data-raw/crop_whc.csv
+++ b/modules/data.land/data-raw/crop_whc.csv
@@ -1,4 +1,4 @@
-"crop_number","crop_name","Category","minWHC","Comments"
+"crop_number","crop_name","Category","whc_min_frac","Comments"
 "3.01","Almonds","Woody Perennial","0.35","Deep roots but managed to avoid nut shrivel during heat."
 "3.02","Apple","Woody Perennial","0.35","Deep roots but managed for fruit sizing."
 "1.02","Artichokes","Non-woody Perennial","0.4","Large established crown systems with moderate drought tolerance."
diff --git a/modules/data.land/man/apply_water_balance.Rd b/modules/data.land/man/apply_water_balance.Rd
index bb90f1ec785..42bba5888f7 100644
--- a/modules/data.land/man/apply_water_balance.Rd
+++ b/modules/data.land/man/apply_water_balance.Rd
@@ -4,18 +4,20 @@
 \alias{apply_water_balance}
 \title{Apply water balance calculations to a data frame with multiple sites}
 \usage{
-apply_water_balance(df, idcol, whc = 500)
+apply_water_balance(df, idcol, whc_mm = 500)
 }
 \arguments{
-\item{df}{Data frame with columns: date, location_id, et_mm_day, precip_mm_day}
+\item{df}{Data frame with columns: `date`, `location_id`, `etc_mm_day`,
+`precip_mm_day`, and `whc_min_frac`}
 
 \item{idcol}{Column name for grouping (typically, `location_id`, `parcel_id` or similar)}
 
-\item{whc}{Water holding capacity (mm)}
+\item{whc_mm}{Water holding capacity (mm)}
 }
 \value{
-Data frame with added columns: W_t, irr, runoff
+Data frame with added columns: `W_t`, `irr`, `runoff`
 }
 \description{
-Groups by location and applies calc_water_balance to each group.
+Groups by location and applies calc_water_balance to each group. Unlike
+`calc_water_balance`, the units here *do* matter -- they should be `mm_day`.
 }
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index 7b40159a458..a7a298f32a2 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -4,17 +4,16 @@
 \alias{calc_water_balance}
 \title{Calculate water balance for a time series at a single site}
 \usage{
-calc_water_balance(et, precip, whc = 500, w_min_frac = 0.375)
+calc_water_balance(et, precip, whc, w_min_frac)
 }
 \arguments{
-\item{et}{Vector of evapotranspiration values (mm/day)}
+\item{et}{Vector of evapotranspiration values (distance / time)}
 
-\item{precip}{Vector of precipitation values (mm/day)}
+\item{precip}{Vector of precipitation values (distance / time)}
 
-\item{whc}{Water holding capacity (mm), default 500}
+\item{whc}{Water holding capacity (distance)}
 
-\item{w_min_frac}{Fraction of WHC for minimum water level, default 0.375 is
-halfway between the recommended defaults for woody PFTs (0.25) and annuals (0.50)}
+\item{w_min_frac}{Fraction of WHC for minimum water level}
 }
 \value{
 List with vectors: W_t (water balance), irr (irrigation), runoff
@@ -22,5 +21,7 @@ List with vectors: W_t (water balance), irr (irrigation), runoff
 \description{
 This is the core water balance calculation that operates on primitive
 numeric vectors for easy testing and debugging. Each input is a time series
-of daily values for a single location (one date per row).
+of daily values for a single location (one date per row). The units for all
+quantities are arbitrary, but they should be consistent (distance / time;
+e.g., most commonly, mm/day).
 }
diff --git a/modules/data.land/man/crop_whc.Rd b/modules/data.land/man/crop_whc.Rd
index f9e26ae3db2..a9525569cc7 100644
--- a/modules/data.land/man/crop_whc.Rd
+++ b/modules/data.land/man/crop_whc.Rd
@@ -10,7 +10,7 @@ A tibble with one row per crop and the following columns:
   \item{crop_number}{BIS crop number (character). Blank for crops not in BIS.}
   \item{crop_name}{Crop name.}
   \item{Category}{Crop category (e.g., Woody Perennial, Annual (Hardy)).}
-  \item{minWHC}{Minimum WHC (fraction, 0-1).}
+  \item{whc_min_frac}{Minimum WHC as fraction of total WHC (0-1).}
   \item{Comments}{Rationale for the minimum WHC value.}
 }
 }
diff --git a/modules/data.land/vignettes/CIMIS-event-files.qmd b/modules/data.land/vignettes/CIMIS-event-files.qmd
index 0afc6136bc6..23e4cbdac6c 100644
--- a/modules/data.land/vignettes/CIMIS-event-files.qmd
+++ b/modules/data.land/vignettes/CIMIS-event-files.qmd
@@ -133,7 +133,12 @@ if (nrow(missing_crops) > 0) {
   )
 }
 
-dp_with_cropname <- dp_crops_filled |> dplyr::filter(!is.na(crop_name))
+dp_with_cropname <- dp_crops_filled |> 
+  dplyr::filter(!is.na(crop_name)) |>
+  dplyr::left_join(
+    crop_whc |> dplyr::select("crop_name", "whc_min_frac"),
+    by = "crop_name"
+  )
 ```
 
 # Join with ETo data
@@ -142,7 +147,9 @@ Join with ETref data.
 
 ```{r}
 dp_with_eto <- dp_with_cropname |>
-  dplyr::left_join(etref, by = c("id", "date"))
+  dplyr::left_join((
+    etref |> dplyr::select("id", "date", "etref_mm_day")
+  ), by = c("id", "date"))
 ```
 
 Calculate ETc directly using eto_to_etc_bism. Group by crop_name and apply since eto_to_etc_bism takes a single crop at a time.
@@ -160,7 +167,8 @@ dp_with_etc <- dp_with_eto |>
   dplyr::select(
     dplyr::any_of(c("id", "parcel_id", "lat", "lon")),
     "date",
-    "etc_mm_day"
+    "etc_mm_day",
+    "whc_min_frac"
   )
 ```
 
@@ -198,12 +206,12 @@ resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
 dp_with_etc <- resolve_multicrop(dp_with_etc)
 ```
 
-Join with precipitation data (inner_join to ensure matching dates). Rename etc_mm_day to et_mm_day for apply_water_balance compatibility.
+Join with precipitation data (inner_join to ensure matching dates).
 
 ```{r}
 dp_crops_all <- dp_with_etc |>
   dplyr::inner_join(precip, by = c("id", "date")) |>
-  dplyr::select(c("id", "lat", "lon", "date", "etc_mm_day", "precip_mm_day"))
+  dplyr::select(c("id", "lat", "lon", "date", "etc_mm_day", "precip_mm_day", "whc_min_frac"))
 ```
 
 # Calculate water balance

From 988d400a472f81d70f5c832937bacd7201f6a5dd Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 27 Feb 2026 16:43:07 -0500
Subject: [PATCH 05/67] rewrite water balance logic. add some tests.

---
 modules/data.land/R/water_balance.R           | 97 ++++++++++++-------
 modules/data.land/man/calc_water_balance.Rd   | 30 +++++-
 .../tests/testthat/test-water_balance.R       | 58 +++++++++++
 3 files changed, 148 insertions(+), 37 deletions(-)
 create mode 100644 modules/data.land/tests/testthat/test-water_balance.R

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index f61d69b974b..688c3cdc63d 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -6,52 +6,82 @@
 #' quantities are arbitrary, but they should be consistent (distance / time;
 #' e.g., most commonly, mm/day).
 #'
+#' The handling of rice here is crude and primitive: setting `whc_min_frac` =
+#' 1.0 (as set in `crop_whc` for rice) means a near-constant need for
+#' irrigation to balance ET + seepage, which roughly mimics the behavior of
+#' maintaining a standing flood. However, proper treatment of rice requires
+#' maintaining a field *above* field capacity, and has other complications.
+#' These will be implemented in the future.
+#'
 #' @param et Vector of evapotranspiration values (distance / time)
 #' @param precip Vector of precipitation values (distance / time)
-#' @param whc Water holding capacity (distance)
-#' @param w_min_frac Fraction of WHC for minimum water level
-#' @return List with vectors: W_t (water balance), irr (irrigation), runoff
+#' @param whc Water holding capacity (distance); interpreted as the plant-
+#'   available range from wilting point to field capacity
+#' @param whc_min_frac Fraction of WHC for minimum water level (irrigation
+#'   trigger)
+#' @param seepage_rate Daily seepage loss for rice paddies (distance / time);
+#'   only used when is_rice = TRUE
+#' @param is_rice Logical; if TRUE, applies a constant seepage loss (mm/day)
+#' @return List with vectors: W_t (soil water), irr (irrigation), runoff
 #' @export
-calc_water_balance <- function(et, precip, whc, w_min_frac) {
+calc_water_balance <- function(
+  et,
+  precip,
+  whc,
+  whc_min_frac,
+  seepage_rate = NULL,
+  is_rice = FALSE
+) {
+  # nolint start: object_name_linter
   n <- length(et)
-  if (length(precip) != n) {
-    stop("et and precip must have the same length")
+
+  if (is_rice && is.null(seepage_rate)) {
+    stop("Seepage rate must be defined for rice fields")
   }
 
-  w_min <- w_min_frac * whc
-  field_capacity <- whc / 2
+  # Field capacity is the upper management target (= WHC under the convention
+  # that WHC spans wilting point to field capacity)
+  field_capacity <- whc
+  w_min <- whc_min_frac * whc
 
-  #nolint start: object_name_linter
   W_t <- numeric(n)
-  W0_t <- numeric(n)
   irr <- numeric(n)
   runoff <- numeric(n)
 
-  W_t[1] <- field_capacity
+  # Initialize at field capacity
+  W_prev <- field_capacity
 
   for (t in seq_len(n)) {
-    if (t == 1) {
-      W_prev <- field_capacity
-    } else {
-      W_prev <- W_t[t - 1]
-    }
+    # Only water above w_min is available for seepage
+    seepage <- if (is_rice) min(seepage_rate, max(0, W_prev - w_min)) else 0.0
 
-    W0_t[t] <- W_prev + precip[t] - et[t]
+    # Potential state after precip and ET
+    W0 <- W_prev + precip[t] - et[t] - seepage
 
-    irr[t] <- max(w_min - W0_t[t], 0)
+    # If W0 falls below w_min (e.g., high ET and seepage; low precip), irrigate
+    # to field capacity.
+    if (W0 < w_min) {
+      irr[t] <- field_capacity - W0
+      W0 <- field_capacity
+    } else {
+      irr[t] <- 0
+    }
 
-    runoff[t] <- max(W0_t[t] - whc, 0)
+    # If W0 exceeds field capacity, the difference is runoff.
+    if (W0 > field_capacity) {
+      runoff[t] <- W0 - field_capacity
+      W_t[t] <- field_capacity
+    } else {
+      runoff[t] <- 0
+      W_t[t] <- max(W0, w_min)
+    }
 
-    W_t[t] <- W_prev + precip[t] + irr[t] - et[t] - runoff[t]
+    W_prev <- W_t[t]
   }
 
   # nolint end: object_name_linter
 
-  list(
-    W_t = W_t,
-    irr = irr,
-    runoff = runoff
-  )
+  list(W_t = W_t, irr = irr, runoff = runoff)
 }
 
 #' Apply water balance calculations to a data frame with multiple sites
@@ -77,8 +107,8 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
   }
 
   if ("whc_min_frac" %in% colnames(df)) {
-    w_min_frac <- df$whc_min_frac
-    n_na <- sum(is.na(w_min_frac))
+    whc_min_frac <- df$whc_min_frac
+    n_na <- sum(is.na(whc_min_frac))
     if (n_na > 0) {
       PEcAn.logger::logger.warn(
         sprintf(
@@ -87,17 +117,18 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
           default_whc_min_frac
         )
       )
-      w_min_frac <- tidyr::replace_na(w_min_frac, default_whc_min_frac)
+      whc_min_frac <- tidyr::replace_na(whc_min_frac, default_whc_min_frac)
     }
   } else {
     PEcAn.logger::logger.warn(
-      "whc_min_frac column not found in input data. Using default = 0.375"
+      "whc_min_frac column not found in input data. Using default = ",
+      default_whc_min_frac
     )
-    w_min_frac <- default_whc_min_frac
+    whc_min_frac <- default_whc_min_frac
   }
 
   df |>
-    dplyr::arrange(.data[[idcol]], .data$date) |>
+    dplyr::arrange(.data[[idcol]], .data$date) |> # nolint: object_usage_linter
     dplyr::mutate(
       year = as.integer(format(.data$date, "%Y")),
       week = as.integer(format(.data$date, "%U")),
@@ -106,9 +137,9 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
         et = .data$etc_mm_day,
         precip = .data$precip_mm_day,
         whc = whc_mm,
-        w_min_frac = w_min_frac
+        whc_min_frac = whc_min_frac
       )),
       .by = dplyr::all_of(idcol)
     ) |>
-    tidyr::unpack(results)
+    tidyr::unpack(.data$results)
 }
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index a7a298f32a2..a7c459d8b06 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -4,19 +4,33 @@
 \alias{calc_water_balance}
 \title{Calculate water balance for a time series at a single site}
 \usage{
-calc_water_balance(et, precip, whc, w_min_frac)
+calc_water_balance(
+  et,
+  precip,
+  whc,
+  whc_min_frac,
+  seepage_rate = NULL,
+  is_rice = FALSE
+)
 }
 \arguments{
 \item{et}{Vector of evapotranspiration values (distance / time)}
 
 \item{precip}{Vector of precipitation values (distance / time)}
 
-\item{whc}{Water holding capacity (distance)}
+\item{whc}{Water holding capacity (distance); interpreted as the plant-
+available range from wilting point to field capacity}
 
-\item{w_min_frac}{Fraction of WHC for minimum water level}
+\item{whc_min_frac}{Fraction of WHC for minimum water level (irrigation
+trigger)}
+
+\item{seepage_rate}{Daily seepage loss for rice paddies (distance / time);
+only used when is_rice = TRUE}
+
+\item{is_rice}{Logical; if TRUE, applies a constant seepage loss (mm/day)}
 }
 \value{
-List with vectors: W_t (water balance), irr (irrigation), runoff
+List with vectors: W_t (soil water), irr (irrigation), runoff
 }
 \description{
 This is the core water balance calculation that operates on primitive
@@ -25,3 +39,11 @@ of daily values for a single location (one date per row). The units for all
 quantities are arbitrary, but they should be consistent (distance / time;
 e.g., most commonly, mm/day).
 }
+\details{
+The handling of rice here is crude and primitive: setting `whc_min_frac` =
+1.0 (as set in `crop_whc` for rice) means a near-constant need for
+irrigation to balance ET + seepage, which roughly mimics the behavior of
+maintaining a standing flood. However, proper treatment of rice requires
+maintaining a field *above* field capacity, and has other complications.
+These will be implemented in the future.
+}
diff --git a/modules/data.land/tests/testthat/test-water_balance.R b/modules/data.land/tests/testthat/test-water_balance.R
new file mode 100644
index 00000000000..f55af788f09
--- /dev/null
+++ b/modules/data.land/tests/testthat/test-water_balance.R
@@ -0,0 +1,58 @@
+context("Water balance calculations")
+
+expect_nonnegative <- function(result) {
+  testthat::expect_true(all(result$W_t >= 0))
+  testthat::expect_true(all(result$irr >= 0))
+  testthat::expect_true(all(result$runoff >= 0))
+}
+
+test_that("calc_water_balance: more precip leads to more runoff", {
+  n <- 10
+  et <- rep(5, n)
+  whc <- 100
+  whc_min_frac <- 0.5
+
+  precip_low <- c(rep(5, 5), rep(0, 5))
+  precip_high <- c(rep(15, 5), rep(0, 5))
+
+  result_low <- calc_water_balance(et, precip_low, whc, whc_min_frac)
+  result_high <- calc_water_balance(et, precip_high, whc, whc_min_frac)
+
+  expect_true(sum(result_high$runoff) > sum(result_low$runoff))
+  expect_nonnegative(result_low)
+  expect_nonnegative(result_high)
+})
+
+test_that("calc_water_balance: more ET leads to less runoff", {
+  n <- 10
+  precip <- c(rep(10, 5), rep(0, 5))
+  whc <- 100
+  whc_min_frac <- 0.5
+
+  et_low <- rep(2, n)
+  et_high <- rep(8, n)
+
+  result_low <- calc_water_balance(et_low, precip, whc, whc_min_frac)
+  result_high <- calc_water_balance(et_high, precip, whc, whc_min_frac)
+
+  expect_true(sum(result_high$runoff) < sum(result_low$runoff))
+  expect_nonnegative(result_low)
+  expect_nonnegative(result_high)
+})
+
+test_that("calc_water_balance: more ET leads to more irrigation", {
+  n <- 60
+  precip <- rep(0, n)
+  whc <- 100
+  whc_min_frac <- 0.5
+
+  et_low <- rep(1, n)
+  et_high <- rep(5, n)
+
+  result_low <- calc_water_balance(et_low, precip, whc, whc_min_frac)
+  result_high <- calc_water_balance(et_high, precip, whc, whc_min_frac)
+
+  expect_true(sum(result_high$irr) > sum(result_low$irr))
+  expect_nonnegative(result_low)
+  expect_nonnegative(result_high)
+})

From 21dd0572ef0adee93246c8d02bae64348c5b7b2b Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sun, 1 Mar 2026 09:43:55 -0500
Subject: [PATCH 06/67] update crop_whc.rda

---
 modules/data.land/data/crop_whc.rda | Bin 2696 -> 2703 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/data.land/data/crop_whc.rda b/modules/data.land/data/crop_whc.rda
index d22ad6545a98fcd40e0a14656486f93e73ad388d..10f58d95f836c43504dfdca450ce87e132e0982e 100644
GIT binary patch
delta 2698
zcmV;53U&2}6^|7WLRx4!F+o`-Q&~?jzIp%%eUT9&fB#H*uG|6Bi#TwVoZt-#5-6b7
znmTp~B{DS9r?k^l{S7E>0x&=T00E)uXlOJ&5C%ZdG|&)4Ad|{^o|8{d0qG5-0MGyp
z00x=>032v&XlMq2#0>$E00x=>8Uc_101PD5NJUS{sx*3fQ1uO|rh_JfKs`VJGynhq
z&^<tLf1#nFpc(@ZGzLHb8fXA$20#D+Fp(e-nhA<&iKRRg^qG`288M{OKzf5sG5`jF
zk)cik`Q^f?wWJC{-C1iuhJ*x&DJ0quzKasd!t=0_H~+vrsnwcbK`SJM5Vk}u%*qLz
z9+r3F>JfpgNt43sKyoUMr{|jr^6NS%KwKFhe?$OWgacB6IXVYN(Z|*uZjQb8oJfcz
z@6(%ARASLlRJB5;W**U29+P=ZYuQ9Fz7K2Z>yN{Ke}j*yXOr8TJTUuD7Yz2Z+r2MM
zM8M`5jWtTyi!fDIMJVfc!q)DI)y4J-SZ1h<6=K7%_FAYgDm>~a<e^Gbg2WV{tuozx
ze<=e*k{#>~=piaWAp;-<7D)ha0s26ml5mzR^i>rVSgZ!1TOb58B2okkG=wC77OKDz
z(9ns2SP*J+d(bYF#2+U!IWWwg%fPI9#u%iTaGz+9=~sh#N<Ohs&5@gxWZsD4iPu{S
zEEo{`WyGe)OYIsF0KVj5Rv&c?L!|}0e^!lq(1HVs8Dy$q*-+tz<&&*tx}z}J4ZPS7
zqzpNL%2?qfY(XwH(izpMn|6C5wVa9?D(MO_aBMSfcnhJ-Y^cvaI}L9di=yLX2zU$9
z5fLFPcugY)4w_uRgm<(ZgBymN;q9%ndb{H=EEbD1Tus@_g>TU~+$aiwr3Fg3f3=qb
z4B<wg&WwYk-W=>dm?Gy6viid0lcwE6Ft7)VZV2Mra|U$M&bA2bs7C$()EgvA7pPG3
z>x_lEJ_^d<q&emGkdOcaII9-e&zIGNl29s<$tf7(sTd4y`hQI8?c3GjHeb)$=eIlC
zd@CDv04IU#Jzee@?=HC_w0<ncf2TrN=)%WEvogaji&s^JhrPqgU7|(Rye?ja=qjwG
zS(%xcRH;%@7lEOTC^>mKLdb{9JiecKF$*`G^U{mV2ddHr&^DEjNUdutI)y}RRhdgt
z(<mVGd>BFS*-#LF;RGdYo>YS>D~W-e?Co1g(iAjA4PByUK=_%WT5U+te=*4exkc^~
zc@!Y!;(kW}=2h`(_>dM(Znu_&Jj+8!l}!jDgGo$LX(?bUfIN7^*Z4aA%n9(2IepJ>
zEkLq)b{MZ?;JB&;`$I}cWRTU@*|{{aUqGtweD@hKt*<1$^B+C(;a>JCr?m*&$oUS8
zZ*X{PKR|1L3!MZD2_?>je@jaESxh$bG&H2RWPi8}cFm`L==ew^O4cu4JlCu<mRZ%7
zA`I;unb5;{z37HrPCm7@tsS%P9tPoenEK3EO^YUmsFa$0V8}m6{v+B6#C&JFv(f5C
zBsP`_d~+``J9{}o{`RS*#70f9_s@bo|JWpa$q16Mrj(}ftYqFue_Mio<J!Tdfo%_o
zgbi^wY!PL#kZb|D7PF_HG+wh+ZHF0qtsf2pxdvqQB~Q8`Qm4qY+wJ;zV|#qqp<14g
zWwTN;m0rg<ltQ{p3Z!DHRfxg+jFhpv#NYz1>=S0~JvpMPdx{m;3o9Y!aCQk?xJ(Mu
zhXN83yr_dnfp`+|f3o$-RD6>C^c~kkx+U_4h7m{Q{oTm)Mc)m6PkNDZYmpE-r4BBN
z(I&qkN|JO&xm9guMHo(d3vVi!V%(fMI@=VEb&EV*HXbw2b|7q!cbj##S_c7)Trwmx
z8>ka;(0Wu|ZXHK{{HJ?LU??I;fcA$fZ3-b_CIf|^=99Z|e``4*)G$UoHp!<03aCmT
z@hsNKA&RFnxn<^Bp0u?FSUhXXa8C%+%2#M#6(F2WLs2Oj`oIm5vii`O%&M==p^_`>
zt+SVS^XJE(%Ir0eQtN>7pMsF1y0`jPLN1axS#vzh8)67HHnbyrJIAj=M)Occ)VkcZ
z0mZJK=D=iif5;=MT;{ZmhY5Bq=ru{E7}aeOGq#!?hO{)!$6D0y&$n9La4^W<66eYB
z%S|_Rm;trLto7HeWw711RWzs5+|67yD>IOne|gbB%8fB5#J+JTu0}<xMR|%_;Q=?{
zKP)25UiH<4E^2fP)@26?4h)yuq5{cM^#XNA40#!;e|bow)&owHmN=&1h-(>2%7ryF
zzC~3Kxp_gI<b0-OGOE%6N~DvVoPg)KBz=={I>&^F4n6ahzgW*_EN#})4*Bdfn8rQ$
zzG#-O#U5O(pmP|@KM}r#Bv;4F=ifGPBtbN#BXQ$*&@Yyos<1Vv#>J!NgZO2RJ$B%{
zb(Ktxe_$DOri_iqR;ohvN^a0D_M<ig(ZBBT4vyP2z1rCz9GdVJrA-GSu<-7?*s<S$
zg7s$x7RAhgIoy-T7A8*#X`$#Xmn>Yu#mc{Jgj^M(2_zVLQFfum9!V*X;I&Pdl^ITW
zZ>k`zsVd-&tb?FpBuXaJs?9Q!qT1I`RIiAtf4L$6Sv8!`lLxrumW|P5S!~6HRG`8(
zoON>XbTEq%E|M8u!>88F#$>G}0J2)#xb}Ofgt?LnIub*{p-z&CBc&l0)M|!sQW|U$
zaR4^PF{dI!B-mgQH8=3<?MO27z2}VX4U1w(se-Ly{V1>pcIRyH3J|j&dgV9zJS9yG
ze^5qZoWN9+JGW(?nwiuN;fR8(7Vi+}0uyx?fd=-5p(w+CNaSFzVlY}l!vpG^xJhoL
z2w2KVb7xZ=M^T~_G?+1|fOg-Mp|N&t?laY_l}&eE>dfQj<6EVRydqurN3ilq?6pgq
zDpLZ;giNZSu^6f#stGubnS+rUFRp84e`b$Sgw`sgC5Lo?;4GLv5JW;!n(MhvB@7to
zDnhN&M#V5BjENsenNe9#z-f%o<{jTtQ>*M_3}xAPW>jW^Y#G3SfuL<807H#5OT-lW
zT^5-OHPN|kmf81Z#V&#?QU;i8A6Isyrv~X+_A<pIlrz0HP0kp$uH^v^%qrZ%e*}YI
zOtS*PMio;z6O!psDFb+%J~R+%$pyA^3r#I3$fgyXfPia`?>S5)#OfwY-kS9jhr#9L
z&u(VaNW0prD&*$W_G>>1Cka{rCncc3pBi56;!0AHcyK~55>{}>6DG>jwa98j6R1%Z
zj>;A4ir|^6LkkjHn}A4Uk*HV&e?k|e{JoA000YiLBOw%#v0;1K6WSGB)=jrl3$>u@
zf=f3uu(u-^Tf;aZXNlmT&?ck-cp)yv7wE7pc9Nti3kD5x=RKKG6(SIcj&jcUVoj1X
z&gggqkLsDHp~{z4sg3#bs2VKtDLh5=rBE~_yszd-QLWyFotu!4Slk-OaK6(~aN2p|
z$R^E7@kuGrj^nt9Q$3m#uZq(uB;MFWq;6r`={%V`(cKCs7!^?0p9cdNoC5aJLwu!`
zM-;jhE+YW21IB6#9cm~>-slT_4bVS-l!PDT3CI_iKpSAEOO`<n=KsaqkxmpO6U=X(
EfNRI;a{vGU

delta 2691
zcmV-}3VijC6^IoPLRx4!F+o`-Q&}J!H`)LQe~}R)f8Q*4HUYND+kn!ocHILY5R_55
zx(yqIk^&RdX{qTlB;KNWCZ3v)DTvV8Mu(L3Jx?`HNX;jalnpkM(Ev&UXhIZEDD^aY
zO#mK{VgP6W27m#e0084dLqk9`1|Vn*fB-bm0MHD8003boqJo*E)i$GTQ_3_w6DO&J
z(9=NBe*gdg000^TjSUSA0MHnLpfUge(?A12G5`Ppgoyx?O$5Y`O;r7=o;6HQDd`#-
zXc}k$4FDPd0Q5?47phz8n_0l5Ewz@E2xve^f|5<7h3hb10~h3CNcesrCf>hdwFDDs
zAz(XU3;1-%lf&|7^kPCZ+Vq(~F*L`b+V%Q-f3v%rVfCQ_L4-sQHdGU#)jGdFCs68j
zdf4FwiIrH#Ew5Emo6SvARTXPmV)Iu$7xG^CyAtLyzi-jjKMsFCp~>A|EWdTH66e^z
zsmaG(oqqh?x-42xWY%`vmA^M}nVlI@8xfXm{qfhr-Ub;eBL!Hn{hfQ%7z&RC6mn3d
ze=0#@3Q$&f?Y@kGnn@0P4ap%U_((uxfhX_>^#WXy;W)VwR8&=Buo{7EfDqpzQUnax
zNJxFm$s~c@FopnSL=C3rf&}7;1Ms!DizqVQSHWiuM@eeKFp&7I@9EZhYgP4}_w`*S
zu54zEw%X+@jwu`M1(S59H=3y;#{_b&e|Bih!cf?n)$xiYLJBl=s&k~o4sdHO)uZ&y
zCG;Ph`6F2~3nQ|+<t%PO&~2=|Gt<8fI|XfR3N>cdSk48xrTL_7S4rJdr#Ca3Vs<c%
zozhH5++@tyO6L}&amY<@vPwi=S8<*Oyu@E?>59Z>NiH=jyWqAS76yOOIUrzUe-$WX
ztQWgv%fQCM+gC%4`j*_kn5Gvn){UjKRb6-NJC_CII0*&!=__q@yfws!WK*Az*juF8
z)+$(wLuWy-k(FyUwqt|N_9B1=n)BXoZQ-UH_GGLTNaB=?nhtXefs;M^9E*mO+00;n
zn@UuZ==_e2xPX=Lo(-;d6}nh$e-y1_w$gT8Rb6#*>#JSerQ24lGfPZk1Z8@!9F0|c
zyza$z8mp$R?(XiXrj|l;*drwzJuVuW%6N3w*YBc2Xz7NVF`ZyKjDWBfOwb&)tn{`E
z2yB*uYHA4(I(>{Fd{mMMAMk+4m6gn~nM;VEX%w|;b(ThXnKp-;TEXwie^jiidnr=1
z=R?M$l8$gAr)>Gm9Y~jL{q_TLcKm-O3VUlqNR>?pB7;dxQE42^ftjEObuu@2xOb2*
z7=v%`-;F@DaPcvB7oi=MGOj_QSL7Ua_xzNUYhS=yi*)5QXIz{S^O^oQM=JBNZXXou
zrieK2*N;o6z1?B6bR)eqe+ww;e3q`(K)($6`nuhxP|*%hGv=4SlMxBHNQ+*$^Yx&z
z)L(ysHHbR>jU9RA#$)_v&9CrmX1R7b&sBIoLl<%v8k^>0izMpU5_Z3YGz@qF-dAy+
zDt$e-TJf1DalD;^Y5ev%dZ!|l2z89i8;6bCoT5juArDh2NXFrne|9rig^VH5w^>RA
z=y_NiHNe}fMV7WftOn#-&Yqn}y=JQ04ifiTKQ03^j07<cj_8FM9+gXcJQ`Kp&6R~a
zl~#GPO9;v<9(shU3$U7M)M{H5sA>Cab{0K;<^We>nr^ow6WX<GBxJOZz6Ob%L+TWg
z@vvQ|I>q8>Gl&<Fe=jRFO2r|x(&mWOE8lziLWO9HYx+LA`{LaV{hiDr*}2HE^h~5^
zF3~5#4QeioZlc`#C_y%HXp@y8Qg-6t=IU%xIaU^R^U(2L7eoyt9mdT~mVvCC<cbLu
z9H9cx>ii&;ww-4`2E4Y1LIzn$BJ&E%d16{p(m>Y!X*w`ofAFe94kND>^mD*9)kTo$
z?_TzbTT`Eg4bHUMb#G`UVbPqsz@8DGu2*1R6(F2WLs2OjdH@?EUG$+dktHu>3lT2z
zwhzv2V9A%IrZTW2Qqmg8YemW0s_q3~i=>VAT#uB&wjhIBYce-Oyn9q6ZZrgJOS#Kf
z92)U<>jp=3e}Xy7l-6;q-!6-C4H9XFG+P9XR?jn`r4030?phsZ-=%K1m}G8=a^m=9
zo_l+Y0NUWztu*dgYc}lBO)2=hbt{I2W^oehxa^>1MwpXgUZE(iMn$Vdd5T-10XO)X
zD#h<vR6^pXK*?lKaFF1_`qV&KDqf&YsNIf=Z=2U<e=kadXA#4zn-+?tV<}lsrlxnu
zs-hP!7&E_nEXHG1odmp#juzM(xspTMHj}J*NPywg<EPCvYvYZYT4B`s4JI*<b}y+C
z(fOm-%Grk@jI;9_s8U6IIQq9uo5>JODM;FQJLVV5J+)XG&|_lF@<IFZ#}4anUKN#0
zjzAfBf0IT=)GJjXdCG0fF7_g=0<)L>l+oqRMWv=1NDq&RgqtB)bRutt+r4WYU?9C&
zy@j!JAWnBA@&$>L!dd2hQrU6E%q(21`fMWLtq@5d!_JGn4lwdbOos)kYs9F^a>IP$
z3fhvc2-wIw1|meFZ91(<l${pFxrIvoa;oK&e+1UeFh4y<p4^)k8LYC|g@sh0!5dEY
zu3i^I2(c3BA&vM)Q?ZeTM6BfivRd4@^ES~5a}*YGNe+g?9y1vs#~}>%jKtE6X^Ijx
zKq?|Jvc$CwK_Q{Le@>|eUd!BkgTyv1X(FZywTt-CU=DXW&j6tdQ2Xvvce~~)W@3UW
ze>Q*MAt)Z2QntIDP~&1Cs>Qp)903Tq6QKivgDOhHezb8gSE4XlLc;^vIPa3%ND#4<
zlH|^8aU9Ihp`^i$Oas3BWetm_&enP>B~wkey0bWXcvk6R?+BMzN4WAy>$OQOP@$lZ
zm?lyx3lWN<3ZRpU!J@RClYc43!={ARe=4LUhjf76ESNp;L_$)U?zv6{3>fSxLaow9
zwJ;=%i5_xgMP)+)o;4xNJbuQ9KW`XgF3ZC*qcam=&Ey0P18Eon98!sPLOXZP1C|AT
zHEZpLtMVmy=}2(Ng9zP5)sYHtieBYPCAis6SW*@l)%J)~6o+!J-I7>ainezZf5tM*
znrK@^gr;`|TthDuh#K*rR@{QqOFA+sg=Zij8t#;;p`j_SJ1EyCV&vxX_PbeZ$k>S&
zb5&(sxHAp?xO5?fNu2;;YcYX76usNTl%*o@;DlZztl^F(O_is6kkp7LP@*jzC|8_U
z1l_Mm7j4VQ*i@3Ktcnf<=VkqUf1b>MCtXC8lN6DyVSCw=*cDzjO}A4Ey`b)bOH-N9
zw-Xp!gH#Z;Vt6Pt38?^H2uq_2_E;912~re=g9e#$p1i1vkqAT!52f&;BB<uNT?N1*
zAL3bWmrb94gUcN&r5)p$gr*~cR8~Bc+PHL$hL?Gn^xA}a4WXoq>@*hrX@{mPf^5*Q
z1d^Qy?ROChXSqU^^IBy@o7o7IjjTKTFzq*FC|tmZMeCkzG|a~Lr4AWODvi?URJe!}
xnivbL?r(H8!|naRU+5mNJ-sOiKjaDM7J&eBf~hT{Xn%A67ji{7P>>)TH`<K~@XP=J


From 13f6cd316b8a589ff471459cccac80d764f45503 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sun, 1 Mar 2026 09:44:23 -0500
Subject: [PATCH 07/67] export CHIRPS-precip-remote function

---
 modules/data.land/NAMESPACE         | 1 +
 modules/data.land/R/CHIRPS-precip.R | 1 +
 2 files changed, 2 insertions(+)

diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index e90edad0df3..03a2a764f05 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -26,6 +26,7 @@ export(extract.stringCode)
 export(extract_FIA)
 export(extract_NEON_veg)
 export(extract_SM_CDS)
+export(extract_chirps_remote)
 export(extract_cimis_date)
 export(extract_cimis_dates)
 export(extract_soil_gssurgo)
diff --git a/modules/data.land/R/CHIRPS-precip.R b/modules/data.land/R/CHIRPS-precip.R
index c451c195f52..3c27d4fad5b 100644
--- a/modules/data.land/R/CHIRPS-precip.R
+++ b/modules/data.land/R/CHIRPS-precip.R
@@ -17,6 +17,7 @@
 #' pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
 #' result <- extract_chirps_remote(pts, as.Date(c("2020-06-15", "2021-06-15")))
 #' }
+#' @export
 extract_chirps_remote <- function(design_points, dates) {
   CHIRPS_REMOTE_ROOT <- "https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_daily/netcdf/p05"
 

From b5a587560d24aba5096b00d61310d90b4970de47 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sun, 1 Mar 2026 10:24:58 -0500
Subject: [PATCH 08/67] fix length mismatch from variable WHC

---
 modules/data.land/R/water_balance.R           | 34 +++++++++++++++----
 .../data.land/vignettes/CIMIS-event-files.qmd |  1 +
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 688c3cdc63d..917715df8c9 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -35,8 +35,28 @@ calc_water_balance <- function(
   # nolint start: object_name_linter
   n <- length(et)
 
+  if (!length(precip) == n) {
+    PEcAn.logger::logger.severe(
+      "Precip and ET have different lengths. ",
+      "length(precip) = ", length(precip), "  ",
+      "length(et) = ", n
+    )
+  }
+
+  if (!length(whc) == 1) {
+    PEcAn.logger::logger.severe(
+      "whc must have length 1; actual length = ", length(whc)
+    )
+  }
+
+  if (!length(whc_min_frac) == 1) {
+    PEcAn.logger::logger.severe(
+      "whc_min_frac must have length 1; actual length = ", length(whc_min_frac)
+    )
+  }
+
   if (is_rice && is.null(seepage_rate)) {
-    stop("Seepage rate must be defined for rice fields")
+    PEcAn.logger::logger.severe("Seepage rate must be defined for rice fields")
   }
 
   # Field capacity is the upper management target (= WHC under the convention
@@ -107,8 +127,7 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
   }
 
   if ("whc_min_frac" %in% colnames(df)) {
-    whc_min_frac <- df$whc_min_frac
-    n_na <- sum(is.na(whc_min_frac))
+    n_na <- sum(is.na(df$whc_min_frac))
     if (n_na > 0) {
       PEcAn.logger::logger.warn(
         sprintf(
@@ -117,14 +136,13 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
           default_whc_min_frac
         )
       )
-      whc_min_frac <- tidyr::replace_na(whc_min_frac, default_whc_min_frac)
     }
   } else {
     PEcAn.logger::logger.warn(
       "whc_min_frac column not found in input data. Using default = ",
       default_whc_min_frac
     )
-    whc_min_frac <- default_whc_min_frac
+    df[["whc_min_frac"]] <- default_whc_min_frac
   }
 
   df |>
@@ -133,11 +151,15 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
       year = as.integer(format(.data$date, "%Y")),
       week = as.integer(format(.data$date, "%U")),
       day_of_year = as.integer(format(.data$date, "%j")),
+      whc_min_frac = tidyr::replace_na(.data$whc_min_frac, default_whc_min_frac),
       results = tibble::as_tibble(calc_water_balance(
         et = .data$etc_mm_day,
         precip = .data$precip_mm_day,
         whc = whc_mm,
-        whc_min_frac = whc_min_frac
+        # NOTE: Use unique here because, in a merged crop data frame, this gets
+        # expanded to a vector of values. They *should* all be unique per
+        # `idcol`; if they're not, `calc_water_balance` should fail loudly.
+        whc_min_frac = unique(.data$whc_min_frac)
       )),
       .by = dplyr::all_of(idcol)
     ) |>
diff --git a/modules/data.land/vignettes/CIMIS-event-files.qmd b/modules/data.land/vignettes/CIMIS-event-files.qmd
index 23e4cbdac6c..d04edbd2ff9 100644
--- a/modules/data.land/vignettes/CIMIS-event-files.qmd
+++ b/modules/data.land/vignettes/CIMIS-event-files.qmd
@@ -199,6 +199,7 @@ resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
     dplyr::group_by(!!id_sym, !!date_sym) |>
     dplyr::summarize(
       etc_mm_day = mean(.data$etc_mm_day, na.rm = TRUE),
+      whc_min_frac = mean(.data$whc_min_frac, na.rm = TRUE),
       .groups = "drop"
     )
 }

From 2b1b94f6d709065d59703ee2ca259e01951fdbbf Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sun, 1 Mar 2026 10:35:37 -0500
Subject: [PATCH 09/67] add more customizability to `calc_water_balance`

Can now optionally set field_capacity, W_initial, and w_min
---
 modules/data.land/R/water_balance.R         | 36 +++++++++++++++------
 modules/data.land/man/calc_water_balance.Rd | 19 +++++++++--
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 917715df8c9..06d260ab9ce 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -15,12 +15,19 @@
 #'
 #' @param et Vector of evapotranspiration values (distance / time)
 #' @param precip Vector of precipitation values (distance / time)
-#' @param whc Water holding capacity (distance); interpreted as the plant-
+#' @param whc Water holding capacity (WHC) (distance); interpreted as the plant-
 #'   available range from wilting point to field capacity
 #' @param whc_min_frac Fraction of WHC for minimum water level (irrigation
-#'   trigger)
+#'   trigger); unused if `w_min` is explicitly specified
+#' @param field_capacity Maximum water holding capacity at field capacity
+#'   (distance); defaults to `whc` if NULL
+#' @param W_initial Initial soil water content at start of time series
+#'   (distance); defaults to `field_capacity` if NULL
+#' @param w_min Minimum water level threshold (distance); irrigation is
+#'   triggered when soil water falls below this level; defaults to
+#'   `whc_min_frac * whc` if NULL
 #' @param seepage_rate Daily seepage loss for rice paddies (distance / time);
-#'   only used when is_rice = TRUE
+#'   only used when `is_rice = TRUE`
 #' @param is_rice Logical; if TRUE, applies a constant seepage loss (mm/day)
 #' @return List with vectors: W_t (soil water), irr (irrigation), runoff
 #' @export
@@ -29,6 +36,9 @@ calc_water_balance <- function(
   precip,
   whc,
   whc_min_frac,
+  field_capacity = NULL,
+  W_initial = NULL, #nolint: object_name_linter
+  w_min = NULL,
   seepage_rate = NULL,
   is_rice = FALSE
 ) {
@@ -59,18 +69,24 @@ calc_water_balance <- function(
     PEcAn.logger::logger.severe("Seepage rate must be defined for rice fields")
   }
 
-  # Field capacity is the upper management target (= WHC under the convention
-  # that WHC spans wilting point to field capacity)
-  field_capacity <- whc
-  w_min <- whc_min_frac * whc
+  if (is.null(field_capacity)) {
+    field_capacity <- whc
+  }
+  if (is.null(w_min)) {
+    w_min <- whc_min_frac * whc
+  }
+
+  if (is.null(W_initial)) {
+    # Initialize at field capacity
+    W_prev <- field_capacity
+  } else {
+    W_prev <- W_initial
+  }
 
   W_t <- numeric(n)
   irr <- numeric(n)
   runoff <- numeric(n)
 
-  # Initialize at field capacity
-  W_prev <- field_capacity
-
   for (t in seq_len(n)) {
     # Only water above w_min is available for seepage
     seepage <- if (is_rice) min(seepage_rate, max(0, W_prev - w_min)) else 0.0
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index a7c459d8b06..602688d7975 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -9,6 +9,9 @@ calc_water_balance(
   precip,
   whc,
   whc_min_frac,
+  field_capacity = NULL,
+  W_initial = NULL,
+  w_min = NULL,
   seepage_rate = NULL,
   is_rice = FALSE
 )
@@ -18,14 +21,24 @@ calc_water_balance(
 
 \item{precip}{Vector of precipitation values (distance / time)}
 
-\item{whc}{Water holding capacity (distance); interpreted as the plant-
+\item{whc}{Water holding capacity (WHC) (distance); interpreted as the plant-
 available range from wilting point to field capacity}
 
 \item{whc_min_frac}{Fraction of WHC for minimum water level (irrigation
-trigger)}
+trigger); unused if `w_min` is explicitly specified}
+
+\item{field_capacity}{Maximum water holding capacity at field capacity
+(distance); defaults to `whc` if NULL}
+
+\item{W_initial}{Initial soil water content at start of time series
+(distance); defaults to `field_capacity` if NULL}
+
+\item{w_min}{Minimum water level threshold (distance); irrigation is
+triggered when soil water falls below this level; defaults to
+`whc_min_frac * whc` if NULL}
 
 \item{seepage_rate}{Daily seepage loss for rice paddies (distance / time);
-only used when is_rice = TRUE}
+only used when `is_rice = TRUE`}
 
 \item{is_rice}{Logical; if TRUE, applies a constant seepage loss (mm/day)}
 }

From bc3633f1a99ff7d6fbebe5f3c3de41c72a753107 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sun, 1 Mar 2026 11:14:03 -0500
Subject: [PATCH 10/67] clarify WHC vs. field capacity in water_balance

---
 modules/data.land/R/water_balance.R         | 47 +++++++++++++--------
 modules/data.land/man/calc_water_balance.Rd | 28 +++++++++---
 2 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 06d260ab9ce..035ecc12558 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -6,6 +6,11 @@
 #' quantities are arbitrary, but they should be consistent (distance / time;
 #' e.g., most commonly, mm/day).
 #'
+#' This function operates in *relative* WHC space, where:
+#' - `w = 0` represents wilting point (no plant-available water)
+#' - `w = whc` represents field capacity (maximum plant-available water)
+#' - `whc = (field_capacity - wilting_point) * rooting_depth` (the plant-available range)
+#'
 #' The handling of rice here is crude and primitive: setting `whc_min_frac` =
 #' 1.0 (as set in `crop_whc` for rice) means a near-constant need for
 #' irrigation to balance ET + seepage, which roughly mimics the behavior of
@@ -15,14 +20,12 @@
 #'
 #' @param et Vector of evapotranspiration values (distance / time)
 #' @param precip Vector of precipitation values (distance / time)
-#' @param whc Water holding capacity (WHC) (distance); interpreted as the plant-
-#'   available range from wilting point to field capacity
+#' @param whc Water holding capacity (distance); the plant-available range from
+#'   wilting point to field capacity (i.e., `whc = field_capacity - wilting_point`)
 #' @param whc_min_frac Fraction of WHC for minimum water level (irrigation
 #'   trigger); unused if `w_min` is explicitly specified
-#' @param field_capacity Maximum water holding capacity at field capacity
-#'   (distance); defaults to `whc` if NULL
 #' @param W_initial Initial soil water content at start of time series
-#'   (distance); defaults to `field_capacity` if NULL
+#'   (distance); defaults to `whc` (field capacity) if NULL
 #' @param w_min Minimum water level threshold (distance); irrigation is
 #'   triggered when soil water falls below this level; defaults to
 #'   `whc_min_frac * whc` if NULL
@@ -30,13 +33,24 @@
 #'   only used when `is_rice = TRUE`
 #' @param is_rice Logical; if TRUE, applies a constant seepage loss (mm/day)
 #' @return List with vectors: W_t (soil water), irr (irrigation), runoff
+#' @examples
+#' # Calculate WHC from field capacity, wilting point, and rooting depth
+#' field_capacity <- 0.30  # volumetric (m3/m3)
+#' wilting_point <- 0.10   # volumetric (m3/m3)
+#' rooting_depth <- 1000   # mm
+#' whc <- (field_capacity - wilting_point) * rooting_depth  # mm
+#'
+#' # Run water balance with 5 days of ET and precip data
+#' et <- c(4, 5, 6, 4, 3)  # mm/day
+#' precip <- c(0, 0, 10, 0, 0)  # mm/day
+#' result <- calc_water_balance(et, precip, whc = whc, whc_min_frac = 0.5)
+#' str(result)
 #' @export
 calc_water_balance <- function(
   et,
   precip,
   whc,
   whc_min_frac,
-  field_capacity = NULL,
   W_initial = NULL, #nolint: object_name_linter
   w_min = NULL,
   seepage_rate = NULL,
@@ -69,16 +83,13 @@ calc_water_balance <- function(
     PEcAn.logger::logger.severe("Seepage rate must be defined for rice fields")
   }
 
-  if (is.null(field_capacity)) {
-    field_capacity <- whc
-  }
   if (is.null(w_min)) {
     w_min <- whc_min_frac * whc
   }
 
   if (is.null(W_initial)) {
-    # Initialize at field capacity
-    W_prev <- field_capacity
+    # Initialize at field capacity (i.e., full WHC)
+    W_prev <- whc
   } else {
     W_prev <- W_initial
   }
@@ -95,18 +106,18 @@ calc_water_balance <- function(
     W0 <- W_prev + precip[t] - et[t] - seepage
 
     # If W0 falls below w_min (e.g., high ET and seepage; low precip), irrigate
-    # to field capacity.
+    # to field capacity (i.e., full WHC).
     if (W0 < w_min) {
-      irr[t] <- field_capacity - W0
-      W0 <- field_capacity
+      irr[t] <- whc - W0
+      W0 <- whc
     } else {
       irr[t] <- 0
     }
 
-    # If W0 exceeds field capacity, the difference is runoff.
-    if (W0 > field_capacity) {
-      runoff[t] <- W0 - field_capacity
-      W_t[t] <- field_capacity
+    # If W0 exceeds field capacity (i.e., whc), the difference is runoff.
+    if (W0 > whc) {
+      runoff[t] <- W0 - whc
+      W_t[t] <- whc
     } else {
       runoff[t] <- 0
       W_t[t] <- max(W0, w_min)
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index 602688d7975..3a44b20d259 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -9,7 +9,6 @@ calc_water_balance(
   precip,
   whc,
   whc_min_frac,
-  field_capacity = NULL,
   W_initial = NULL,
   w_min = NULL,
   seepage_rate = NULL,
@@ -21,17 +20,14 @@ calc_water_balance(
 
 \item{precip}{Vector of precipitation values (distance / time)}
 
-\item{whc}{Water holding capacity (WHC) (distance); interpreted as the plant-
-available range from wilting point to field capacity}
+\item{whc}{Water holding capacity (distance); the plant-available range from
+wilting point to field capacity (i.e., `whc = field_capacity - wilting_point`)}
 
 \item{whc_min_frac}{Fraction of WHC for minimum water level (irrigation
 trigger); unused if `w_min` is explicitly specified}
 
-\item{field_capacity}{Maximum water holding capacity at field capacity
-(distance); defaults to `whc` if NULL}
-
 \item{W_initial}{Initial soil water content at start of time series
-(distance); defaults to `field_capacity` if NULL}
+(distance); defaults to `whc` (field capacity) if NULL}
 
 \item{w_min}{Minimum water level threshold (distance); irrigation is
 triggered when soil water falls below this level; defaults to
@@ -53,6 +49,11 @@ quantities are arbitrary, but they should be consistent (distance / time;
 e.g., most commonly, mm/day).
 }
 \details{
+This function operates in *relative* WHC space, where:
+- `w = 0` represents wilting point (no plant-available water)
+- `w = whc` represents field capacity (maximum plant-available water)
+- `whc = (field_capacity - wilting_point) * rooting_depth` (the plant-available range)
+
 The handling of rice here is crude and primitive: setting `whc_min_frac` =
 1.0 (as set in `crop_whc` for rice) means a near-constant need for
 irrigation to balance ET + seepage, which roughly mimics the behavior of
@@ -60,3 +61,16 @@ maintaining a standing flood. However, proper treatment of rice requires
 maintaining a field *above* field capacity, and has other complications.
 These will be implemented in the future.
 }
+\examples{
+# Calculate WHC from field capacity, wilting point, and rooting depth
+field_capacity <- 0.30  # volumetric (m3/m3)
+wilting_point <- 0.10   # volumetric (m3/m3)
+rooting_depth <- 1000   # mm
+whc <- (field_capacity - wilting_point) * rooting_depth  # mm
+
+# Run water balance with 5 days of ET and precip data
+et <- c(4, 5, 6, 4, 3)  # mm/day
+precip <- c(0, 0, 10, 0, 0)  # mm/day
+result <- calc_water_balance(et, precip, whc = whc, whc_min_frac = 0.5)
+str(result)
+}

From bd2339d9a20522823c5d8c46d6cc97907e8989cd Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 11:01:23 -0500
Subject: [PATCH 11/67] draft SSURGO get mukey function

---
 modules/data.land/R/gSSURGO_Query.R           | 220 ++++++++++++++++++
 .../data.land/tests/testthat/test-ssurgo.R    | 161 +++++++++++++
 2 files changed, 381 insertions(+)
 create mode 100644 modules/data.land/tests/testthat/test-ssurgo.R

diff --git a/modules/data.land/R/gSSURGO_Query.R b/modules/data.land/R/gSSURGO_Query.R
index d90d1e02c0f..05c36969ad6 100644
--- a/modules/data.land/R/gSSURGO_Query.R
+++ b/modules/data.land/R/gSSURGO_Query.R
@@ -138,4 +138,224 @@ gSSURGO.Query <- function(mukeys,
   
 }
 
+#' Get map unit keys (mukeys) from gSSURGO using spatial filters
+#'
+#' Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
+#' based on spatial filters: bounding box, polygon, or point with distance.
+#'
+#' @param bbox Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).
+#'   Features that intersect the bounding box are returned.
+#' @param polygon Polygon coordinates in WGS84. Can be:
+#'   - An `sf` object with a single polygon geometry
+#'   - A numeric matrix with columns x (lon) and y (lat), where the first and
+#'     last points are identical (closed ring)
+#'   Features that intersect the polygon are returned.
+#' @param point Numeric vector of length 2: c(lon, lat) in WGS84.
+#'   Must be used with `distance`.
+#' @param distance Numeric. Distance in meters from the point.
+#'   Must be used with `point`. Use 0 for exact point intersection.
+#'
+#' @return Character vector of unique map unit keys (mukeys).
+#'
+#' @details
+#' This function uses the NRCS SDM Data Access Web Feature Service:
+#' \url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
+#'
+#' The total extent of any spatial filter cannot exceed 10,100,000,000 square
+#' meters (~3,900 square miles).
+#'
+#' @examples
+#' \dontrun{
+#' # Bounding box query
+#' mukeys <- ssurgo_mukeys(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
+#'
+#' # Point with distance (600m radius)
+#' mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 600)
+#'
+#' # Point with zero distance (exact intersection)
+#' mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 0)
+#'
+#' # Polygon as matrix
+#' poly <- rbind(
+#'   c(-88.0865046533, 37.5555143852),
+#'   c(-88.0860204771, 37.5600435404),
+#'   c(-88.0782858287, 37.5595392364),
+#'   c(-88.0787704736, 37.5550101113),
+#'   c(-88.0865046533, 37.5555143852)
+#' )
+#' mukeys <- ssurgo_mukeys(polygon = poly)
+#'
+#' # Polygon as sf object
+#' poly_sf <- sf::st_polygon(list(poly))
+#' mukeys <- ssurgo_mukeys(polygon = poly_sf)
+#' }
+#' @export
+ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance = NULL) {
+  n_provided <- sum(c(!is.null(bbox), !is.null(polygon), !is.null(point)))
+
+  if (n_provided == 0) {
+    stop("Must provide one of: bbox, polygon, or point")
+  }
+
+  if (n_provided > 1) {
+    stop("Only one of bbox, polygon, or point may be provided")
+  }
+
+  if (!is.null(point)) {
+    if (length(point) != 2) {
+      stop("point must be a numeric vector of length 2: c(lon, lat)")
+    }
+    if (is.null(distance)) {
+      stop("distance is required when point is provided")
+    }
+    if (!is.numeric(distance) || distance < 0) {
+      stop("distance must be a non-negative numeric value")
+    }
+  }
+
+  if (!is.null(distance) && is.null(point)) {
+    stop("distance requires point to be provided")
+  }
+
+  filter_xml <- if (!is.null(bbox)) {
+    if (!is.numeric(bbox) || length(bbox) != 4) {
+      stop("bbox must be a numeric vector of length 4: c(xmin, ymin, xmax, ymax)")
+    }
+    xmin <- bbox[1]
+    ymin <- bbox[2]
+    xmax <- bbox[3]
+    ymax <- bbox[4]
+
+    if (xmin >= xmax || ymin >= ymax) {
+      stop("bbox must have xmin < xmax and ymin < ymax")
+    }
 
+    paste0(
+      "<Filter>",
+      "<BBOX>",
+      "<PropertyName>Geometry</PropertyName>",
+      "<Box srsName='EPSG:4326'>",
+      "<coordinates>", xmin, ",", ymin, " ", xmax, ",", ymax, "</coordinates>",
+      "</Box>",
+      "</BBOX>",
+      "</Filter>"
+    )
+  } else if (!is.null(polygon)) {
+    coords <- if (inherits(polygon, "sfc")) {
+      if (length(polygon) != 1) {
+        stop("polygon (sfc) must contain exactly one geometry")
+      }
+      geom <- polygon[[1]]
+      if (inherits(geom, "POLYGON")) {
+        as.vector(t(geom))
+      } else {
+        stop("sfc object must contain a POLYGON geometry")
+      }
+    } else if (inherits(polygon, "sfg")) {
+      if (inherits(polygon, "POLYGON")) {
+        as.vector(t(polygon))
+      } else {
+        stop("sfg object must be a POLYGON")
+      }
+    } else if (inherits(polygon, "sf")) {
+      if (nrow(polygon) != 1) {
+        stop("polygon (sf) must contain exactly one feature")
+      }
+      geom <- sf::st_geometry(polygon)[[1]]
+      if (inherits(geom, "POLYGON")) {
+        as.vector(t(geom))
+      } else {
+        stop("sf object must contain a POLYGON geometry")
+      }
+    } else if (is.matrix(polygon) || is.data.frame(polygon)) {
+      if (ncol(polygon) != 2) {
+        stop("polygon matrix must have 2 columns: x (lon) and y (lat)")
+      }
+      as.vector(t(as.matrix(polygon)))
+    } else {
+      stop("polygon must be an sf/sfc object or a matrix/data.frame with coordinates")
+    }
+
+    coords_str <- paste(coords, collapse = " ")
+
+    paste0(
+      "<Filter>",
+      "<Intersect>",
+      "<PropertyName>Geometry</PropertyName>",
+      "<gml:Polygon>",
+      "<gml:outerBoundaryIs>",
+      "<gml:LinearRing>",
+      "<gml:coordinates>", coords_str, "</gml:coordinates>",
+      "</gml:LinearRing>",
+      "</gml:outerBoundaryIs>",
+      "</gml:Polygon>",
+      "</Intersect>",
+      "</Filter>"
+    )
+  } else if (!is.null(point)) {
+    lon <- point[1]
+    lat <- point[2]
+
+    paste0(
+      "<Filter>",
+      "<DWithin>",
+      "<PropertyName>Geometry</PropertyName>",
+      "<gml:Point>",
+      "<gml:coordinates>", lon, ",", lat, "</gml:coordinates>",
+      "</gml:Point>",
+      "<Distance units=\"m\">", distance, "</Distance>",
+      "</DWithin>",
+      "</Filter>"
+    )
+  }
+
+  base_url <- "https://sdmdataaccess.nrcs.usda.gov/Spatial/SDMWGS84Geographic.wfs"
+
+  if (!is.null(bbox)) {
+    query <- list(
+      SERVICE = "WFS",
+      VERSION = "1.1.0",
+      REQUEST = "GetFeature",
+      TYPENAME = "MapunitPoly",
+      BBOX = paste(bbox, collapse = ","),
+      OUTPUTFORMAT = "XMLMukeyList"
+    )
+    resp <- httr2::request(base_url) |>
+      httr2::req_url_query(!!!query) |>
+      httr2::req_perform()
+  } else {
+    query <- list(
+      SERVICE = "WFS",
+      VERSION = "1.1.0",
+      REQUEST = "GetFeature",
+      TYPENAME = "MapunitPoly",
+      FILTER = filter_xml,
+      OUTPUTFORMAT = "XMLMukeyList"
+    )
+    resp <- httr2::request(base_url) |>
+      httr2::req_url_query(!!!query) |>
+      httr2::req_perform()
+  }
+
+  httr2::resp_check_status(resp)
+
+  resp_text <- httr2::resp_body_string(resp)
+
+  resp_xml <- XML::xmlParse(resp_text)
+
+  mukey_nodes <- XML::getNodeSet(resp_xml, "//MapUnitKeyList")
+
+  if (length(mukey_nodes) == 0) {
+    return(character(0))
+  }
+
+  mukey_str <- XML::xmlValue(mukey_nodes[[1]])
+
+  if (is.null(mukey_str) || nchar(trimws(mukey_str)) == 0) {
+    return(character(0))
+  }
+
+  mukeys <- unique(strsplit(trimws(mukey_str), ",")[[1]])
+
+  mukeys
+}
diff --git a/modules/data.land/tests/testthat/test-ssurgo.R b/modules/data.land/tests/testthat/test-ssurgo.R
new file mode 100644
index 00000000000..27f4a24643b
--- /dev/null
+++ b/modules/data.land/tests/testthat/test-ssurgo.R
@@ -0,0 +1,161 @@
+context("ssurgo_mukeys")
+
+test_that("ssurgo_mukeys requires exactly one spatial filter", {
+  expect_error(ssurgo_mukeys(), "Must provide one of")
+  expect_error(ssurgo_mukeys(bbox = c(1, 2, 3, 4), point = c(1, 2)), "Only one of")
+  expect_error(ssurgo_mukeys(polygon = matrix(1:10, ncol = 2), point = c(1, 2)), "Only one of")
+})
+
+test_that("ssurgo_mukeys validates bbox input", {
+  expect_error(ssurgo_mukeys(bbox = "not numeric"), "numeric vector of length 4")
+  expect_error(ssurgo_mukeys(bbox = c(1, 2)), "numeric vector of length 4")
+  expect_error(ssurgo_mukeys(bbox = c(3, 2, 1, 4)), "xmin < xmax")
+  expect_error(ssurgo_mukeys(bbox = c(1, 4, 3, 2)), "ymin < ymax")
+})
+
+test_that("ssurgo_mukeys validates point and distance", {
+  expect_error(ssurgo_mukeys(point = c(1, 2, 3)), "length 2")
+  expect_error(ssurgo_mukeys(point = c(1, 2)), "distance is required")
+  expect_error(ssurgo_mukeys(distance = 100), "Must provide one of")
+  expect_error(ssurgo_mukeys(point = c(1, 2), distance = -10), "non-negative")
+  expect_error(ssurgo_mukeys(point = c(1, 2), distance = "100"), "non-negative")
+})
+
+test_that("ssurgo_mukeys validates polygon input", {
+  expect_error(ssurgo_mukeys(polygon = 1:5), "sf/sfc object or a matrix")
+  expect_error(ssurgo_mukeys(polygon = matrix(1:6, ncol = 3)), "2 columns")
+})
+
+test_that("ssurgo_mukeys bbox returns mukeys for valid location", {
+  skip_on_cran()
+  skip_on_ci()
+
+  mukeys <- ssurgo_mukeys(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
+
+  expect_type(mukeys, "character")
+  expect_gt(length(mukeys), 0)
+})
+
+test_that("ssurgo_mukeys point with distance returns mukeys", {
+  skip_on_cran()
+  skip_on_ci()
+
+  mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 600)
+
+  expect_type(mukeys, "character")
+  expect_gt(length(mukeys), 0)
+})
+
+test_that("ssurgo_mukeys point with zero distance returns mukeys", {
+  skip_on_cran()
+  skip_on_ci()
+
+  mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 0)
+
+  expect_type(mukeys, "character")
+})
+
+test_that("ssurgo_mukeys polygon as matrix returns mukeys", {
+  skip_on_cran()
+  skip_on_ci()
+
+  poly <- rbind(
+    c(-88.0865046533, 37.5555143852),
+    c(-88.0860204771, 37.5600435404),
+    c(-88.0782858287, 37.5595392364),
+    c(-88.0787704736, 37.5550101113),
+    c(-88.0865046533, 37.5555143852)
+  )
+
+  mukeys <- tryCatch(
+    ssurgo_mukeys(polygon = poly),
+    error = function(e) {
+      skip(paste("API error:", e$message))
+    }
+  )
+
+  expect_type(mukeys, "character")
+  expect_gt(length(mukeys), 0)
+})
+
+test_that("ssurgo_mukeys polygon as sf returns mukeys", {
+  skip_on_cran()
+  skip_on_ci()
+  skip_if_not_installed("sf")
+  library(sf)
+
+  poly <- rbind(
+    c(-88.0865046533, 37.5555143852),
+    c(-88.0860204771, 37.5600435404),
+    c(-88.0782858287, 37.5595392364),
+    c(-88.0787704736, 37.5550101113),
+    c(-88.0865046533, 37.5555143852)
+  )
+  poly_sf <- sf::st_polygon(list(poly))
+
+  mukeys <- tryCatch(
+    ssurgo_mukeys(polygon = poly_sf),
+    error = function(e) {
+      skip(paste("API error:", e$message))
+    }
+  )
+
+  expect_type(mukeys, "character")
+  expect_gt(length(mukeys), 0)
+})
+
+test_that("ssurgo_mukeys returns unique mukeys", {
+  skip_on_cran()
+  skip_on_ci()
+
+  mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 600)
+
+  expect_equal(length(mukeys), length(unique(mukeys)))
+})
+
+test_that("ssurgo_mukeys handles area with no soil data gracefully", {
+  skip_on_cran()
+  skip_on_ci()
+
+  mukeys <- ssurgo_mukeys(bbox = c(0, 0, 0.001, 0.001))
+
+  expect_type(mukeys, "character")
+  expect_equal(length(mukeys), 0)
+})
+
+test_that("ssurgo_mukeys bbox and point return consistent results for same area", {
+  skip_on_cran()
+  skip_on_ci()
+
+  center_lon <- -91.22
+  center_lat <- 38.46
+  distance <- 600
+
+  bbox_mukeys <- ssurgo_mukeys(
+    bbox = c(
+      center_lon - 0.01,
+      center_lat - 0.01,
+      center_lon + 0.01,
+      center_lat + 0.01
+    )
+  )
+  point_mukeys <- ssurgo_mukeys(
+    point = c(center_lon, center_lat),
+    distance = distance
+  )
+
+  expect_type(bbox_mukeys, "character")
+  expect_type(point_mukeys, "character")
+  expect_gt(length(bbox_mukeys), length(point_mukeys))
+})
+
+test_that("real bounding boxes for CA", {
+  # devtools::load_all("modules/data.land")
+  bbox_01 <- c(-123.569131, 39.638344, -121.234281, 41.461763)
+  bbox_02 <- c(-124.064177, 38.994921, -120.102514, 42.088592)
+
+  expect_error(ssurgo_mukeys(bbox = bbox_01))
+  expect_error(ssurgo_mukeys(bbox = bbox_02))
+  expect_no_error(ssurgo_mukeys_bigbbox(bbox_01))
+  expect_no_error(ssurgo_mukeys_bigbbox(bbox_02))
+})

From 35d7e40ecc8793f42d39ce56425939bdb2164de1 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 11:17:36 -0500
Subject: [PATCH 12/67] ssurgo mukey function for big bboxes

---
 modules/data.land/R/gSSURGO_Query.R           | 155 +++++++++++++++++-
 .../data.land/tests/testthat/test-ssurgo.R    |   1 -
 2 files changed, 154 insertions(+), 2 deletions(-)

diff --git a/modules/data.land/R/gSSURGO_Query.R b/modules/data.land/R/gSSURGO_Query.R
index 05c36969ad6..ce1bc9733cd 100644
--- a/modules/data.land/R/gSSURGO_Query.R
+++ b/modules/data.land/R/gSSURGO_Query.R
@@ -217,6 +217,8 @@ ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance =
     stop("distance requires point to be provided")
   }
 
+  wgs84_crs <- sf::st_crs(4326)
+
   filter_xml <- if (!is.null(bbox)) {
     if (!is.numeric(bbox) || length(bbox) != 4) {
       stop("bbox must be a numeric vector of length 4: c(xmin, ymin, xmax, ymax)")
@@ -230,6 +232,26 @@ ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance =
       stop("bbox must have xmin < xmax and ymin < ymax")
     }
 
+    MAX_AREA <- 10100000000
+    albers_crs <- sf::st_crs(5070)
+
+    bbox_poly <- sf::st_polygon(list(
+      matrix(c(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin), ncol = 2, byrow = TRUE)
+    ))
+    bbox_sf <- sf::st_sfc(bbox_poly, crs = wgs84_crs)
+    bbox_albers <- sf::st_transform(bbox_sf, albers_crs)
+    area <- as.numeric(sf::st_area(bbox_albers))
+
+    if (area > MAX_AREA) {
+      stop(
+        paste0(
+          "Bounding box area (", format(area, scientific = FALSE),
+          " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
+          " m²). Use ssurgo_mukeys_bigbbox() for large bounding boxes."
+        )
+      )
+    }
+
     paste0(
       "<Filter>",
       "<BBOX>",
@@ -271,11 +293,45 @@ ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance =
       if (ncol(polygon) != 2) {
         stop("polygon matrix must have 2 columns: x (lon) and y (lat)")
       }
-      as.vector(t(as.matrix(polygon)))
+      if (nrow(polygon) < 4) {
+        stop("polygon matrix must have at least 4 rows (3 unique vertices + closing point)")
+      }
+      poly_matrix <- as.matrix(polygon)
+      if (!identical(poly_matrix[1, ], poly_matrix[nrow(poly_matrix), ])) {
+        stop("polygon matrix first and last points must be identical (closed ring)")
+      }
+      as.vector(t(poly_matrix))
     } else {
       stop("polygon must be an sf/sfc object or a matrix/data.frame with coordinates")
     }
 
+    poly_sf <- if (inherits(polygon, "sfc")) {
+      polygon
+    } else if (inherits(polygon, "sfg")) {
+      sf::st_sfc(polygon, crs = wgs84_crs)
+    } else if (inherits(polygon, "sf")) {
+      sf::st_geometry(polygon)
+    } else {
+      poly_matrix <- as.matrix(polygon)
+      poly_obj <- sf::st_polygon(list(poly_matrix))
+      sf::st_sfc(poly_obj, crs = wgs84_crs)
+    }
+
+    MAX_AREA <- 10100000000
+    albers_crs <- sf::st_crs(5070)
+    poly_albers <- sf::st_transform(poly_sf, albers_crs)
+    area <- as.numeric(sf::st_area(poly_albers))
+
+    if (area > MAX_AREA) {
+      stop(
+        paste0(
+          "Polygon area (", format(area, scientific = FALSE),
+          " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
+          " m²)."
+        )
+      )
+    }
+
     coords_str <- paste(coords, collapse = " ")
 
     paste0(
@@ -296,6 +352,18 @@ ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance =
     lon <- point[1]
     lat <- point[2]
 
+    MAX_AREA <- 10100000000
+    circle_area <- pi * (distance^2)
+    if (circle_area > MAX_AREA) {
+      stop(
+        paste0(
+          "Search radius area (", format(circle_area, scientific = FALSE),
+          " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
+          " m²)."
+        )
+      )
+    }
+
     paste0(
       "<Filter>",
       "<DWithin>",
@@ -359,3 +427,88 @@ ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance =
 
   mukeys
 }
+
+#' Get map unit keys (mukeys) from gSSURGO for large bounding boxes
+#'
+#' Queries the NRCS gSSURGO Web Feature Service for large bounding boxes
+#' by dividing the area into smaller cells that comply with the API's
+#' 10,100,000,000 m² extent limit.
+#'
+#' @param bbox Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).
+#' @param ... Additional arguments passed to \code{ssurgo_mukeys()}.
+#'   Currently supports \code{distance} for point queries (ignored for bbox queries).
+#'
+#' @return Character vector of unique map unit keys (mukeys).
+#'
+#' @details
+#' This function divides a large bounding box into smaller cells,
+#' each with an area less than 10,100,000,000 square meters,
+#' then queries each cell individually and combines the results.
+#'
+#' The grid is created using Albers Equal Area projection (EPSG:5070)
+#' to ensure accurate area calculations, then transformed back to
+#' WGS84 (EPSG:4326) for the API query.
+#'
+#' @examples
+#' \dontrun{
+#' # Large bounding box covering a significant area
+#' mukeys <- ssurgo_mukeys_bigbbox(bbox = c(-120, 35, -110, 45))
+#' }
+#' @export
+ssurgo_mukeys_bigbbox <- function(bbox, ...) {
+  if (!is.numeric(bbox) || length(bbox) != 4) {
+    stop("bbox must be a numeric vector of length 4: c(xmin, ymin, xmax, ymax)")
+  }
+
+  xmin <- bbox[1]
+  ymin <- bbox[2]
+  xmax <- bbox[3]
+  ymax <- bbox[4]
+
+  if (xmin >= xmax || ymin >= ymax) {
+    stop("bbox must have xmin < xmax and ymin < ymax")
+  }
+
+  MAX_AREA <- 10100000000
+  wgs84_crs <- sf::st_crs(4326)
+
+  bbox_poly <- sf::st_polygon(list(
+    matrix(c(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin), ncol = 2, byrow = TRUE)
+  ))
+  bbox_sf <- sf::st_sfc(bbox_poly, crs = wgs84_crs)
+
+  bbox_area <- as.numeric(sf::st_area(bbox_sf))
+
+  bbox_wgs84_box <- sf::st_bbox(bbox_sf)
+  width_deg <- bbox_wgs84_box["xmax"] - bbox_wgs84_box["xmin"]
+  height_deg <- bbox_wgs84_box["ymax"] - bbox_wgs84_box["ymin"]
+
+  aspect_ratio <- width_deg / height_deg
+
+  n_cells <- ceiling(bbox_area / MAX_AREA)
+  cells_per_side <- sqrt(n_cells)
+
+  ncol_cells <- ceiling(cells_per_side * sqrt(aspect_ratio))
+  nrow_cells <- ceiling(cells_per_side / sqrt(aspect_ratio))
+
+  if (ncol_cells < 1) ncol_cells <- 1
+  if (nrow_cells < 1) nrow_cells <- 1
+
+  grid_wgs84 <- sf::st_make_grid(
+    bbox_sf,
+    n = c(ncol_cells, nrow_cells),
+    crs = wgs84_crs
+  )
+
+  cell_bboxes <- purrr::map(grid_wgs84, sf::st_bbox)
+
+  results <- purrr::map(cell_bboxes, function(cell_bbox) {
+    cell_vec <- c(
+      cell_bbox["xmin"], cell_bbox["ymin"],
+      cell_bbox["xmax"], cell_bbox["ymax"]
+    )
+    ssurgo_mukeys(bbox = cell_vec, ...)
+  }, .progress = "Querying grid cells")
+
+  unique(unlist(results, use.names = FALSE))
+}
diff --git a/modules/data.land/tests/testthat/test-ssurgo.R b/modules/data.land/tests/testthat/test-ssurgo.R
index 27f4a24643b..754c7ad97e8 100644
--- a/modules/data.land/tests/testthat/test-ssurgo.R
+++ b/modules/data.land/tests/testthat/test-ssurgo.R
@@ -82,7 +82,6 @@ test_that("ssurgo_mukeys polygon as sf returns mukeys", {
   skip_on_cran()
   skip_on_ci()
   skip_if_not_installed("sf")
-  library(sf)
 
   poly <- rbind(
     c(-88.0865046533, 37.5555143852),

From 42eadab3160b695a1e426ec47466c8a8bcf519d6 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 11:31:36 -0500
Subject: [PATCH 13/67] separate ssurgo mukey functions for different APIs

---
 modules/data.land/R/gSSURGO_Query.R           | 423 ++++++++----------
 .../data.land/tests/testthat/test-ssurgo.R    | 131 ++----
 2 files changed, 233 insertions(+), 321 deletions(-)

diff --git a/modules/data.land/R/gSSURGO_Query.R b/modules/data.land/R/gSSURGO_Query.R
index ce1bc9733cd..a18bcdad5f2 100644
--- a/modules/data.land/R/gSSURGO_Query.R
+++ b/modules/data.land/R/gSSURGO_Query.R
@@ -138,22 +138,13 @@ gSSURGO.Query <- function(mukeys,
   
 }
 
-#' Get map unit keys (mukeys) from gSSURGO using spatial filters
+#' Get map unit keys (mukeys) from gSSURGO using a bounding box
 #'
 #' Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
-#' based on spatial filters: bounding box, polygon, or point with distance.
+#' based on a bounding box spatial filter.
 #'
 #' @param bbox Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).
 #'   Features that intersect the bounding box are returned.
-#' @param polygon Polygon coordinates in WGS84. Can be:
-#'   - An `sf` object with a single polygon geometry
-#'   - A numeric matrix with columns x (lon) and y (lat), where the first and
-#'     last points are identical (closed ring)
-#'   Features that intersect the polygon are returned.
-#' @param point Numeric vector of length 2: c(lon, lat) in WGS84.
-#'   Must be used with `distance`.
-#' @param distance Numeric. Distance in meters from the point.
-#'   Must be used with `point`. Use 0 for exact point intersection.
 #'
 #' @return Character vector of unique map unit keys (mukeys).
 #'
@@ -161,249 +152,164 @@ gSSURGO.Query <- function(mukeys,
 #' This function uses the NRCS SDM Data Access Web Feature Service:
 #' \url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
 #'
-#' The total extent of any spatial filter cannot exceed 10,100,000,000 square
-#' meters (~3,900 square miles).
+#' The total extent of the bounding box cannot exceed 10,100,000,000 square
+#' meters (~3,900 square miles). Use \code{ssurgo_mukeys_bigbbox()} for large
+#' bounding boxes.
 #'
 #' @examples
 #' \dontrun{
 #' # Bounding box query
-#' mukeys <- ssurgo_mukeys(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
-#'
-#' # Point with distance (600m radius)
-#' mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 600)
-#'
-#' # Point with zero distance (exact intersection)
-#' mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 0)
-#'
-#' # Polygon as matrix
-#' poly <- rbind(
-#'   c(-88.0865046533, 37.5555143852),
-#'   c(-88.0860204771, 37.5600435404),
-#'   c(-88.0782858287, 37.5595392364),
-#'   c(-88.0787704736, 37.5550101113),
-#'   c(-88.0865046533, 37.5555143852)
-#' )
-#' mukeys <- ssurgo_mukeys(polygon = poly)
-#'
-#' # Polygon as sf object
-#' poly_sf <- sf::st_polygon(list(poly))
-#' mukeys <- ssurgo_mukeys(polygon = poly_sf)
+#' mukeys <- ssurgo_mukeys_bbox(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
 #' }
 #' @export
-ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance = NULL) {
-  n_provided <- sum(c(!is.null(bbox), !is.null(polygon), !is.null(point)))
-
-  if (n_provided == 0) {
-    stop("Must provide one of: bbox, polygon, or point")
+ssurgo_mukeys_bbox <- function(bbox) {
+  if (!is.numeric(bbox) || length(bbox) != 4) {
+    stop("bbox must be a numeric vector of length 4: c(xmin, ymin, xmax, ymax)")
   }
 
-  if (n_provided > 1) {
-    stop("Only one of bbox, polygon, or point may be provided")
+  xmin <- bbox[1]
+  ymin <- bbox[2]
+  xmax <- bbox[3]
+  ymax <- bbox[4]
+
+  if (xmin >= xmax || ymin >= ymax) {
+    stop("bbox must have xmin < xmax and ymin < ymax")
   }
 
-  if (!is.null(point)) {
-    if (length(point) != 2) {
-      stop("point must be a numeric vector of length 2: c(lon, lat)")
-    }
-    if (is.null(distance)) {
-      stop("distance is required when point is provided")
-    }
-    if (!is.numeric(distance) || distance < 0) {
-      stop("distance must be a non-negative numeric value")
-    }
+  MAX_AREA <- 10100000000
+  wgs84_crs <- sf::st_crs(4326)
+
+  bbox_poly <- sf::st_polygon(list(
+    matrix(c(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin), ncol = 2, byrow = TRUE)
+  ))
+  bbox_sf <- sf::st_sfc(bbox_poly, crs = wgs84_crs)
+  area <- as.numeric(sf::st_area(bbox_sf))
+
+  if (area > MAX_AREA) {
+    stop(
+      paste0(
+        "Bounding box area (", format(area, scientific = FALSE),
+        " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
+        " m²). Use ssurgo_mukeys_bigbbox() for large bounding boxes."
+      )
+    )
   }
 
-  if (!is.null(distance) && is.null(point)) {
-    stop("distance requires point to be provided")
+  base_url <- "https://sdmdataaccess.nrcs.usda.gov/Spatial/SDMWGS84Geographic.wfs"
+
+  query <- list(
+    SERVICE = "WFS",
+    VERSION = "1.1.0",
+    REQUEST = "GetFeature",
+    TYPENAME = "MapunitPoly",
+    BBOX = paste(bbox, collapse = ","),
+    OUTPUTFORMAT = "XMLMukeyList"
+  )
+
+  resp <- httr2::request(base_url) |>
+    httr2::req_url_query(!!!query) |>
+    httr2::req_perform()
+
+  httr2::resp_check_status(resp)
+
+  resp_text <- httr2::resp_body_string(resp)
+
+  resp_xml <- XML::xmlParse(resp_text)
+
+  mukey_nodes <- XML::getNodeSet(resp_xml, "//MapUnitKeyList")
+
+  if (length(mukey_nodes) == 0) {
+    return(character(0))
   }
 
-  wgs84_crs <- sf::st_crs(4326)
+  mukey_str <- XML::xmlValue(mukey_nodes[[1]])
 
-  filter_xml <- if (!is.null(bbox)) {
-    if (!is.numeric(bbox) || length(bbox) != 4) {
-      stop("bbox must be a numeric vector of length 4: c(xmin, ymin, xmax, ymax)")
-    }
-    xmin <- bbox[1]
-    ymin <- bbox[2]
-    xmax <- bbox[3]
-    ymax <- bbox[4]
+  if (is.null(mukey_str) || nchar(trimws(mukey_str)) == 0) {
+    return(character(0))
+  }
 
-    if (xmin >= xmax || ymin >= ymax) {
-      stop("bbox must have xmin < xmax and ymin < ymax")
-    }
+  mukeys <- unique(strsplit(trimws(mukey_str), ",")[[1]])
 
-    MAX_AREA <- 10100000000
-    albers_crs <- sf::st_crs(5070)
-
-    bbox_poly <- sf::st_polygon(list(
-      matrix(c(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin), ncol = 2, byrow = TRUE)
-    ))
-    bbox_sf <- sf::st_sfc(bbox_poly, crs = wgs84_crs)
-    bbox_albers <- sf::st_transform(bbox_sf, albers_crs)
-    area <- as.numeric(sf::st_area(bbox_albers))
-
-    if (area > MAX_AREA) {
-      stop(
-        paste0(
-          "Bounding box area (", format(area, scientific = FALSE),
-          " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
-          " m²). Use ssurgo_mukeys_bigbbox() for large bounding boxes."
-        )
-      )
-    }
+  mukeys
+}
 
-    paste0(
-      "<Filter>",
-      "<BBOX>",
-      "<PropertyName>Geometry</PropertyName>",
-      "<Box srsName='EPSG:4326'>",
-      "<coordinates>", xmin, ",", ymin, " ", xmax, ",", ymax, "</coordinates>",
-      "</Box>",
-      "</BBOX>",
-      "</Filter>"
-    )
-  } else if (!is.null(polygon)) {
-    coords <- if (inherits(polygon, "sfc")) {
-      if (length(polygon) != 1) {
-        stop("polygon (sfc) must contain exactly one geometry")
-      }
-      geom <- polygon[[1]]
-      if (inherits(geom, "POLYGON")) {
-        as.vector(t(geom))
-      } else {
-        stop("sfc object must contain a POLYGON geometry")
-      }
-    } else if (inherits(polygon, "sfg")) {
-      if (inherits(polygon, "POLYGON")) {
-        as.vector(t(polygon))
-      } else {
-        stop("sfg object must be a POLYGON")
-      }
-    } else if (inherits(polygon, "sf")) {
-      if (nrow(polygon) != 1) {
-        stop("polygon (sf) must contain exactly one feature")
-      }
-      geom <- sf::st_geometry(polygon)[[1]]
-      if (inherits(geom, "POLYGON")) {
-        as.vector(t(geom))
-      } else {
-        stop("sf object must contain a POLYGON geometry")
-      }
-    } else if (is.matrix(polygon) || is.data.frame(polygon)) {
-      if (ncol(polygon) != 2) {
-        stop("polygon matrix must have 2 columns: x (lon) and y (lat)")
-      }
-      if (nrow(polygon) < 4) {
-        stop("polygon matrix must have at least 4 rows (3 unique vertices + closing point)")
-      }
-      poly_matrix <- as.matrix(polygon)
-      if (!identical(poly_matrix[1, ], poly_matrix[nrow(poly_matrix), ])) {
-        stop("polygon matrix first and last points must be identical (closed ring)")
-      }
-      as.vector(t(poly_matrix))
-    } else {
-      stop("polygon must be an sf/sfc object or a matrix/data.frame with coordinates")
-    }
+#' Get map unit keys (mukeys) from gSSURGO using a point with distance
+#'
+#' Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
+#' based on a point and distance (DWithin) spatial filter.
+#'
+#' @param point Numeric vector of length 2: c(lon, lat) in WGS84 (EPSG:4326).
+#' @param distance Numeric. Distance in meters from the point.
+#'   Use 0 for exact point intersection.
+#'
+#' @return Character vector of unique map unit keys (mukeys).
+#'
+#' @details
+#' This function uses the NRCS SDM Data Access Web Feature Service:
+#' \url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
+#'
+#' The search radius area (π × distance²) cannot exceed 10,100,000,000 square
+#' meters (~3,900 square miles).
+#'
+#' @examples
+#' \dontrun{
+#' # Point with distance (600m radius)
+#' mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 600)
+#'
+#' # Point with zero distance (exact intersection)
+#' mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 0)
+#' }
+#' @export
+ssurgo_mukeys_point <- function(point, distance) {
+  if (length(point) != 2) {
+    stop("point must be a numeric vector of length 2: c(lon, lat)")
+  }
 
-    poly_sf <- if (inherits(polygon, "sfc")) {
-      polygon
-    } else if (inherits(polygon, "sfg")) {
-      sf::st_sfc(polygon, crs = wgs84_crs)
-    } else if (inherits(polygon, "sf")) {
-      sf::st_geometry(polygon)
-    } else {
-      poly_matrix <- as.matrix(polygon)
-      poly_obj <- sf::st_polygon(list(poly_matrix))
-      sf::st_sfc(poly_obj, crs = wgs84_crs)
-    }
+  if (!is.numeric(distance) || distance < 0) {
+    stop("distance must be a non-negative numeric value")
+  }
 
-    MAX_AREA <- 10100000000
-    albers_crs <- sf::st_crs(5070)
-    poly_albers <- sf::st_transform(poly_sf, albers_crs)
-    area <- as.numeric(sf::st_area(poly_albers))
-
-    if (area > MAX_AREA) {
-      stop(
-        paste0(
-          "Polygon area (", format(area, scientific = FALSE),
-          " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
-          " m²)."
-        )
-      )
-    }
+  lon <- point[1]
+  lat <- point[2]
 
-    coords_str <- paste(coords, collapse = " ")
-
-    paste0(
-      "<Filter>",
-      "<Intersect>",
-      "<PropertyName>Geometry</PropertyName>",
-      "<gml:Polygon>",
-      "<gml:outerBoundaryIs>",
-      "<gml:LinearRing>",
-      "<gml:coordinates>", coords_str, "</gml:coordinates>",
-      "</gml:LinearRing>",
-      "</gml:outerBoundaryIs>",
-      "</gml:Polygon>",
-      "</Intersect>",
-      "</Filter>"
-    )
-  } else if (!is.null(point)) {
-    lon <- point[1]
-    lat <- point[2]
-
-    MAX_AREA <- 10100000000
-    circle_area <- pi * (distance^2)
-    if (circle_area > MAX_AREA) {
-      stop(
-        paste0(
-          "Search radius area (", format(circle_area, scientific = FALSE),
-          " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
-          " m²)."
-        )
+  MAX_AREA <- 10100000000
+  circle_area <- pi * (distance^2)
+  if (circle_area > MAX_AREA) {
+    stop(
+      paste0(
+        "Search radius area (", format(circle_area, scientific = FALSE),
+        " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
+        " m²)."
       )
-    }
-
-    paste0(
-      "<Filter>",
-      "<DWithin>",
-      "<PropertyName>Geometry</PropertyName>",
-      "<gml:Point>",
-      "<gml:coordinates>", lon, ",", lat, "</gml:coordinates>",
-      "</gml:Point>",
-      "<Distance units=\"m\">", distance, "</Distance>",
-      "</DWithin>",
-      "</Filter>"
     )
   }
 
+  filter_xml <- paste0(
+    "<Filter>",
+    "<DWithin>",
+    "<PropertyName>Geometry</PropertyName>",
+    "<gml:Point>",
+    "<gml:coordinates>", lon, ",", lat, "</gml:coordinates>",
+    "</gml:Point>",
+    "<Distance units=\"m\">", distance, "</Distance>",
+    "</DWithin>",
+    "</Filter>"
+  )
+
   base_url <- "https://sdmdataaccess.nrcs.usda.gov/Spatial/SDMWGS84Geographic.wfs"
 
-  if (!is.null(bbox)) {
-    query <- list(
-      SERVICE = "WFS",
-      VERSION = "1.1.0",
-      REQUEST = "GetFeature",
-      TYPENAME = "MapunitPoly",
-      BBOX = paste(bbox, collapse = ","),
-      OUTPUTFORMAT = "XMLMukeyList"
-    )
-    resp <- httr2::request(base_url) |>
-      httr2::req_url_query(!!!query) |>
-      httr2::req_perform()
-  } else {
-    query <- list(
-      SERVICE = "WFS",
-      VERSION = "1.1.0",
-      REQUEST = "GetFeature",
-      TYPENAME = "MapunitPoly",
-      FILTER = filter_xml,
-      OUTPUTFORMAT = "XMLMukeyList"
-    )
-    resp <- httr2::request(base_url) |>
-      httr2::req_url_query(!!!query) |>
-      httr2::req_perform()
-  }
+  query <- list(
+    SERVICE = "WFS",
+    VERSION = "1.1.0",
+    REQUEST = "GetFeature",
+    TYPENAME = "MapunitPoly",
+    FILTER = filter_xml,
+    OUTPUTFORMAT = "XMLMukeyList"
+  )
+
+  resp <- httr2::request(base_url) |>
+    httr2::req_url_query(!!!query) |>
+    httr2::req_perform()
 
   httr2::resp_check_status(resp)
 
@@ -435,19 +341,16 @@ ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance =
 #' 10,100,000,000 m² extent limit.
 #'
 #' @param bbox Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).
-#' @param ... Additional arguments passed to \code{ssurgo_mukeys()}.
-#'   Currently supports \code{distance} for point queries (ignored for bbox queries).
 #'
 #' @return Character vector of unique map unit keys (mukeys).
 #'
 #' @details
 #' This function divides a large bounding box into smaller cells,
 #' each with an area less than 10,100,000,000 square meters,
-#' then queries each cell individually and combines the results.
+#' then queries each cell individually in parallel and combines the results.
 #'
-#' The grid is created using Albers Equal Area projection (EPSG:5070)
-#' to ensure accurate area calculations, then transformed back to
-#' WGS84 (EPSG:4326) for the API query.
+#' The grid is created using \code{sf::st_area()} to calculate the bbox area
+#' in meters, then divided into appropriately-sized cells.
 #'
 #' @examples
 #' \dontrun{
@@ -455,7 +358,7 @@ ssurgo_mukeys <- function(bbox = NULL, polygon = NULL, point = NULL, distance =
 #' mukeys <- ssurgo_mukeys_bigbbox(bbox = c(-120, 35, -110, 45))
 #' }
 #' @export
-ssurgo_mukeys_bigbbox <- function(bbox, ...) {
+ssurgo_mukeys_bigbbox <- function(bbox) {
   if (!is.numeric(bbox) || length(bbox) != 4) {
     stop("bbox must be a numeric vector of length 4: c(xmin, ymin, xmax, ymax)")
   }
@@ -502,13 +405,57 @@ ssurgo_mukeys_bigbbox <- function(bbox, ...) {
 
   cell_bboxes <- purrr::map(grid_wgs84, sf::st_bbox)
 
-  results <- purrr::map(cell_bboxes, function(cell_bbox) {
+  base_url <- "https://sdmdataaccess.nrcs.usda.gov/Spatial/SDMWGS84Geographic.wfs"
+
+  reqs <- purrr::map(cell_bboxes, function(cell_bbox) {
     cell_vec <- c(
       cell_bbox["xmin"], cell_bbox["ymin"],
       cell_bbox["xmax"], cell_bbox["ymax"]
     )
-    ssurgo_mukeys(bbox = cell_vec, ...)
-  }, .progress = "Querying grid cells")
+    query <- list(
+      SERVICE = "WFS",
+      VERSION = "1.1.0",
+      REQUEST = "GetFeature",
+      TYPENAME = "MapunitPoly",
+      BBOX = paste(cell_vec, collapse = ","),
+      OUTPUTFORMAT = "XMLMukeyList"
+    )
+    httr2::request(base_url) |>
+      httr2::req_url_query(!!!query)
+  })
+
+  reqs_throttled <- purrr::map(reqs, ~ .x |> httr2::req_throttle(10 / 60))
+
+  resps <- httr2::req_perform_parallel(
+    reqs_throttled,
+    on_error = "continue",
+    max_active = 10,
+    progress = TRUE
+  )
+
+  parse_mukeys <- function(resp) {
+    if (inherits(resp, "httr2_response")) {
+      tryCatch({
+        resp_text <- httr2::resp_body_string(resp)
+        resp_xml <- XML::xmlParse(resp_text)
+        mukey_nodes <- XML::getNodeSet(resp_xml, "//MapUnitKeyList")
+        if (length(mukey_nodes) == 0) {
+          return(character(0))
+        }
+        mukey_str <- XML::xmlValue(mukey_nodes[[1]])
+        if (is.null(mukey_str) || nchar(trimws(mukey_str)) == 0) {
+          return(character(0))
+        }
+        strsplit(trimws(mukey_str), ",")[[1]]
+      }, error = function(e) {
+        character(0)
+      })
+    } else {
+      character(0)
+    }
+  }
+
+  mukeys_list <- purrr::map(resps, parse_mukeys)
 
-  unique(unlist(results, use.names = FALSE))
+  unique(unlist(mukeys_list, use.names = FALSE))
 }
diff --git a/modules/data.land/tests/testthat/test-ssurgo.R b/modules/data.land/tests/testthat/test-ssurgo.R
index 754c7ad97e8..2c13a0e2f3a 100644
--- a/modules/data.land/tests/testthat/test-ssurgo.R
+++ b/modules/data.land/tests/testthat/test-ssurgo.R
@@ -1,128 +1,77 @@
 context("ssurgo_mukeys")
 
-test_that("ssurgo_mukeys requires exactly one spatial filter", {
-  expect_error(ssurgo_mukeys(), "Must provide one of")
-  expect_error(ssurgo_mukeys(bbox = c(1, 2, 3, 4), point = c(1, 2)), "Only one of")
-  expect_error(ssurgo_mukeys(polygon = matrix(1:10, ncol = 2), point = c(1, 2)), "Only one of")
+test_that("ssurgo_mukeys_bbox validates bbox input", {
+  expect_error(ssurgo_mukeys_bbox("not numeric"), "numeric vector of length 4")
+  expect_error(ssurgo_mukeys_bbox(c(1, 2)), "numeric vector of length 4")
+  expect_error(ssurgo_mukeys_bbox(c(3, 2, 1, 4)), "xmin < xmax")
+  expect_error(ssurgo_mukeys_bbox(c(1, 4, 3, 2)), "ymin < ymax")
 })
 
-test_that("ssurgo_mukeys validates bbox input", {
-  expect_error(ssurgo_mukeys(bbox = "not numeric"), "numeric vector of length 4")
-  expect_error(ssurgo_mukeys(bbox = c(1, 2)), "numeric vector of length 4")
-  expect_error(ssurgo_mukeys(bbox = c(3, 2, 1, 4)), "xmin < xmax")
-  expect_error(ssurgo_mukeys(bbox = c(1, 4, 3, 2)), "ymin < ymax")
+test_that("ssurgo_mukeys_point validates point and distance", {
+  expect_error(ssurgo_mukeys_point(c(1, 2, 3)), "length 2")
+  expect_error(ssurgo_mukeys_point(c(1, 2)), "missing")
+  expect_error(ssurgo_mukeys_point(point = c(1, 2), distance = -10), "non-negative")
+  expect_error(ssurgo_mukeys_point(point = c(1, 2), distance = "100"), "non-negative")
 })
 
-test_that("ssurgo_mukeys validates point and distance", {
-  expect_error(ssurgo_mukeys(point = c(1, 2, 3)), "length 2")
-  expect_error(ssurgo_mukeys(point = c(1, 2)), "distance is required")
-  expect_error(ssurgo_mukeys(distance = 100), "Must provide one of")
-  expect_error(ssurgo_mukeys(point = c(1, 2), distance = -10), "non-negative")
-  expect_error(ssurgo_mukeys(point = c(1, 2), distance = "100"), "non-negative")
-})
-
-test_that("ssurgo_mukeys validates polygon input", {
-  expect_error(ssurgo_mukeys(polygon = 1:5), "sf/sfc object or a matrix")
-  expect_error(ssurgo_mukeys(polygon = matrix(1:6, ncol = 3)), "2 columns")
-})
-
-test_that("ssurgo_mukeys bbox returns mukeys for valid location", {
+test_that("ssurgo_mukeys_bbox returns mukeys for valid location", {
   skip_on_cran()
   skip_on_ci()
 
-  mukeys <- ssurgo_mukeys(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
+  mukeys <- ssurgo_mukeys_bbox(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
 
   expect_type(mukeys, "character")
   expect_gt(length(mukeys), 0)
 })
 
-test_that("ssurgo_mukeys point with distance returns mukeys", {
+test_that("ssurgo_mukeys_point with distance returns mukeys", {
   skip_on_cran()
   skip_on_ci()
 
-  mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 600)
+  mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 600)
 
   expect_type(mukeys, "character")
   expect_gt(length(mukeys), 0)
 })
 
-test_that("ssurgo_mukeys point with zero distance returns mukeys", {
-  skip_on_cran()
-  skip_on_ci()
-
-  mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 0)
-
-  expect_type(mukeys, "character")
-})
-
-test_that("ssurgo_mukeys polygon as matrix returns mukeys", {
+test_that("ssurgo_mukeys_point with zero distance returns mukeys", {
   skip_on_cran()
   skip_on_ci()
 
-  poly <- rbind(
-    c(-88.0865046533, 37.5555143852),
-    c(-88.0860204771, 37.5600435404),
-    c(-88.0782858287, 37.5595392364),
-    c(-88.0787704736, 37.5550101113),
-    c(-88.0865046533, 37.5555143852)
-  )
-
-  mukeys <- tryCatch(
-    ssurgo_mukeys(polygon = poly),
-    error = function(e) {
-      skip(paste("API error:", e$message))
-    }
-  )
+  mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 0)
 
   expect_type(mukeys, "character")
-  expect_gt(length(mukeys), 0)
 })
 
-test_that("ssurgo_mukeys polygon as sf returns mukeys", {
+test_that("ssurgo_mukeys_bbox returns unique mukeys", {
   skip_on_cran()
   skip_on_ci()
-  skip_if_not_installed("sf")
-
-  poly <- rbind(
-    c(-88.0865046533, 37.5555143852),
-    c(-88.0860204771, 37.5600435404),
-    c(-88.0782858287, 37.5595392364),
-    c(-88.0787704736, 37.5550101113),
-    c(-88.0865046533, 37.5555143852)
-  )
-  poly_sf <- sf::st_polygon(list(poly))
 
-  mukeys <- tryCatch(
-    ssurgo_mukeys(polygon = poly_sf),
-    error = function(e) {
-      skip(paste("API error:", e$message))
-    }
-  )
+  mukeys <- ssurgo_mukeys_bbox(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
 
-  expect_type(mukeys, "character")
-  expect_gt(length(mukeys), 0)
+  expect_equal(length(mukeys), length(unique(mukeys)))
 })
 
-test_that("ssurgo_mukeys returns unique mukeys", {
+test_that("ssurgo_mukeys_point returns unique mukeys", {
   skip_on_cran()
   skip_on_ci()
 
-  mukeys <- ssurgo_mukeys(point = c(-91.22, 38.46), distance = 600)
+  mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 600)
 
   expect_equal(length(mukeys), length(unique(mukeys)))
 })
 
-test_that("ssurgo_mukeys handles area with no soil data gracefully", {
+test_that("ssurgo_mukeys_bbox handles area with no soil data gracefully", {
   skip_on_cran()
   skip_on_ci()
 
-  mukeys <- ssurgo_mukeys(bbox = c(0, 0, 0.001, 0.001))
+  mukeys <- ssurgo_mukeys_bbox(bbox = c(0, 0, 0.001, 0.001))
 
   expect_type(mukeys, "character")
   expect_equal(length(mukeys), 0)
 })
 
-test_that("ssurgo_mukeys bbox and point return consistent results for same area", {
+test_that("ssurgo_mukeys_bbox and ssurgo_mukeys_point return consistent results for same area", {
   skip_on_cran()
   skip_on_ci()
 
@@ -130,7 +79,7 @@ test_that("ssurgo_mukeys bbox and point return consistent results for same area"
   center_lat <- 38.46
   distance <- 600
 
-  bbox_mukeys <- ssurgo_mukeys(
+  bbox_mukeys <- ssurgo_mukeys_bbox(
     bbox = c(
       center_lon - 0.01,
       center_lat - 0.01,
@@ -138,7 +87,7 @@ test_that("ssurgo_mukeys bbox and point return consistent results for same area"
       center_lat + 0.01
     )
   )
-  point_mukeys <- ssurgo_mukeys(
+  point_mukeys <- ssurgo_mukeys_point(
     point = c(center_lon, center_lat),
     distance = distance
   )
@@ -148,13 +97,29 @@ test_that("ssurgo_mukeys bbox and point return consistent results for same area"
   expect_gt(length(bbox_mukeys), length(point_mukeys))
 })
 
-test_that("real bounding boxes for CA", {
-  # devtools::load_all("modules/data.land")
+test_that("big bounding boxes exceed area limit", {
+  skip_on_cran()
+  skip_on_ci()
+
+  bbox_01 <- c(-123.569131, 39.638344, -121.234281, 41.461763)
+  bbox_02 <- c(-124.064177, 38.994921, -120.102514, 42.088592)
+
+  expect_error(ssurgo_mukeys_bbox(bbox = bbox_01), "exceeds maximum allowed area")
+  expect_error(ssurgo_mukeys_bbox(bbox = bbox_02), "exceeds maximum allowed area")
+})
+
+test_that("ssurgo_mukeys_bigbbox returns mukeys for large bounding boxes", {
+  skip_on_cran()
+  skip_on_ci()
+
   bbox_01 <- c(-123.569131, 39.638344, -121.234281, 41.461763)
   bbox_02 <- c(-124.064177, 38.994921, -120.102514, 42.088592)
 
-  expect_error(ssurgo_mukeys(bbox = bbox_01))
-  expect_error(ssurgo_mukeys(bbox = bbox_02))
-  expect_no_error(ssurgo_mukeys_bigbbox(bbox_01))
-  expect_no_error(ssurgo_mukeys_bigbbox(bbox_02))
+  mukeys_01 <- ssurgo_mukeys_bigbbox(bbox_01)
+  expect_type(mukeys_01, "character")
+  expect_gt(length(mukeys_01), 0)
+
+  mukeys_02 <- ssurgo_mukeys_bigbbox(bbox_02)
+  expect_type(mukeys_02, "character")
+  expect_gt(length(mukeys_02), 0)
 })

From 6c718c20ce63e85123e782a3d57e957e37389f20 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 11:32:15 -0500
Subject: [PATCH 14/67] add ssurgo mukey function docs

---
 modules/data.land/NAMESPACE                   |  3 ++
 modules/data.land/man/ssurgo_mukeys_bbox.Rd   | 33 +++++++++++++++++
 .../data.land/man/ssurgo_mukeys_bigbbox.Rd    | 33 +++++++++++++++++
 modules/data.land/man/ssurgo_mukeys_point.Rd  | 37 +++++++++++++++++++
 4 files changed, 106 insertions(+)
 create mode 100644 modules/data.land/man/ssurgo_mukeys_bbox.Rd
 create mode 100644 modules/data.land/man/ssurgo_mukeys_bigbbox.Rd
 create mode 100644 modules/data.land/man/ssurgo_mukeys_point.Rd

diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index 03a2a764f05..944442ae67b 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -77,6 +77,9 @@ export(soil_process)
 export(soilgrids_ic_process)
 export(soilgrids_soilC_extract)
 export(soilgrids_texture_extraction)
+export(ssurgo_mukeys_bbox)
+export(ssurgo_mukeys_bigbbox)
+export(ssurgo_mukeys_point)
 export(subset_layer)
 export(to.Tag)
 export(to.TreeCode)
diff --git a/modules/data.land/man/ssurgo_mukeys_bbox.Rd b/modules/data.land/man/ssurgo_mukeys_bbox.Rd
new file mode 100644
index 00000000000..adae7f0f524
--- /dev/null
+++ b/modules/data.land/man/ssurgo_mukeys_bbox.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gSSURGO_Query.R
+\name{ssurgo_mukeys_bbox}
+\alias{ssurgo_mukeys_bbox}
+\title{Get map unit keys (mukeys) from gSSURGO using a bounding box}
+\usage{
+ssurgo_mukeys_bbox(bbox)
+}
+\arguments{
+\item{bbox}{Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).
+Features that intersect the bounding box are returned.}
+}
+\value{
+Character vector of unique map unit keys (mukeys).
+}
+\description{
+Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
+based on a bounding box spatial filter.
+}
+\details{
+This function uses the NRCS SDM Data Access Web Feature Service:
+\url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
+
+The total extent of the bounding box cannot exceed 10,100,000,000 square
+meters (~3,900 square miles). Use \code{ssurgo_mukeys_bigbbox()} for large
+bounding boxes.
+}
+\examples{
+\dontrun{
+# Bounding box query
+mukeys <- ssurgo_mukeys_bbox(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
+}
+}
diff --git a/modules/data.land/man/ssurgo_mukeys_bigbbox.Rd b/modules/data.land/man/ssurgo_mukeys_bigbbox.Rd
new file mode 100644
index 00000000000..d9128d23910
--- /dev/null
+++ b/modules/data.land/man/ssurgo_mukeys_bigbbox.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gSSURGO_Query.R
+\name{ssurgo_mukeys_bigbbox}
+\alias{ssurgo_mukeys_bigbbox}
+\title{Get map unit keys (mukeys) from gSSURGO for large bounding boxes}
+\usage{
+ssurgo_mukeys_bigbbox(bbox)
+}
+\arguments{
+\item{bbox}{Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).}
+}
+\value{
+Character vector of unique map unit keys (mukeys).
+}
+\description{
+Queries the NRCS gSSURGO Web Feature Service for large bounding boxes
+by dividing the area into smaller cells that comply with the API's
+10,100,000,000 m² extent limit.
+}
+\details{
+This function divides a large bounding box into smaller cells,
+each with an area less than 10,100,000,000 square meters,
+then queries each cell individually in parallel and combines the results.
+
+The grid is created using \code{sf::st_area()} to calculate the bbox area
+in meters, then divided into appropriately-sized cells.
+}
+\examples{
+\dontrun{
+# Large bounding box covering a significant area
+mukeys <- ssurgo_mukeys_bigbbox(bbox = c(-120, 35, -110, 45))
+}
+}
diff --git a/modules/data.land/man/ssurgo_mukeys_point.Rd b/modules/data.land/man/ssurgo_mukeys_point.Rd
new file mode 100644
index 00000000000..fd65da8849d
--- /dev/null
+++ b/modules/data.land/man/ssurgo_mukeys_point.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gSSURGO_Query.R
+\name{ssurgo_mukeys_point}
+\alias{ssurgo_mukeys_point}
+\title{Get map unit keys (mukeys) from gSSURGO using a point with distance}
+\usage{
+ssurgo_mukeys_point(point, distance)
+}
+\arguments{
+\item{point}{Numeric vector of length 2: c(lon, lat) in WGS84 (EPSG:4326).}
+
+\item{distance}{Numeric. Distance in meters from the point.
+Use 0 for exact point intersection.}
+}
+\value{
+Character vector of unique map unit keys (mukeys).
+}
+\description{
+Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
+based on a point and distance (DWithin) spatial filter.
+}
+\details{
+This function uses the NRCS SDM Data Access Web Feature Service:
+\url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
+
+The search radius area (π × distance²) cannot exceed 10,100,000,000 square
+meters (~3,900 square miles).
+}
+\examples{
+\dontrun{
+# Point with distance (600m radius)
+mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 600)
+
+# Point with zero distance (exact intersection)
+mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 0)
+}
+}

From 3dc2fec1f14f5c03777b13250497c27564301471 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 11:49:18 -0500
Subject: [PATCH 15/67] Add example SSURGO script

---
 modules/data.land/inst/ssurgo-soil-inputs.R | 30 +++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 modules/data.land/inst/ssurgo-soil-inputs.R

diff --git a/modules/data.land/inst/ssurgo-soil-inputs.R b/modules/data.land/inst/ssurgo-soil-inputs.R
new file mode 100644
index 00000000000..63ed5f5c55a
--- /dev/null
+++ b/modules/data.land/inst/ssurgo-soil-inputs.R
@@ -0,0 +1,30 @@
+# Example of querying available water content from SSURGO
+
+devtools::load_all("~/projects/pecan/cimis-et/modules/data.land")
+
+design_points <- readr::read_csv("~/projects/cimis-to-irrigation/design_points.csv") |>
+  head(10)
+
+mukeys_list <- purrr::map2(
+  design_points$lon, design_points$lat,
+  ~ PEcAn.data.land::ssurgo_mukeys_point(point = c(.x, .y), distance = 20)
+)
+
+all_mukeys <- unique(unlist(mukeys_list))
+
+soil_data <- PEcAn.data.land::gSSURGO.Query(
+  mukeys = all_mukeys,
+  fields = c("chorizon.awc_r")
+)
+
+result <- design_points |>
+  dplyr::mutate(mukey = mukeys_list) |>
+  tidyr::unnest(mukey) |>
+  dplyr::mutate(mukey = as.numeric(mukey)) |>
+  dplyr::left_join(
+    soil_data |>
+      dplyr::select(mukey, awc_r),
+    by = "mukey"
+  )
+
+readr::write_csv(result, "_test_soil_inputs.csv")

From cdaacd480f26397b538a009b5b28e725b5174e81 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 12:36:18 -0500
Subject: [PATCH 16/67] update crop_whc table to use FAO-56 Table 22 vals

Source: https://www.professormendoncauenf.com.br/wp-content/uploads/2021/03/ag_fao_56_ingles.pdf
---
 modules/data.land/data-raw/crop_whc.R   |   4 +-
 modules/data.land/data-raw/crop_whc.csv | 134 ++++++++++++------------
 modules/data.land/data/crop_whc.rda     | Bin 2703 -> 1624 bytes
 3 files changed, 70 insertions(+), 68 deletions(-)

diff --git a/modules/data.land/data-raw/crop_whc.R b/modules/data.land/data-raw/crop_whc.R
index 800e4e567be..05fcf4fcd34 100644
--- a/modules/data.land/data-raw/crop_whc.R
+++ b/modules/data.land/data-raw/crop_whc.R
@@ -14,8 +14,10 @@ crop_whc <- read_csv(
     crop_number = col_character(),
     crop_name = col_character(),
     Category = col_character(),
+    rooting_depth_m = col_double(),
     whc_min_frac = col_double(),
-    Comments = col_character()
+    whc_notes = col_character(),
+    rooting_depth_notes = col_character()
   ),
   progress = FALSE
 )
diff --git a/modules/data.land/data-raw/crop_whc.csv b/modules/data.land/data-raw/crop_whc.csv
index 33be8826ebb..19877b247ea 100644
--- a/modules/data.land/data-raw/crop_whc.csv
+++ b/modules/data.land/data-raw/crop_whc.csv
@@ -1,67 +1,67 @@
-"crop_number","crop_name","Category","whc_min_frac","Comments"
-"3.01","Almonds","Woody Perennial","0.35","Deep roots but managed to avoid nut shrivel during heat."
-"3.02","Apple","Woody Perennial","0.35","Deep roots but managed for fruit sizing."
-"1.02","Artichokes","Non-woody Perennial","0.4","Large established crown systems with moderate drought tolerance."
-"1.03","Asparagus","Non-woody Perennial","0.4","Deep-rooted herbaceous perennial with high storage capacity."
-"1.01","Alfalfa (cycle)","Non-woody Perennial","0.4","Deep taproots can scavenge moisture; tolerant of moderate depletion."
-"2.01","Alfalfa (annual)","Non-woody Perennial","0.4","Deep taproots can scavenge moisture; tolerant of moderate depletion."
-"4.01","Avocado","Woody Perennial","0.35","Note: Sensitive to low WHC due to shallow roots without root hairs."
-"1.04","Barley","Annual (Hardy)","0.45","Scavenging root system; relatively drought-tolerant small grain."
-"1.05","Beans (pinto)","Annual (Hardy)","0.45","Can tolerate moderate tension compared to succulent fresh beans."
-"1.06","Beans (dry)","Annual (Hardy)","0.45","Can tolerate moderate tension compared to succulent fresh beans."
-"","Beans (Fresh)","Annual (Sensitive)","0.6","Shallow roots; moisture stress quickly impacts pod quality."
-"1.08","Beets (table)","Annual (Sensitive)","0.6","Consistent moisture required for root development and texture."
-"1.09","Broccoli","Annual (Sensitive)","0.6","Shallow roots; high ET demand to maintain vegetative growth."
-"1.1","Cabbage","Annual (Sensitive)","0.6","High water content crop; very sensitive to head splitting/wilting."
-"1.11","Carrots","Annual (Sensitive)","0.6","Requires high soil moisture to prevent woody texture and forking."
-"1.12","Celery","Annual (Sensitive)","0.6","Extremely shallow roots; highest moisture requirement for crispness."
-"4.02","Citrus (>3.8 m tall)","Woody Perennial","0.35","Moderate depletion allowed but high frequency preferred in summer."
-"4.03","Citrus (<3.0 m tall)","Woody Perennial","0.35","Moderate depletion allowed but high frequency preferred in summer."
-"1.13","Corn (grain)","Annual (Hardy)","0.45","Deep roots; can mine moisture but sensitive during pollination."
-"1.14","Corn (silage)","Annual (Hardy)","0.45","Deep roots; can mine moisture but sensitive during pollination."
-"1.15","Cotton","Annual (Hardy)","0.45","Taproot system allows for significant soil moisture extraction."
-"1.16","Cucumber","Annual (Sensitive)","0.6","High water turnover; stress leads to bitter fruit."
-"4.04","Date Palm","Woody Perennial","0.35","Extremely hardy but requires managed floors for fruit yield."
-"1.17","Eggplant","Annual (Hardy)","0.45","Deeper rooted than most solanaceous vegetables."
-"4.05","Evergreen","Woody Perennial","0.35","Stable root volume allows for lower management floor."
-"1.18","Flax","Annual (Hardy)","0.45","Moderate drought tolerance once established."
-"1.19","Grains (small)","Annual (Hardy)","0.45","General category for cereals with scavenging root architectures."
-"1.20","Grains (winter)","Annual (Hardy)","0.45","General category for cereals with scavenging root architectures."
-"3.03","Grapes (wine)","Woody Perennial","0.35","Deep roots; often managed with Regulated Deficit Irrigation (RDI)."
-"3.04","Grapes (table)","Woody Perennial","0.35","Deep roots; often managed with Regulated Deficit Irrigation (RDI)."
-"3.05","Grapes (raisin)","Woody Perennial","0.35","Deep roots; often managed with Regulated Deficit Irrigation (RDI)."
-"2.02","Improved Pasture","Non-woody Perennial","0.4","Dense root mat; requires moderate floor to maintain regrowth."
-"3.06","Kiwifruit","Woody Perennial","0.35","Vines require stable moisture but can access larger soil volumes."
-"1.21","Lentil","Annual (Hardy)","0.45","Cool-season legume with reasonable drought resistance."
-"1.22","Lettuce","Annual (Sensitive)","0.6","Very shallow roots; moisture stress causes tipburn and bolting."
-"1.23","Melon","Annual (Hardy)","0.45","Deep taproots; managed depletion can increase sugar content (Brix)."
-"1.24","Millet","Annual (Hardy)","0.45","Highly efficient water user; scavenging roots."
-"1.25","Mustard","Annual (Hardy)","0.45","Rapid growth and reasonable scavenging ability."
-"1.26","Oats","Annual (Hardy)","0.45","Standard cereal tolerance for moderate soil moisture depletion."
-"4.06","Olives","Woody Perennial","0.35","Highly drought tolerant; 0.35 is a conservative management floor."
-"1.27","Onion (dry)","Annual (Sensitive)","0.6","Very shallow, fibrous roots; sensitive to bulb sizing issues."
-"1.28","Onion (green)","Annual (Sensitive)","0.6","Very shallow, fibrous roots; sensitive to bulb sizing issues."
-"1.29","Peas","Annual (Sensitive)","0.6","Shallow roots; moisture critical during flowering and pod fill."
-"1.3","Peppers","Annual (Sensitive)","0.6","Sensitive to blossom end rot if moisture fluctuates."
-"1.31","Potato","Annual (Sensitive)","0.6","Shallow roots; stress causes internal defects and yield loss."
-"1.32","Radishes","Annual (Sensitive)","0.6","Fast growing and shallow; requires constant moisture for quality."
-"1.34","Safflower","Annual (Hardy)","0.45","Deep taproot (up to 6ft+); excellent scavenger."
-"1.35","Sisal","Annual (Hardy)","0.45","Succulent-like properties with high drought tolerance."
-"1.36","Sorghum","Annual (Hardy)","0.45","Exceptional drought tolerance; can extract water at high tension."
-"1.37","Spinach","Annual (Sensitive)","0.6","Extremely shallow roots; sensitive to rapid ET changes."
-"1.38","Squash","Annual (Hardy)","0.45","Large leaf area but deep/wide-ranging root system."
-"3.07","Stone fruits","Woody Perennial","0.35","Deep roots; floor managed for fruit sizing and bud set."
-"1.39","Strawberries w/mulch","Annual (Sensitive)","0.6","Shallow roots; usually drip-irrigated with high frequency in CA."
-"1.47","Sudan grass","Annual (Hardy)","0.45","Hardy forage crop with extensive scavenging roots."
-"1.4","Sugarbeet","Annual (Hardy)","0.45","Deep taproot; can tolerate moderate stress before harvest."
-"3.48","Sugarcane","Non-woody Perennial","0.4","High biomass requires steady water but has established root depth."
-"1.41","Sunflower","Annual (Hardy)","0.45","Deep taproot; very effective at mining subsoil moisture."
-"1.42","Sweet Potatoes","Annual (Hardy)","0.45","Deeper and more resilient root system than Irish potatoes."
-"1.43","Tomato","Annual (Sensitive)","0.6","Fresh market types require high WHC for fruit quality/sizing."
-"2.03","Turfgrass (cool-season)","Non-woody Perennial","0.4","Fibrous mat; 0.4 prevents dormancy/browning in CA heat."
-"2.04","Turfgrass (warm-season)","Non-woody Perennial","0.4","Fibrous mat; 0.4 prevents dormancy/browning in CA heat."
-"1.44","Vegetables","Annual (Sensitive)","0.6","General category for high-turnover, shallow-rooted garden crops."
-"3.08","Walnuts","Woody Perennial","0.35","Large trees with deep roots but highly sensitive to late-season stress."
-"1.46","Watermelon","Annual (Hardy)","0.45","Deep roots; can handle more depletion than other melons."
-"1.45","Wheat","Annual (Hardy)","0.45","Standard grain scavenging profile."
-"1.33","Rice","Rice","1.0","WHC logic should be bypassed for this crop."
+crop_number,crop_name,Category,rooting_depth_m,whc_min_frac,whc_notes,rooting_depth_notes
+2.01,Alfalfa (annual),Non-woody Perennial,1.5,0.45,FAO-56 Table 22 (hay),FAO-56 Table 22
+1.01,Alfalfa (cycle),Non-woody Perennial,2,0.4,FAO-56 Table 22 (seed),FAO-56 Table 22
+3.01,Almonds,Woody Perennial,1.5,0.6,FAO-56 Table 22,FAO-56 Table 22
+3.02,Apple,Woody Perennial,1.5,0.5,FAO-56 Table 22,FAO-56 Table 22
+1.02,Artichokes,Non-woody Perennial,0.75,0.55,FAO-56 Table 22,FAO-56 Table 22
+1.03,Asparagus,Non-woody Perennial,1.5,0.55,FAO-56 Table 22,FAO-56 Table 22
+4.01,Avocado,Woody Perennial,0.75,0.3,FAO-56 Table 22,FAO-56 Table 22
+1.04,Barley,Annual (Hardy),1.25,0.45,FAO-56 Table 22,FAO-56 Table 22
+1.06,Beans (dry),Annual (Hardy),0.75,0.55,FAO-56 Table 22,FAO-56 Table 22
+,Beans (Fresh),Annual (Sensitive),0.6,0.55,FAO-56 Table 22,FAO-56 Table 22
+1.05,Beans (pinto),Annual (Hardy),0.6,0.55,FAO-56 Table 22,FAO-56 Table 22
+1.08,Beets (table),Annual (Sensitive),0.8,0.5,FAO-56 Table 22,FAO-56 Table 22
+1.09,Broccoli,Annual (Sensitive),0.5,0.55,FAO-56 Table 22,FAO-56 Table 22
+1.1,Cabbage,Annual (Sensitive),0.65,0.55,FAO-56 Table 22,FAO-56 Table 22
+1.11,Carrots,Annual (Sensitive),0.75,0.65,FAO-56 Table 22,FAO-56 Table 22
+1.12,Celery,Annual (Sensitive),0.4,0.80,FAO-56 Table 22,FAO-56 Table 22
+4.03,Citrus (<3.0 m tall),Woody Perennial,0.95,0.50,FAO-56 Table 22,FAO-56 Table 22
+4.02,Citrus (>3.8 m tall),Woody Perennial,1.35,0.50,FAO-56 Table 22,FAO-56 Table 22
+1.13,Corn (grain),Annual (Hardy),1.35,0.45,FAO-56 Table 22,FAO-56 Table 22
+1.14,Corn (silage),Annual (Hardy),1.0,0.50,FAO-56 Table 22,FAO-56 Table 22
+1.15,Cotton,Annual (Hardy),1.35,0.35,FAO-56 Table 22,FAO-56 Table 22
+1.16,Cucumber,Annual (Sensitive),0.95,0.50,FAO-56 Table 22,FAO-56 Table 22
+4.04,Date Palm,Woody Perennial,2.0,0.50,FAO-56 Table 22,FAO-56 Table 22
+1.17,Eggplant,Annual (Hardy),0.95,0.55,FAO-56 Table 22,FAO-56 Table 22
+4.05,Evergreen,Woody Perennial,1.25,0.35,FAO-56 Table 22,FAO-56 Table 22
+1.18,Flax,Annual (Hardy),1.25,0.50,FAO-56 Table 22,FAO-56 Table 22
+1.19,Grains (small),Annual (Hardy),1.25,0.45,FAO-56 Table 22,FAO-56 Table 22
+1.2,Grains (winter),Annual (Hardy),1.65,0.45,FAO-56 Table 22,FAO-56 Table 22
+3.05,Grapes (raisin),Woody Perennial,1.5,0.65,FAO-56 Table 22,FAO-56 Table 22
+3.04,Grapes (table),Woody Perennial,1.5,0.35,FAO-56 Table 22,FAO-56 Table 22
+3.03,Grapes (wine),Woody Perennial,1.5,0.55,FAO-56 Table 22,FAO-56 Table 22
+2.02,Improved Pasture,Non-woody Perennial,1.0,0.40,FAO-56 Table 22,FAO-56 Table 22
+3.06,Kiwifruit,Woody Perennial,1.0,0.65,FAO-56 Table 22,FAO-56 Table 22
+1.21,Lentil,Annual (Hardy),0.7,0.50,FAO-56 Table 22,FAO-56 Table 22
+1.22,Lettuce,Annual (Sensitive),0.4,0.70,FAO-56 Table 22,FAO-56 Table 22
+1.23,Melon,Annual (Hardy),1.2,0.55,FAO-56 Table 22,FAO-56 Table 22
+1.24,Millet,Annual (Hardy),1.5,0.45,FAO-56 Table 22,FAO-56 Table 22
+1.25,Mustard,Annual (Hardy),60,0.45,Rapid growth and reasonable scavenging ability.,"Limited CA data; 0.6 m estimated from brassica analogues (canola, rapeseed)"
+1.26,Oats,Annual (Hardy),1.25,0.45,FAO-56 Table 22,FAO-56 Table 22
+4.06,Olives,Woody Perennial,1.45,0.35,FAO-56 Table 22,FAO-56 Table 22
+1.27,Onion (dry),Annual (Sensitive),0.45,0.70,FAO-56 Table 22,FAO-56 Table 22
+1.28,Onion (green),Annual (Sensitive),0.45,0.70,FAO-56 Table 22,FAO-56 Table 22
+1.29,Peas,Annual (Sensitive),0.8,0.65,FAO-56 Table 22,FAO-56 Table 22
+1.3,Peppers,Annual (Sensitive),0.75,0.70,FAO-56 Table 22,FAO-56 Table 22
+1.31,Potato,Annual (Sensitive),0.5,0.65,FAO-56 Table 22,FAO-56 Table 22
+1.32,Radishes,Annual (Sensitive),0.4,0.70,FAO-56 Table 22,FAO-56 Table 22
+1.33,Rice,Rice,0.75,1.0,FAO-56 Table 22,FAO value is 0.8 but WHC logic should be bypassed
+1.34,Safflower,Annual (Hardy),1.5,0.40,FAO-56 Table 22,FAO-56 Table 22
+1.35,Sisal,Annual (Hardy),0.75,0.20,FAO-56 Table 22,FAO-56 Table 22
+1.36,Sorghum,Annual (Hardy),1.5,0.45,FAO-56 Table 22,FAO-56 Table 22
+1.37,Spinach,Annual (Sensitive),0.4,0.80,FAO-56 Table 22,FAO-56 Table 22
+1.38,Squash,Annual (Hardy),0.8,0.50,FAO-56 Table 22,FAO-56 Table 22
+3.07,Stone fruits,Woody Perennial,1.5,0.50,FAO-56 Table 22,FAO-56 Table 22
+1.39,Strawberries w/mulch,Annual (Sensitive),0.25,0.80,FAO-56 Table 22,FAO-56 Table 22
+1.47,Sudan grass,Annual (Hardy),1.25,0.45,FAO-56 Table 22,FAO-56 Table 22
+1.4,Sugarbeet,Annual (Hardy),0.95,0.45,FAO-56 Table 22,FAO-56 Table 22
+3.48,Sugarcane,Non-woody Perennial,1.6,0.35,FAO-56 Table 22,FAO-56 Table 22
+1.41,Sunflower,Annual (Hardy),1.15,0.55,FAO-56 Table 22,FAO-56 Table 22
+1.42,Sweet Potatoes,Annual (Hardy),1.25,0.35,FAO-56 Table 22,FAO-56 Table 22
+1.43,Tomato,Annual (Sensitive),1.1,0.60,FAO-56 Table 22,FAO-56 Table 22
+2.03,Turfgrass (cool-season),Non-woody Perennial,0.75,0.60,FAO-56 Table 22,FAO-56 Table 22
+2.04,Turfgrass (warm-season),Non-woody Perennial,0.75,0.50,FAO-56 Table 22,FAO-56 Table 22
+1.44,Vegetables,Annual (Sensitive),0.4,0.70,FAO-56 Table 22,FAO-56 Table 22
+3.08,Walnuts,Woody Perennial,2.05,0.50,FAO-56 Table 22,FAO-56 Table 22
+1.46,Watermelon,Annual (Hardy),1.15,0.60,FAO-56 Table 22,FAO-56 Table 22
+1.45,Wheat,Annual (Hardy),1.25,0.45,FAO-56 Table 22,FAO-56 Table 22
diff --git a/modules/data.land/data/crop_whc.rda b/modules/data.land/data/crop_whc.rda
index 10f58d95f836c43504dfdca450ce87e132e0982e..bc94230fe19c1c2a413143b9052ea91131634547 100644
GIT binary patch
literal 1624
zcmV-e2B-N#T4*^jL0KkKS*P?>@Bj;$fA9bQ-+(}8|Ly<p-@w1`|KLCX05AXn047N!
zZDD@#;0B%W=EXIkIg7275h#$#Q_zUoo|<JINE!j4&<2A;K+tF#Q_^Sv00000N+4(=
zCYmy6(<*+7ekuSOXf)6=05+fofuI5E27nB30000000000000000009ZY3V4YqZ3Uu
zG|7<DAPkrSG-;zC$YKD<G%y5Y0iy>1000000000000000001%si3pk|1ZZTZl=D>b
znWXfdpay7w2dEkkP#QfzWc58qsL<0-2~C1ab7Sy7vS=a#ME{gTx+T%B@rp_!m}0+9
zuO-yl&wwNXT32O`fkDZF0vc>eWQahG#wFWg@tR2p=jahPlXG{6j?x5H(Qz9Def=!r
zsSD+D?qWc-+5u>wYG}a9$pMDUJYJ1|N8RPhz(8jWB!r6?kQZ2kYp%NgS6cY$-tjO@
zru1T~9Ws=qDAA(K=4N4;r5IPuqA@B=XOSN|FU!~D#JI6PCU%D+_9Zw)=f=4r?~=WV
z!P(qOB_UfRuA>igoh(3=9K}2uYV}=rHDFaw2{?*?a2tRCfKjSey%ZqOV0!woHY6s*
zh82ymjAp&^laR@=u?>w9i6Z$)MB_cywa{s_0$4<ZkibG#FiCE(7wVE+V2U6l0@QlV
zuwXVc;D+02q#;8Y0E9pUK*((hOhAEF10ke<w9-OC;cBKJ>of@pVk(4@ePKvP;RAZg
z{0<;n?;-#Us<{M1PNV<@`YWPZHPfv4KKDDC&Fgovmbx8$UZMaPuGSA6ux6bmHip{<
z$fd#UQjBSg7z7gO>2a%JB$D)3RH{i-<??QN(uXokgPE4bF{FXx9nLLh)W$Iiq1R=r
zC~Se5plt(a8%nem>nNRd(srnlNmdiEZ6uOuPH|mTwFowTI}m)8st7yW+>nl13f<Rs
z)E8-RObC@us-uAb15rv-&_DrP<w~lqB6O81B9xwo0m;>2j+n;OHG?dan-hit0AmsX
zwgOGK1ehU#0MI)kLI4611Q416h#@GGTnGRR$K}l+8UTTVpPa3UFHU^{{p(mEA!;Iw
z6sgG$0y(3!NhIw(B0GwNFVdj^#;9H+Tyc__)BqDg058)m?1b$=kw@Cn?bKevAiZH`
zT}!pBeapxIEBi}+Ebc_I2}cjAjqvP{c1P@<*~UQx6o~jpm%wgc>7W3&HEf2ByItSf
z-4XBb2_L{DUM_pDi>rH*+yOnH&6+2cV1V1CI~YWiKr-~|;kbgj=9LucMHP-x2N+8F
z{2SK49~YIu8m2f|F^&WP=2lJ8uQRwF=+}ph^oa?M;uB=`>vBL6h{P7x%&IR0CJ1Z*
z8CkTLsAfa}wZpP>F^f26ab}2-bpq7U=|L$|0)~JCQ&xB<w`3?pf{1NRnhR;#lK_R&
zT%*P>)or}<HoqS|_ZQqE9pHh_yFkH)^b~WTA`^5mTKEgQmLH7dWtw)&xcxK4WO${B
z+d?2G0ukTu4UM$lr_2Ah(9N6Wt&Nkdi+Ny%GG-srxupYtWiJ`;YwNJt!m18IwIR@q
z7VOwWLC^6Dap9}u$*!=|hFqm-a|DON^B%oY&07c?2jI9sUatmvV^jblszKtA!;{jf
zA_^b@1sDNytFqMUmX*9%qyQf1uJXvxO87IYpp4Sk+6<FOsKjU4w^3*lRh!<zpeU;O
zoSpmPs~ai}#3C?|Bw<KJWw2_ZVhD(J0z}K%bk**ox+F@5<g~0}3cD2*+%cShJ!$O*
z!%S_a+hezF1i&^n494kvFa>naF-XEXBL4GPw9$8`dU-PRay%NqIS^1eN{LR%XgNE0
zCcLZ&nxhh;A+iOZjfXtN;os6aXBrfjO8VJyRNOBi#4iR{1L9o#RgA9DR3}4LQ-U1g
zPFbPo@D$5SZmOwCRv?O&4<E79fmI9K;{{(;BNC(TF5Nl0m?vdS$Xw<Pydl3EL?$3H
z1>e<B1&Ju)e&iLY3IZYfNRG<yXB=QSn+DCxhPQN9r$bw2#2{aPMfxEe^1%a#(K$*n
zs0mLl1(_r<fTf|27i;uFNbmRqmG~fr!b6T}E_0$HaotHT@0*YkT<1dpv4bcGnn@+T
WC4B?JCWG_#h1`)&6eKDA6?_0=<=Gkl

literal 2703
zcmV;A3UKv8T4*^jL0KkKSx+&(dH@K0fA9bQ-GESU|NsB*-@w1`|L{Nn05AXn00sdh
z2mn9<;0ymud9K_6(~CH8m7L%W3KA%w)|xtY2_-T#(x<f3RQ(MoZ2~Yr0004@>S$;*
zJrD*!&@|8xL?DyOdY+R{Pyy);qyW$W4FCq30010lXlQ5#fW!>}kN^gn02%?1000am
z)JR27$*MGZdQkNZsiuP_gFrn%05kvq0MI=^aiO81pc(@ZGzLHb8fXA$20#D+Fp(e-
znhA<&iKRRg^qG`288M{OKzf5sG5`jFk)cik`Q^f?wWJC{-C1iuhJ*x&DJ0quzKasd
z!t=0_H~+vrsnwcbK`SJM5Vk}u%*qLz9+r3F>JfpgNt43sKyoUMr{|jr^6NS%KwKFh
zL;zfb15$xGItNG5$JQNgj=lGsNQfox)0<UPV$o4lwL+$59?@1FlX*>R*+ekD4{Pb`
zkHdd|gO90aliQm-F#Asz4ED3zy)R8fz~&i^HA>lwFjZDXDC>8^*6xYb#r6tVW~hu6
zV#BcZTBtB8JnAUqp-NPO#1x>dGTnVC14NP?><#E4DnTIwAO#jl0B-^MK%SCtmMrvD
z6%|;l2B2FY1T!L11Pe5TB!3pFz!A{UiGf%UYIA$gE|kO{Co?%P%%01@ta`>6q?vG^
zXprewgL+Cnu~E&Do0VkVh~kOYTM8@~5c_4srpQa}8W8}#<Y87HbqqtL1-w>`d(eUd
ziWy|8VcAgOhUJs3WxAs<*$uqd52Oq^fXZ0mBy2%0HPRW?shf6tBDI`~8Y<}uF>q`%
zZg>l!%xtL7KRXR?8jGUiWC(Z*(h(6MD|k&K1`e8Bz=U_S9fKQ&oZ;=QvwFMZFf0~}
zGh9vC%Y|>zINT@-fTaaWxV4u94B<wg&WwYk-W=>dm?Gy6viid0lcwE6Ft7)VZV2Mr
za|U$M&bA2bs7C$()EgvA7pPG3>x_lEJ_^d<q&emGkdOcaII9-e&zIGNl29s<$tf7(
zsTd4y`hQI8?c3GjHeb)$=eIlCd@CDv04IU#Jzee@?=HC_w0<ncr$Sih!pB9kGQ%#5
zS5<|Fy~E30qD9rbE?$M`Dy*ehnVFeXsZvrGfuW5kIe9rk$cM{3zMpw93pbqe(u>Ro
zs?r9~HkFV_t!pbfg+y#snM+dBC?NBE7(wvaP!NCN1SM>qRD&ujiGiH#?ORFG6f{H)
zU7}_{_?e<wZAj5E$pg7X?h$zuAm!qIM*!wk@oM;x7EW%rmW4daLr9fP2qJ??Oi^hm
zU@L$;c*EEDI{wTF@Q^uu&u=Y2vUzqGuVdi2ss#H(N=Ian)z{g%G_hYms_%UF88NM|
zB);<>J@VmR_9~~f2;9i|4vcSbcxyjEYkv!!1Pciz&V@@#_*qOg^E5Q1xMY9040g?@
ze(3l}BudsVUOd;VGnQG^mLd%89GTF=c)jR`T~0o=wXGeq?;Zx>cbNLjSWSy2hNzU9
zePGBxNd6<*3B-J7yR*^iMkF?t34C)eF*|!XLjLxtrNl-}u=me`J^$Dwe8~usv8I%!
z@vLOtNLzw`<J!Tdfo%_ogbi^wY!PL#kZb|D7PF_HG+wh+ZHF0qtsf2pxdvqQB~Q8`
zQm4qY+wJ;zV|#qqp<14gWwTN;m0rg<ltQ{p3Z!DHRfxg+jFhpv#NYz1>=S0~JvpMP
zdx{m;3o9Y!aCQk?xJ(MuhXN83yr_dnfp`+|vh~SSe3Je29oIy<CGv)b5l7|y-N^Js
z-wl3GdXaK#kq|ni4lat(Cchy{l5|G7Rc&TP7*2Z&Zz`E$+?+Z(+Z2v<i#%O69y8B&
zAZ(C#n{~EY2LX&+G9)t_s1tF}dQ@F*9Y=orr+Z3ZC?ZIJ_J=BM3L#-81BIXFle=+i
zIU&?AMm#pjrvnP8N+9tp*2y7?r!%=_=31V#wFX!`Ys+v?2-C_}XkQf|oK8bgDH{5~
z4Uw|?(3#Atug#&7E9<SZmw5B%$DhjVHIP#4fbyS$kfXY{`c^_Nk~mp&Jj@$n2sSpf
zBYZo@uR=!iP)5|c+_nM5uAb(=WOc|Rs$Ax@jfV+#E$B5#rWn<25;L}%9fq_t&c|BR
z@6Wed-Ec6--xBA^@yks&b(jIQ#jN$$t!1#?w^cN!)7;HmH7hfamw$QDK+26VCd9sR
zD6U3Dt3`Q=Tj2pW;Xf=Q%wF}?gf41y4Ax}_2@VXG+oA%=QuP9LM+|uxsd-4F)&owH
zmN=&1h-(>2%7ryFzC~3Kxp_gI<b0-OGOE%6N~DvVoPg)KBz=={I>&^F4n6ahzgW*_
zEN#})4*Bdfn8rQ$zG#-O#U5O(pmP|@KM}r#Bv;4F=ifGPBtbN#BXQ$*&@Yyos<1Vv
z#>J!NgZO2RJ$B%{b(KtxU>S6#jE%@vszUWjZqP3Fqc#N5zwYr4j@vZ7+Swo+n(!8-
zO$Q>d@b0_VvEP7#^=Aea#ms>@+>^)_CQk`zq3A7_EL_6H%D-)dTos}TBp7;8cA>@|
zNhy%vwN03n8BTa_svxbYD&UQ*gP>w0N+#2)%`%gs+SgE2uZXI-A^=%6oX?X7xa5|N
z(PUX{#f4O$!Zw_Ba`AL9ixDo88D7Ju*38Catt9}mTHLtyd#HrDk_$Q#L&2d=l8Gaw
zAr{nXhHz3EY!Yz*HpVfhB10tDU=lSq@ayeJGV{IXjP4DKVo0fitz!Ksum^VMZ14&Y
zvmbipH~Ks!O$<;*Vw}KKlsmU&o|>7|4&jJ`s}}DN=K>RT7l8)$hM_3Ken{kCuVOG-
zLc;^<oVZDDqzG8bNpojY97j>26f~GIsepFhl%cV9ZSFJGtd&i7Uh2%_=HpwXi@YLT
z_(!nvN$j;tn<`TR$b?L)ps^ULAgT#Cj+ujz8ZWMEWoD02gw`sgC5Lo?;4GLv5JW;!
zn(MhvB@7toDnhN&M#V5BjENsenNe9#z-f%o<{jTtQ>*M_3}xAPW>jW^Y#G3SfuL<8
z07H#5OT-lWT^5-OHPN|kmf81Z#V&#?QU;i8A6Isyrv~X+_A<pIlrz0HP0kp$uH^v^
z%qrZ%1cP8qvjV|J6;nAAlIc+?19+T1G!SXY1-5ewO)V(MrWKrkfNPHLIZPzP>LyIy
zn)MTh!R6)8Zf4X-yV|QN<mS}&Yd;Dn30eRrC7{5c8eZ+<N>Y({a6&H<R&d7?Cd$*b
z$ZA9rs8JS<$`$I0;F+sK3ldwKfJkJKs8|I;7o_~Xjtl?;&O;+16p^uEd)gD)6<yX%
zw^IwXpzDH5H#4xeBN$u5I3Z_=;GobZqycy#F2)z=uq}3yq$vvq4RYr_nNbxY5Q&a*
z&iG<Yk~Gffcm$8?nWv%3msP2a`SYk6Eb=KlMf9amG$p*R=1Ebl-iDo<kdIj08pyuW
zQE=LM;>ae=O7Tf4(2nD{h*Le96t9ZYDkR?6M5JzE+vz-+JJH<=Cm0n_*PjOi7@Pw3
z(nEZul}8l16)qzHumi?w3ms}GM&9TPd=1b)f0TqD<O#?Zm_Qp~r%RSW4(9*G+>uTc
JBooYUo`8`#?STLQ


From bbf555b0c17d426054c56e70b754835f274c6be7 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 12:52:05 -0500
Subject: [PATCH 17/67] simplify ssurgo function documentation

---
 modules/data.land/R/gSSURGO_Query.R           | 96 ++++++-------------
 modules/data.land/man/ssurgo_mukeys.Rd        | 50 ++++++++++
 modules/data.land/man/ssurgo_mukeys_bbox.Rd   | 33 -------
 .../data.land/man/ssurgo_mukeys_bigbbox.Rd    | 33 -------
 modules/data.land/man/ssurgo_mukeys_point.Rd  | 37 -------
 5 files changed, 80 insertions(+), 169 deletions(-)
 create mode 100644 modules/data.land/man/ssurgo_mukeys.Rd
 delete mode 100644 modules/data.land/man/ssurgo_mukeys_bbox.Rd
 delete mode 100644 modules/data.land/man/ssurgo_mukeys_bigbbox.Rd
 delete mode 100644 modules/data.land/man/ssurgo_mukeys_point.Rd

diff --git a/modules/data.land/R/gSSURGO_Query.R b/modules/data.land/R/gSSURGO_Query.R
index a18bcdad5f2..21d6be6865b 100644
--- a/modules/data.land/R/gSSURGO_Query.R
+++ b/modules/data.land/R/gSSURGO_Query.R
@@ -138,29 +138,43 @@ gSSURGO.Query <- function(mukeys,
   
 }
 
-#' Get map unit keys (mukeys) from gSSURGO using a bounding box
+SSURGO_API_MAX_AREA_M2 <- 10100000000
+
+#' Get map unit keys (mukeys) from gSSURGO
 #'
-#' Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
-#' based on a bounding box spatial filter.
+#' These functions query the NRCS gSSURGO Web Feature Service to retrieve map
+#' unit keys based on different spatial filters.
 #'
-#' @param bbox Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).
-#'   Features that intersect the bounding box are returned.
+#' @param bbox Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84
+#'   (EPSG:4326). Features that intersect the bounding box are returned.
+#' @param point Numeric vector of length 2: c(lon, lat) in WGS84 (EPSG:4326).
+#' @param distance Numeric. Distance in meters from the point. Use 0 for exact
+#'   point intersection.
 #'
 #' @return Character vector of unique map unit keys (mukeys).
 #'
 #' @details
-#' This function uses the NRCS SDM Data Access Web Feature Service:
+#' These functions use the NRCS SDM Data Access Web Feature Service:
 #' \url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
 #'
-#' The total extent of the bounding box cannot exceed 10,100,000,000 square
-#' meters (~3,900 square miles). Use \code{ssurgo_mukeys_bigbbox()} for large
-#' bounding boxes.
+#' The total extent cannot exceed 10,100,000,000 square meters (~3,900 square
+#' miles). Use `ssurgo_mukeys_bigbbox()` for large bounding boxes.
 #'
 #' @examples
 #' \dontrun{
 #' # Bounding box query
 #' mukeys <- ssurgo_mukeys_bbox(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
+#'
+#' # Point with distance (600m radius)
+#' mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 600)
+#'
+#' # Large bounding box
+#' mukeys <- ssurgo_mukeys_bigbbox(bbox = c(-120, 35, -110, 45))
 #' }
+#' @name ssurgo_mukeys
+NULL
+
+#' @rdname ssurgo_mukeys
 #' @export
 ssurgo_mukeys_bbox <- function(bbox) {
   if (!is.numeric(bbox) || length(bbox) != 4) {
@@ -176,7 +190,6 @@ ssurgo_mukeys_bbox <- function(bbox) {
     stop("bbox must have xmin < xmax and ymin < ymax")
   }
 
-  MAX_AREA <- 10100000000
   wgs84_crs <- sf::st_crs(4326)
 
   bbox_poly <- sf::st_polygon(list(
@@ -185,11 +198,11 @@ ssurgo_mukeys_bbox <- function(bbox) {
   bbox_sf <- sf::st_sfc(bbox_poly, crs = wgs84_crs)
   area <- as.numeric(sf::st_area(bbox_sf))
 
-  if (area > MAX_AREA) {
+  if (area > SSURGO_API_MAX_AREA_M2) {
     stop(
       paste0(
         "Bounding box area (", format(area, scientific = FALSE),
-        " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
+        " m²) exceeds maximum allowed area (", format(SSURGO_API_MAX_AREA_M2, scientific = FALSE),
         " m²). Use ssurgo_mukeys_bigbbox() for large bounding boxes."
       )
     )
@@ -233,32 +246,7 @@ ssurgo_mukeys_bbox <- function(bbox) {
   mukeys
 }
 
-#' Get map unit keys (mukeys) from gSSURGO using a point with distance
-#'
-#' Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
-#' based on a point and distance (DWithin) spatial filter.
-#'
-#' @param point Numeric vector of length 2: c(lon, lat) in WGS84 (EPSG:4326).
-#' @param distance Numeric. Distance in meters from the point.
-#'   Use 0 for exact point intersection.
-#'
-#' @return Character vector of unique map unit keys (mukeys).
-#'
-#' @details
-#' This function uses the NRCS SDM Data Access Web Feature Service:
-#' \url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
-#'
-#' The search radius area (π × distance²) cannot exceed 10,100,000,000 square
-#' meters (~3,900 square miles).
-#'
-#' @examples
-#' \dontrun{
-#' # Point with distance (600m radius)
-#' mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 600)
-#'
-#' # Point with zero distance (exact intersection)
-#' mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 0)
-#' }
+#' @rdname ssurgo_mukeys
 #' @export
 ssurgo_mukeys_point <- function(point, distance) {
   if (length(point) != 2) {
@@ -272,13 +260,12 @@ ssurgo_mukeys_point <- function(point, distance) {
   lon <- point[1]
   lat <- point[2]
 
-  MAX_AREA <- 10100000000
   circle_area <- pi * (distance^2)
-  if (circle_area > MAX_AREA) {
+  if (circle_area > SSURGO_API_MAX_AREA_M2) {
     stop(
       paste0(
         "Search radius area (", format(circle_area, scientific = FALSE),
-        " m²) exceeds maximum allowed area (", format(MAX_AREA, scientific = FALSE),
+        " m²) exceeds maximum allowed area (", format(SSURGO_API_MAX_AREA_M2, scientific = FALSE),
         " m²)."
       )
     )
@@ -334,29 +321,7 @@ ssurgo_mukeys_point <- function(point, distance) {
   mukeys
 }
 
-#' Get map unit keys (mukeys) from gSSURGO for large bounding boxes
-#'
-#' Queries the NRCS gSSURGO Web Feature Service for large bounding boxes
-#' by dividing the area into smaller cells that comply with the API's
-#' 10,100,000,000 m² extent limit.
-#'
-#' @param bbox Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).
-#'
-#' @return Character vector of unique map unit keys (mukeys).
-#'
-#' @details
-#' This function divides a large bounding box into smaller cells,
-#' each with an area less than 10,100,000,000 square meters,
-#' then queries each cell individually in parallel and combines the results.
-#'
-#' The grid is created using \code{sf::st_area()} to calculate the bbox area
-#' in meters, then divided into appropriately-sized cells.
-#'
-#' @examples
-#' \dontrun{
-#' # Large bounding box covering a significant area
-#' mukeys <- ssurgo_mukeys_bigbbox(bbox = c(-120, 35, -110, 45))
-#' }
+#' @rdname ssurgo_mukeys
 #' @export
 ssurgo_mukeys_bigbbox <- function(bbox) {
   if (!is.numeric(bbox) || length(bbox) != 4) {
@@ -372,7 +337,6 @@ ssurgo_mukeys_bigbbox <- function(bbox) {
     stop("bbox must have xmin < xmax and ymin < ymax")
   }
 
-  MAX_AREA <- 10100000000
   wgs84_crs <- sf::st_crs(4326)
 
   bbox_poly <- sf::st_polygon(list(
@@ -388,7 +352,7 @@ ssurgo_mukeys_bigbbox <- function(bbox) {
 
   aspect_ratio <- width_deg / height_deg
 
-  n_cells <- ceiling(bbox_area / MAX_AREA)
+  n_cells <- ceiling(bbox_area / SSURGO_API_MAX_AREA_M2)
   cells_per_side <- sqrt(n_cells)
 
   ncol_cells <- ceiling(cells_per_side * sqrt(aspect_ratio))
diff --git a/modules/data.land/man/ssurgo_mukeys.Rd b/modules/data.land/man/ssurgo_mukeys.Rd
new file mode 100644
index 00000000000..de370953f53
--- /dev/null
+++ b/modules/data.land/man/ssurgo_mukeys.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gSSURGO_Query.R
+\name{ssurgo_mukeys}
+\alias{ssurgo_mukeys}
+\alias{ssurgo_mukeys_bbox}
+\alias{ssurgo_mukeys_point}
+\alias{ssurgo_mukeys_bigbbox}
+\title{Get map unit keys (mukeys) from gSSURGO}
+\usage{
+ssurgo_mukeys_bbox(bbox)
+
+ssurgo_mukeys_point(point, distance)
+
+ssurgo_mukeys_bigbbox(bbox)
+}
+\arguments{
+\item{bbox}{Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84
+(EPSG:4326). Features that intersect the bounding box are returned.}
+
+\item{point}{Numeric vector of length 2: c(lon, lat) in WGS84 (EPSG:4326).}
+
+\item{distance}{Numeric. Distance in meters from the point. Use 0 for exact
+point intersection.}
+}
+\value{
+Character vector of unique map unit keys (mukeys).
+}
+\description{
+These functions query the NRCS gSSURGO Web Feature Service to retrieve map
+unit keys based on different spatial filters.
+}
+\details{
+These functions use the NRCS SDM Data Access Web Feature Service:
+\url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
+
+The total extent cannot exceed 10,100,000,000 square meters (~3,900 square
+miles). Use `ssurgo_mukeys_bigbbox()` for large bounding boxes.
+}
+\examples{
+\dontrun{
+# Bounding box query
+mukeys <- ssurgo_mukeys_bbox(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
+
+# Point with distance (600m radius)
+mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 600)
+
+# Large bounding box
+mukeys <- ssurgo_mukeys_bigbbox(bbox = c(-120, 35, -110, 45))
+}
+}
diff --git a/modules/data.land/man/ssurgo_mukeys_bbox.Rd b/modules/data.land/man/ssurgo_mukeys_bbox.Rd
deleted file mode 100644
index adae7f0f524..00000000000
--- a/modules/data.land/man/ssurgo_mukeys_bbox.Rd
+++ /dev/null
@@ -1,33 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gSSURGO_Query.R
-\name{ssurgo_mukeys_bbox}
-\alias{ssurgo_mukeys_bbox}
-\title{Get map unit keys (mukeys) from gSSURGO using a bounding box}
-\usage{
-ssurgo_mukeys_bbox(bbox)
-}
-\arguments{
-\item{bbox}{Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).
-Features that intersect the bounding box are returned.}
-}
-\value{
-Character vector of unique map unit keys (mukeys).
-}
-\description{
-Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
-based on a bounding box spatial filter.
-}
-\details{
-This function uses the NRCS SDM Data Access Web Feature Service:
-\url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
-
-The total extent of the bounding box cannot exceed 10,100,000,000 square
-meters (~3,900 square miles). Use \code{ssurgo_mukeys_bigbbox()} for large
-bounding boxes.
-}
-\examples{
-\dontrun{
-# Bounding box query
-mukeys <- ssurgo_mukeys_bbox(bbox = c(-114.006, 32.1823, -113.806, 32.2823))
-}
-}
diff --git a/modules/data.land/man/ssurgo_mukeys_bigbbox.Rd b/modules/data.land/man/ssurgo_mukeys_bigbbox.Rd
deleted file mode 100644
index d9128d23910..00000000000
--- a/modules/data.land/man/ssurgo_mukeys_bigbbox.Rd
+++ /dev/null
@@ -1,33 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gSSURGO_Query.R
-\name{ssurgo_mukeys_bigbbox}
-\alias{ssurgo_mukeys_bigbbox}
-\title{Get map unit keys (mukeys) from gSSURGO for large bounding boxes}
-\usage{
-ssurgo_mukeys_bigbbox(bbox)
-}
-\arguments{
-\item{bbox}{Numeric vector of length 4: c(xmin, ymin, xmax, ymax) in WGS84 (EPSG:4326).}
-}
-\value{
-Character vector of unique map unit keys (mukeys).
-}
-\description{
-Queries the NRCS gSSURGO Web Feature Service for large bounding boxes
-by dividing the area into smaller cells that comply with the API's
-10,100,000,000 m² extent limit.
-}
-\details{
-This function divides a large bounding box into smaller cells,
-each with an area less than 10,100,000,000 square meters,
-then queries each cell individually in parallel and combines the results.
-
-The grid is created using \code{sf::st_area()} to calculate the bbox area
-in meters, then divided into appropriately-sized cells.
-}
-\examples{
-\dontrun{
-# Large bounding box covering a significant area
-mukeys <- ssurgo_mukeys_bigbbox(bbox = c(-120, 35, -110, 45))
-}
-}
diff --git a/modules/data.land/man/ssurgo_mukeys_point.Rd b/modules/data.land/man/ssurgo_mukeys_point.Rd
deleted file mode 100644
index fd65da8849d..00000000000
--- a/modules/data.land/man/ssurgo_mukeys_point.Rd
+++ /dev/null
@@ -1,37 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gSSURGO_Query.R
-\name{ssurgo_mukeys_point}
-\alias{ssurgo_mukeys_point}
-\title{Get map unit keys (mukeys) from gSSURGO using a point with distance}
-\usage{
-ssurgo_mukeys_point(point, distance)
-}
-\arguments{
-\item{point}{Numeric vector of length 2: c(lon, lat) in WGS84 (EPSG:4326).}
-
-\item{distance}{Numeric. Distance in meters from the point.
-Use 0 for exact point intersection.}
-}
-\value{
-Character vector of unique map unit keys (mukeys).
-}
-\description{
-Queries the NRCS gSSURGO Web Feature Service to retrieve map unit keys
-based on a point and distance (DWithin) spatial filter.
-}
-\details{
-This function uses the NRCS SDM Data Access Web Feature Service:
-\url{https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm}
-
-The search radius area (π × distance²) cannot exceed 10,100,000,000 square
-meters (~3,900 square miles).
-}
-\examples{
-\dontrun{
-# Point with distance (600m radius)
-mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 600)
-
-# Point with zero distance (exact intersection)
-mukeys <- ssurgo_mukeys_point(point = c(-91.22, 38.46), distance = 0)
-}
-}

From b7e8410dadf5d317a4ed4267803c54419a27ad2c Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 13:25:57 -0500
Subject: [PATCH 18/67] allow time-varying WHC

necessary to support crop changes (which change the rooting depth).
---
 modules/data.land/R/water_balance.R           | 71 +++++++++++--------
 modules/data.land/man/apply_water_balance.Rd  |  5 +-
 modules/data.land/man/calc_water_balance.Rd   | 11 +--
 .../tests/testthat/test-water_balance.R       | 28 ++++++++
 4 files changed, 81 insertions(+), 34 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 035ecc12558..ec2c04c1435 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -21,14 +21,17 @@
 #' @param et Vector of evapotranspiration values (distance / time)
 #' @param precip Vector of precipitation values (distance / time)
 #' @param whc Water holding capacity (distance); the plant-available range from
-#'   wilting point to field capacity (i.e., `whc = field_capacity - wilting_point`)
+#'   wilting point to field capacity (i.e., `whc = field_capacity - wilting_point`).
+#'   Can be a single value or a vector of the same length as `et`.
 #' @param whc_min_frac Fraction of WHC for minimum water level (irrigation
-#'   trigger); unused if `w_min` is explicitly specified
+#'   trigger); unused if `w_min` is explicitly specified.
+#'   Can be a single value or a vector of the same length as `et`.
 #' @param W_initial Initial soil water content at start of time series
-#'   (distance); defaults to `whc` (field capacity) if NULL
+#'   (distance); defaults to `whc[1]` (field capacity) if NULL
 #' @param w_min Minimum water level threshold (distance); irrigation is
 #'   triggered when soil water falls below this level; defaults to
-#'   `whc_min_frac * whc` if NULL
+#'   `whc_min_frac * whc` if NULL.
+#'   Can be a single value or a vector of the same length as `et`.
 #' @param seepage_rate Daily seepage loss for rice paddies (distance / time);
 #'   only used when `is_rice = TRUE`
 #' @param is_rice Logical; if TRUE, applies a constant seepage loss (mm/day)
@@ -67,16 +70,21 @@ calc_water_balance <- function(
     )
   }
 
-  if (!length(whc) == 1) {
-    PEcAn.logger::logger.severe(
-      "whc must have length 1; actual length = ", length(whc)
-    )
+  ensure_vec <- function(x, n, name) {
+    if (length(x) == 1) {
+      rep(x, n)
+    } else if (length(x) == n) {
+      x
+    } else {
+      PEcAn.logger::logger.severe(
+        sprintf("%s must have length 1 or %d; actual length = %d", name, n, length(x))
+      )
+    }
   }
 
-  if (!length(whc_min_frac) == 1) {
-    PEcAn.logger::logger.severe(
-      "whc_min_frac must have length 1; actual length = ", length(whc_min_frac)
-    )
+  whc <- ensure_vec(whc, n, "whc")
+  if (!is.null(whc_min_frac)) {
+    whc_min_frac <- ensure_vec(whc_min_frac, n, "whc_min_frac")
   }
 
   if (is_rice && is.null(seepage_rate)) {
@@ -84,12 +92,17 @@ calc_water_balance <- function(
   }
 
   if (is.null(w_min)) {
+    if (is.null(whc_min_frac)) {
+      PEcAn.logger::logger.severe("Either whc_min_frac or w_min must be provided")
+    }
     w_min <- whc_min_frac * whc
+  } else {
+    w_min <- ensure_vec(w_min, n, "w_min")
   }
 
   if (is.null(W_initial)) {
     # Initialize at field capacity (i.e., full WHC)
-    W_prev <- whc
+    W_prev <- whc[1]
   } else {
     W_prev <- W_initial
   }
@@ -100,27 +113,27 @@ calc_water_balance <- function(
 
   for (t in seq_len(n)) {
     # Only water above w_min is available for seepage
-    seepage <- if (is_rice) min(seepage_rate, max(0, W_prev - w_min)) else 0.0
+    seepage <- if (is_rice) min(seepage_rate, max(0, W_prev - w_min[t])) else 0.0
 
     # Potential state after precip and ET
     W0 <- W_prev + precip[t] - et[t] - seepage
 
     # If W0 falls below w_min (e.g., high ET and seepage; low precip), irrigate
     # to field capacity (i.e., full WHC).
-    if (W0 < w_min) {
-      irr[t] <- whc - W0
-      W0 <- whc
+    if (W0 < w_min[t]) {
+      irr[t] <- whc[t] - W0
+      W0 <- whc[t]
     } else {
       irr[t] <- 0
     }
 
     # If W0 exceeds field capacity (i.e., whc), the difference is runoff.
-    if (W0 > whc) {
-      runoff[t] <- W0 - whc
-      W_t[t] <- whc
+    if (W0 > whc[t]) {
+      runoff[t] <- W0 - whc[t]
+      W_t[t] <- whc[t]
     } else {
       runoff[t] <- 0
-      W_t[t] <- max(W0, w_min)
+      W_t[t] <- max(W0, w_min[t])
     }
 
     W_prev <- W_t[t]
@@ -137,9 +150,10 @@ calc_water_balance <- function(
 #' `calc_water_balance`, the units here *do* matter -- they should be `mm_day`.
 #'
 #' @param df Data frame with columns: `date`, `location_id`, `etc_mm_day`,
-#' `precip_mm_day`, and `whc_min_frac`
+#' `precip_mm_day`, and `whc_min_frac` (optional, defaults to 0.375).
+#' If a `whc_mm` column is present, it is used as the water holding capacity.
 #' @param idcol Column name for grouping (typically, `location_id`, `parcel_id` or similar)
-#' @param whc_mm Water holding capacity (mm)
+#' @param whc_mm Water holding capacity (mm); ignored if `whc_mm` is a column in `df`.
 #' @return Data frame with added columns: `W_t`, `irr`, `runoff`
 #' @export
 apply_water_balance <- function(df, idcol, whc_mm = 500) {
@@ -172,6 +186,10 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
     df[["whc_min_frac"]] <- default_whc_min_frac
   }
 
+  if (!("whc_mm" %in% colnames(df))) {
+    df[["whc_mm"]] <- whc_mm
+  }
+
   df |>
     dplyr::arrange(.data[[idcol]], .data$date) |> # nolint: object_usage_linter
     dplyr::mutate(
@@ -182,11 +200,8 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
       results = tibble::as_tibble(calc_water_balance(
         et = .data$etc_mm_day,
         precip = .data$precip_mm_day,
-        whc = whc_mm,
-        # NOTE: Use unique here because, in a merged crop data frame, this gets
-        # expanded to a vector of values. They *should* all be unique per
-        # `idcol`; if they're not, `calc_water_balance` should fail loudly.
-        whc_min_frac = unique(.data$whc_min_frac)
+        whc = .data$whc_mm,
+        whc_min_frac = .data$whc_min_frac
       )),
       .by = dplyr::all_of(idcol)
     ) |>
diff --git a/modules/data.land/man/apply_water_balance.Rd b/modules/data.land/man/apply_water_balance.Rd
index 42bba5888f7..29848a143aa 100644
--- a/modules/data.land/man/apply_water_balance.Rd
+++ b/modules/data.land/man/apply_water_balance.Rd
@@ -8,11 +8,12 @@ apply_water_balance(df, idcol, whc_mm = 500)
 }
 \arguments{
 \item{df}{Data frame with columns: `date`, `location_id`, `etc_mm_day`,
-`precip_mm_day`, and `whc_min_frac`}
+`precip_mm_day`, and `whc_min_frac` (optional, defaults to 0.375).
+If a `whc_mm` column is present, it is used as the water holding capacity.}
 
 \item{idcol}{Column name for grouping (typically, `location_id`, `parcel_id` or similar)}
 
-\item{whc_mm}{Water holding capacity (mm)}
+\item{whc_mm}{Water holding capacity (mm); ignored if `whc_mm` is a column in `df`.}
 }
 \value{
 Data frame with added columns: `W_t`, `irr`, `runoff`
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index 3a44b20d259..8ab5cb9dac3 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -21,17 +21,20 @@ calc_water_balance(
 \item{precip}{Vector of precipitation values (distance / time)}
 
 \item{whc}{Water holding capacity (distance); the plant-available range from
-wilting point to field capacity (i.e., `whc = field_capacity - wilting_point`)}
+wilting point to field capacity (i.e., `whc = field_capacity - wilting_point`).
+Can be a single value or a vector of the same length as `et`.}
 
 \item{whc_min_frac}{Fraction of WHC for minimum water level (irrigation
-trigger); unused if `w_min` is explicitly specified}
+trigger); unused if `w_min` is explicitly specified.
+Can be a single value or a vector of the same length as `et`.}
 
 \item{W_initial}{Initial soil water content at start of time series
-(distance); defaults to `whc` (field capacity) if NULL}
+(distance); defaults to `whc[1]` (field capacity) if NULL}
 
 \item{w_min}{Minimum water level threshold (distance); irrigation is
 triggered when soil water falls below this level; defaults to
-`whc_min_frac * whc` if NULL}
+`whc_min_frac * whc` if NULL.
+Can be a single value or a vector of the same length as `et`.}
 
 \item{seepage_rate}{Daily seepage loss for rice paddies (distance / time);
 only used when `is_rice = TRUE`}
diff --git a/modules/data.land/tests/testthat/test-water_balance.R b/modules/data.land/tests/testthat/test-water_balance.R
index f55af788f09..998c4e9dda8 100644
--- a/modules/data.land/tests/testthat/test-water_balance.R
+++ b/modules/data.land/tests/testthat/test-water_balance.R
@@ -56,3 +56,31 @@ test_that("calc_water_balance: more ET leads to more irrigation", {
   expect_nonnegative(result_low)
   expect_nonnegative(result_high)
 })
+
+test_that("calc_water_balance: vector parameters work", {
+  n <- 10
+  et <- rep(5, n)
+  precip <- rep(0, n)
+  
+  # WHC decreases halfway through
+  whc_vec <- c(rep(100, 5), rep(50, 5))
+  whc_min_frac <- 0.5
+  
+  # Constant WHC for comparison
+  result_const100 <- calc_water_balance(et, precip, 100, whc_min_frac)
+  result_const50 <- calc_water_balance(et, precip, 50, whc_min_frac)
+  result_vec <- calc_water_balance(et, precip, whc_vec, whc_min_frac)
+  
+  # Irrigation should match the first half of 100
+  expect_equal(result_vec$irr[1:5], result_const100$irr[1:5])
+  # Step 6 might differ because it depends on W_t[5], which is different.
+  # But subsequent steps (7:10) should be identical because step 6 will force W_t[6] to whc[6] (50).
+  expect_equal(result_vec$irr[7:10], result_const50$irr[7:10])
+  
+  # w_min vector
+  w_min_vec <- c(rep(50, 5), rep(25, 5))
+  # whc_min_frac is required but ignored if w_min is provided
+  result_wmin_vec <- calc_water_balance(et, precip, 100, whc_min_frac = 0.5, w_min = w_min_vec)
+  expect_equal(result_wmin_vec$irr[1:5], result_const100$irr[1:5])
+})
+

From 3fb11276d29b0dd051a06e47f3708995273b89ab Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 13:53:23 -0500
Subject: [PATCH 19/67] Replace CIMIS ET vignette with a script

The vignette won't run with absolute file paths. But the script is still
useful as reference (and is a bit easier to test and debug than Quarto).
---
 .../CIMIS-event-files.R}                      | 257 +++++++++++-------
 1 file changed, 153 insertions(+), 104 deletions(-)
 rename modules/data.land/{vignettes/CIMIS-event-files.qmd => inst/CIMIS-event-files.R} (50%)

diff --git a/modules/data.land/vignettes/CIMIS-event-files.qmd b/modules/data.land/inst/CIMIS-event-files.R
similarity index 50%
rename from modules/data.land/vignettes/CIMIS-event-files.qmd
rename to modules/data.land/inst/CIMIS-event-files.R
index d04edbd2ff9..585affd65a9 100644
--- a/modules/data.land/vignettes/CIMIS-event-files.qmd
+++ b/modules/data.land/inst/CIMIS-event-files.R
@@ -1,74 +1,73 @@
----
-title: "Generating SIPNET event files from CIMIS and CHIRPS data"
-author: "Alexey N. Shiklomanov"
----
+#' ---
+#' title: "Example workflow generating SIPNET event files from CIMIS and CHIRPS data"
+#' author: "Alexey N. Shiklomanov"
+#' ---
 
-```{r}
-library(PEcAn.data.land)
-# devtools::load_all("modules/data.land")
-```
+if (interactive()) {
+  devtools::load_all("modules/data.land")
+} else {
+  library(PEcAn.data.land)
+}
 
-Start from a range of dates (2020 to present) and locations (`design_points.csv`).
+#' Define paths to relevant external data files
+design_points_path <- "~/projects/cimis-to-irrigation/design_points.csv"
+cimis_eto_cog_path <- "~/data/CIMIS-ETo-COG"
+parcels_path <- "~/data/LandIQ-harmonized-v3/parcels.gpkg"
+crops_path <- "~/data/LandIQ-harmonized-v3/crops_all_years.parq"
+
+#' Start from a range of dates (2020 to present) and locations (`design_points.csv`).
 
-```{r}
 dates <- seq.Date(as.Date("2020-03-01"), as.Date("2020-11-30"), "day")
-design_points <- readr::read_csv("~/projects/cimis-to-irrigation/design_points.csv") |>
+design_points <- readr::read_csv(design_points_path) |>
   head(10)
-```
-
-# CIMIS ETref
 
-For each site, extract its reference ETref from the CIMIS data.
+#' # CIMIS ETref
+#'
+#' For each site, extract its reference ETref from the CIMIS data.
 
-```{r}
 etref <- design_points |>
   extract_cimis_dates(
     dates,
-    "~/data/CIMIS-ETo-COG",
+    cimis_eto_cog_path,
     .progress = TRUE
   )
-```
 
-# CHIRPS Precipitation
+#' # CHIRPS Precipitation
+#'
+#' Also, extract precipitation from CHIRPS v2.
 
-Also, extract precipitation from CHIRPS v2.
-
-```{r}
 precip <- extract_chirps_remote(design_points, dates)
-```
-
-# BIS Kc coefficients
 
-For each site, get LandIQ parcel and crop data.
+#' # BIS Kc coefficients
+#'
+#' For each site, get LandIQ parcel and crop data.
 
-```{r}
 dp_with_crops <- get_landiq(
   design_points,
-  parcels_file = "~/data/LandIQ-harmonized-v3/parcels.gpkg",
-  crops_file = "~/data/LandIQ-harmonized-v3/crops_all_years.parq"
+  parcels_file = parcels_path,
+  crops_file = crops_path
 ) |>
   tibble::as_tibble()
-```
-
-Map `CLASS/SUBCLASS` to `crop_name` using `bism_kc_by_crop`.
 
-**NOTE:** Some LandIQ classes/subclasses map onto *multiple BISM crop types*.
+#' Map `CLASS/SUBCLASS` to `crop_name` using `bism_kc_by_crop`.
+#'
+#' **NOTE:** Some LandIQ classes/subclasses map onto *multiple BISM crop types*.
 
-```{r}
 bism_kc_by_crop |>
-  dplyr::summarize(n_unique = dplyr::n(), .by = c("landiq_class", "landiq_subclass")) |>
+  dplyr::summarize(
+    n_unique = dplyr::n(),
+    .by = c("landiq_class", "landiq_subclass")
+  ) |>
   dplyr::filter(n_unique > 1) |>
   dplyr::left_join(bism_kc_by_crop) |>
   dplyr::summarize(
     crops = paste(crop_name, collapse = ", "),
     .by = c("landiq_class", "landiq_subclass", "n_unique")
   )
-```
 
-So, below, we introduce a **HACK** to select just the first crop in any of these groups.
-The more correct fix is to do some kind of averaging later.
+#' So, below, we introduce a **HACK** to select just the first crop in any of these groups.
+#' The more correct fix is to do some kind of averaging later.
 
-```{r}
 bism_crop_unique <- bism_kc_by_crop |>
   dplyr::distinct(landiq_class, landiq_subclass, crop_name) |>
   # WARNING: Hack here!
@@ -78,12 +77,10 @@ design_point_crops <- dp_with_crops |>
     bism_crop_unique,
     by = c("CLASS" = "landiq_class", "SUBCLASS" = "landiq_subclass")
   )
-```
 
-For demonstration purposes, we will expand this naively using `tidyr::fill` and hard-code dates for the 4 seasons to January 1, April 1, July 1, October 1.
-In reality, you would resolve these more finely using phenology data (e.g., from remote sensing).
+#' For demonstration purposes, we will expand this naively using `tidyr::fill` and hard-code dates for the 4 seasons to January 1, April 1, July 1, October 1.
+#' In reality, you would resolve these more finely using phenology data (e.g., from remote sensing).
 
-```{r}
 fill_season <- function(year, season) {
   if (season == 1) {
     start <- lubridate::make_date(year, 1, 1)
@@ -113,11 +110,9 @@ dp_crops_filled <- design_point_crops |>
   dplyr::mutate(date = purrr::map2(year, season, fill_season)) |>
   tidyr::unnest(date) |>
   dplyr::filter(date %in% !!dates)
-```
 
-Identify and warn about parcels with no matching BIS crop.
+#' Identify and warn about parcels with no matching BIS crop.
 
-```{r}
 missing_crops <- dp_crops_filled |> dplyr::filter(is.na(crop_name))
 if (nrow(missing_crops) > 0) {
   missing_crop_strs <- missing_crops |>
@@ -133,28 +128,88 @@ if (nrow(missing_crops) > 0) {
   )
 }
 
-dp_with_cropname <- dp_crops_filled |> 
+dp_with_cropname <- dp_crops_filled |>
   dplyr::filter(!is.na(crop_name)) |>
   dplyr::left_join(
-    crop_whc |> dplyr::select("crop_name", "whc_min_frac"),
+    crop_whc |> dplyr::select("crop_name", "whc_min_frac", "rooting_depth_m"),
     by = "crop_name"
   )
-```
 
-# Join with ETo data
+#' # SSURGO Soil Data
+#'
+#' Calculate site-specific water holding capacity (WHC) from SSURGO soil data and crop rooting depth.
+
+calc_effective_awc <- function(
+  hzdept_r_cm,
+  hzdepb_r_cm,
+  awc_r,
+  rooting_depth_cm
+) {
+  # Clip each horizon to the rooting depth
+  effective_top <- pmin(hzdept_r_cm, rooting_depth_cm)
+  effective_bottom <- pmin(hzdepb_r_cm, rooting_depth_cm)
+  thickness_cm <- pmax(0, effective_bottom - effective_top)
+
+  # awc_r is cm water / cm soil, so multiply by thickness to get cm water.
+  # Convert cm water to mm water by multiplying by 10.
+  sum(awc_r * thickness_cm, na.rm = TRUE) * 10
+}
 
-Join with ETref data.
+# 1. Get mukeys for all design points
+design_points_sf <- design_points |>
+  dplyr::distinct(id, lon, lat)
+
+mukeys_list <- purrr::map2(
+  design_points_sf$lon,
+  design_points_sf$lat,
+  ~ ssurgo_mukeys_point(point = c(.x, .y), distance = 20)
+)
+
+# 2. Query gSSURGO for soil data
+all_mukeys <- unique(unlist(mukeys_list))
+soil_raw <- gSSURGO.Query(
+  mukeys = all_mukeys,
+  fields = c("chorizon.awc_r", "chorizon.hzdept_r", "chorizon.hzdepb_r")
+)
+
+# 3. Calculate effective WHC for each site-crop combination
+# We use the dominant soil component for each map unit.
+soil_dominant <- soil_raw |>
+  dplyr::filter(cokey == cokey[which.max(comppct_r)], .by = "mukey")
+
+dp_with_whc <- dp_with_cropname |>
+  dplyr::mutate(mukey = mukeys_list[match(id, design_points_sf$id)]) |>
+  tidyr::unnest(mukey) |>
+  dplyr::mutate(mukey = as.numeric(mukey)) |>
+  dplyr::left_join(
+    soil_dominant,
+    by = "mukey",
+    relationship = "many-to-many"
+  ) |>
+  dplyr::summarize(
+    whc_mm = calc_effective_awc(
+      hzdept_r,
+      hzdepb_r,
+      awc_r,
+      rooting_depth_cm = rooting_depth_m[[1]] * 100
+    ),
+    .by = c("id", "parcel_id", "date", "crop_name", "whc_min_frac")
+  ) |>
+  # Fallback to default if WHC is 0 or NA
+  dplyr::mutate(whc_mm = dplyr::if_else(whc_mm > 0, whc_mm, 500, missing = 500))
+
+#' # Join with ETo data
+#'
+#' Join with ETref data.
 
-```{r}
-dp_with_eto <- dp_with_cropname |>
-  dplyr::left_join((
-    etref |> dplyr::select("id", "date", "etref_mm_day")
-  ), by = c("id", "date"))
-```
+dp_with_eto <- dp_with_whc |>
+  dplyr::left_join(
+    (etref |> dplyr::select("id", "date", "etref_mm_day")),
+    by = c("id", "date")
+  )
 
-Calculate ETc directly using eto_to_etc_bism. Group by crop_name and apply since eto_to_etc_bism takes a single crop at a time.
+#' Calculate ETc directly using eto_to_etc_bism. Group by crop_name and apply since eto_to_etc_bism takes a single crop at a time.
 
-```{r}
 dp_with_etc <- dp_with_eto |>
   dplyr::mutate(
     etc_mm_day = eto_to_etc_bism(
@@ -168,13 +223,12 @@ dp_with_etc <- dp_with_eto |>
     dplyr::any_of(c("id", "parcel_id", "lat", "lon")),
     "date",
     "etc_mm_day",
-    "whc_min_frac"
+    "whc_min_frac",
+    "whc_mm"
   )
-```
 
-Handle multi-crop parcels (double-cropping) - placeholder logic that warns and averages ETc values.
+#' Handle multi-crop parcels (double-cropping) - placeholder logic that warns and averages ETc values.
 
-```{r}
 resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
   id_sym <- rlang::sym(id_col)
   date_sym <- rlang::sym(date_col)
@@ -191,7 +245,7 @@ resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
     message(
       "Multi-crop parcels: ",
       multicrop_counts$n_multicrop,
-      " date-parcel combinations have multiple crops. Averaging ETc values."
+      " date-parcel combinations have multiple crops. Averaging ETc and WHC values."
     )
   }
 
@@ -200,30 +254,34 @@ resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
     dplyr::summarize(
       etc_mm_day = mean(.data$etc_mm_day, na.rm = TRUE),
       whc_min_frac = mean(.data$whc_min_frac, na.rm = TRUE),
+      whc_mm = mean(.data$whc_mm, na.rm = TRUE),
       .groups = "drop"
     )
 }
 
 dp_with_etc <- resolve_multicrop(dp_with_etc)
-```
 
-Join with precipitation data (inner_join to ensure matching dates).
+#' Join with precipitation data (inner_join to ensure matching dates).
 
-```{r}
 dp_crops_all <- dp_with_etc |>
   dplyr::inner_join(precip, by = c("id", "date")) |>
-  dplyr::select(c("id", "lat", "lon", "date", "etc_mm_day", "precip_mm_day", "whc_min_frac"))
-```
+  dplyr::select(c(
+    "id",
+    "lat",
+    "lon",
+    "date",
+    "etc_mm_day",
+    "precip_mm_day",
+    "whc_min_frac",
+    "whc_mm"
+  ))
 
-# Calculate water balance
+#' # Calculate water balance
 
-```{r}
 dpwb <- apply_water_balance(dp_crops_all, "id")
-```
 
-Check crop evapotranspiration values are reasonable.
+#' Check crop evapotranspiration values are reasonable.
 
-```{r}
 etc_summary <- dp_crops_all |>
   dplyr::summarize(
     etc_min = min(.data$etc_mm_day, na.rm = TRUE),
@@ -232,11 +290,9 @@ etc_summary <- dp_crops_all |>
     .by = "id"
   )
 print(etc_summary)
-```
 
-Check that water balance calculations are reasonable.
+#' Check that water balance calculations are reasonable.
 
-```{r}
 wb_summary <- dpwb |>
   dplyr::group_by(.data$id) |>
   dplyr::summarize(
@@ -249,11 +305,9 @@ wb_summary <- dpwb |>
     .groups = "drop"
   )
 print(wb_summary)
-```
 
-Check for other issues.
+#' Check for other issues.
 
-```{r}
 if (any(wb_summary$irr_max < 0)) {
   warning("Negative irrigation values detected!")
 } else {
@@ -265,57 +319,52 @@ if (any(wb_summary$W_t_min < 0)) {
 } else {
   message("Soil water values are non-negative")
 }
-```
 
-Seasonal variation check - irrigation should be higher in summer.
+#' Seasonal variation check - irrigation should be higher in summer.
 
-```{r}
 monthly_irr <- dpwb |>
   dplyr::mutate(month = lubridate::month(.data$date)) |>
   dplyr::group_by(.data$month) |>
   dplyr::summarize(irr_mean = mean(.data$irr, na.rm = TRUE), .groups = "drop")
 print(monthly_irr)
-```
 
-# Plot results
+#' # Plot results
 
-```{r}
 library(ggplot2)
 dpwb |>
   ggplot() +
   aes(x = date, y = irr, color = id) +
   geom_line() +
   labs(title = "Irrigation Requirements by Site", y = "Irrigation (mm/day)")
-```
-
-# Write event files
 
-Example of a single event data frame.
+#' # Write event files
+#'
+#' Example of a single event data frame.
 
-```{r}
 dpwb |>
   dplyr::filter(id == id[[1]]) |>
   create_event_file()
-```
 
-Write all event files.
+#' Write all event files.
 
-```{r}
 outdir <- tempfile(pattern = "events_")
 dir.create(outdir)
 dpwb |>
   dplyr::group_nest(.data$id) |>
-  dplyr::mutate(fname = purrr::map2(
-    id,
-    data,
-    \(id, dat) readr::write_delim(
-      create_event_file(dat),
-      file.path(outdir, glue::glue("{id}_events.txt")),
-      delim = " ",
-      col_names = FALSE
+  dplyr::mutate(
+    fname = purrr::map2(
+      id,
+      data,
+      \(id, dat) {
+        readr::write_delim(
+          create_event_file(dat),
+          file.path(outdir, glue::glue("{id}_events.txt")),
+          delim = " ",
+          col_names = FALSE
+        )
+      }
     )
-  ))
+  )
 
 fnames <- list.files(outdir, full.names = TRUE)
 cat(readr::read_file(fnames[[1]]))
-```

From f58e7eb895595d6a4f2e5a528738b03b76f4af11 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 13:55:45 -0500
Subject: [PATCH 20/67] add horizon depths to SSURGO test script

---
 modules/data.land/inst/ssurgo-soil-inputs.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/data.land/inst/ssurgo-soil-inputs.R b/modules/data.land/inst/ssurgo-soil-inputs.R
index 63ed5f5c55a..04edd40a201 100644
--- a/modules/data.land/inst/ssurgo-soil-inputs.R
+++ b/modules/data.land/inst/ssurgo-soil-inputs.R
@@ -14,7 +14,11 @@ all_mukeys <- unique(unlist(mukeys_list))
 
 soil_data <- PEcAn.data.land::gSSURGO.Query(
   mukeys = all_mukeys,
-  fields = c("chorizon.awc_r")
+  fields = c(
+    "chorizon.hzdept_r",
+    "chorizon.hzdepb_r",
+    "chorizon.awc_r"
+  )
 )
 
 result <- design_points |>

From c3926547be9d8ac10d2beedcefc2007a4c359877 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 15:03:32 -0500
Subject: [PATCH 21/67] add OpenET download function and comparison script

---
 modules/data.land/NAMESPACE                   |  1 +
 modules/data.land/R/openET.R                  | 82 +++++++++++++++++++
 modules/data.land/inst/compare-cimis-openet.R | 41 ++++++++++
 modules/data.land/man/extract_openet_daily.Rd | 32 ++++++++
 4 files changed, 156 insertions(+)
 create mode 100644 modules/data.land/R/openET.R
 create mode 100644 modules/data.land/inst/compare-cimis-openet.R
 create mode 100644 modules/data.land/man/extract_openet_daily.Rd

diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index 944442ae67b..2cfe32be5a2 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -29,6 +29,7 @@ export(extract_SM_CDS)
 export(extract_chirps_remote)
 export(extract_cimis_date)
 export(extract_cimis_dates)
+export(extract_openet_daily)
 export(extract_soil_gssurgo)
 export(extract_soil_nc)
 export(extract_veg)
diff --git a/modules/data.land/R/openET.R b/modules/data.land/R/openET.R
new file mode 100644
index 00000000000..66e39b62918
--- /dev/null
+++ b/modules/data.land/R/openET.R
@@ -0,0 +1,82 @@
+#' Extract daily ET data from OpenET
+#'
+#' Note that this requires the environment variable `OPENET_API_KEY` to be set.
+#' A convenient way to do this is via a `.Renviron`, either globally
+#' (`~/.Renviron`) or in the current working directory (`./.Renviron`), with
+#' contents like:
+#'
+#' ```
+#' OPENET_API_KEY="abcdefg123456"
+#' ```
+#'
+#' You can obtain an OpenET API key from the OpenET data portal.
+#'
+#' @param design_points `data.frame` of design points with columns `lat` and `lon`
+#' @param start_date Start date for data extraction
+#' @param end_date End date for data extraction
+#'
+#' @return `design_points` `data.frame` with additional columns `date`, and
+#' `et_mm_day` (ET, mm/day)
+#' @export
+extract_openet_daily <- function(design_points, start_date, end_date) {
+  api_key <- Sys.getenv("OPENET_API_KEY")
+  if (api_key == "") {
+    stop("OPENET_API_KEY environment variable is not set")
+  }
+
+  start_date_str <- format(start_date, "%Y-%m-%d")
+  end_date_str <- format(end_date, "%Y-%m-%d")
+
+  request_body_template <- list(
+    date_range = c(start_date_str, end_date_str),
+    interval = "daily",
+    model = "Ensemble",
+    variable = "ET",
+    reference_et = "gridMET",
+    units = "mm",
+    file_format = "JSON"
+  )
+
+  prep_request <- function(lon, lat) {
+    request_body <- request_body_template
+    request_body$geometry <- c(lon, lat)
+
+    httr2::request("https://openet-api.org/raster/timeseries/point") |>
+      httr2::req_headers(Authorization = api_key) |>
+      httr2::req_body_json(request_body) |>
+      httr2::req_throttle(capacity = 10, fill_time_s = 1) |>
+      httr2::req_retry(max_tries = 3) |>
+      httr2::req_timeout(seconds = 150)
+  }
+
+  raw_results <- design_points |>
+    dplyr::mutate(
+      reqs = purrr::map2(.data$lon, .data$lat, prep_request),
+      resps = httr2::req_perform_parallel(
+        raw_results[["reqs"]],
+        max_active = 10,
+        on_error = "continue"
+      )
+    )
+
+  parse_response <- function(resp) {
+    if (!inherits(resp, "httr2_response")) {
+      return(NULL)
+    }
+    data <- httr2::resp_body_json(resp)
+    if (length(data) == 0 || is.null(data[[1]]$time)) {
+      return(NULL)
+    }
+    tibble::tibble(
+      date = as.Date(purrr::map_chr(data, "time")),
+      et_mm_day = purrr::map_dbl(data, "et")
+    )
+  }
+
+  results <- raw_results |>
+    dplyr::mutate(results = purrr::map(.data$resps, parse_response)) |>
+    dplyr::select(-c("reqs", "resps")) |>
+    tidyr::unnest("results", keep_empty = TRUE)
+
+  results
+}
diff --git a/modules/data.land/inst/compare-cimis-openet.R b/modules/data.land/inst/compare-cimis-openet.R
new file mode 100644
index 00000000000..2f81c123c73
--- /dev/null
+++ b/modules/data.land/inst/compare-cimis-openet.R
@@ -0,0 +1,41 @@
+#!/usr/bin/env Rscript
+
+if (interactive()) {
+  devtools::load_all("modules/data.land")
+} else {
+  library(PEcAn.data.land)
+}
+
+design_points_path <- "~/projects/cimis-to-irrigation/design_points.csv"
+cimis_eto_cog_path <- "~/data/CIMIS-ETo-COG"
+
+dates <- seq.Date(as.Date("2020-03-01"), as.Date("2020-11-30"), "day")
+design_points <- readr::read_csv(design_points_path) |>
+  head(10)
+
+cimis_et <- extract_cimis_dates(
+  design_points,
+  dates,
+  cimis_eto_cog_path,
+  .progress = TRUE
+)
+
+openet_et <- extract_openet_daily(
+  design_points,
+  min(dates),
+  max(dates)
+)
+
+combined <- cimis_et |>
+  dplyr::rename(cimis = etref_mm_day) |>
+  dplyr::full_join(openet_et |> dplyr::rename(openet = et_mm_day))
+
+combined_long <- combined |>
+  tidyr::pivot_longer(c(cimis, openet), names_to = "source", values_to = "et_mm_day")
+
+library(ggplot2)
+ggplot(combined_long) +
+  aes(x = date, y = et_mm_day, color = source) +
+  geom_line() +
+  facet_wrap(~id) +
+  theme_bw()
diff --git a/modules/data.land/man/extract_openet_daily.Rd b/modules/data.land/man/extract_openet_daily.Rd
new file mode 100644
index 00000000000..7099a0d504c
--- /dev/null
+++ b/modules/data.land/man/extract_openet_daily.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/openET.R
+\name{extract_openet_daily}
+\alias{extract_openet_daily}
+\title{Extract daily ET data from OpenET}
+\usage{
+extract_openet_daily(design_points, start_date, end_date)
+}
+\arguments{
+\item{design_points}{`data.frame` of design points with columns `lat` and `lon`}
+
+\item{start_date}{Start date for data extraction}
+
+\item{end_date}{End date for data extraction}
+}
+\value{
+`design_points` `data.frame` with additional columns `date`, and
+`et_mm_day` (ET, mm/day)
+}
+\description{
+Note that this requires the environment variable `OPENET_API_KEY` to be set.
+A convenient way to do this is via a `.Renviron`, either globally
+(`~/.Renviron`) or in the current working directory (`./.Renviron`), with
+contents like:
+}
+\details{
+```
+OPENET_API_KEY="abcdefg123456"
+```
+
+You can obtain an OpenET API key from the OpenET data portal.
+}

From 422166aa8f55915fd13f66ee7a4e299957fce3ba Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 15:35:21 -0500
Subject: [PATCH 22/67] manually propagate duckspatial to dependencies

---
 docker/depends/pecan_package_dependencies.csv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
index 32a5c8d7003..04bca4c5ea9 100644
--- a/docker/depends/pecan_package_dependencies.csv
+++ b/docker/depends/pecan_package_dependencies.csv
@@ -70,6 +70,7 @@
 "dplyr",">= 1.1.2","base/db","Imports",FALSE
 "dygraphs","*","models/peprmt","Suggests",FALSE
 "earthdatalogin","*","models/peprmt","Suggests",FALSE
+"duckspatial","*","modules/data.land","Imports",FALSE
 "ecmwfr",">= 2.0.0","modules/data.atmosphere","Suggests",FALSE
 "ellipse","*","modules/assim.batch","Imports",FALSE
 "exactextractr","*","modules/assim.sequential","Suggests",FALSE

From 8f222c40a049b7c4d7450207db7555bd48f014c3 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 15:52:27 -0500
Subject: [PATCH 23/67] whitespace-only documentation change

---
 modules/data.land/R/data.R                           | 12 ++++++------
 modules/data.land/man/bism_kc_by_crop.Rd             |  2 +-
 modules/data.land/man/fertilizer_composition_data.Rd |  6 +++---
 modules/data.land/man/landiq_crop_mapping_codes.Rd   |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/modules/data.land/R/data.R b/modules/data.land/R/data.R
index 304d3b96ee6..2442e89d127 100644
--- a/modules/data.land/R/data.R
+++ b/modules/data.land/R/data.R
@@ -80,8 +80,8 @@
 "soil_class"
 
 #' Fertilizer Nutrient Composition Table
-#' 
-#' A dataset of fertilizer and organic matter addition types 
+#'
+#' A dataset of fertilizer and organic matter addition types
 #' and their nitrogen and carbon composition, based on the SWAT model's
 #' `fertilizer.frt` table and DayCent model defaults for organic matter
 #' C:N ratio parameters.
@@ -101,9 +101,9 @@
 #' }
 #'
 #' @details
-#' This table is based on SWAT model's \code{fertilizer.frt} file, and uses 
+#' This table is based on SWAT model's \code{fertilizer.frt} file, and uses
 #' C:N ratios (\code{cn_ratio}) from DayCent model default parameter files.
-#' \code{fraction_nh3_n} and \code{fraction_no3_n} represent the fraction of 
+#' \code{fraction_nh3_n} and \code{fraction_no3_n} represent the fraction of
 #' fertilizer by mass that is ammonium-N and nitrate-N, respectively. This is different from
 #' the SWAT model's definition of \code{fraction_nh3_n} as a fraction of the total mineral N.
 #'
@@ -123,7 +123,7 @@
 #'   \item{SUBCLASS}{LandIQ subclass code.}
 #'   \item{subclass_name}{LandIQ subclass name.}
 #' }
-#' @source California Department of Water Resources. (2023). Statewide Crop Mapping—California 
+#' @source California Department of Water Resources. (2023). Statewide Crop Mapping—California
 #' Natural Resources Agency Open Data. Metadata retrieved from https://data.cnra.ca.gov/dataset/statewide-crop-mapping and manually extracted into `data-raw/landiq_crop_mapping_codes.tsv`.
 "landiq_crop_mapping_codes"
 
@@ -132,7 +132,7 @@
 #' Crop and growth stage specific coefficients (Kc) from the Basic Irrigation Scheduling
 #' (BIS) Excel workbook (Snyder et. al., 2014).
 #' The dataset is an export of the BISm.xlsx workbook's `CropRef` worksheet, with columns renamed
-#' and columns added that map to LandIQ CADWR land use dataset 
+#' and columns added that map to LandIQ CADWR land use dataset
 #' (\code{\link{landiq_crop_mapping_codes}}; California Department of Water Resources, 2023).
 #' This dataset provides the information needed to reconstruct a stage-based daily Kc curve when
 #' combined with grass-reference evapotranspiration (ETo), such as that provided
diff --git a/modules/data.land/man/bism_kc_by_crop.Rd b/modules/data.land/man/bism_kc_by_crop.Rd
index 3edcb768305..c6771fb5642 100644
--- a/modules/data.land/man/bism_kc_by_crop.Rd
+++ b/modules/data.land/man/bism_kc_by_crop.Rd
@@ -36,7 +36,7 @@ bism_kc_by_crop
 Crop and growth stage specific coefficients (Kc) from the Basic Irrigation Scheduling
 (BIS) Excel workbook (Snyder et. al., 2014).
 The dataset is an export of the BISm.xlsx workbook's `CropRef` worksheet, with columns renamed
-and columns added that map to LandIQ CADWR land use dataset 
+and columns added that map to LandIQ CADWR land use dataset
 (\code{\link{landiq_crop_mapping_codes}}; California Department of Water Resources, 2023).
 This dataset provides the information needed to reconstruct a stage-based daily Kc curve when
 combined with grass-reference evapotranspiration (ETo), such as that provided
diff --git a/modules/data.land/man/fertilizer_composition_data.Rd b/modules/data.land/man/fertilizer_composition_data.Rd
index 4a7c41a23b2..50954070e8b 100644
--- a/modules/data.land/man/fertilizer_composition_data.Rd
+++ b/modules/data.land/man/fertilizer_composition_data.Rd
@@ -28,15 +28,15 @@ DayCent model default parameter file: `omad.100` obtained from the Soil Carbon S
 fertilizer_composition_data
 }
 \description{
-A dataset of fertilizer and organic matter addition types 
+A dataset of fertilizer and organic matter addition types
 and their nitrogen and carbon composition, based on the SWAT model's
 `fertilizer.frt` table and DayCent model defaults for organic matter
 C:N ratio parameters.
 }
 \details{
-This table is based on SWAT model's \code{fertilizer.frt} file, and uses 
+This table is based on SWAT model's \code{fertilizer.frt} file, and uses
 C:N ratios (\code{cn_ratio}) from DayCent model default parameter files.
-\code{fraction_nh3_n} and \code{fraction_no3_n} represent the fraction of 
+\code{fraction_nh3_n} and \code{fraction_no3_n} represent the fraction of
 fertilizer by mass that is ammonium-N and nitrate-N, respectively. This is different from
 the SWAT model's definition of \code{fraction_nh3_n} as a fraction of the total mineral N.
 }
diff --git a/modules/data.land/man/landiq_crop_mapping_codes.Rd b/modules/data.land/man/landiq_crop_mapping_codes.Rd
index 8cbd354c43b..7de715dfdc4 100644
--- a/modules/data.land/man/landiq_crop_mapping_codes.Rd
+++ b/modules/data.land/man/landiq_crop_mapping_codes.Rd
@@ -15,7 +15,7 @@ A data frame with 203 rows and 4 columns:
 }
 }
 \source{
-California Department of Water Resources. (2023). Statewide Crop Mapping—California 
+California Department of Water Resources. (2023). Statewide Crop Mapping—California
 Natural Resources Agency Open Data. Metadata retrieved from https://data.cnra.ca.gov/dataset/statewide-crop-mapping and manually extracted into `data-raw/landiq_crop_mapping_codes.tsv`.
 }
 \usage{

From 166a420a8bee4caafb8fd8dbf8592e9e41966838 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 15:52:41 -0500
Subject: [PATCH 24/67] update crop_whc values and docs

better value + reference for mustard
---
 modules/data.land/R/data.R              |  21 +++++++++++++--------
 modules/data.land/data-raw/crop_whc.csv |   2 +-
 modules/data.land/data/crop_whc.rda     | Bin 1624 -> 1694 bytes
 modules/data.land/man/crop_whc.Rd       |  21 +++++++++++++--------
 4 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/modules/data.land/R/data.R b/modules/data.land/R/data.R
index 2442e89d127..3e8651f848e 100644
--- a/modules/data.land/R/data.R
+++ b/modules/data.land/R/data.R
@@ -288,22 +288,27 @@
 #'   composition (N/C fractions) from the SWAT/DayCent database.
 "ca_compost_amendment"
 
-#' Crop-specific minimum water holding capacity (WHC) thresholds
+#' Crop-specific rooting depths and water-depletion thresholds
 #'
-#' Minimum soil water content (fraction of available water-holding capacity)
-#' that should be maintained for each crop to avoid yield loss or quality issues.
-#' Values are based on crop root depth, drought tolerance, and sensitivity to
-#' water stress.
+#' Maximum effective rooting depth and minimum soil water content thresholds
+#' for various crops. The `whc_min_frac` column represents the fraction of
+#' total available water (TAW) that should remain in the root zone to avoid
+#' moisture stress (equivalent to 1 - p, where p is the depletion fraction
+#' from FAO-56).
 #'
 #' @format A tibble with one row per crop and the following columns:
 #' \describe{
 #'   \item{crop_number}{BIS crop number (character). Blank for crops not in BIS.}
 #'   \item{crop_name}{Crop name.}
 #'   \item{Category}{Crop category (e.g., Woody Perennial, Annual (Hardy)).}
-#'   \item{whc_min_frac}{Minimum WHC as fraction of total WHC (0-1).}
-#'   \item{Comments}{Rationale for the minimum WHC value.}
+#'   \item{rooting_depth_m}{Maximum effective rooting depth in meters.}
+#'   \item{whc_min_frac}{Minimum soil water as fraction of available water-holding capacity (0-1).}
+#'   \item{whc_notes}{Rationale or source for the minimum WHC value.}
+#'   \item{rooting_depth_notes}{Rationale or source for the rooting depth value.}
 #' }
-#' @source Expert knowledge; California irrigation scheduling guidelines.
+#' @source Allen, R. G., Pereira, L. S., Raes, D., & Smith, M.
+#' \emph{FAO Irrigation and Drainage Paper No. 56}.
+#' https://www.professormendoncauenf.com.br/wp-content/uploads/2021/03/ag_fao_56_ingles.pdf
 #' @examples
 #' data(crop_whc)
 #' head(crop_whc)
diff --git a/modules/data.land/data-raw/crop_whc.csv b/modules/data.land/data-raw/crop_whc.csv
index 19877b247ea..766fbf01ea5 100644
--- a/modules/data.land/data-raw/crop_whc.csv
+++ b/modules/data.land/data-raw/crop_whc.csv
@@ -36,7 +36,7 @@ crop_number,crop_name,Category,rooting_depth_m,whc_min_frac,whc_notes,rooting_de
 1.22,Lettuce,Annual (Sensitive),0.4,0.70,FAO-56 Table 22,FAO-56 Table 22
 1.23,Melon,Annual (Hardy),1.2,0.55,FAO-56 Table 22,FAO-56 Table 22
 1.24,Millet,Annual (Hardy),1.5,0.45,FAO-56 Table 22,FAO-56 Table 22
-1.25,Mustard,Annual (Hardy),60,0.45,Rapid growth and reasonable scavenging ability.,"Limited CA data; 0.6 m estimated from brassica analogues (canola, rapeseed)"
+1.25,Mustard,Annual (Hardy),1.2,0.45,Common value for other hardy annuals,"Dharmasri, Jong, Cowell. 1993. https://harvest.usask.ca/server/api/core/bitstreams/fc5e68a2-951c-4d34-9cfe-b5d2a383d449/content"
 1.26,Oats,Annual (Hardy),1.25,0.45,FAO-56 Table 22,FAO-56 Table 22
 4.06,Olives,Woody Perennial,1.45,0.35,FAO-56 Table 22,FAO-56 Table 22
 1.27,Onion (dry),Annual (Sensitive),0.45,0.70,FAO-56 Table 22,FAO-56 Table 22
diff --git a/modules/data.land/data/crop_whc.rda b/modules/data.land/data/crop_whc.rda
index bc94230fe19c1c2a413143b9052ea91131634547..98f506de888f18c70764bee3254aa4949e499fec 100644
GIT binary patch
literal 1694
zcmV;P24VR^T4*^jL0KkKS*jIn`v423|L_0*-+)AC|Mma>-@w1`|KPv?05AXn047N!
zZDD@#;0C?$@){I-x$b*oN|Y5;hLr}I35_Nb5t9&PVhuDJXks)QQv*maMuDavGGq(^
zDFHMD(^Jx6H8!E3&>o{EL7-?202*naGypU*001JV<YJmc8Vv&=&;S4dpa1{>0iXZ?
z00y3vl{C``VqrAM88TodLre$;1YsBiVHgNt1OpIY00000000000000000004i3lM!
zrh`)^YMwMEP|4}2GBO5(N2nNp(?HM-G-%P~8flGAl^RR=_~aiFXd(haf2pEQX)<rT
zl*FQFXN^9kMg-c<fFzO@JQC2LP;y{^hMN+ZA`l~%#J;92mXbm|Tw*5lxqIE7<%khm
zMf7Wt_VTb5NM9S5YdHeeU<(=sri|8BNDNlu{dx8E{>}MX2n+*BAtKg<24fIybQeRh
z=v+FwI=Qnc(V>W)5+WiSl+{U9R25YXui-iZf;r$;0fQcJ@a{*3FyXQ!RcvvRk2KgD
z;}FY%`7bo0kRgMb{1^!d$_yeVLmLU}61JDSQHNmjDwbghR@<^6Hrr{jcAf`~2s9WM
z^<r#DS*%Eaux*TEHLfb0hE0u#Y-p586_k`tGo5Q$2O*#m#3Uq!0ur^PCFL{=wvt`I
zMGz7Ja$ZK;7!8d$A-39S2vEjAK!z|BF^#ZNVgw4X84V-_jU*&4%T+N4J)lTa5mY3L
zmL!CI(B4IK4k0d{ng9*a>2d^(Erb9LCACFvrLy=b!A!d4or|;jU;&1h0q9F-Vsx0=
z8*CdQn4V=Q#+bo?)09(hUcax_gpx}uvtvI=K|Yc@igI$!NOF-W1C*rL#t9&J-{a$U
z@^v(D^K<ojmRqlph*(E<EZkH#0NZFAK-vb<tp%F<_LZk?wa)btNh|k)mxj_wCYH5T
zO4}`J1a9YbZ=mZ~sswQOd$5lIOPFGQJ{(UJt1P{g2(XPn-6cK>gcoJ#02;eyqoD|5
zpa4c|4H3jmHXINd_98VQJYySNJS@Zq6bML2f&m5F2xOCDaKJzeVn8;)Nw)x#1TY{P
z2i+tP0D=+(3K*smm6C*#Ueq`E$O4`S2wdEzX%S651VIkO!5A~7Q0g@<x@y@Lk|VI)
zfe|7ndq;VK<AqV}Y64!dH8tH@^nlb50G+pHjBHC02P;Om63!q^0v7Jg5?R%q00J#S
zOm@n)S`ioH&qIJZWFFA{^PYDIAcBz{V}f3nJFCh-0Y!Me(Yt4_0<#Ego(PA?1VLz3
zEf=%XEd>ie;f`~>)TD8xEvW{%VCH5ZCU;er)h$;RV$niRitJ1hmg$jQi?YpQlYc_v
z<Vx;LmWYu64$`1v6EMf&($IE<C{&O+l2xvQcRE?Q89;Fm&M@$<WU1z99fU0aRTXS1
zS4%^o7XvcYu_hEuVZ0&~2rRl=&`MOqp`ZZEwRImh+LR?WBQ3fQd2=_7z2zVSBfPj2
zZY0C6QH74Rtvtk{A(kLVtyF;}4nacdWtWJ8mg=gW!-YcYbmMsh-P3u|R9;m~sJem+
z2JQ7`Asy}!v9_Dw@PEzmvu_b*&pVWP*yS{1EHQr<wIo}~j&basI?M&|S`f~;41_bH
zgcS+#r@>f=H;hK<S03fLvf3}#SWt;yFm7yBr+7iSM&<DaXAN4JEv=verpdVxJUTsF
z<wV*539_&Uk;aYV-7nSZO+<huW0i>G0QSzJCCPw+Eh>Z32*nQ@avv$5fo}*mIFrCM
z4O(U2olck{15ilLEfO=55m|T|sF;Esa)(+HC}HI3tUgP$NR>H~(nvjQlq$eu<Qzam
z_^sOvtueNnZH_RYpcxnv%b0DO<}laK3)N#ZWJ^GzGyPV!bpsCV)78n))6tk&fCxxz
zU}G<e_Ys0{q^zP6*sfD75;rJPS7+FzGMwH)o?k)^<U%!CvQja#3%n|eO9niZl)-lu
z0qbIjp{qts&LriQFI_7Ag1s*z1-0u^FNi57XKr-5Q&zO8>0xG9N~J0EK4G(stkdY&
zf=P7+E#X)z6{`va3KDs`_5eg+sIIKpT<5hbE4+k{?xK5l^bD|EYFBjh%tbEfl0yz&
zR4$c>6h)fF<F(i9*lJ$u8$w8pm=MO9Y=D@+Qpby<*Zk0uJKgZ`KKu~aNO9qk@f>jz
okJ%)+ygkrKc#bv$V+K$WFaRzRxdu@L8GHUN<ce^iAyg{c_ApV>IRF3v

literal 1624
zcmV-e2B-N#T4*^jL0KkKS*P?>@Bj;$fA9bQ-+(}8|Ly<p-@w1`|KLCX05AXn047N!
zZDD@#;0B%W=EXIkIg7275h#$#Q_zUoo|<JINE!j4&<2A;K+tF#Q_^Sv00000N+4(=
zCYmy6(<*+7ekuSOXf)6=05+fofuI5E27nB30000000000000000009ZY3V4YqZ3Uu
zG|7<DAPkrSG-;zC$YKD<G%y5Y0iy>1000000000000000001%si3pk|1ZZTZl=D>b
znWXfdpay7w2dEkkP#QfzWc58qsL<0-2~C1ab7Sy7vS=a#ME{gTx+T%B@rp_!m}0+9
zuO-yl&wwNXT32O`fkDZF0vc>eWQahG#wFWg@tR2p=jahPlXG{6j?x5H(Qz9Def=!r
zsSD+D?qWc-+5u>wYG}a9$pMDUJYJ1|N8RPhz(8jWB!r6?kQZ2kYp%NgS6cY$-tjO@
zru1T~9Ws=qDAA(K=4N4;r5IPuqA@B=XOSN|FU!~D#JI6PCU%D+_9Zw)=f=4r?~=WV
z!P(qOB_UfRuA>igoh(3=9K}2uYV}=rHDFaw2{?*?a2tRCfKjSey%ZqOV0!woHY6s*
zh82ymjAp&^laR@=u?>w9i6Z$)MB_cywa{s_0$4<ZkibG#FiCE(7wVE+V2U6l0@QlV
zuwXVc;D+02q#;8Y0E9pUK*((hOhAEF10ke<w9-OC;cBKJ>of@pVk(4@ePKvP;RAZg
z{0<;n?;-#Us<{M1PNV<@`YWPZHPfv4KKDDC&Fgovmbx8$UZMaPuGSA6ux6bmHip{<
z$fd#UQjBSg7z7gO>2a%JB$D)3RH{i-<??QN(uXokgPE4bF{FXx9nLLh)W$Iiq1R=r
zC~Se5plt(a8%nem>nNRd(srnlNmdiEZ6uOuPH|mTwFowTI}m)8st7yW+>nl13f<Rs
z)E8-RObC@us-uAb15rv-&_DrP<w~lqB6O81B9xwo0m;>2j+n;OHG?dan-hit0AmsX
zwgOGK1ehU#0MI)kLI4611Q416h#@GGTnGRR$K}l+8UTTVpPa3UFHU^{{p(mEA!;Iw
z6sgG$0y(3!NhIw(B0GwNFVdj^#;9H+Tyc__)BqDg058)m?1b$=kw@Cn?bKevAiZH`
zT}!pBeapxIEBi}+Ebc_I2}cjAjqvP{c1P@<*~UQx6o~jpm%wgc>7W3&HEf2ByItSf
z-4XBb2_L{DUM_pDi>rH*+yOnH&6+2cV1V1CI~YWiKr-~|;kbgj=9LucMHP-x2N+8F
z{2SK49~YIu8m2f|F^&WP=2lJ8uQRwF=+}ph^oa?M;uB=`>vBL6h{P7x%&IR0CJ1Z*
z8CkTLsAfa}wZpP>F^f26ab}2-bpq7U=|L$|0)~JCQ&xB<w`3?pf{1NRnhR;#lK_R&
zT%*P>)or}<HoqS|_ZQqE9pHh_yFkH)^b~WTA`^5mTKEgQmLH7dWtw)&xcxK4WO${B
z+d?2G0ukTu4UM$lr_2Ah(9N6Wt&Nkdi+Ny%GG-srxupYtWiJ`;YwNJt!m18IwIR@q
z7VOwWLC^6Dap9}u$*!=|hFqm-a|DON^B%oY&07c?2jI9sUatmvV^jblszKtA!;{jf
zA_^b@1sDNytFqMUmX*9%qyQf1uJXvxO87IYpp4Sk+6<FOsKjU4w^3*lRh!<zpeU;O
zoSpmPs~ai}#3C?|Bw<KJWw2_ZVhD(J0z}K%bk**ox+F@5<g~0}3cD2*+%cShJ!$O*
z!%S_a+hezF1i&^n494kvFa>naF-XEXBL4GPw9$8`dU-PRay%NqIS^1eN{LR%XgNE0
zCcLZ&nxhh;A+iOZjfXtN;os6aXBrfjO8VJyRNOBi#4iR{1L9o#RgA9DR3}4LQ-U1g
zPFbPo@D$5SZmOwCRv?O&4<E79fmI9K;{{(;BNC(TF5Nl0m?vdS$Xw<Pydl3EL?$3H
z1>e<B1&Ju)e&iLY3IZYfNRG<yXB=QSn+DCxhPQN9r$bw2#2{aPMfxEe^1%a#(K$*n
zs0mLl1(_r<fTf|27i;uFNbmRqmG~fr!b6T}E_0$HaotHT@0*YkT<1dpv4bcGnn@+T
WC4B?JCWG_#h1`)&6eKDA6?_0=<=Gkl

diff --git a/modules/data.land/man/crop_whc.Rd b/modules/data.land/man/crop_whc.Rd
index a9525569cc7..70a03fd0000 100644
--- a/modules/data.land/man/crop_whc.Rd
+++ b/modules/data.land/man/crop_whc.Rd
@@ -3,28 +3,33 @@
 \docType{data}
 \name{crop_whc}
 \alias{crop_whc}
-\title{Crop-specific minimum water holding capacity (WHC) thresholds}
+\title{Crop-specific rooting depths and water-depletion thresholds}
 \format{
 A tibble with one row per crop and the following columns:
 \describe{
   \item{crop_number}{BIS crop number (character). Blank for crops not in BIS.}
   \item{crop_name}{Crop name.}
   \item{Category}{Crop category (e.g., Woody Perennial, Annual (Hardy)).}
-  \item{whc_min_frac}{Minimum WHC as fraction of total WHC (0-1).}
-  \item{Comments}{Rationale for the minimum WHC value.}
+  \item{rooting_depth_m}{Maximum effective rooting depth in meters.}
+  \item{whc_min_frac}{Minimum soil water as fraction of available water-holding capacity (0-1).}
+  \item{whc_notes}{Rationale or source for the minimum WHC value.}
+  \item{rooting_depth_notes}{Rationale or source for the rooting depth value.}
 }
 }
 \source{
-Expert knowledge; California irrigation scheduling guidelines.
+Allen, R. G., Pereira, L. S., Raes, D., & Smith, M.
+\emph{FAO Irrigation and Drainage Paper No. 56}.
+https://www.professormendoncauenf.com.br/wp-content/uploads/2021/03/ag_fao_56_ingles.pdf
 }
 \usage{
 crop_whc
 }
 \description{
-Minimum soil water content (fraction of available water-holding capacity)
-that should be maintained for each crop to avoid yield loss or quality issues.
-Values are based on crop root depth, drought tolerance, and sensitivity to
-water stress.
+Maximum effective rooting depth and minimum soil water content thresholds
+for various crops. The `whc_min_frac` column represents the fraction of
+total available water (TAW) that should remain in the root zone to avoid
+moisture stress (equivalent to 1 - p, where p is the depletion fraction
+from FAO-56).
 }
 \examples{
 data(crop_whc)

From 9c93527f015dd3a13a0386b2623521257cad84b2 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 16:05:13 -0500
Subject: [PATCH 25/67] replace duckspatial with terra + sf

duckspatial was causing CI/CD problems.
---
 modules/data.land/DESCRIPTION            |  1 +
 modules/data.land/R/get-landiq-parcels.R | 34 +++++-------------------
 2 files changed, 8 insertions(+), 27 deletions(-)

diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION
index a6f6c261a63..95409ed8f18 100644
--- a/modules/data.land/DESCRIPTION
+++ b/modules/data.land/DESCRIPTION
@@ -30,6 +30,7 @@ Imports:
     dplyr,
     duckdb,
     duckspatial,
+    dplR,
     foreach,
     fs,
     future,
diff --git a/modules/data.land/R/get-landiq-parcels.R b/modules/data.land/R/get-landiq-parcels.R
index 74aef0c2802..29d5d575784 100644
--- a/modules/data.land/R/get-landiq-parcels.R
+++ b/modules/data.land/R/get-landiq-parcels.R
@@ -7,35 +7,15 @@
 #' @return `design_points` `data.frame` with harmonized LandIQ parcel_IDs
 #' @export
 get_landiq_parcel_ids <- function(design_points, parcels_file) {
-  parcel_crs <- sf::st_layers(parcels_file)[["crs"]][[1]]
+  parcels_vect <- terra::vect(parcels_file)
   pts_sf <- design_points |>
     dplyr::select("id", "lat", "lon") |>
     sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
-    sf::st_transform(parcel_crs)
-  conn <- duckspatial::ddbs_create_conn()
-  duckspatial::ddbs_write_vector(
-    conn = conn,
-    data = pts_sf,
-    name = "design_points"
-  )
-  duckdb::dbSendQuery(
-    conn,
-    glue::glue(
-      "
-      CREATE TABLE merged AS
-      SELECT dp.*, p.parcel_id,
-      FROM design_points dp
-      LEFT JOIN ST_Read('{parcels_file}', layer='parcels') p
-      ON ST_Within(dp.geometry, p.geom)
-      "
-    )
-  )
-  dp_with_parcels <- duckspatial::ddbs_read_vector(
-    conn = conn,
-    name = "merged"
-  ) |>
-    sf::st_drop_geometry(dp_parcels) |>
-    dplyr::right_join(design_points, by = "id")
-
+    sf::st_transform(sf::st_crs(parcels_vect))
+  dp_vect <- terra::vect(pts_sf)
+  matched <- terra::intersect(dp_vect, parcels_vect)
+  matched_sf <- sf::st_as_sf(matched)
+  dp_with_parcels <- design_points |>
+    dplyr::left_join(matched_sf, by = "id")
   dp_with_parcels
 }

From 79a53f8b50eeb80f8b0ddf334ea328498d1e8359 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 17:07:00 -0500
Subject: [PATCH 26/67] fix R CMD check issues

---
 modules/data.land/DESCRIPTION                | 4 ++++
 modules/data.land/R/CIMIS-ET.R               | 1 +
 modules/data.land/man/extract_cimis_dates.Rd | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION
index 95409ed8f18..9deef73793c 100644
--- a/modules/data.land/DESCRIPTION
+++ b/modules/data.land/DESCRIPTION
@@ -25,6 +25,7 @@ URL: https://pecanproject.github.io
 BugReports: https://github.com/PecanProject/pecan/issues
 Depends: R (>= 4.1.0)
 Imports:
+    arrow,
     coda,
     curl,
     dplyr,
@@ -35,6 +36,8 @@ Imports:
     fs,
     future,
     furrr,
+    httr,
+    httr2,
     lubridate,
     magrittr,
     ncdf4 (>= 1.15),
@@ -47,6 +50,7 @@ Imports:
     sf,
     stringr,
     terra,
+    tibble,
     tidyr,
     tidyselect,
     XML (>= 3.98-1.4)
diff --git a/modules/data.land/R/CIMIS-ET.R b/modules/data.land/R/CIMIS-ET.R
index f1503e7bd1d..26462631dae 100644
--- a/modules/data.land/R/CIMIS-ET.R
+++ b/modules/data.land/R/CIMIS-ET.R
@@ -76,6 +76,7 @@ extract_cimis_date <- function(
 #' Extract CIMIS reference ET for multiple dates
 #'
 #' @param dates Sequence of dates for which to extract data
+#' @param ... Additional arguments to `purrr::map`
 #' @inheritParams extract_cimis_date
 #'
 #' @return `design_points` `data.frame` extended with ETref data for all dates.
diff --git a/modules/data.land/man/extract_cimis_dates.Rd b/modules/data.land/man/extract_cimis_dates.Rd
index 24f3a4ef66d..caa1671ef14 100644
--- a/modules/data.land/man/extract_cimis_dates.Rd
+++ b/modules/data.land/man/extract_cimis_dates.Rd
@@ -11,6 +11,8 @@ extract_cimis_dates(design_points, dates, ...)
 `location_id`, `lat`, and `lon`}
 
 \item{dates}{Sequence of dates for which to extract data}
+
+\item{...}{Additional arguments to `purrr::map`}
 }
 \value{
 `design_points` `data.frame` extended with ETref data for all dates.

From 6b97e3dd85e6433a3fd6f9fce2bee1d09e9f65d9 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 5 Mar 2026 17:20:51 -0500
Subject: [PATCH 27/67] fix non-unicode chars in data.land

---
 modules/data.land/R/gSSURGO_Query.R    | 14 +++++++-------
 modules/data.land/man/gSSURGO.Query.Rd |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/modules/data.land/R/gSSURGO_Query.R b/modules/data.land/R/gSSURGO_Query.R
index 21d6be6865b..1f8d4942b67 100644
--- a/modules/data.land/R/gSSURGO_Query.R
+++ b/modules/data.land/R/gSSURGO_Query.R
@@ -30,10 +30,10 @@
 #' | `chorizon.om_r`        | Organic matter (<2 mm soil)               | %            |
 #' | `chorizon.hzdept_r`    | Horizon top depth                         | cm           |
 #' | `chfrags.fragvol_r`    | Rock fragments                            | % (by volume)|
-#' | `chorizon.dbthirdbar_r`| Bulk density at field capacity            | g/cm³        |
+#' | `chorizon.dbthirdbar_r`| Bulk density at field capacity            | g/cm3        |
 #' | `chorizon.ph1to1h2o_r` | Soil pH (1:1 H2O)                         | pH (unitless)|
-#' | `chorizon.cokey`       | Component key (identifier)                | —            |
-#' | `chorizon.chkey`       | Horizon key (identifier)                  | —            |
+#' | `chorizon.cokey`       | Component key (identifier)                | -            |
+#' | `chorizon.chkey`       | Horizon key (identifier)                  | -            |
 #'
 #' **API stability:** The NRCS occasionally modifies the API schema. If queries fail,
 #'   adjustments may be required here to align with the updated structure. 
@@ -202,8 +202,8 @@ ssurgo_mukeys_bbox <- function(bbox) {
     stop(
       paste0(
         "Bounding box area (", format(area, scientific = FALSE),
-        " m²) exceeds maximum allowed area (", format(SSURGO_API_MAX_AREA_M2, scientific = FALSE),
-        " m²). Use ssurgo_mukeys_bigbbox() for large bounding boxes."
+        " m2) exceeds maximum allowed area (", format(SSURGO_API_MAX_AREA_M2, scientific = FALSE),
+        " m2). Use ssurgo_mukeys_bigbbox() for large bounding boxes."
       )
     )
   }
@@ -265,8 +265,8 @@ ssurgo_mukeys_point <- function(point, distance) {
     stop(
       paste0(
         "Search radius area (", format(circle_area, scientific = FALSE),
-        " m²) exceeds maximum allowed area (", format(SSURGO_API_MAX_AREA_M2, scientific = FALSE),
-        " m²)."
+        " m2) exceeds maximum allowed area (", format(SSURGO_API_MAX_AREA_M2, scientific = FALSE),
+        " m2)."
       )
     )
   }
diff --git a/modules/data.land/man/gSSURGO.Query.Rd b/modules/data.land/man/gSSURGO.Query.Rd
index 51d6def7dce..336bcd20e56 100644
--- a/modules/data.land/man/gSSURGO.Query.Rd
+++ b/modules/data.land/man/gSSURGO.Query.Rd
@@ -44,10 +44,10 @@ for full list):\tabular{lll}{
    \code{chorizon.om_r} \tab Organic matter (<2 mm soil) \tab \% \cr
    \code{chorizon.hzdept_r} \tab Horizon top depth \tab cm \cr
    \code{chfrags.fragvol_r} \tab Rock fragments \tab \% (by volume) \cr
-   \code{chorizon.dbthirdbar_r} \tab Bulk density at field capacity \tab g/cm³ \cr
+   \code{chorizon.dbthirdbar_r} \tab Bulk density at field capacity \tab g/cm3 \cr
    \code{chorizon.ph1to1h2o_r} \tab Soil pH (1:1 H2O) \tab pH (unitless) \cr
-   \code{chorizon.cokey} \tab Component key (identifier) \tab — \cr
-   \code{chorizon.chkey} \tab Horizon key (identifier) \tab — \cr
+   \code{chorizon.cokey} \tab Component key (identifier) \tab - \cr
+   \code{chorizon.chkey} \tab Horizon key (identifier) \tab - \cr
 }
 
 
From fabd6c94b57cdbb603f1103e442c0145f52777c9 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 6 Mar 2026 08:32:43 -0500
Subject: [PATCH 28/67] Update PEcAn dependencies

---
 docker/depends/pecan_package_dependencies.csv | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
index 04bca4c5ea9..95cc5ccc033 100644
--- a/docker/depends/pecan_package_dependencies.csv
+++ b/docker/depends/pecan_package_dependencies.csv
@@ -6,6 +6,7 @@
 "abind",">= 1.4.5","modules/data.atmosphere","Imports",FALSE
 "amerifluxr","*","modules/data.atmosphere","Imports",FALSE
 "arrow","*","modules/data.atmosphere","Imports",FALSE
+"arrow","*","modules/data.land","Imports",FALSE
 "assertthat","*","models/ed","Imports",FALSE
 "assertthat","*","modules/data.atmosphere","Imports",FALSE
 "BayesianTools","*","modules/assim.batch","Imports",FALSE
@@ -134,6 +135,7 @@
 "httr","*","modules/data.atmosphere","Imports",FALSE
 "httr","*","modules/data.land","Suggests",FALSE
 "httr","*","modules/data.remote","Suggests",FALSE
+"httr2","*","modules/data.land","Imports",FALSE
 "IDPmisc","*","modules/assim.batch","Imports",FALSE
 "imager","*","models/peprmt","Suggests",FALSE
 "itertools","*","modules/assim.sequential","Suggests",FALSE
@@ -669,6 +671,7 @@
 "tibble","*","models/fates","Imports",FALSE
 "tibble","*","models/lpjguess","Imports",FALSE
 "tibble","*","modules/data.atmosphere","Imports",FALSE
+"tibble","*","modules/data.land","Imports",FALSE
 "tibble","*","modules/data.remote","Suggests",FALSE
 "tibble","*","modules/meta.analysis","Suggests",FALSE
 "tictoc","*","modules/assim.sequential","Suggests",FALSE

From 4e63faebfc7b552b17dbc5996312555e88ad19cb Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 6 Mar 2026 09:40:25 -0500
Subject: [PATCH 29/67] minor cleanup of ssurgo mukey code

---
 modules/data.land/R/gSSURGO_Query.R           | 105 ++++++++----------
 .../data.land/man/SSURGO_API_MAX_AREA_M2.Rd   |  16 +++
 modules/data.land/man/parse_mukey_response.Rd |  14 +++
 3 files changed, 79 insertions(+), 56 deletions(-)
 create mode 100644 modules/data.land/man/SSURGO_API_MAX_AREA_M2.Rd
 create mode 100644 modules/data.land/man/parse_mukey_response.Rd

diff --git a/modules/data.land/R/gSSURGO_Query.R b/modules/data.land/R/gSSURGO_Query.R
index 1f8d4942b67..bc3fbac72f4 100644
--- a/modules/data.land/R/gSSURGO_Query.R
+++ b/modules/data.land/R/gSSURGO_Query.R
@@ -138,7 +138,8 @@ gSSURGO.Query <- function(mukeys,
   
 }
 
-SSURGO_API_MAX_AREA_M2 <- 10100000000
+#' Maximum area for SSURGO API requests
+SSURGO_API_MAX_AREA_M2 <- 10100000000  # nolint: object_name_linter
 
 #' Get map unit keys (mukeys) from gSSURGO
 #'
@@ -192,9 +193,16 @@ ssurgo_mukeys_bbox <- function(bbox) {
 
   wgs84_crs <- sf::st_crs(4326)
 
-  bbox_poly <- sf::st_polygon(list(
-    matrix(c(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin), ncol = 2, byrow = TRUE)
-  ))
+  # Calculate the area of the bbox to make sure that it's smaller than the
+  # SSURGO limit (`SSURGO_API_MAX_AREA_M2`).
+  bbox_matrix <- rbind(
+    c(xmin, ymin),
+    c(xmax, ymin),
+    c(xmax, ymax),
+    c(xmin, ymax),
+    c(xmin, ymin)
+  )
+  bbox_poly <- sf::st_polygon(list(bbox_matrix))
   bbox_sf <- sf::st_sfc(bbox_poly, crs = wgs84_crs)
   area <- as.numeric(sf::st_area(bbox_sf))
 
@@ -225,23 +233,7 @@ ssurgo_mukeys_bbox <- function(bbox) {
 
   httr2::resp_check_status(resp)
 
-  resp_text <- httr2::resp_body_string(resp)
-
-  resp_xml <- XML::xmlParse(resp_text)
-
-  mukey_nodes <- XML::getNodeSet(resp_xml, "//MapUnitKeyList")
-
-  if (length(mukey_nodes) == 0) {
-    return(character(0))
-  }
-
-  mukey_str <- XML::xmlValue(mukey_nodes[[1]])
-
-  if (is.null(mukey_str) || nchar(trimws(mukey_str)) == 0) {
-    return(character(0))
-  }
-
-  mukeys <- unique(strsplit(trimws(mukey_str), ",")[[1]])
+  mukeys <- unique(parse_mukey_response(resp))
 
   mukeys
 }
@@ -300,23 +292,7 @@ ssurgo_mukeys_point <- function(point, distance) {
 
   httr2::resp_check_status(resp)
 
-  resp_text <- httr2::resp_body_string(resp)
-
-  resp_xml <- XML::xmlParse(resp_text)
-
-  mukey_nodes <- XML::getNodeSet(resp_xml, "//MapUnitKeyList")
-
-  if (length(mukey_nodes) == 0) {
-    return(character(0))
-  }
-
-  mukey_str <- XML::xmlValue(mukey_nodes[[1]])
-
-  if (is.null(mukey_str) || nchar(trimws(mukey_str)) == 0) {
-    return(character(0))
-  }
-
-  mukeys <- unique(strsplit(trimws(mukey_str), ",")[[1]])
+  mukeys <- unique(parse_mukey_response(resp))
 
   mukeys
 }
@@ -339,16 +315,21 @@ ssurgo_mukeys_bigbbox <- function(bbox) {
 
   wgs84_crs <- sf::st_crs(4326)
 
-  bbox_poly <- sf::st_polygon(list(
-    matrix(c(xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin), ncol = 2, byrow = TRUE)
-  ))
+  # Get the total bbox area.
+  bbox_matrix <- rbind(
+    c(xmin, ymin),
+    c(xmax, ymin),
+    c(xmax, ymax),
+    c(xmin, ymax),
+    c(xmin, ymin)
+  )
+  bbox_poly <- sf::st_polygon(list(bbox_matrix))
   bbox_sf <- sf::st_sfc(bbox_poly, crs = wgs84_crs)
 
   bbox_area <- as.numeric(sf::st_area(bbox_sf))
 
-  bbox_wgs84_box <- sf::st_bbox(bbox_sf)
-  width_deg <- bbox_wgs84_box["xmax"] - bbox_wgs84_box["xmin"]
-  height_deg <- bbox_wgs84_box["ymax"] - bbox_wgs84_box["ymin"]
+  width_deg <- xmax - xmin
+  height_deg <- ymax - ymin
 
   aspect_ratio <- width_deg / height_deg
 
@@ -388,7 +369,11 @@ ssurgo_mukeys_bigbbox <- function(bbox) {
       httr2::req_url_query(!!!query)
   })
 
-  reqs_throttled <- purrr::map(reqs, ~ .x |> httr2::req_throttle(10 / 60))
+  reqs_throttled <- reqs |>
+    # max 10 tries per minute
+    purrr::map(httr2::req_throttle, capacity = 10) |>
+    # keep trying for 2 minutes before giving up
+    purrr::map(httr2::req_retry, max_seconds = 120)
 
   resps <- httr2::req_perform_parallel(
     reqs_throttled,
@@ -400,17 +385,7 @@ ssurgo_mukeys_bigbbox <- function(bbox) {
   parse_mukeys <- function(resp) {
     if (inherits(resp, "httr2_response")) {
       tryCatch({
-        resp_text <- httr2::resp_body_string(resp)
-        resp_xml <- XML::xmlParse(resp_text)
-        mukey_nodes <- XML::getNodeSet(resp_xml, "//MapUnitKeyList")
-        if (length(mukey_nodes) == 0) {
-          return(character(0))
-        }
-        mukey_str <- XML::xmlValue(mukey_nodes[[1]])
-        if (is.null(mukey_str) || nchar(trimws(mukey_str)) == 0) {
-          return(character(0))
-        }
-        strsplit(trimws(mukey_str), ",")[[1]]
+        parse_mukey_response(resp)
       }, error = function(e) {
         character(0)
       })
@@ -423,3 +398,21 @@ ssurgo_mukeys_bigbbox <- function(bbox) {
 
   unique(unlist(mukeys_list, use.names = FALSE))
 }
+
+#' Parse responses from the mukey WFS service
+#'
+#' @params resp `httr2` response object from SSURGO mukey WFS API
+#' @return character vector of mukeys
+parse_mukey_response <- function(resp) {
+  resp_text <- httr2::resp_body_string(resp)
+  resp_xml <- XML::xmlParse(resp_text)
+  mukey_nodes <- XML::getNodeSet(resp_xml, "//MapUnitKeyList")
+  if (length(mukey_nodes) == 0) {
+    return(character(0))
+  }
+  mukey_str <- XML::xmlValue(mukey_nodes[[1]])
+  if (is.null(mukey_str) || nchar(trimws(mukey_str)) == 0) {
+    return(character(0))
+  }
+  strsplit(trimws(mukey_str), ",")[[1]]
+}
diff --git a/modules/data.land/man/SSURGO_API_MAX_AREA_M2.Rd b/modules/data.land/man/SSURGO_API_MAX_AREA_M2.Rd
new file mode 100644
index 00000000000..c07c2d0249d
--- /dev/null
+++ b/modules/data.land/man/SSURGO_API_MAX_AREA_M2.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gSSURGO_Query.R
+\docType{data}
+\name{SSURGO_API_MAX_AREA_M2}
+\alias{SSURGO_API_MAX_AREA_M2}
+\title{Maximum area for SSURGO API requests}
+\format{
+An object of class \code{numeric} of length 1.
+}
+\usage{
+SSURGO_API_MAX_AREA_M2
+}
+\description{
+Maximum area for SSURGO API requests
+}
+\keyword{datasets}
diff --git a/modules/data.land/man/parse_mukey_response.Rd b/modules/data.land/man/parse_mukey_response.Rd
new file mode 100644
index 00000000000..0967a8c602d
--- /dev/null
+++ b/modules/data.land/man/parse_mukey_response.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gSSURGO_Query.R
+\name{parse_mukey_response}
+\alias{parse_mukey_response}
+\title{Parse responses from the mukey WFS service}
+\usage{
+parse_mukey_response(resp)
+}
+\value{
+character vector of mukeys
+}
+\description{
+Parse responses from the mukey WFS service
+}

From 763acb36623b60c47822f5ceddfd539ea0a708ed Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 10 Mar 2026 13:29:19 -0400
Subject: [PATCH 30/67] remove duckdb and duckspatial (again)

...as part of the rebase against master.
---
 modules/data.land/DESCRIPTION | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION
index 9deef73793c..948827429aa 100644
--- a/modules/data.land/DESCRIPTION
+++ b/modules/data.land/DESCRIPTION
@@ -29,8 +29,6 @@ Imports:
     coda,
     curl,
     dplyr,
-    duckdb,
-    duckspatial,
     dplR,
     foreach,
     fs,

From 32944090d42ae28c4241e1320023e0ab605cf0e2 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 3 Apr 2026 14:41:59 -0400
Subject: [PATCH 31/67] update pecan_package_dependencies

---
 docker/depends/pecan_package_dependencies.csv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
index 95cc5ccc033..e0ffb6791c7 100644
--- a/docker/depends/pecan_package_dependencies.csv
+++ b/docker/depends/pecan_package_dependencies.csv
@@ -49,6 +49,7 @@
 "doSNOW","*","modules/data.atmosphere","Suggests",FALSE
 "doSNOW","*","modules/data.land","Suggests",FALSE
 "doSNOW","*","modules/data.remote","Suggests",FALSE
+"dplR","*","modules/data.land","Imports",FALSE
 "dplR","*","modules/data.land","Suggests",FALSE
 "dplyr","*","base/qaqc","Imports",FALSE
 "dplyr","*","base/remote","Imports",FALSE
@@ -71,7 +72,6 @@
 "dplyr",">= 1.1.2","base/db","Imports",FALSE
 "dygraphs","*","models/peprmt","Suggests",FALSE
 "earthdatalogin","*","models/peprmt","Suggests",FALSE
-"duckspatial","*","modules/data.land","Imports",FALSE
 "ecmwfr",">= 2.0.0","modules/data.atmosphere","Suggests",FALSE
 "ellipse","*","modules/assim.batch","Imports",FALSE
 "exactextractr","*","modules/assim.sequential","Suggests",FALSE
@@ -133,6 +133,7 @@
 "here","*","base/db","Suggests",FALSE
 "httr","*","base/remote","Imports",FALSE
 "httr","*","modules/data.atmosphere","Imports",FALSE
+"httr","*","modules/data.land","Imports",FALSE
 "httr","*","modules/data.land","Suggests",FALSE
 "httr","*","modules/data.remote","Suggests",FALSE
 "httr2","*","modules/data.land","Imports",FALSE

From 3199406c7df7e8e3b90a49f13d32174c6dd8849c Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sat, 4 Apr 2026 13:42:28 -0400
Subject: [PATCH 32/67] skip CHIRPS tests on CI

On a Docker-based CI, this seems to fail with this error:

```
Opening a /vsi file with the netCDF driver requires Linux userfaultfd to
be available. If running from Docker, --security-opt seccomp=unconfined
might be needed. Or you may set the GDAL_SKIP=netCDF configuration
option to force the use of the HDF5 driver. (GDAL error 1)
```
---
 modules/data.land/tests/testthat/test-CHIRPS-precip.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/data.land/tests/testthat/test-CHIRPS-precip.R b/modules/data.land/tests/testthat/test-CHIRPS-precip.R
index 64715fc9f11..db981116d21 100644
--- a/modules/data.land/tests/testthat/test-CHIRPS-precip.R
+++ b/modules/data.land/tests/testthat/test-CHIRPS-precip.R
@@ -1,5 +1,6 @@
 test_that("extract_chirps_remote returns data for single date", {
   skip_if_offline()
+  skip_on_ci()
 
   pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
 
@@ -16,6 +17,7 @@ test_that("extract_chirps_remote returns data for single date", {
 
 test_that("extract_chirps_remote handles multiple dates in same year", {
   skip_if_offline()
+  skip_on_ci()
 
   pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
   dates <- seq(as.Date("2020-06-01"), as.Date("2020-06-30"), by = "1 day")
@@ -31,6 +33,7 @@ test_that("extract_chirps_remote handles multiple dates in same year", {
 
 test_that("extract_chirps_remote handles dates spanning multiple years", {
   skip_if_offline()
+  skip_on_ci()
 
   pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
   dates <- c(
@@ -49,6 +52,7 @@ test_that("extract_chirps_remote handles dates spanning multiple years", {
 
 test_that("extract_chirps_remote output has correct structure", {
   skip_if_offline()
+  skip_on_ci()
 
   pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
   dates <- seq(as.Date("2020-06-01"), as.Date("2020-06-10"), by = "1 day")

From 4f059c4acdfc259d0ee59e49dcd13d70a0e863fb Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sat, 4 Apr 2026 13:46:38 -0400
Subject: [PATCH 33/67] documentation bugfix

---
 modules/data.land/R/gSSURGO_Query.R           | 2 +-
 modules/data.land/man/parse_mukey_response.Rd | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/data.land/R/gSSURGO_Query.R b/modules/data.land/R/gSSURGO_Query.R
index bc3fbac72f4..83050d9e918 100644
--- a/modules/data.land/R/gSSURGO_Query.R
+++ b/modules/data.land/R/gSSURGO_Query.R
@@ -401,7 +401,7 @@ ssurgo_mukeys_bigbbox <- function(bbox) {
 
 #' Parse responses from the mukey WFS service
 #'
-#' @params resp `httr2` response object from SSURGO mukey WFS API
+#' @param resp `httr2` response object from SSURGO mukey WFS API
 #' @return character vector of mukeys
 parse_mukey_response <- function(resp) {
   resp_text <- httr2::resp_body_string(resp)
diff --git a/modules/data.land/man/parse_mukey_response.Rd b/modules/data.land/man/parse_mukey_response.Rd
index 0967a8c602d..596f532e332 100644
--- a/modules/data.land/man/parse_mukey_response.Rd
+++ b/modules/data.land/man/parse_mukey_response.Rd
@@ -6,6 +6,9 @@
 \usage{
 parse_mukey_response(resp)
 }
+\arguments{
+\item{resp}{`httr2` response object from SSURGO mukey WFS API}
+}
 \value{
 character vector of mukeys
 }

From e31027aeaf2cfe44dfd26ca036192b297482a5de Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 27 Feb 2026 11:32:48 -0500
Subject: [PATCH 34/67] add crop WHC data

---
 modules/data.land/data/crop_whc.rda | Bin 1694 -> 1673 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/data.land/data/crop_whc.rda b/modules/data.land/data/crop_whc.rda
index 98f506de888f18c70764bee3254aa4949e499fec..db521d6444f59c96a120e5a9b075be67f6a5c75f 100644
GIT binary patch
delta 1657
zcmV-<28Q{b4T%j7LRx4!F+o`-Q&|E1`BRY&C4Zgp=E({X*t*SalnSnfik^hT852!3
z$(n|PKmoNHXahq)^$i*T003kF0BAv=h|^O|JyEqDqtwx$9-*KbXnKGE003wJGyniW
zfB*mh0000000000000008Kk6BCzJs70j7We27#ahKmY&$15GqEG#WGzU;qFB00000
z0Dk}g0000000fE%(lp3ShC-fA8f6TgrbZ({pfq9_ngc^k05oVB0BE(JNq+J2N%c_!
zVM3p)9ZM<}d?{E{vBL>V{UxcjmH<U0En8!nVKK#m0u44LA|wzaki@=5E-OhP9uN3L
zo7d!U_WQIzikdK^aEHyuw5lTa9DdxW7Ju6STBsV5F|x8CWV0vV&adG4c~P(s7L5o<
zSeS#2&s_e`mvHLyHgAzXGIMTwbyWAg(N`*>s;E_0xw6L+Q24+r1A`$G@po|L9m|Zp
zeOrX|Uk15@z;~RU8EC<&eFNMg*-W2AR51}4h77>71y%j5&MeL9))l(VDBfTT>3<ji
zFi#Y^UTLMFU}4#Wn2@otA^}0RF^t>1s!<sxHX*U1Fp4WD7)lvkYi$OzKqZh!NemF0
zTY;qUn{VQiRy371)|kKmAQBNqIV6fCAvi&U5eNcshy<Mpq=8%mAksilq=bdrYNjCP
zGzkhqDuPJ8WJpKP2KJTk93YoRLVo}b0qg4t8vH~64iV!`S+Qh&sr0A2xKp!!r$hiQ
zouKnai^P=VHip{<$WJ51IkBcNU^L?i?|nP???3_A(X_dsyKy97t9JI(iBwets;LM-
zZKPg3nw)G7wtKvNO`feR+}dIl!@njxlQ+QLXd6J<2GXqsH#t@nr)8DP1b@;=EI0yq
z2GU6;mfclG%2?C{aAoUZLD!<F5#IlQI3xTgdkl}myCcbl21=uFpcXKfy6U5)A(Es3
z9nL(I=}Ky#0NRviRaQb0oM&+xdbe)NcW<p29o3P$5s0A)6N?U^Nk}N*2m%mD2_Okc
z7$pG^1`!0+6bu0aNDwClV1JcuQ%NO>+2ZpWPqdm?bMsTQBGk{IVut#G&`yZj*mjlT
zobPF&*Hnl^L_}Mf*9}BW8~)iKT5q==Z4igL2WUVBDQLFBRlrD_>}HvV5Q5qi*3oTi
zCvy@23j0P63Rqzr;!y4SDCq}8f$I<1ImdK?0tDd>xzdlF&FgW{0DnzjFtl(}@AL6Z
z9L|`L`2a*14;()0Wa)*N1157eZ6r*)CnAuG7&3`u0ZnlghVl)FFIu!z-gZU_jH;x4
zJv8XI!@tV0@+-DbSs_S(0CkQ7AU!~lF_eH#dauoOMCIVpAeF5a<lcZVh)ow*F(n#-
zu(Jd+fMu^VOs}cy^na`Jd71TvXv<C=B1OU$?!OEO#(5MP01e(X^L{;*C?zH%cFfO(
za(GSd6@UUZX@v{CIb!$LF)`UXRL~{}hHxMwrCflOrw}levg5%JG<9|2yqQfB_w1QP
z$j7F2<utsp%4k9$65rX7gnu33u*}HMJMZYv_{EF>td{JAxPNF!gfOUqNqEQ=QkHxa
zL*=0ch42uBGwg=2hI+yW3UcYebP)#V2)bz;*woCyHBcNO93{K_O0-}Qa1h_ZAjRIR
zV`|FU04i*gi3wewt%RtPKmjIJ0QQ_S;5vpHx)M-`1+kmdnYKN&m{gVl0XmKkC=(1k
zHpqOaegt?BGk<C@=mtO<u-)ZKr#oQ)*b*|92^mrmS%4a-kb(}l!PbO{mub+h@m+#M
zXUUGbLFsKws?%3k?|}(-?A{I5nA=UZ#|w~54W<NlaT~`d)eX%+!B(lBBy$r+r=Zst
zU@qXkTRU9+8RQhS2qZQzHB2JG-z}6BRVIYEmBvK`jeq%*jrx>Pk&m0uJriXVpF);A
z`3z)bX$>Q}=V~<=V~&v5bSi{d37a!yoS_qD6qU9w7%iI`BDfrr1Ou32yxe%&Q8GGe
zV7WI;%-?lZlEh0f@AU7Y!J!Tn24H%!fLtJ%7msp)5K0~u^)lJFF_2jhA|F?V>t}Rf
zL2PBMx?*aeEcQZD7_+%_&}I-_Up6rCr9GUN$!Y4GV+CgD6xg+KKulmNy`JIs{mnMl
z=P57Yl@tW26E2_if26Www%$%&LP;+H|1cXEGGK`S0eF?@Ib{(YKlr<nDZ+$+5AVvD
Dd5z0#

delta 1678
zcmV;9266d`4W11SLRx4!F+o`-Q(3ANZTpc9C4ar}@){I-x$b*oN|Y5;hLr}I35_Nb
z5t9&PVhuDJXks)QQv*maMuDavGGq(^DFHMD(^Jx6H8!E3&>o{EL7-?202*naGypU*
z001JV<YJmc8Vv&=&;S4dpa1{>0iXZ?00y3vl{C``VqrAM88TodLre$;1YsBiVHgNt
z1b+h%U;qFB0000000000000001c?YCHl~A9CTgBECQ!-gs4_AJgGZ<sfYU(G4K!%c
z<r-;?PL&!<`S|1?5@;d<LVu~EPH8f4yp+VEXlIQ+rA7qW&VVG67CaKrpipvPfQFkA
znIaG)mc+iMEtZl(JX~TX^tpT8p5=%UTYp9LYmxTyuoXyO8<%T20@h#)8V06})>cRi
zR^t76_4WSE`C14J14$tw)`SLQ5N&i9L$T;wI=VW!vnkP`h@BE5A{&&|NmW!8RSmD<
zIs$?@;8p>H9&qsPM}{!rvLsb(agvWT*c;;z%YpeXG@_6pgPQyp2?@#!A|^u{34iMn
zwwJq6hhXz6mSG82+p-}x+i9_Oo(GKxG#D54Vr)oRtVn>cZH!|zt}2{{O^t|bXp~76
zl$1_0ooiVKA)pe(BqW9c61Aiy<unVnl3l<>5E21$UPjv(4UIS<w%Tb3P{u$&hA<Q{
zjj&Q;1PZVj4I~APBqT1&RWS!WpnphH5mY3LmL!CI(B4IK4k0d{ng9*a>2d^(Erb9L
zCACFvrLy=b!A!d4or|;jU;&1h0q9F-Vsx0=8*CdQn4V=Q#+bo?)09(hUcax_gpx}u
zvtvI=K|Yc@igI$!NOF-W1C*rL#t9&J-{a$U@^v(D^K<ojmRqlph*(E<EPvcoHvrpc
z8$j9y(yaxW`}UQmZMDwz5=krff|rKUNhX%HRZ80}Y6NcQb#I{SSgHhY_<OLA0ZW)-
zem)#e6ss(~lnAhmK;0!i3WOJB=l~kKW}~49VxRy<YYh>^O*R}58ulVJAv|LnTs$nq
z2NVcMNP+<c+X!TnVsOAf41Z!kHo!@@0FwkTAQ}hVBoF|C5(EkurV^Esgpyv=H~7c`
zo(KqB+@@&}O+5rb4#dG2Go(=JH7>ep*%p!`u-$<XA}4!Cd4l7GQSE91Ua>Vb-CFd3
z)DQrjw`PoNOA!YvMz<2qAWZ@m?#&We)tvwWEkaCo%C=e&7vs-EfPXq<9?<>so_7c!
zf{`6#f?k(9tI9wDMR>l^yJxQgvj}XS2#3f7L1<Mi7qinX1q(poj&r=!q;aJ!sRp@V
z=4K!!cU70wEmsy|(Lzs(>`W4t>5*NFvdv?Ye?sHrO72XSh>-ve(x74!FvsE2&~}6<
zRFFB6Rjz|~I$60HK!0%%&M@$<WU1z99fU0aRTXS1S4%^o7XvcYu_hEuVZ0&~2rRl=
z&`MOqp`ZZEwRImh+LR?WBQ3fQd2=_7z2zVSBfPj2ZY0C6QH74Rtvtk{A(kLVtyF;}
z4nacdWtWJ8mg=gW!-YcYbmMsh-P3u|R9;m~sJem+2JQ7`A%7k25wW(L;P8LV@w0Cc
zX3sm6dD!JNV=OU$7qui?$&PXCo;u70@LCYgxeSCeqJ$L*@u$I9h&PNz=~o`*xw6_X
z*H}=AUodWLRi}7Cx<=*k24@XgnJulL0;b8i5j;9QTjfOB012|N2a(2&<J~XS>P<v|
zCS#R|;{f)~qJJgHfPpP4gVG4a4;ykHDW8FF2sb#Bz%&h7W#64nm?8sENX{)1Gm;Tm
zcp9jff*o>)S`sK><ms$FOSDLpIg-*yJ#3UJz+>bbKt%Yh+YGHSwwrB^Frc6r7!u2v
zZJXvW*Ut;pV>Dz-K%z7KR<?Bm4(-#`$<Wi$m|1`bNPlc#V=s#L5rS}}tfCUwu2U=$
zHz-n9XV|4OoZdm6UqTM#LN!{lQZcg&yef-J20WFN!FLq_>tcwZt42-EB;}SbT`K*8
zy)PpLwd+zZh$$v#Zgjd+R<x<<VP;oKr7842VY7{_)9BcONp%G+;aDpbs|o}P5_!7z
z07PJ@u6L~2T<5hbE4+k{?xK5l^bD|EYFBjh%tbEfl0yz&R4$c>6h)fF<F(i9*lJ$u
z8$w8pm=MO9Y=D@+Qpby<*Zk0uJKgZ`KKu~aNO9qk@f>jzkJ%)+ygkrKc#bv$V+K$W
YFaRzRxdu@L8GHUN<ce^iAyg{c_E$E>O8@`>


From be6ab35bfdd7277fb0ea4905d714e626b6aed68f Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 13 Mar 2026 12:43:38 -0400
Subject: [PATCH 35/67] remove landiq class/subclass fixes

these are already incorporated into harmonized LandIQ v4.1
---
 modules/data.land/R/get-landiq.R | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/modules/data.land/R/get-landiq.R b/modules/data.land/R/get-landiq.R
index 638ef5a0519..0e6a50a832d 100644
--- a/modules/data.land/R/get-landiq.R
+++ b/modules/data.land/R/get-landiq.R
@@ -12,10 +12,6 @@ get_landiq <- function(design_points, parcels_file, crops_file) {
 
   crops <- arrow::read_parquet(crops_file) |>
     dplyr::semi_join(dp_with_parcels, by = "parcel_id") |>
-    dplyr::mutate(
-      dplyr::across(c("CLASS", "SUBCLASS"), ~ dplyr::na_if(.x, "**")),
-      SUBCLASS = as.integer(.data$SUBCLASS)
-    ) |>
     dplyr::select("parcel_id", "year", "season", "CLASS", "SUBCLASS")
 
   dp_with_crops <- dp_with_parcels |>

From b1ca4a4337bf54c7773c0c1a2ae478386a07f0b4 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 13 Mar 2026 12:44:17 -0400
Subject: [PATCH 36/67] working version of irrigation workflow in targets

---
 .../inst/CIMIS-event-files-targets.R          | 429 ++++++++++++++++++
 1 file changed, 429 insertions(+)
 create mode 100644 modules/data.land/inst/CIMIS-event-files-targets.R

diff --git a/modules/data.land/inst/CIMIS-event-files-targets.R b/modules/data.land/inst/CIMIS-event-files-targets.R
new file mode 100644
index 00000000000..b974be2317f
--- /dev/null
+++ b/modules/data.land/inst/CIMIS-event-files-targets.R
@@ -0,0 +1,429 @@
+#' ---
+#' title: "Example workflow generating SIPNET event files from CIMIS and CHIRPS data"
+#' author: "Alexey N. Shiklomanov"
+#' ---
+
+library(targets)
+
+targets_file <- here::here("_targets.R")
+targets_store <- here::here("_targets/")
+tar_config_set(
+  script = targets_file,
+  store = targets_store
+)
+
+#' Write the targets pipeline script to _targets.R in this directory.
+tar_script(
+  code = {
+    library(targets)
+    library(tarchetypes)
+
+    # if (interactive()) {
+    devtools::load_all(here::here("modules/data.land"))
+    # } else {
+    #   library(PEcAn.data.land)
+    # }
+
+    # -------------------------------------------------------------------------
+    # Helper functions
+    # -------------------------------------------------------------------------
+
+    #' Generate a sequence of dates for a given year/season (1=winter, 2=spring,
+    #' 3=summer, 4=fall).
+    fill_season <- function(year, season) {
+      if (season == 1) {
+        start <- lubridate::make_date(year, 1, 1)
+        end   <- lubridate::make_date(year, 3, 31)
+      } else if (season == 2) {
+        start <- lubridate::make_date(year, 4, 1)
+        end   <- lubridate::make_date(year, 6, 30)
+      } else if (season == 3) {
+        start <- lubridate::make_date(year, 7, 1)
+        end   <- lubridate::make_date(year, 9, 30)
+      } else if (season == 4) {
+        start <- lubridate::make_date(year, 10, 1)
+        end   <- lubridate::make_date(year, 12, 31)
+      }
+      seq.Date(start, end, "day")
+    }
+
+    #' Calculate effective available water capacity (mm) for a soil profile
+    #' clipped to a given rooting depth.
+    calc_effective_awc <- function(
+      hzdept_r_cm,
+      hzdepb_r_cm,
+      awc_r,
+      rooting_depth_cm
+    ) {
+      effective_top    <- pmin(hzdept_r_cm, rooting_depth_cm)
+      effective_bottom <- pmin(hzdepb_r_cm, rooting_depth_cm)
+      thickness_cm     <- pmax(0, effective_bottom - effective_top)
+      # awc_r is cm water / cm soil; multiply by thickness -> cm water -> mm water
+      sum(awc_r * thickness_cm, na.rm = TRUE) * 10
+    }
+
+    #' Average ETc and WHC across multi-crop parcels (double-cropping hack).
+    resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
+      id_sym   <- rlang::sym(id_col)
+      date_sym <- rlang::sym(date_col)
+
+      multicrop_counts <- etc_data |>
+        dplyr::add_count(!!id_sym, !!date_sym, name = "n") |>
+        dplyr::filter(.data$n > 1) |>
+        dplyr::summarize(
+          n_multicrop = dplyr::n_distinct(!!id_sym, !!date_sym),
+          .groups = "drop"
+        )
+
+      if (multicrop_counts$n_multicrop > 0) {
+        message(
+          "Multi-crop parcels: ",
+          multicrop_counts$n_multicrop,
+          " date-parcel combinations have multiple crops. Averaging ETc and WHC values."
+        )
+      }
+
+      etc_data |>
+        dplyr::group_by(!!id_sym, !!date_sym) |>
+        dplyr::summarize(
+          etc_mm_day   = mean(.data$etc_mm_day,   na.rm = TRUE),
+          whc_min_frac = mean(.data$whc_min_frac, na.rm = TRUE),
+          whc_mm       = mean(.data$whc_mm,       na.rm = TRUE),
+          .groups = "drop"
+        )
+    }
+
+    # -------------------------------------------------------------------------
+    # Package options
+    # -------------------------------------------------------------------------
+
+    tar_option_set(
+      packages = c(
+        "dplyr", "tidyr", "purrr", "readr", "tibble",
+        "lubridate", "glue", "ggplot2", "arrow", "rlang"
+      )
+    )
+
+    # -------------------------------------------------------------------------
+    # Pipeline
+    # -------------------------------------------------------------------------
+
+    list(
+
+      # --- Inputs from environment variables ---------------------------------
+
+      tar_target(design_points_path, Sys.getenv("DESIGN_POINTS")),
+      tar_target(cimis_eto_cog_path, Sys.getenv("CIMIS_ETO_COG")),
+      tar_target(parcels_path,       Sys.getenv("LANDIQ_PARCELS")),
+      tar_target(crops_path,         Sys.getenv("LANDIQ_CROPS")),
+      tar_target(event_output_dir,   path.expand(Sys.getenv("EVENT_OUTPUT_DIR"))),
+
+      # --- Validate all input paths exist ------------------------------------
+
+      tar_target(validated_paths, {
+        stopifnot(
+          file.exists(design_points_path),
+          dir.exists(cimis_eto_cog_path),
+          file.exists(parcels_path),
+          file.exists(crops_path)
+        )
+        dir.create(event_output_dir, showWarnings = FALSE, recursive = TRUE)
+        TRUE
+      }),
+
+      # --- Base inputs -------------------------------------------------------
+
+      tar_target(
+        dates,
+        seq.Date(as.Date("2020-03-01"), as.Date("2020-11-30"), "day")
+      ),
+
+      tar_target(
+        design_points,
+        readr::read_csv(design_points_path, show_col_types = FALSE) |> head(10)
+      ),
+
+      # --- Remote data extractions (slow; most benefit from caching) ---------
+
+      tar_target(
+        etref,
+        design_points |>
+          extract_cimis_dates(dates, cimis_eto_cog_path, .progress = TRUE)
+      ),
+
+      tar_target(
+        precip,
+        extract_chirps_remote(design_points, dates)
+      ),
+
+      # --- LandIQ crop data --------------------------------------------------
+
+      tar_target(
+        dp_with_crops,
+        get_landiq(
+          design_points,
+          parcels_file = parcels_path,
+          crops_file   = crops_path
+        ) |>
+          tibble::as_tibble()
+      ),
+
+      #' NOTE: Some LandIQ classes/subclasses map onto multiple BISM crop types.
+      #' HACK: select just the first crop per class/subclass group.
+      tar_target(
+        bism_crop_unique,
+        bism_kc_by_crop |>
+          dplyr::distinct(landiq_class, landiq_subclass, crop_name) |>
+          dplyr::slice(1, .by = c("landiq_class", "landiq_subclass"))
+      ),
+
+      tar_target(
+        design_point_crops,
+        dp_with_crops |>
+          dplyr::left_join(
+            bism_crop_unique,
+            by = c("CLASS" = "landiq_class", "SUBCLASS" = "landiq_subclass")
+          )
+      ),
+
+      #' Expand crop seasons to daily rows using hard-coded quarterly dates.
+      #' In reality these would be resolved from phenology data.
+      tar_target(
+        dp_crops_filled,
+        design_point_crops |>
+          dplyr::filter(!is.na(season)) |>
+          tidyr::fill(
+            "CLASS", "SUBCLASS", "crop_name",
+            .direction = "downup",
+            .by = "parcel_id"
+          ) |>
+          dplyr::mutate(date = purrr::map2(year, season, fill_season)) |>
+          tidyr::unnest(date) |>
+          dplyr::filter(date %in% dates)
+      ),
+
+      #' Warn about parcels with no matching BIS crop, then filter them out.
+      tar_target(
+        dp_with_cropname, {
+          missing_crops <- dp_crops_filled |> dplyr::filter(is.na(crop_name))
+          if (nrow(missing_crops) > 0) {
+            missing_crop_strs <- missing_crops |>
+              dplyr::distinct(CLASS, SUBCLASS) |>
+              dplyr::mutate(
+                string = glue::glue("CLASS: {CLASS} SUBCLASS: {SUBCLASS}")
+              ) |>
+              dplyr::pull(string)
+            warning(
+              "Skipping ", nrow(missing_crops),
+              " rows with no matching BIS crop. Relevant pairs are: [",
+              paste(missing_crop_strs, collapse = "; "), "]"
+            )
+          }
+          dp_crops_filled |>
+            dplyr::filter(!is.na(crop_name)) |>
+            dplyr::left_join(
+              crop_whc |>
+                dplyr::select("crop_name", "whc_min_frac", "rooting_depth_m"),
+              by = "crop_name"
+            )
+        }
+      ),
+
+      # --- SSURGO soil data --------------------------------------------------
+
+      tar_target(
+        design_points_sf,
+        dplyr::distinct(design_points, id, lon, lat)
+      ),
+
+      tar_target(
+        mukeys_list,
+        purrr::map2(
+          design_points_sf$lon,
+          design_points_sf$lat,
+          ~ ssurgo_mukeys_point(point = c(.x, .y), distance = 20)
+        )
+      ),
+
+      tar_target(
+        soil_raw,
+        gSSURGO.Query(
+          mukeys = unique(unlist(mukeys_list)),
+          fields = c("chorizon.awc_r", "chorizon.hzdept_r", "chorizon.hzdepb_r")
+        )
+      ),
+
+      tar_target(
+        soil_dominant,
+        soil_raw |>
+          dplyr::filter(cokey == cokey[which.max(comppct_r)], .by = "mukey")
+      ),
+
+      tar_target(
+        dp_with_whc,
+        dp_with_cropname |>
+          dplyr::mutate(
+            mukey = mukeys_list[match(id, design_points_sf$id)]
+          ) |>
+          tidyr::unnest(mukey) |>
+          dplyr::mutate(mukey = as.numeric(mukey)) |>
+          dplyr::left_join(
+            soil_dominant,
+            by = "mukey",
+            relationship = "many-to-many"
+          ) |>
+          dplyr::summarize(
+            whc_mm = calc_effective_awc(
+              hzdept_r, hzdepb_r, awc_r,
+              rooting_depth_cm = rooting_depth_m[[1]] * 100
+            ),
+            .by = c("id", "parcel_id", "date", "crop_name", "whc_min_frac")
+          ) |>
+          dplyr::mutate(
+            whc_mm = dplyr::if_else(whc_mm > 0, whc_mm, 500, missing = 500)
+          )
+      ),
+
+      # --- ETc and water balance ---------------------------------------------
+
+      tar_target(
+        dp_with_eto,
+        dp_with_whc |>
+          dplyr::left_join(
+            etref |> dplyr::select("id", "date", "etref_mm_day"),
+            by = c("id", "date")
+          )
+      ),
+
+      tar_target(
+        dp_with_etc,
+        dp_with_eto |>
+          dplyr::mutate(
+            etc_mm_day = eto_to_etc_bism(
+              eto       = etref_mm_day,
+              crop_name = crop_name[[1]],
+              date      = date
+            ),
+            .by = "crop_name"
+          ) |>
+          dplyr::select(
+            dplyr::any_of(c("id", "parcel_id", "lat", "lon")),
+            "date", "etc_mm_day", "whc_min_frac", "whc_mm"
+          ) |>
+          resolve_multicrop()
+      ),
+
+      tar_target(
+        dp_crops_all,
+        dp_with_etc |>
+          dplyr::inner_join(precip, by = c("id", "date")) |>
+          dplyr::select(
+            "id", "lat", "lon", "date",
+            "etc_mm_day", "precip_mm_day", "whc_min_frac", "whc_mm"
+          )
+      ),
+
+      tar_target(
+        dpwb,
+        apply_water_balance(dp_crops_all, "id")
+      ),
+
+      # --- Diagnostics -------------------------------------------------------
+
+      tar_target(
+        etc_summary,
+        dp_crops_all |>
+          dplyr::summarize(
+            etc_min  = min(.data$etc_mm_day,  na.rm = TRUE),
+            etc_max  = max(.data$etc_mm_day,  na.rm = TRUE),
+            etc_mean = mean(.data$etc_mm_day, na.rm = TRUE),
+            .by = "id"
+          )
+      ),
+
+      tar_target(
+        wb_summary,
+        dpwb |>
+          dplyr::group_by(.data$id) |>
+          dplyr::summarize(
+            irr_total    = sum(.data$irr,   na.rm = TRUE),
+            irr_max      = max(.data$irr,   na.rm = TRUE),
+            irr_mean     = mean(.data$irr,  na.rm = TRUE),
+            runoff_total = sum(.data$runoff, na.rm = TRUE),
+            W_t_min      = min(.data$W_t,   na.rm = TRUE),
+            W_t_max      = max(.data$W_t,   na.rm = TRUE),
+            .groups = "drop"
+          ) |>
+          (\(x) {
+            print(x)
+            if (any(x$irr_max < 0))  warning("Negative irrigation values detected!")
+            else                      message("Irrigation values are non-negative")
+            if (any(x$W_t_min < 0))  warning("Negative soil water values detected!")
+            else                      message("Soil water values are non-negative")
+            x
+          })()
+      ),
+
+      tar_target(
+        monthly_irr,
+        dpwb |>
+          dplyr::mutate(month = lubridate::month(.data$date)) |>
+          dplyr::group_by(.data$month) |>
+          dplyr::summarize(irr_mean = mean(.data$irr, na.rm = TRUE), .groups = "drop") |>
+          (\(x) { print(x); x })()
+      ),
+
+      # --- Plot (saved as PNG) -----------------------------------------------
+
+      tar_target(
+        irrigation_plot, {
+          p <- dpwb |>
+            ggplot2::ggplot() +
+            ggplot2::aes(x = date, y = irr, color = id) +
+            ggplot2::geom_line() +
+            ggplot2::labs(
+              title = "Irrigation Requirements by Site",
+              y     = "Irrigation (mm/day)"
+            )
+          path <- file.path(event_output_dir, "irrigation_plot.png")
+          ggplot2::ggsave(path, p, width = 10, height = 6)
+          path
+        },
+        format = "file"
+      ),
+
+      # --- Write SIPNET event files ------------------------------------------
+
+      tar_target(
+        event_files, {
+          dpwb |>
+            dplyr::group_nest(.data$id) |>
+            dplyr::mutate(
+              fname = purrr::map2(
+                id, data,
+                \(site_id, dat) {
+                  readr::write_delim(
+                    create_event_file(dat),
+                    file.path(
+                      event_output_dir,
+                      glue::glue("{site_id}_events.txt")
+                    ),
+                    delim      = " ",
+                    col_names  = FALSE
+                  )
+                }
+              )
+            )
+          list.files(event_output_dir, full.names = TRUE,
+                     pattern = "_events\\.txt$")
+        },
+        format = "file"
+      )
+
+    )
+  },
+  ask = FALSE
+)
+
+#' Run the pipeline. Targets that are already up-to-date will be skipped.
+tar_make()

From b2f633a950c9525a7ab50a5c5fd30d637e7a078a Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 13 Mar 2026 16:56:59 -0400
Subject: [PATCH 37/67] Add mslsp_to_canopycover

---
 modules/data.land/NAMESPACE                   |  1 +
 modules/data.land/R/mslsp_to_canopycover.R    | 79 +++++++++++++++++++
 modules/data.land/man/mslsp_to_canopycover.Rd | 25 ++++++
 3 files changed, 105 insertions(+)
 create mode 100644 modules/data.land/R/mslsp_to_canopycover.R
 create mode 100644 modules/data.land/man/mslsp_to_canopycover.Rd

diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index 2cfe32be5a2..863ee2e96f5 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -55,6 +55,7 @@ export(matchInventoryRings)
 export(match_pft)
 export(match_species_id)
 export(mpot2smoist)
+export(mslsp_to_canopycover)
 export(ndti_to_sipnet_tillage)
 export(netcdf.writer.BADM)
 export(om2soc)
diff --git a/modules/data.land/R/mslsp_to_canopycover.R b/modules/data.land/R/mslsp_to_canopycover.R
new file mode 100644
index 00000000000..8c0ce7b1cd7
--- /dev/null
+++ b/modules/data.land/R/mslsp_to_canopycover.R
@@ -0,0 +1,79 @@
+#' Convert MSLSP phenology data to tidy canopy cover
+#'
+#' @param mslsp_path Path to directory containing MSLSP outputs (in parquet format)
+#' @param parcel_ids Vector of parcel IDs for filtering. If `NULL`, use all parcels.
+#' @param years Vector of years for filtering. If `NULL`, use all years.
+#'
+#' @return `data.frame` of `parcel_id`, `year`, `season`, `date`, and
+#' `canopy_cover` (fraction). `date` is a sequence from the MSLSP greenness
+#' onset (growing season start) to the greenness minimum (growing season end).
+#' `canopy_cover` is the fractional canpoy cover (0 to 1), suitable for ingest
+#' into [eto_to_etc_bism()] in "canopy cover mode".
+#' @export
+mslsp_to_canopycover <- function(mslsp_path, parcel_ids = NULL, years = NULL) {
+  stopifnot(dir.exists(mslsp_path))
+  mslsp_files <- list.files(mslsp_path, "\\.parquet", full.names = TRUE)
+  mslsp_dat <- arrow::open_dataset(mslsp_files)
+  if (!is.null(parcel_ids)) {
+    mslsp_dat <- mslsp_dat |>
+      dplyr::filter(.data[["parcel_id"]] %in% unique(parcel_ids))
+  }
+  if (!is.null(years)) {
+    mslsp_dat <- mslsp_dat |>
+      dplyr::filter(.data[["year"]] %in% years)
+  }
+  common_cols <- c(
+    "parcel_id",
+    "year",
+    "season",
+    "mslsp_cycle",
+    "landiq_PFT",
+    "landiq_CLASS",
+    "landiq_SUBCLASS"
+  )
+  mslsp_tbl <- mslsp_dat |>
+    dplyr::filter(!is.na(.data[["mslsp_cycle"]])) |>
+    dplyr::select(
+      dplyr::all_of(common_cols),
+      dplyr::all_of(.MSLSP_DATE_MAPPING[["date_name"]])
+    ) |>
+    dplyr::collect() |>
+    tibble::as_tibble()
+
+  result <- mslsp_tbl |>
+    dplyr::mutate(
+      cc_nested = purrr::pmap(
+        dplyr::pick(dplyr::all_of(.MSLSP_DATE_MAPPING[["date_name"]])),
+        \(...) expand_mslsp_cycle(list(...)),
+        .progress = TRUE
+      )
+    ) |>
+    dplyr::select(dplyr::all_of(common_cols), "cc_nested") |>
+    tidyr::unnest("cc_nested")
+
+  result
+}
+
+expand_mslsp_cycle <- function(mslsp_row) {
+  dates <- as.Date(unlist(mslsp_row[.MSLSP_DATE_MAPPING[["date_name"]]]))
+  all_dates <- seq(min(dates), max(dates), by = "1 day")
+  tibble::tibble(
+    date = all_dates,
+    canopy_cover = approx(
+      x = dates,
+      y = .MSLSP_DATE_MAPPING$canopy_cover,
+      xout = all_dates
+    )[["y"]]
+  )
+}
+
+.MSLSP_DATE_MAPPING <- tibble::tribble(
+  ~date_name, ~canopy_cover,
+  "mslsp_OGI", 0.15,
+  "mslsp_50PCGI", 0.5,
+  "mslsp_OGMx", 0.9,
+  "mslsp_Peak", 1.0,
+  "mslsp_OGD", 0.9,
+  "mslsp_50PCGD", 0.5,
+  "mslsp_OGMn", 0.15
+)
diff --git a/modules/data.land/man/mslsp_to_canopycover.Rd b/modules/data.land/man/mslsp_to_canopycover.Rd
new file mode 100644
index 00000000000..9998f68cb07
--- /dev/null
+++ b/modules/data.land/man/mslsp_to_canopycover.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mslsp_to_canopycover.R
+\name{mslsp_to_canopycover}
+\alias{mslsp_to_canopycover}
+\title{Convert MSLSP phenology data to tidy canopy cover}
+\usage{
+mslsp_to_canopycover(mslsp_path, parcel_ids = NULL, years = NULL)
+}
+\arguments{
+\item{mslsp_path}{Path to directory containing MSLSP outputs (in parquet format)}
+
+\item{parcel_ids}{Vector of parcel IDs for filtering. If `NULL`, use all parcels.}
+
+\item{years}{Vector of years for filtering. If `NULL`, use all years.}
+}
+\value{
+`data.frame` of `parcel_id`, `year`, `season`, `date`, and
+`canopy_cover` (fraction). `date` is a sequence from the MSLSP greenness
+onset (growing season start) to the greenness minimum (growing season end).
+`canopy_cover` is the fractional canpoy cover (0 to 1), suitable for ingest
+into [eto_to_etc_bism()] in "canopy cover mode".
+}
+\description{
+Convert MSLSP phenology data to tidy canopy cover
+}

From 87c4dda35ccf1dd9cc19c161f6043d2ab62ab884 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 13 Mar 2026 16:57:27 -0400
Subject: [PATCH 38/67] WIP targets pipeline for more sites

Works up to the ET download. ET download is slow --- need to just
pre-process existing CIMIS download on BU.
---
 modules/data.land/R/CIMIS-ET.R                |   6 +-
 modules/data.land/R/get-landiq-parcels.R      |   3 +-
 modules/data.land/R/get-landiq.R              |  10 +-
 .../inst/CIMIS-event-files-targets.R          | 221 ++++++++----------
 modules/data.land/man/extract_cimis_dates.Rd  |   8 +-
 5 files changed, 120 insertions(+), 128 deletions(-)

diff --git a/modules/data.land/R/CIMIS-ET.R b/modules/data.land/R/CIMIS-ET.R
index 26462631dae..265cd52b897 100644
--- a/modules/data.land/R/CIMIS-ET.R
+++ b/modules/data.land/R/CIMIS-ET.R
@@ -76,16 +76,18 @@ extract_cimis_date <- function(
 #' Extract CIMIS reference ET for multiple dates
 #'
 #' @param dates Sequence of dates for which to extract data
-#' @param ... Additional arguments to `purrr::map`
+#' @param ... Additional arguments to [extract_cimis_date()]
 #' @inheritParams extract_cimis_date
+#' @inheritParams purrr::map
 #'
 #' @return `design_points` `data.frame` extended with ETref data for all dates.
 #' @export
-extract_cimis_dates <- function(design_points, dates, ...) {
+extract_cimis_dates <- function(design_points, dates, .progress = FALSE, ...) {
   df_list <- purrr::map(
     dates,
     purrr::possibly(extract_cimis_date, NULL, quiet = FALSE),
     design_points = design_points,
+    .progress = .progress,
     ...
   )
   dplyr::bind_rows(df_list)
diff --git a/modules/data.land/R/get-landiq-parcels.R b/modules/data.land/R/get-landiq-parcels.R
index 29d5d575784..f5ca39ba987 100644
--- a/modules/data.land/R/get-landiq-parcels.R
+++ b/modules/data.land/R/get-landiq-parcels.R
@@ -14,7 +14,8 @@ get_landiq_parcel_ids <- function(design_points, parcels_file) {
     sf::st_transform(sf::st_crs(parcels_vect))
   dp_vect <- terra::vect(pts_sf)
   matched <- terra::intersect(dp_vect, parcels_vect)
-  matched_sf <- sf::st_as_sf(matched)
+  matched_sf <- sf::st_as_sf(matched) |>
+    sf::st_drop_geometry()
   dp_with_parcels <- design_points |>
     dplyr::left_join(matched_sf, by = "id")
   dp_with_parcels
diff --git a/modules/data.land/R/get-landiq.R b/modules/data.land/R/get-landiq.R
index 0e6a50a832d..e6ed76a72e8 100644
--- a/modules/data.land/R/get-landiq.R
+++ b/modules/data.land/R/get-landiq.R
@@ -10,12 +10,14 @@
 get_landiq <- function(design_points, parcels_file, crops_file) {
   dp_with_parcels <- get_landiq_parcel_ids(design_points, parcels_file)
 
-  crops <- arrow::read_parquet(crops_file) |>
-    dplyr::semi_join(dp_with_parcels, by = "parcel_id") |>
-    dplyr::select("parcel_id", "year", "season", "CLASS", "SUBCLASS")
+  crops <- arrow::open_dataset(crops_file) |>
+    dplyr::filter(.data$parcel_id %in% unique(dp_with_parcels[["parcel_id"]])) |> 
+    dplyr::select("parcel_id", "year", "season", "CLASS", "SUBCLASS") |>
+    dplyr::collect()
 
   dp_with_crops <- dp_with_parcels |>
-    dplyr::left_join(crops, by = "parcel_id")
+    dplyr::left_join(crops, by = "parcel_id") |>
+    tibble::as_tibble()
 
   dp_with_crops
 }
diff --git a/modules/data.land/inst/CIMIS-event-files-targets.R b/modules/data.land/inst/CIMIS-event-files-targets.R
index b974be2317f..0cd22a2b9e6 100644
--- a/modules/data.land/inst/CIMIS-event-files-targets.R
+++ b/modules/data.land/inst/CIMIS-event-files-targets.R
@@ -5,6 +5,10 @@
 
 library(targets)
 
+devtools::document("modules/data.land")
+devtools::install("modules/data.land", upgrade = FALSE)
+devtools::reload("modules/data.land")
+
 targets_file <- here::here("_targets.R")
 targets_store <- here::here("_targets/")
 tar_config_set(
@@ -18,35 +22,10 @@ tar_script(
     library(targets)
     library(tarchetypes)
 
-    # if (interactive()) {
-    devtools::load_all(here::here("modules/data.land"))
-    # } else {
-    #   library(PEcAn.data.land)
-    # }
-
     # -------------------------------------------------------------------------
     # Helper functions
     # -------------------------------------------------------------------------
 
-    #' Generate a sequence of dates for a given year/season (1=winter, 2=spring,
-    #' 3=summer, 4=fall).
-    fill_season <- function(year, season) {
-      if (season == 1) {
-        start <- lubridate::make_date(year, 1, 1)
-        end   <- lubridate::make_date(year, 3, 31)
-      } else if (season == 2) {
-        start <- lubridate::make_date(year, 4, 1)
-        end   <- lubridate::make_date(year, 6, 30)
-      } else if (season == 3) {
-        start <- lubridate::make_date(year, 7, 1)
-        end   <- lubridate::make_date(year, 9, 30)
-      } else if (season == 4) {
-        start <- lubridate::make_date(year, 10, 1)
-        end   <- lubridate::make_date(year, 12, 31)
-      }
-      seq.Date(start, end, "day")
-    }
-
     #' Calculate effective available water capacity (mm) for a soil profile
     #' clipped to a given rooting depth.
     calc_effective_awc <- function(
@@ -97,12 +76,7 @@ tar_script(
     # Package options
     # -------------------------------------------------------------------------
 
-    tar_option_set(
-      packages = c(
-        "dplyr", "tidyr", "purrr", "readr", "tibble",
-        "lubridate", "glue", "ggplot2", "arrow", "rlang"
-      )
-    )
+    tar_option_set(packages = c("ggplot2", "rlang"))
 
     # -------------------------------------------------------------------------
     # Pipeline
@@ -110,119 +84,95 @@ tar_script(
 
     list(
 
-      # --- Inputs from environment variables ---------------------------------
+      # --- Input paths
 
-      tar_target(design_points_path, Sys.getenv("DESIGN_POINTS")),
-      tar_target(cimis_eto_cog_path, Sys.getenv("CIMIS_ETO_COG")),
-      tar_target(parcels_path,       Sys.getenv("LANDIQ_PARCELS")),
-      tar_target(crops_path,         Sys.getenv("LANDIQ_CROPS")),
+      tar_target(design_points_path, path.expand(Sys.getenv("DESIGN_POINTS"))),
+      tar_target(cimis_eto_cog_path, path.expand(Sys.getenv("CIMIS_ETO_COG"))),
+      tar_target(parcels_path,       path.expand(Sys.getenv("LANDIQ_PARCELS"))),
+      tar_target(crops_path,         path.expand(Sys.getenv("LANDIQ_CROPS"))),
+      tar_target(mslsp_path,         path.expand(Sys.getenv("LANDIQ_TIMESERIES"))),
       tar_target(event_output_dir,   path.expand(Sys.getenv("EVENT_OUTPUT_DIR"))),
 
-      # --- Validate all input paths exist ------------------------------------
-
       tar_target(validated_paths, {
         stopifnot(
           file.exists(design_points_path),
           dir.exists(cimis_eto_cog_path),
           file.exists(parcels_path),
-          file.exists(crops_path)
+          file.exists(crops_path),
+          dir.exists(mslsp_path),
+          length(list.files(mslsp_path, "\\.parquet")) == 7
         )
         dir.create(event_output_dir, showWarnings = FALSE, recursive = TRUE)
         TRUE
       }),
 
-      # --- Base inputs -------------------------------------------------------
-
-      tar_target(
-        dates,
-        seq.Date(as.Date("2020-03-01"), as.Date("2020-11-30"), "day")
-      ),
-
       tar_target(
         design_points,
-        readr::read_csv(design_points_path, show_col_types = FALSE) |> head(10)
+        readr::read_csv(design_points_path, show_col_types = FALSE) |>
+          # Remove duplicate IDs
+          dplyr::slice(1, .by = "id")
       ),
 
-      # --- Remote data extractions (slow; most benefit from caching) ---------
-
       tar_target(
-        etref,
-        design_points |>
-          extract_cimis_dates(dates, cimis_eto_cog_path, .progress = TRUE)
+        dp_with_parcels,
+        PEcAn.data.land::get_landiq_parcel_ids(design_points, parcels_path) |>
+          dplyr::mutate(parcel_id = as.character(parcel_id))
       ),
 
       tar_target(
-        precip,
-        extract_chirps_remote(design_points, dates)
-      ),
-
-      # --- LandIQ crop data --------------------------------------------------
-
-      tar_target(
-        dp_with_crops,
-        get_landiq(
-          design_points,
-          parcels_file = parcels_path,
-          crops_file   = crops_path
+        dp_with_phenology,
+        PEcAn.data.land::mslsp_to_canopycover(
+          mslsp_path,
+          parcel_ids = unique(dp_with_parcels[["parcel_id"]])
         ) |>
-          tibble::as_tibble()
+          dplyr::mutate(
+            landiq_SUBCLASS = as.integer(.data$landiq_SUBCLASS)
+          ) |>
+          dplyr::inner_join(dp_with_parcels, by = "parcel_id") |>
+          dplyr::select(-dplyr::starts_with("UniqueID_"))
       ),
 
-      #' NOTE: Some LandIQ classes/subclasses map onto multiple BISM crop types.
-      #' HACK: select just the first crop per class/subclass group.
-      tar_target(
-        bism_crop_unique,
-        bism_kc_by_crop |>
-          dplyr::distinct(landiq_class, landiq_subclass, crop_name) |>
-          dplyr::slice(1, .by = c("landiq_class", "landiq_subclass"))
-      ),
+      # --- LandIQ crop data --------------------------------------------------
 
       tar_target(
         design_point_crops,
-        dp_with_crops |>
-          dplyr::left_join(
-            bism_crop_unique,
-            by = c("CLASS" = "landiq_class", "SUBCLASS" = "landiq_subclass")
-          )
-      ),
-
-      #' Expand crop seasons to daily rows using hard-coded quarterly dates.
-      #' In reality these would be resolved from phenology data.
-      tar_target(
-        dp_crops_filled,
-        design_point_crops |>
-          dplyr::filter(!is.na(season)) |>
-          tidyr::fill(
-            "CLASS", "SUBCLASS", "crop_name",
-            .direction = "downup",
-            .by = "parcel_id"
-          ) |>
-          dplyr::mutate(date = purrr::map2(year, season, fill_season)) |>
-          tidyr::unnest(date) |>
-          dplyr::filter(date %in% dates)
-      ),
+        {
+          #' NOTE: Some LandIQ classes/subclasses map onto multiple BISM crop types.
+          #' HACK: select just the first crop per class/subclass group.
+          bism_crop_unique <- PEcAn.data.land::bism_kc_by_crop |>
+            dplyr::distinct(.data$landiq_class, .data$landiq_subclass, .data$crop_name) |>
+            dplyr::slice(1, .by = c("landiq_class", "landiq_subclass"))
+
+          dp_crops <- dp_with_phenology |>
+            dplyr::left_join(
+              bism_crop_unique,
+              by = c(
+                "landiq_CLASS" = "landiq_class",
+                "landiq_SUBCLASS" = "landiq_subclass"
+              )
+            )
 
-      #' Warn about parcels with no matching BIS crop, then filter them out.
-      tar_target(
-        dp_with_cropname, {
-          missing_crops <- dp_crops_filled |> dplyr::filter(is.na(crop_name))
+          missing_crops <- dp_crops |> dplyr::filter(is.na(crop_name))
           if (nrow(missing_crops) > 0) {
             missing_crop_strs <- missing_crops |>
-              dplyr::distinct(CLASS, SUBCLASS) |>
+              dplyr::distinct(.data$landiq_CLASS, .data$landiq_SUBCLASS) |>
               dplyr::mutate(
-                string = glue::glue("CLASS: {CLASS} SUBCLASS: {SUBCLASS}")
+                string = glue::glue(
+                  "CLASS: {.data$landiq_CLASS} ",
+                  "SUBCLASS: {.data$landiq_SUBCLASS}"
+                )
               ) |>
-              dplyr::pull(string)
+              dplyr::pull(.data$string)
             warning(
               "Skipping ", nrow(missing_crops),
               " rows with no matching BIS crop. Relevant pairs are: [",
               paste(missing_crop_strs, collapse = "; "), "]"
             )
           }
-          dp_crops_filled |>
-            dplyr::filter(!is.na(crop_name)) |>
+          dp_crops |>
+            dplyr::filter(!is.na(.data$crop_name)) |>
             dplyr::left_join(
-              crop_whc |>
+              PEcAn.data.land::crop_whc |>
                 dplyr::select("crop_name", "whc_min_frac", "rooting_depth_m"),
               by = "crop_name"
             )
@@ -231,23 +181,25 @@ tar_script(
 
       # --- SSURGO soil data --------------------------------------------------
 
-      tar_target(
-        design_points_sf,
-        dplyr::distinct(design_points, id, lon, lat)
-      ),
-
       tar_target(
         mukeys_list,
-        purrr::map2(
-          design_points_sf$lon,
-          design_points_sf$lat,
-          ~ ssurgo_mukeys_point(point = c(.x, .y), distance = 20)
-        )
+        {
+          design_points_sf <- design_points |> 
+            dplyr::distinct(id, lon, lat)
+          purrr::map2(
+            design_points_sf$lon,
+            design_points_sf$lat,
+            ~ PEcAn.data.land::ssurgo_mukeys_point(
+              point = c(.x, .y),
+              distance = 10
+            )
+          )
+        }
       ),
 
       tar_target(
         soil_raw,
-        gSSURGO.Query(
+        PEcAn.data.land::gSSURGO.Query(
           mukeys = unique(unlist(mukeys_list)),
           fields = c("chorizon.awc_r", "chorizon.hzdept_r", "chorizon.hzdepb_r")
         )
@@ -261,9 +213,9 @@ tar_script(
 
       tar_target(
         dp_with_whc,
-        dp_with_cropname |>
+        design_point_crops |>
           dplyr::mutate(
-            mukey = mukeys_list[match(id, design_points_sf$id)]
+            mukey = mukeys_list[match(id, design_points$id)]
           ) |>
           tidyr::unnest(mukey) |>
           dplyr::mutate(mukey = as.numeric(mukey)) |>
@@ -284,6 +236,30 @@ tar_script(
           )
       ),
 
+      # --- Remote data extractions (slow; most benefit from caching) ---------
+
+      tar_target(
+        precip_et_dates,
+        with(design_point_crops, seq(min(date), max(date), by = "1 day"))
+      ),
+
+      tar_target(
+        etref,
+        design_point_crops |>
+          PEcAn.data.land::extract_cimis_dates(
+            precip_et_dates,
+            cimis_eto_cog_path,
+            download_missing = TRUE,
+            .progress = TRUE
+          )
+      ),
+
+      tar_target(
+        precip,
+        PEcAn.data.land::extract_chirps_remote(design_points, precip_et_dates)
+      ),
+
+
       # --- ETc and water balance ---------------------------------------------
 
       tar_target(
@@ -426,4 +402,11 @@ tar_script(
 )
 
 #' Run the pipeline. Targets that are already up-to-date will be skipped.
-tar_make()
+# tar_make()
+# tar_invalidate(dp_with_crops)
+tar_make(c(precip))
+
+if (interactive()) {
+  # tar_load(c("design_points", "dp_with_crops", "phenology"))
+  tar_load_everything()
+}
diff --git a/modules/data.land/man/extract_cimis_dates.Rd b/modules/data.land/man/extract_cimis_dates.Rd
index caa1671ef14..17096384e1e 100644
--- a/modules/data.land/man/extract_cimis_dates.Rd
+++ b/modules/data.land/man/extract_cimis_dates.Rd
@@ -4,7 +4,7 @@
 \alias{extract_cimis_dates}
 \title{Extract CIMIS reference ET for multiple dates}
 \usage{
-extract_cimis_dates(design_points, dates, ...)
+extract_cimis_dates(design_points, dates, .progress = FALSE, ...)
 }
 \arguments{
 \item{design_points}{`data.frame` of design points with columns
@@ -12,7 +12,11 @@ extract_cimis_dates(design_points, dates, ...)
 
 \item{dates}{Sequence of dates for which to extract data}
 
-\item{...}{Additional arguments to `purrr::map`}
+\item{.progress}{Whether to show a progress bar. Use \code{TRUE} to turn on
+a basic progress bar, use a string to give it a name, or see
+\link[purrr]{progress_bars} for more details.}
+
+\item{...}{Additional arguments to [extract_cimis_date()]}
 }
 \value{
 `design_points` `data.frame` extended with ETref data for all dates.

From 5dcdaa7b3f8830f714433aa112dcd43077a6f8a9 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 20 Mar 2026 16:11:32 -0400
Subject: [PATCH 39/67] add basic flooded rice irrigation implementation

---
 modules/data.land/NAMESPACE                   |   1 +
 modules/data.land/R/water_balance.R           | 235 ++++++++++++++++--
 modules/data.land/man/apply_water_balance.Rd  |   8 +-
 modules/data.land/man/calc_water_balance.Rd   |   9 +-
 .../data.land/man/calc_water_balance_rice.Rd  |  77 ++++++
 5 files changed, 295 insertions(+), 35 deletions(-)
 create mode 100644 modules/data.land/man/calc_water_balance_rice.Rd

diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index 863ee2e96f5..3d72ff060e5 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -12,6 +12,7 @@ export(Soilgrids_SoilC_prep)
 export(apply_water_balance)
 export(buildJAGSdata_InventoryRings)
 export(calc_water_balance)
+export(calc_water_balance_rice)
 export(clip_and_save_raster_file)
 export(cohort2pool)
 export(create_event_file)
diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index ec2c04c1435..c086244c808 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -32,9 +32,6 @@
 #'   triggered when soil water falls below this level; defaults to
 #'   `whc_min_frac * whc` if NULL.
 #'   Can be a single value or a vector of the same length as `et`.
-#' @param seepage_rate Daily seepage loss for rice paddies (distance / time);
-#'   only used when `is_rice = TRUE`
-#' @param is_rice Logical; if TRUE, applies a constant seepage loss (mm/day)
 #' @return List with vectors: W_t (soil water), irr (irrigation), runoff
 #' @examples
 #' # Calculate WHC from field capacity, wilting point, and rooting depth
@@ -55,10 +52,9 @@ calc_water_balance <- function(
   whc,
   whc_min_frac,
   W_initial = NULL, #nolint: object_name_linter
-  w_min = NULL,
-  seepage_rate = NULL,
-  is_rice = FALSE
+  w_min = NULL
 ) {
+
   # nolint start: object_name_linter
   n <- length(et)
 
@@ -77,7 +73,12 @@ calc_water_balance <- function(
       x
     } else {
       PEcAn.logger::logger.severe(
-        sprintf("%s must have length 1 or %d; actual length = %d", name, n, length(x))
+        sprintf(
+          "%s must have length 1 or %d; actual length = %d",
+          name,
+          n,
+          length(x)
+        )
       )
     }
   }
@@ -87,13 +88,11 @@ calc_water_balance <- function(
     whc_min_frac <- ensure_vec(whc_min_frac, n, "whc_min_frac")
   }
 
-  if (is_rice && is.null(seepage_rate)) {
-    PEcAn.logger::logger.severe("Seepage rate must be defined for rice fields")
-  }
-
   if (is.null(w_min)) {
     if (is.null(whc_min_frac)) {
-      PEcAn.logger::logger.severe("Either whc_min_frac or w_min must be provided")
+      PEcAn.logger::logger.severe(
+        "Either whc_min_frac or w_min must be provided"
+      )
     }
     w_min <- whc_min_frac * whc
   } else {
@@ -112,13 +111,10 @@ calc_water_balance <- function(
   runoff <- numeric(n)
 
   for (t in seq_len(n)) {
-    # Only water above w_min is available for seepage
-    seepage <- if (is_rice) min(seepage_rate, max(0, W_prev - w_min[t])) else 0.0
-
     # Potential state after precip and ET
-    W0 <- W_prev + precip[t] - et[t] - seepage
+    W0 <- W_prev + precip[t] - et[t]
 
-    # If W0 falls below w_min (e.g., high ET and seepage; low precip), irrigate
+    # If W0 falls below w_min (e.g., high ET; low precip), irrigate
     # to field capacity (i.e., full WHC).
     if (W0 < w_min[t]) {
       irr[t] <- whc[t] - W0
@@ -144,20 +140,158 @@ calc_water_balance <- function(
   list(W_t = W_t, irr = irr, runoff = runoff)
 }
 
+#' Calculate water balance for a flooded rice paddy
+#'
+#' Models the water balance of a flooded rice system with a two-layer
+#' structure: a ponded water layer above a saturated soil profile. This is
+#' physically distinct from the upland soil water balance in
+#' calc_water_balance(). Water is managed to maintain a target flood depth,
+#' with support for mid-season drainage events.
+#'
+#' State variable:
+#'   - pond_depth: depth of standing water above the soil surface
+#'
+#' The soil profile is assumed to be continuously saturated during flooded
+#' periods, so plant-available soil water is not tracked separately. ET is
+#' applied directly to the pond layer (open-water ET during flooded periods).
+#'
+#' Irrigation is triggered when pond_depth falls below flood_min. Farmers
+#' refill to flood_target. Runoff (bund overflow) occurs when pond_depth
+#' exceeds flood_max.
+#'
+#' Mid-season drainage is specified as a logical vector (drain[t] = TRUE means
+#' the field is intentionally drained on day t). During drainage days, the
+#' pond is drawn down to pond_depth = 0 and irrigation is suppressed. This
+#' represents practices such as weed control or pre-harvest drainage.
+#'
+#' @param et        Numeric vector. Daily reference ET. During flooded
+#'                  periods this is treated as open-water ET; no crop
+#'                  coefficient is applied here but you can pre-multiply if
+#'                  needed.
+#' @param precip    Numeric vector. Daily precipitation.
+#' @param flood_target Numeric scalar. Target ponded water depth.
+#'                  Irrigation refills to this level.
+#' @param flood_min Numeric scalar. Minimum acceptable pond depth before
+#'                  irrigation is triggered.
+#' @param flood_max Numeric scalar. Maximum pond depth before bund
+#'                  overflow / runoff occurs.
+#' @param seepage   Numeric scalar. Daily seepage + percolation loss
+#'                  Represents losses through the bund and downward percolation
+#'                  through the hardpan (if any). Typical range: 1-5 mm/day
+#'                  for well-puddled California soils.
+#' @param drain     Logical vector (same length as et). TRUE on days when an
+#'                  intentional drainage event occurs (e.g., mid-season drain,
+#'                  pre-harvest drawdown). Pond is set to 0 on these days and
+#'                  irrigation is suppressed.
+#' @param pond_init Numeric scalar. Initial pond depth at t = 1.
+#'                  Defaults to flood_target.
+#'
+#' @return A list with numeric vectors of length n:
+#'   \item{pond_depth}{Ponded water depth at end of each day}
+#'   \item{irr}{Irrigation applied}
+#'   \item{runoff}{Bund overflow / surface runoff}
+#'
+#' @export
+calc_water_balance_rice <- function(
+  et,
+  precip,
+  flood_target,
+  flood_min,
+  flood_max,
+  seepage,
+  drain = NULL,
+  pond_init = flood_target
+) {
+  n <- length(et)
+
+  if (length(precip) != n) {
+    stop("et and precip must be the same length")
+  }
+  if (is.null(drain)) {
+    drain <- rep(FALSE, n)
+  }
+  if (length(drain) != n) {
+    stop("drain must be the same length as et")
+  }
+  if (flood_min >= flood_target) {
+    stop("flood_min must be less than flood_target")
+  }
+  if (flood_target >= flood_max) {
+    stop("flood_target must be less than flood_max")
+  }
+  if (seepage < 0) {
+    stop("seepage must be non-negative")
+  }
+
+  pond_depth <- numeric(n)
+  irr <- numeric(n)
+  runoff <- numeric(n)
+
+  pond_prev <- pond_init
+
+  for (t in seq_len(n)) {
+    # --- Intentional drainage day -----------------------------------------
+    # The field is deliberately drained (mid-season weed control, pre-harvest,
+    # etc.). All water in the pond is released as managed drainage, counted
+    # as runoff. Irrigation is suppressed for the day.
+    if (drain[t]) {
+      runoff[t] <- pond_prev + precip[t] # drain existing pond + any rain
+      irr[t] <- 0
+      pond_depth[t] <- 0
+      pond_prev <- 0
+      next
+    }
+
+    # --- Normal flooded day -----------------------------------------------
+
+    # 1. Fluxes: precip adds, ET and seepage remove.
+    #    Seepage is capped at available pond depth so we don't go below zero
+    #    before irrigation is assessed.
+    actual_seepage <- min(seepage, max(0, pond_prev))
+    pond0 <- pond_prev + precip[t] - et[t] - actual_seepage
+
+    # 2. Irrigation: refill to flood_target if pond drops below flood_min.
+    #    Note that pond0 can be negative if ET is very high (e.g., early
+    #    season before the pond is established). Irrigation covers the full
+    #    deficit back to the target.
+    if (pond0 < flood_min) {
+      irr[t] <- flood_target - pond0
+      pond0 <- flood_target
+    } else {
+      irr[t] <- 0
+    }
+
+    # 3. Runoff (bund overflow): any depth exceeding flood_max spills over.
+    if (pond0 > flood_max) {
+      runoff[t] <- pond0 - flood_max
+      pond_depth[t] <- flood_max
+    } else {
+      runoff[t] <- 0
+      pond_depth[t] <- max(pond0, 0)
+    }
+
+    pond_prev <- pond_depth[t]
+  }
+
+  list(pond_depth = pond_depth, irr = irr, runoff = runoff)
+}
+
 #' Apply water balance calculations to a data frame with multiple sites
 #'
 #' Groups by location and applies calc_water_balance to each group. Unlike
 #' `calc_water_balance`, the units here *do* matter -- they should be `mm_day`.
 #'
 #' @param df Data frame with columns: `date`, `location_id`, `etc_mm_day`,
-#' `precip_mm_day`, and `whc_min_frac` (optional, defaults to 0.375).
+#' `precip_mm_day`, `crop_name`, and `whc_min_frac` (optional, defaults to 0.375).
 #' If a `whc_mm` column is present, it is used as the water holding capacity.
-#' @param idcol Column name for grouping (typically, `location_id`, `parcel_id` or similar)
-#' @param whc_mm Water holding capacity (mm); ignored if `whc_mm` is a column in `df`.
+#' @param idcol Column name for grouping (typically, `location_id`, `parcel_id`
+#' or similar)
+#' @param whc_mm Water holding capacity (mm); ignored if `whc_mm` is a column
+#' in `df`.
 #' @return Data frame with added columns: `W_t`, `irr`, `runoff`
 #' @export
 apply_water_balance <- function(df, idcol, whc_mm = 500) {
-  need_cols <- c("etc_mm_day", "precip_mm_day", "date")
+  need_cols <- c("etc_mm_day", "precip_mm_day", "date", "crop_name")
   missing_cols <- need_cols[!(need_cols %in% colnames(df))]
   default_whc_min_frac <- 0.375
   if (length(missing_cols) > 0) {
@@ -190,14 +324,65 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
     df[["whc_mm"]] <- whc_mm
   }
 
-  df |>
+  try_wb_rice <- function(...) {
+    tryCatch(
+      calc_water_balance_rice(...),
+      error = function(e) {
+        warning("Hit the following error: \n\n", e$message)
+        list(
+          pond_depth = NA_real_,
+          irr = NA_real_,
+          runoff = NA_real_
+        )
+      }
+    )
+  }
+
+  rice <- df |>
+    dplyr::filter(.data$crop_name == "Rice") |>
+    dplyr::arrange(.data[[idcol]], .data$date) |> # nolint: object_usage_linter
+    dplyr::mutate(
+      year = as.integer(format(.data$date, "%Y")),
+      week = as.integer(format(.data$date, "%U")),
+      day_of_year = as.integer(format(.data$date, "%j")),
+      results = tibble::as_tibble(try_wb_rice(
+        et = .data$etc_mm_day,
+        precip = .data$precip_mm_day,
+        flood_target = 125,
+        flood_min = 62.5,
+        flood_max = 175,
+        seepage = 2.5
+      )),
+      .by = dplyr::all_of(idcol)
+    ) |>
+    tidyr::unpack(.data$results)
+
+  try_wb <- function(...) {
+    tryCatch(
+      calc_water_balance(...),
+      error = function(e) {
+        warning("Hit the following error: \n\n", e$message)
+        list(
+          W_t = NA_real_,
+          irr = NA_real_,
+          runoff = NA_real_
+        )
+      }
+    )
+  }
+
+  others <- df |>
+    dplyr::filter(.data$crop_name != "Rice") |>
     dplyr::arrange(.data[[idcol]], .data$date) |> # nolint: object_usage_linter
     dplyr::mutate(
       year = as.integer(format(.data$date, "%Y")),
       week = as.integer(format(.data$date, "%U")),
       day_of_year = as.integer(format(.data$date, "%j")),
-      whc_min_frac = tidyr::replace_na(.data$whc_min_frac, default_whc_min_frac),
-      results = tibble::as_tibble(calc_water_balance(
+      whc_min_frac = tidyr::replace_na(
+        .data$whc_min_frac,
+        default_whc_min_frac
+      ),
+      results = tibble::as_tibble(try_wb(
         et = .data$etc_mm_day,
         precip = .data$precip_mm_day,
         whc = .data$whc_mm,
@@ -206,4 +391,6 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
       .by = dplyr::all_of(idcol)
     ) |>
     tidyr::unpack(.data$results)
+
+  dplyr::bind_rows(rice, others)
 }
diff --git a/modules/data.land/man/apply_water_balance.Rd b/modules/data.land/man/apply_water_balance.Rd
index 29848a143aa..9fe13466d00 100644
--- a/modules/data.land/man/apply_water_balance.Rd
+++ b/modules/data.land/man/apply_water_balance.Rd
@@ -8,12 +8,14 @@ apply_water_balance(df, idcol, whc_mm = 500)
 }
 \arguments{
 \item{df}{Data frame with columns: `date`, `location_id`, `etc_mm_day`,
-`precip_mm_day`, and `whc_min_frac` (optional, defaults to 0.375).
+`precip_mm_day`, `crop_name`, and `whc_min_frac` (optional, defaults to 0.375).
 If a `whc_mm` column is present, it is used as the water holding capacity.}
 
-\item{idcol}{Column name for grouping (typically, `location_id`, `parcel_id` or similar)}
+\item{idcol}{Column name for grouping (typically, `location_id`, `parcel_id`
+or similar)}
 
-\item{whc_mm}{Water holding capacity (mm); ignored if `whc_mm` is a column in `df`.}
+\item{whc_mm}{Water holding capacity (mm); ignored if `whc_mm` is a column
+in `df`.}
 }
 \value{
 Data frame with added columns: `W_t`, `irr`, `runoff`
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index 8ab5cb9dac3..563333117ea 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -10,9 +10,7 @@ calc_water_balance(
   whc,
   whc_min_frac,
   W_initial = NULL,
-  w_min = NULL,
-  seepage_rate = NULL,
-  is_rice = FALSE
+  w_min = NULL
 )
 }
 \arguments{
@@ -35,11 +33,6 @@ Can be a single value or a vector of the same length as `et`.}
 triggered when soil water falls below this level; defaults to
 `whc_min_frac * whc` if NULL.
 Can be a single value or a vector of the same length as `et`.}
-
-\item{seepage_rate}{Daily seepage loss for rice paddies (distance / time);
-only used when `is_rice = TRUE`}
-
-\item{is_rice}{Logical; if TRUE, applies a constant seepage loss (mm/day)}
 }
 \value{
 List with vectors: W_t (soil water), irr (irrigation), runoff
diff --git a/modules/data.land/man/calc_water_balance_rice.Rd b/modules/data.land/man/calc_water_balance_rice.Rd
new file mode 100644
index 00000000000..e50a00c21a2
--- /dev/null
+++ b/modules/data.land/man/calc_water_balance_rice.Rd
@@ -0,0 +1,77 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/water_balance.R
+\name{calc_water_balance_rice}
+\alias{calc_water_balance_rice}
+\title{Calculate water balance for a flooded rice paddy}
+\usage{
+calc_water_balance_rice(
+  et,
+  precip,
+  flood_target,
+  flood_min,
+  flood_max,
+  seepage,
+  drain = NULL,
+  pond_init = flood_target
+)
+}
+\arguments{
+\item{et}{Numeric vector. Daily reference ET. During flooded
+periods this is treated as open-water ET; no crop
+coefficient is applied here but you can pre-multiply if
+needed.}
+
+\item{precip}{Numeric vector. Daily precipitation.}
+
+\item{flood_target}{Numeric scalar. Target ponded water depth.
+Irrigation refills to this level.}
+
+\item{flood_min}{Numeric scalar. Minimum acceptable pond depth before
+irrigation is triggered.}
+
+\item{flood_max}{Numeric scalar. Maximum pond depth before bund
+overflow / runoff occurs.}
+
+\item{seepage}{Numeric scalar. Daily seepage + percolation loss
+Represents losses through the bund and downward percolation
+through the hardpan (if any). Typical range: 1-5 mm/day
+for well-puddled California soils.}
+
+\item{drain}{Logical vector (same length as et). TRUE on days when an
+intentional drainage event occurs (e.g., mid-season drain,
+pre-harvest drawdown). Pond is set to 0 on these days and
+irrigation is suppressed.}
+
+\item{pond_init}{Numeric scalar. Initial pond depth at t = 1.
+Defaults to flood_target.}
+}
+\value{
+A list with numeric vectors of length n:
+  \item{pond_depth}{Ponded water depth at end of each day}
+  \item{irr}{Irrigation applied}
+  \item{runoff}{Bund overflow / surface runoff}
+}
+\description{
+Models the water balance of a flooded rice system with a two-layer
+structure: a ponded water layer above a saturated soil profile. This is
+physically distinct from the upland soil water balance in
+calc_water_balance(). Water is managed to maintain a target flood depth,
+with support for mid-season drainage events.
+}
+\details{
+State variable:
+  - pond_depth: depth of standing water above the soil surface
+
+The soil profile is assumed to be continuously saturated during flooded
+periods, so plant-available soil water is not tracked separately. ET is
+applied directly to the pond layer (open-water ET during flooded periods).
+
+Irrigation is triggered when pond_depth falls below flood_min. Farmers
+refill to flood_target. Runoff (bund overflow) occurs when pond_depth
+exceeds flood_max.
+
+Mid-season drainage is specified as a logical vector (drain[t] = TRUE means
+the field is intentionally drained on day t). During drainage days, the
+pond is drawn down to pond_depth = 0 and irrigation is suppressed. This
+represents practices such as weed control or pre-harvest drainage.
+}

From 7773eac66faa926282db9b0c7589767cd002fd71 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 20 Mar 2026 16:34:15 -0400
Subject: [PATCH 40/67] Complete working statewide irrigation pipeline

---
 .../inst/irrigation-statewide/.gitignore      |   3 +
 .../inst/irrigation-statewide/R/calc_awc.R    |  85 +++++++++
 .../inst/irrigation-statewide/R/get.R         | 108 ++++++++++++
 .../inst/irrigation-statewide/R/misc.R        |  96 +++++++++++
 .../R/resolve_multicrop.R                     |  30 ++++
 .../inst/irrigation-statewide/README.md       |  50 ++++++
 .../inst/irrigation-statewide/_targets.R      | 161 ++++++++++++++++++
 .../inst/irrigation-statewide/_targets.yaml   |   3 +
 .../inst/irrigation-statewide/run-pipeline.R  |  19 +++
 9 files changed, 555 insertions(+)
 create mode 100644 modules/data.land/inst/irrigation-statewide/.gitignore
 create mode 100644 modules/data.land/inst/irrigation-statewide/R/calc_awc.R
 create mode 100644 modules/data.land/inst/irrigation-statewide/R/get.R
 create mode 100644 modules/data.land/inst/irrigation-statewide/R/misc.R
 create mode 100644 modules/data.land/inst/irrigation-statewide/R/resolve_multicrop.R
 create mode 100644 modules/data.land/inst/irrigation-statewide/README.md
 create mode 100644 modules/data.land/inst/irrigation-statewide/_targets.R
 create mode 100644 modules/data.land/inst/irrigation-statewide/_targets.yaml
 create mode 100644 modules/data.land/inst/irrigation-statewide/run-pipeline.R

diff --git a/modules/data.land/inst/irrigation-statewide/.gitignore b/modules/data.land/inst/irrigation-statewide/.gitignore
new file mode 100644
index 00000000000..ebb71e51f7c
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/.gitignore
@@ -0,0 +1,3 @@
+_logs/
+_targets/
+zz-*
diff --git a/modules/data.land/inst/irrigation-statewide/R/calc_awc.R b/modules/data.land/inst/irrigation-statewide/R/calc_awc.R
new file mode 100644
index 00000000000..e83e1222ff1
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/R/calc_awc.R
@@ -0,0 +1,85 @@
+#' Calculate effective available water capacity (mm) for a soil profile
+#' clipped to a given rooting depth.
+calc_effective_awc <- function(
+  hzdept_r_cm,
+  hzdepb_r_cm,
+  awc_r,
+  rooting_depth_cm
+) {
+  effective_top <- pmin(hzdept_r_cm, rooting_depth_cm)
+  effective_bottom <- pmin(hzdepb_r_cm, rooting_depth_cm)
+  thickness_cm <- pmax(0, effective_bottom - effective_top)
+  # awc_r is cm water / cm soil;
+  # multiply by thickness -> cm water -> mm water
+  sum(awc_r * thickness_cm, na.rm = TRUE) * 10
+}
+
+add_soil_awc <- function(
+  crop_info,
+  ssurgo_weights_path,
+  ssurgo_gdb_path
+) {
+  parcel_ids <- unique(crop_info[["parcel_id"]])
+  message("Reading SSURGO weights")
+  weights <- arrow::open_dataset(ssurgo_weights_path) |>
+    dplyr::filter(.data$parcel_id %in% parcel_ids) |>
+    dplyr::collect()
+
+  component <- sf::read_sf(
+    ssurgo_gdb_path,
+    layer = "component",
+    as_tibble = TRUE
+  ) |>
+    dplyr::semi_join(weights, by = "mukey")
+
+  chorizon <- sf::read_sf(
+    ssurgo_gdb_path,
+    layer = "chorizon",
+    as_tibble = TRUE
+  ) |>
+    dplyr::semi_join(component, by = "cokey")
+
+  combined <- weights |>
+    dplyr::left_join(
+      dplyr::select(crop_info, "parcel_id", "rooting_depth_m"),
+      by = "parcel_id",
+      relationship = "many-to-many"
+    ) |>
+    dplyr::left_join(
+      component,
+      by = "mukey",
+      relationship = "many-to-many"
+    ) |>
+    dplyr::left_join(
+      chorizon,
+      by = "cokey",
+      relationship = "many-to-many"
+    )
+
+  awc <- combined |>
+    dplyr::filter(
+      !is.na(.data$awc_r),
+      !is.na(.data$hzdept_r),
+      !is.na(.data$hzdepb_r)
+    ) |>
+    dplyr::mutate(
+      rooting_depth_cm = .data$rooting_depth_m * 100,
+      .keep = "unused"
+    ) |>
+    dplyr::summarize(
+      whc_mm = calc_effective_awc(
+        .data$hzdept_r,
+        .data$hzdepb_r,
+        .data$awc_r,
+        .data$rooting_depth_cm
+      ),
+      .by = c("parcel_id", "mukey", "cokey", "area_m2", "weight")
+    ) |>
+    dplyr::summarize(
+      whc_mm = sum(.data$whc_mm * .data$weight),
+      .by = "parcel_id"
+    )
+
+  crops_with_soil <- dplyr::left_join(crop_info, awc, by = "parcel_id")
+  crops_with_soil
+}
diff --git a/modules/data.land/inst/irrigation-statewide/R/get.R b/modules/data.land/inst/irrigation-statewide/R/get.R
new file mode 100644
index 00000000000..c4b09f38068
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/R/get.R
@@ -0,0 +1,108 @@
+#!/usr/bin/env Rscript
+# library(PEcAn.data.land)
+# parcel_ids <- parcel_id_batches[[1]]
+
+get_parcel_ids <- function(crops_path, n_parcels = NULL) {
+  parcel_ids <- arrow::open_dataset(crops_path) |>
+    dplyr::distinct(.data$parcel_id) |>
+    dplyr::pull(as_vector = TRUE)
+  if (is.null(n_parcels)) return(parcel_ids)
+  sample(parcel_ids, n_parcels)
+}
+
+get_phenology <- function(mslsp_path, parcel_ids = NULL) {
+  dat_raw <- mslsp_to_canopycover(mslsp_path, parcel_ids)
+  phenology_raw <- dat_raw |>
+    dplyr::mutate(
+      parcel_id = as.integer(parcel_id),
+      landiq_SUBCLASS = as.integer(.data$landiq_SUBCLASS)
+    )
+  # Resolve overlapping canopy_cover values. If a single date has multiple
+  # rows, we take the row with the largest canopy cover.
+  phenology <- phenology_raw |>
+    dplyr::slice_max(.data$canopy_cover, n = 1, by = c("parcel_id", "date")) |>
+    dplyr::relocate("parcel_id", "year", "date")
+  phenology
+}
+
+get_etref <- function(cimis_etref_path, parcel_ids = NULL) {
+  dat <- arrow::open_dataset(cimis_etref_path)
+  if (!is.null(parcel_ids)) {
+    dat <- dplyr::filter(dat, .data$parcel_id %in% parcel_ids)
+  }
+  etref <- dat |>
+    dplyr::arrange(.data$parcel_id, .data$date) |>
+    dplyr::collect()
+  etref
+}
+
+get_precip <- function(chirps_precip_path, parcel_ids = NULL) {
+  dat <- arrow::open_dataset(chirps_precip_path)
+  if (!is.null(parcel_ids)) {
+    dat <- dplyr::filter(dat, .data$parcel_id %in% parcel_ids)
+  }
+  precip <- dat |>
+    dplyr::arrange(.data$parcel_id, .data$date) |>
+    dplyr::collect()
+  precip
+}
+
+#' @importFrom PEcAn.data.land bism_kc_by_crop crop_whc
+get_crop_info <- function(crops_path, parcel_ids = NULL) {
+  # parcel_ids <- parcel_id_batches[[1]]
+  dat <- arrow::open_dataset(crops_path)
+  if (!is.null(parcel_ids)) {
+    dat <- dplyr::filter(dat, .data$parcel_id %in% .env$parcel_ids)
+  }
+  dlocal <- dat |>
+    dplyr::collect()
+
+  #' NOTE: Some LandIQ classes/subclasses map onto multiple BISM crop
+  #' types. HACK: select just the first crop per class/subclass group.
+  bism_crop_unique <- bism_kc_by_crop |>
+    dplyr::distinct(
+      .data$landiq_class,
+      .data$landiq_subclass,
+      .data$crop_name
+    ) |>
+    dplyr::slice(1, .by = c("landiq_class", "landiq_subclass"))
+
+  crops <- dlocal |>
+    dplyr::left_join(
+      bism_crop_unique,
+      by = c(
+        "CLASS" = "landiq_class",
+        "SUBCLASS" = "landiq_subclass"
+      )
+    )
+
+  missing_crops <- dplyr::filter(crops, is.na(.data$crop_name))
+  if (nrow(missing_crops) > 0) {
+    missing_crop_strs <- missing_crops |>
+      dplyr::distinct(.data$CLASS, .data$SUBCLASS) |>
+      dplyr::mutate(
+        string = glue::glue(
+          "CLASS: {.data$CLASS} ",
+          "SUBCLASS: {.data$SUBCLASS}"
+        )
+      ) |>
+      dplyr::pull(.data$string)
+    warning(
+      "Skipping ",
+      nrow(missing_crops),
+      " rows with no matching BIS crop. Relevant pairs are: [",
+      paste(missing_crop_strs, collapse = "; "),
+      "]"
+    )
+  }
+  crop_whc_sub <- dplyr::select(
+    crop_whc,
+    "crop_name",
+    "whc_min_frac",
+    "rooting_depth_m"
+  )
+  crop_info <- crops |>
+    dplyr::filter(!is.na(.data$crop_name)) |>
+    dplyr::left_join(crop_whc_sub, by = "crop_name")
+  crop_info
+}
diff --git a/modules/data.land/inst/irrigation-statewide/R/misc.R b/modules/data.land/inst/irrigation-statewide/R/misc.R
new file mode 100644
index 00000000000..3c8e6e2c4d7
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/R/misc.R
@@ -0,0 +1,96 @@
+split_into_batches <- function(x, batch_size) {
+  split(x, ceiling(seq_along(x) / batch_size))
+}
+
+make_crop_timeseries <- function(crops_with_soil, phenology, precip, etref) {
+  crop_cols <- c(
+    "parcel_id",
+    "year",
+    "crop_name",
+    "whc_min_frac",
+    "whc_mm",
+    "whc_min_frac"
+  )
+
+  crop_soil_timeseries <- crops_with_soil |>
+    dplyr::select(dplyr::all_of(crop_cols)) |>
+    dplyr::inner_join(
+      phenology,
+      by = c("parcel_id", "year"),
+      relationship = "many-to-many"
+    ) |>
+    dplyr::slice_max(
+      .data$canopy_cover,
+      n = 1,
+      by = c("parcel_id", "date")
+    )
+
+  check_unique <- crop_soil_timeseries |>
+    dplyr::group_by(.data$parcel_id, .data$date) |>
+    dplyr::count() |>
+    dplyr::filter(.data$n > 1)
+  if (nrow(check_unique) > 1) {
+    bad_parcels <- unique(check_unique[["parcel_id"]])
+    warning(
+      "The parcels below have some non-unique values ",
+      "even after `slice_max(canopy_cover)`. ",
+      "This is likely because of non-unique ",
+      "landIQ crop --> crop_type mappings. ",
+      "Selecting only the first row in each of these cases.",
+      "\n",
+      paste(bad_parcels, collapse = ", ")
+    )
+    crop_soil_timeseries <- crop_soil_timeseries |>
+      dplyr::slice_max(
+        .data$canopy_cover,
+        n = 1,
+        by = c("parcel_id", "date"),
+        with_ties = FALSE
+      )
+  }
+
+  complete_crop_timeseries <- crop_soil_timeseries |>
+    dplyr::left_join(precip, by = c("parcel_id", "date")) |>
+    dplyr::left_join(
+      dplyr::select(etref, -"year"),
+      by = c("parcel_id", "date")
+    ) |>
+    dplyr::arrange(.data$parcel_id, .data$date) |>
+    tidyr::fill(.data$etref_mm_day) |>
+    dplyr::mutate(
+      etc_mm_day = eto_to_etc_bism(
+        eto = .data$etref_mm_day,
+        crop_name = .data$crop_name[[1]],
+        date = .data$date
+      ),
+      .by = "crop_name"
+    )
+
+  complete_crop_timeseries
+}
+
+make_event_df <- function(parcel_waterbalance, outfile) {
+  pw_sub <- parcel_waterbalance |>
+    dplyr::filter(.data$irr > 0) |>
+    dplyr::relocate("parcel_id", "date", "crop_name", "canopy_cover", "irr")
+
+  irr_events <- pw_sub |>
+    dplyr::select(
+      "parcel_id",
+      "date",
+      "crop_name",
+      amount_mm = "irr",
+      "canopy_cover"
+    ) |>
+    dplyr::mutate(
+      crop_code = crop_name,
+      method = dplyr::case_when(
+        crop_code == "Rice" ~ "flood",
+        TRUE ~ "canopy"
+      )
+    ) |>
+    dplyr::select("parcel_id", "date", "amount_mm", "method")
+
+  arrow::write_parquet(irr_events, outfile)
+  invisible(outfile)
+}
diff --git a/modules/data.land/inst/irrigation-statewide/R/resolve_multicrop.R b/modules/data.land/inst/irrigation-statewide/R/resolve_multicrop.R
new file mode 100644
index 00000000000..ed1c4cfb3d9
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/R/resolve_multicrop.R
@@ -0,0 +1,30 @@
+#' Average ETc and WHC across multi-crop parcels (double-cropping hack).
+resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
+  id_sym   <- rlang::sym(id_col)
+  date_sym <- rlang::sym(date_col)
+
+  multicrop_counts <- etc_data |>
+    dplyr::add_count(!!id_sym, !!date_sym, name = "n") |>
+    dplyr::filter(.data$n > 1) |>
+    dplyr::summarize(
+      n_multicrop = dplyr::n_distinct(!!id_sym, !!date_sym),
+      .groups = "drop"
+    )
+
+  if (multicrop_counts$n_multicrop > 0) {
+    message(
+      "Multi-crop parcels: ",
+      multicrop_counts$n_multicrop,
+      " date-parcel combinations have multiple crops. Averaging ETc and WHC values."
+    )
+  }
+
+  etc_data |>
+    dplyr::group_by(!!id_sym, !!date_sym) |>
+    dplyr::summarize(
+      etc_mm_day   = mean(.data$etc_mm_day,   na.rm = TRUE),
+      whc_min_frac = mean(.data$whc_min_frac, na.rm = TRUE),
+      whc_mm       = mean(.data$whc_mm,       na.rm = TRUE),
+      .groups = "drop"
+    )
+}
diff --git a/modules/data.land/inst/irrigation-statewide/README.md b/modules/data.land/inst/irrigation-statewide/README.md
new file mode 100644
index 00000000000..d758da7f481
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/README.md
@@ -0,0 +1,50 @@
+# Statewide irrigation events workflow
+
+This generates PEcAn event files for irrigation events across all of California.
+The spatial unit is harmonized LandIQ parcels.
+
+The workflow uses `targets` for reproducibility, scalability, and incremental execution.
+The pipeline is defined in `_targets.R` and can be executed via the `run-pipeline.R` script (which is just a thin wrapper around `library(targets); tar_make()`).
+
+# Setup
+
+The workflow requires the following environment variables to be set:
+
+- `TAR_CONFIG` --- path to the `_targets.yaml` config file in this directory.
+- `N_PARCELS` --- the number of parcels to process (sampled at random) or `all` to run for all LandIQ parcels.
+- `BATCH_SIZE` --- number of parcels per "batch". Each batch gets its own target. Note that having too many small batches creates a lot of overhead.
+- `N_REMOTE_WORKERS` --- number of remote workers (SGE jobs) to spawn for execution 
+- `EXEC_TYPE` --- execution type. Either `local` (run on current machine, with `NSLOTS` parallel processes) or `cluster` (to run using SGE jobs)
+
+- `LANDIQ_CROPS` --- path to harmonized LandIQ crops file (`crops_all_years.parq`)
+- `LANDIQ_TIMESERIES` --- path to HLS-based phenology (MSLSP) parquet files
+
+- `EVENT_OUTPUT_DIR` --- output directory where final event files will be written. If it doesn't exist, it will be created.
+- `EVENT_FILENAME` --- name of event file to be created. Should have `.parquet` extension. It will be placed in `EVENT_OUTPUT_DIR`
+
+- `CHIRPS_PRECIP` --- path to pre-extracted CHIRPS precipitation data (folder containing parquet files)
+- `CIMIS_ETREF` --- path to pre-extracted CIMIS evapotranspiration data (folder containing parquet files)
+- `SSURGO_WEIGHTS` --- path to pre-computed SSURGO weights for LandIQ parcels (single parquet file)
+- `SSURGO_GDB` --- path to complete SSURGO geodatabase (geodatabase; folder with `.gdb` extension)
+
+A good way to set these is via a project-local `.Renviron` file that looks like this:
+
+```
+TAR_CONFIG=modules/data.land/inst/irrigation-statewide/_targets.yaml
+
+LANDIQ_PARCELS=/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1/parcels.gpkg
+LANDIQ_CROPS=/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1/crops_all_years.parq
+LANDIQ_TIMESERIES=/projectnb/dietzelab/ccmmf/management/phenology/matched_landiq_mslsp_v4.1
+EVENT_OUTPUT_DIR=/projectnb/dietzelab/ccmmf/usr/ashiklom/event-outputs
+
+CHIRPS_PRECIP=/projectnb/dietzelab/ccmmf/data/chirps-extracted
+CIMIS_ETREF=/projectnb/dietzelab/ccmmf/data/cimis-extracted
+SSURGO_WEIGHTS=/projectnb/dietzelab/ccmmf/data_raw/ssurgo/ssurgo-weights.parquet
+SSURGO_GDB=/projectnb/dietzelab/ccmmf/data_raw/ssurgo/gSSURGO_CA.gdb
+```
+
+Use R commands like `Sys.getenv("TAR_CONFIG")` from inside your R session to confirm these variables are set correctly.
+
+# Execution
+
+Assuming the variables above are set, you can run the pipeline with just `Rscript -e 'targets::tar_make()'`.
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
new file mode 100644
index 00000000000..a6146ec30cc
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -0,0 +1,161 @@
+#!/usr/bin/env Rscript
+
+library(targets)
+library(tarchetypes)
+library(crew)
+library(crew.cluster)
+
+root_dir <- here::here("modules/data.land/inst/irrigation-statewide/")
+logdir <- file.path(root_dir, "_logs")
+dir.create(logdir, showWarnings = FALSE, recursive = TRUE)
+
+n_parcels <- Sys.getenv("N_PARCELS", 1000)
+if (tolower(n_parcels) == "all") {
+  n_parcels <- NULL
+} else {
+  n_parcels <- as.integer(n_parcels)
+}
+batch_size <- as.integer(Sys.getenv("BATCH_SIZE", 100))
+n_remote_workers <- as.integer(Sys.getenv("N_REMOTE_WORKERS", 10))
+n_local_workers <- as.integer(Sys.getenv("NSLOTS", 1))
+exec_type <- Sys.getenv("EXEC_TYPE", "local")
+event_filename <- Sys.getenv("EVENT_FILENAME", "irrigation_small.parquet")
+
+stopifnot(exec_type %in% c("cluster", "local"))
+
+ctrl_local <- crew_controller_local(
+  name = "local",
+  workers = n_local_workers
+)
+
+ctrl_sge <- crew_controller_sge(
+  name = "sge",
+  workers = n_remote_workers,
+  options_cluster = crew_options_sge(
+    envvars = TRUE, # Needed for pixi
+    log_output = logdir
+  )
+)
+
+res_local <- tar_resources(
+  crew = tar_resources_crew(controller = "local")
+)
+res_sge <- tar_resources(
+  crew = tar_resources_crew(controller = "sge")
+)
+
+res_default <- if (exec_type == "local") {
+  message("Running locally")
+  res_local
+} else if (exec_type == "cluster") {
+  message("Running via SGE cluster")
+  res_sge
+} else {
+  stop("Unknown exec_type ", shQuote(exec_type))
+}
+
+tar_option_set(
+  controller = crew_controller_group(ctrl_local, ctrl_sge),
+  resources = res_default,
+  packages = c("ggplot2", "rlang", "PEcAn.data.land"),
+  imports = c("PEcAn.data.land")
+)
+
+if (exec_type == "cluster") {
+  tar_option_set(storage = "worker", retrieval = "worker")
+}
+
+tar_source(file.path(root_dir, "R"))
+
+list(
+  tar_target(crops_path, path.expand(Sys.getenv("LANDIQ_CROPS"))),
+  tar_target(mslsp_path, path.expand(Sys.getenv("LANDIQ_TIMESERIES"))),
+  tar_target(cimis_etref_path, path.expand(Sys.getenv("CIMIS_ETREF"))),
+  tar_target(chirps_precip_path, path.expand(Sys.getenv("CHIRPS_PRECIP"))),
+  tar_target(ssurgo_weights_path, path.expand(Sys.getenv("SSURGO_WEIGHTS"))),
+  tar_target(ssurgo_gdb_path, path.expand(Sys.getenv("SSURGO_GDB"))),
+
+  tar_target(event_output_dir, path.expand(Sys.getenv("EVENT_OUTPUT_DIR"))),
+
+  tar_target(validated_paths, {
+    stopifnot(
+      file.exists(crops_path),
+      dir.exists(mslsp_path),
+      length(list.files(mslsp_path, "\\.parquet")) == 7,
+      dir.exists(cimis_etref_path),
+      dir.exists(chirps_precip_path),
+      file.exists(ssurgo_weights_path),
+      dir.exists(ssurgo_gdb_path)
+    )
+    dir.create(event_output_dir, showWarnings = FALSE, recursive = TRUE)
+    TRUE
+  }),
+
+  tar_target(parcel_ids, get_parcel_ids(crops_path, n_parcels)),
+
+  tar_target(
+    parcel_id_batches,
+    split_into_batches(parcel_ids, batch_size),
+    iteration = "list"
+  ),
+
+  tar_target(
+    phenology,
+    get_phenology(mslsp_path, parcel_id_batches),
+    pattern = map(parcel_id_batches),
+    format = "parquet"
+  ),
+
+  tar_target(
+    etref,
+    get_etref(cimis_etref_path, parcel_id_batches),
+    pattern = map(parcel_id_batches),
+    format = "parquet"
+  ),
+
+  tar_target(
+    precip,
+    get_precip(chirps_precip_path, parcel_id_batches),
+    pattern = map(parcel_id_batches),
+    format = "parquet"
+  ),
+
+  tar_target(
+    crop_info,
+    get_crop_info(crops_path, parcel_id_batches),
+    pattern = map(parcel_id_batches),
+    format = "parquet"
+  ),
+
+  tar_target(
+    crops_with_soil,
+    add_soil_awc(crop_info, ssurgo_weights_path, ssurgo_gdb_path),
+    pattern = map(crop_info),
+    format = "parquet"
+  ),
+
+  tar_target(
+    complete_crop_timeseries,
+    make_crop_timeseries(crops_with_soil, phenology, precip, etref),
+    pattern = map(crops_with_soil, phenology, precip, etref),
+    format = "parquet"
+  ),
+
+  tar_target(
+    parcel_waterbalance,
+    apply_water_balance(complete_crop_timeseries, "parcel_id"),
+    pattern = map(complete_crop_timeseries),
+    format = "parquet"
+  ),
+
+  tar_target(
+    irr_events,
+    make_event_df(
+      parcel_waterbalance,
+      file.path(event_output_dir, event_filename)
+    ),
+    format = "file"
+  ),
+
+  NULL
+)
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.yaml b/modules/data.land/inst/irrigation-statewide/_targets.yaml
new file mode 100644
index 00000000000..63b4f4a8910
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/_targets.yaml
@@ -0,0 +1,3 @@
+main:
+  script: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets.R
+  store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets/
diff --git a/modules/data.land/inst/irrigation-statewide/run-pipeline.R b/modules/data.land/inst/irrigation-statewide/run-pipeline.R
new file mode 100644
index 00000000000..89ebb3b1885
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/run-pipeline.R
@@ -0,0 +1,19 @@
+#' ---
+#' title: "Statewide irrigation workflow"
+#' author: "Alexey N. Shiklomanov"
+#' ---
+
+library(targets)
+
+# devtools::document("modules/data.land")
+# devtools::install("modules/data.land", upgrade = FALSE, reload = TRUE)
+
+#' Run the pipeline. Targets that are already up-to-date will be skipped.
+tar_make()
+if (interactive()) {
+  tar_load_everything()
+}
+
+# tar_invalidate(dp_with_crops)
+# tar_load("phenology_crops")
+# tar_load(c("design_points", "dp_with_crops", "phenology"))

From 347989e5099499039566ad42f54357233cfc4c1c Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 20 Mar 2026 16:43:19 -0400
Subject: [PATCH 41/67] Add information message to targets pipeline

---
 .../data.land/inst/irrigation-statewide/_targets.R   | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
index a6146ec30cc..a21c9ab9d05 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -23,6 +23,18 @@ event_filename <- Sys.getenv("EVENT_FILENAME", "irrigation_small.parquet")
 
 stopifnot(exec_type %in% c("cluster", "local"))
 
+message(glue::glue(
+  "Running {n_parcels} parcels in batches of {batch_size} parcels each.\n",
+  "Execution type: {exec_type} with ",
+  if (exec_type == "local") {
+    "{n_local_workers} workers.\n"
+  } else {
+    "{n_remote_workers} workers.\n"
+  },
+  "Output will be saved to ",
+  "{file.path(Sys.getenv('EVENT_OUTPUT_DIR'), event_filename)}"
+))
+
 ctrl_local <- crew_controller_local(
   name = "local",
   workers = n_local_workers

From 091559575d7e1cbb30345e8238b17baa624a0417 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 20 Mar 2026 20:07:52 -0400
Subject: [PATCH 42/67] replace stop with pecan.logger

---
 modules/data.land/R/water_balance.R | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index c086244c808..4b81380fbea 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -205,22 +205,22 @@ calc_water_balance_rice <- function(
   n <- length(et)
 
   if (length(precip) != n) {
-    stop("et and precip must be the same length")
+    PEcAn.logger::logger.severe("et and precip must be the same length")
   }
   if (is.null(drain)) {
     drain <- rep(FALSE, n)
   }
   if (length(drain) != n) {
-    stop("drain must be the same length as et")
+    PEcAn.logger::logger.severe("drain must be the same length as et")
   }
   if (flood_min >= flood_target) {
-    stop("flood_min must be less than flood_target")
+    PEcAn.logger::logger.severe("flood_min must be less than flood_target")
   }
   if (flood_target >= flood_max) {
-    stop("flood_target must be less than flood_max")
+    PEcAn.logger::logger.severe("flood_target must be less than flood_max")
   }
   if (seepage < 0) {
-    stop("seepage must be non-negative")
+    PEcAn.logger::logger.severe("seepage must be non-negative")
   }
 
   pond_depth <- numeric(n)

From 581b9264e84a2c8be8970435adb1c47c52494696 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 20 Mar 2026 20:12:20 -0400
Subject: [PATCH 43/67] propagate rice args through apply_water_balance

---
 modules/data.land/R/water_balance.R          | 28 +++++++++++------
 modules/data.land/man/apply_water_balance.Rd | 33 +++++++++++++++++---
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 4b81380fbea..7c8fe7d36fd 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -282,15 +282,25 @@ calc_water_balance_rice <- function(
 #' `calc_water_balance`, the units here *do* matter -- they should be `mm_day`.
 #'
 #' @param df Data frame with columns: `date`, `location_id`, `etc_mm_day`,
-#' `precip_mm_day`, `crop_name`, and `whc_min_frac` (optional, defaults to 0.375).
-#' If a `whc_mm` column is present, it is used as the water holding capacity.
+#' `precip_mm_day`, `crop_name`, and `whc_min_frac` (optional, defaults to
+#' 0.375). If a `whc_mm` column is present, it is used as the water holding
+#' capacity.
 #' @param idcol Column name for grouping (typically, `location_id`, `parcel_id`
-#' or similar)
+#' or similar).
 #' @param whc_mm Water holding capacity (mm); ignored if `whc_mm` is a column
 #' in `df`.
-#' @return Data frame with added columns: `W_t`, `irr`, `runoff`
+#' @inheritParams calc_water_balance_rice
+#' @return Data frame with added columns: `W_t` / `pond_depth`, `irr`, `runoff`
 #' @export
-apply_water_balance <- function(df, idcol, whc_mm = 500) {
+apply_water_balance <- function(
+  df,
+  idcol,
+  whc_mm = 500,
+  flood_target = 125,
+  flood_min = 62.5,
+  flood_max = 175,
+  seepage = 2.5
+) {
   need_cols <- c("etc_mm_day", "precip_mm_day", "date", "crop_name")
   missing_cols <- need_cols[!(need_cols %in% colnames(df))]
   default_whc_min_frac <- 0.375
@@ -348,10 +358,10 @@ apply_water_balance <- function(df, idcol, whc_mm = 500) {
       results = tibble::as_tibble(try_wb_rice(
         et = .data$etc_mm_day,
         precip = .data$precip_mm_day,
-        flood_target = 125,
-        flood_min = 62.5,
-        flood_max = 175,
-        seepage = 2.5
+        flood_target = .env$flood_target,
+        flood_min = .env$flood_min,
+        flood_max = .env$flood_max,
+        seepage = .env$seepage
       )),
       .by = dplyr::all_of(idcol)
     ) |>
diff --git a/modules/data.land/man/apply_water_balance.Rd b/modules/data.land/man/apply_water_balance.Rd
index 9fe13466d00..eee6122c6d6 100644
--- a/modules/data.land/man/apply_water_balance.Rd
+++ b/modules/data.land/man/apply_water_balance.Rd
@@ -4,21 +4,44 @@
 \alias{apply_water_balance}
 \title{Apply water balance calculations to a data frame with multiple sites}
 \usage{
-apply_water_balance(df, idcol, whc_mm = 500)
+apply_water_balance(
+  df,
+  idcol,
+  whc_mm = 500,
+  flood_target = 125,
+  flood_min = 62.5,
+  flood_max = 175,
+  seepage = 2.5
+)
 }
 \arguments{
 \item{df}{Data frame with columns: `date`, `location_id`, `etc_mm_day`,
-`precip_mm_day`, `crop_name`, and `whc_min_frac` (optional, defaults to 0.375).
-If a `whc_mm` column is present, it is used as the water holding capacity.}
+`precip_mm_day`, `crop_name`, and `whc_min_frac` (optional, defaults to
+0.375). If a `whc_mm` column is present, it is used as the water holding
+capacity.}
 
 \item{idcol}{Column name for grouping (typically, `location_id`, `parcel_id`
-or similar)}
+or similar).}
 
 \item{whc_mm}{Water holding capacity (mm); ignored if `whc_mm` is a column
 in `df`.}
+
+\item{flood_target}{Numeric scalar. Target ponded water depth.
+Irrigation refills to this level.}
+
+\item{flood_min}{Numeric scalar. Minimum acceptable pond depth before
+irrigation is triggered.}
+
+\item{flood_max}{Numeric scalar. Maximum pond depth before bund
+overflow / runoff occurs.}
+
+\item{seepage}{Numeric scalar. Daily seepage + percolation loss
+Represents losses through the bund and downward percolation
+through the hardpan (if any). Typical range: 1-5 mm/day
+for well-puddled California soils.}
 }
 \value{
-Data frame with added columns: `W_t`, `irr`, `runoff`
+Data frame with added columns: `W_t` / `pond_depth`, `irr`, `runoff`
 }
 \description{
 Groups by location and applies calc_water_balance to each group. Unlike

From ed792244c9a444645fdce481dd2564d7441cba2e Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 20 Mar 2026 20:12:54 -0400
Subject: [PATCH 44/67] fix tidyselect call in tidyr::fill

---
 modules/data.land/inst/irrigation-statewide/R/misc.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/data.land/inst/irrigation-statewide/R/misc.R b/modules/data.land/inst/irrigation-statewide/R/misc.R
index 3c8e6e2c4d7..e7dc3d68f24 100644
--- a/modules/data.land/inst/irrigation-statewide/R/misc.R
+++ b/modules/data.land/inst/irrigation-statewide/R/misc.R
@@ -56,7 +56,7 @@ make_crop_timeseries <- function(crops_with_soil, phenology, precip, etref) {
       by = c("parcel_id", "date")
     ) |>
     dplyr::arrange(.data$parcel_id, .data$date) |>
-    tidyr::fill(.data$etref_mm_day) |>
+    tidyr::fill("etref_mm_day") |>
     dplyr::mutate(
       etc_mm_day = eto_to_etc_bism(
         eto = .data$etref_mm_day,

From 29f60c7fd96521d4c4396bfade21ada530f05276 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 20 Mar 2026 20:59:11 -0400
Subject: [PATCH 45/67] targets project shortcuts

TAR_PROJECT = "small" for local tests, "medium" for larger tests
---
 modules/data.land/.Rbuildignore               |  1 +
 .../inst/irrigation-statewide/.gitignore      |  2 +-
 .../inst/irrigation-statewide/_targets.R      | 60 ++++++++++++++++---
 .../inst/irrigation-statewide/_targets.yaml   |  6 ++
 4 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/modules/data.land/.Rbuildignore b/modules/data.land/.Rbuildignore
index edd7e377255..530eaa07a82 100644
--- a/modules/data.land/.Rbuildignore
+++ b/modules/data.land/.Rbuildignore
@@ -2,3 +2,4 @@ contrib
 data-raw
 ^docs$
 .*venv/
+.*/_targets.*/
diff --git a/modules/data.land/inst/irrigation-statewide/.gitignore b/modules/data.land/inst/irrigation-statewide/.gitignore
index ebb71e51f7c..42959cecde2 100644
--- a/modules/data.land/inst/irrigation-statewide/.gitignore
+++ b/modules/data.land/inst/irrigation-statewide/.gitignore
@@ -1,3 +1,3 @@
 _logs/
-_targets/
+_targets*/
 zz-*
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
index a21c9ab9d05..2131234e08c 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -9,21 +9,56 @@ root_dir <- here::here("modules/data.land/inst/irrigation-statewide/")
 logdir <- file.path(root_dir, "_logs")
 dir.create(logdir, showWarnings = FALSE, recursive = TRUE)
 
-n_parcels <- Sys.getenv("N_PARCELS", 1000)
+tar_project <- Sys.getenv("TAR_PROJECT", "main")
+proj_defaults <- list(
+  small = list(
+    n_parcels = 1000,
+    batch_size = 100,
+    n_remote_workers = 1,  # dummy value
+    exec_type = "local",
+    event_filename = "irrigation_1000.parquet"
+  ),
+  medium = list(
+    n_parcels = 10000,
+    batch_size = 1000,
+    n_remote_workers = 15,
+    exec_type = "cluster",
+    event_filename = "irrigation_10000.parquet"
+  )
+)
+proj_defaults[["main"]] <- proj_defaults[["small"]]
+
+n_parcels <- Sys.getenv(
+  "N_PARCELS",
+  proj_defaults[[tar_project]][["n_parcels"]]
+)
 if (tolower(n_parcels) == "all") {
   n_parcels <- NULL
 } else {
   n_parcels <- as.integer(n_parcels)
 }
-batch_size <- as.integer(Sys.getenv("BATCH_SIZE", 100))
-n_remote_workers <- as.integer(Sys.getenv("N_REMOTE_WORKERS", 10))
+batch_size <- as.integer(Sys.getenv(
+  "BATCH_SIZE",
+  proj_defaults[[tar_project]][["batch_size"]]
+))
+n_remote_workers <- as.integer(Sys.getenv(
+  "N_REMOTE_WORKERS",
+  proj_defaults[[tar_project]][["n_remote_workers"]]
+))
 n_local_workers <- as.integer(Sys.getenv("NSLOTS", 1))
-exec_type <- Sys.getenv("EXEC_TYPE", "local")
-event_filename <- Sys.getenv("EVENT_FILENAME", "irrigation_small.parquet")
+exec_type <- Sys.getenv(
+  "EXEC_TYPE",
+  proj_defaults[[tar_project]][["exec_type"]]
+)
+event_filename <- Sys.getenv(
+  "EVENT_FILENAME",
+  proj_defaults[[tar_project]][["event_filename"]]
+)
 
 stopifnot(exec_type %in% c("cluster", "local"))
 
 message(glue::glue(
+  "PROJECT: {tar_project}\n",
   "Running {n_parcels} parcels in batches of {batch_size} parcels each.\n",
   "Execution type: {exec_type} with ",
   if (exec_type == "local") {
@@ -32,7 +67,8 @@ message(glue::glue(
     "{n_remote_workers} workers.\n"
   },
   "Output will be saved to ",
-  "{file.path(Sys.getenv('EVENT_OUTPUT_DIR'), event_filename)}"
+  "{file.path(Sys.getenv('EVENT_OUTPUT_DIR'), event_filename)}\n",
+  "Targets output will be stored in ", tar_config_get("store")
 ))
 
 ctrl_local <- crew_controller_local(
@@ -45,7 +81,17 @@ ctrl_sge <- crew_controller_sge(
   workers = n_remote_workers,
   options_cluster = crew_options_sge(
     envvars = TRUE, # Needed for pixi
-    log_output = logdir
+    log_output = logdir,
+    script_lines = c(
+      # Try to prevent multiple threads
+      "#$ -pe omp 1",
+      "export OMP_NUM_THREADS=1",
+      "export OPENBLAS_NUM_THREADS=1",
+      "export MKL_NUM_THREADS=1",
+      "export RCPP_PARALLEL_NUM_THREADS=1",
+      "export GOTO_NUM_THREADS=1",
+      "export USE_SIMPLE_THREADED_LEVEL3=1"
+    )
   )
 )
 
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.yaml b/modules/data.land/inst/irrigation-statewide/_targets.yaml
index 63b4f4a8910..f5b210885a6 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.yaml
+++ b/modules/data.land/inst/irrigation-statewide/_targets.yaml
@@ -1,3 +1,9 @@
 main:
   script: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets.R
   store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets/
+small:
+  inherits: main
+  store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets_small/
+medium:
+  inherits: main
+  store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets_medium/

From a6c9543667b124407105627c0dd97e6d6a3abb7b Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 20 Mar 2026 21:38:50 -0400
Subject: [PATCH 46/67] add crude uncertainty propagation

Proof-of-concept to show that we can work with ensembles of irrigation
files. Uncertainty is estimated as just `amount_mm * rnorm(n, 1, frac)`
(where frac defaults to 0.1), applied uniformly to each parcel's
irrigation time series.
---
 .../irrigation-statewide/R/crop_timeseries.R  | 68 +++++++++++++
 .../inst/irrigation-statewide/R/events_df.R   | 56 +++++++++++
 .../inst/irrigation-statewide/R/misc.R        | 95 +------------------
 .../inst/irrigation-statewide/_targets.R      |  6 +-
 4 files changed, 130 insertions(+), 95 deletions(-)
 create mode 100644 modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
 create mode 100644 modules/data.land/inst/irrigation-statewide/R/events_df.R

diff --git a/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R b/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
new file mode 100644
index 00000000000..6015d6951c1
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
@@ -0,0 +1,68 @@
+#!/usr/bin/env Rscript
+
+make_crop_timeseries <- function(crops_with_soil, phenology, precip, etref) {
+  crop_cols <- c(
+    "parcel_id",
+    "year",
+    "crop_name",
+    "whc_min_frac",
+    "whc_mm",
+    "whc_min_frac"
+  )
+
+  crop_soil_timeseries <- crops_with_soil |>
+    dplyr::select(dplyr::all_of(crop_cols)) |>
+    dplyr::inner_join(
+      phenology,
+      by = c("parcel_id", "year"),
+      relationship = "many-to-many"
+    ) |>
+    dplyr::slice_max(
+      .data$canopy_cover,
+      n = 1,
+      by = c("parcel_id", "date")
+    )
+
+  check_unique <- crop_soil_timeseries |>
+    dplyr::group_by(.data$parcel_id, .data$date) |>
+    dplyr::count() |>
+    dplyr::filter(.data$n > 1)
+  if (nrow(check_unique) > 1) {
+    bad_parcels <- unique(check_unique[["parcel_id"]])
+    warning(
+      "The parcels below have some non-unique values ",
+      "even after `slice_max(canopy_cover)`. ",
+      "This is likely because of non-unique ",
+      "landIQ crop --> crop_type mappings. ",
+      "Selecting only the first row in each of these cases.",
+      "\n",
+      paste(bad_parcels, collapse = ", ")
+    )
+    crop_soil_timeseries <- crop_soil_timeseries |>
+      dplyr::slice_max(
+        .data$canopy_cover,
+        n = 1,
+        by = c("parcel_id", "date"),
+        with_ties = FALSE
+      )
+  }
+
+  complete_crop_timeseries <- crop_soil_timeseries |>
+    dplyr::left_join(precip, by = c("parcel_id", "date")) |>
+    dplyr::left_join(
+      dplyr::select(etref, -"year"),
+      by = c("parcel_id", "date")
+    ) |>
+    dplyr::arrange(.data$parcel_id, .data$date) |>
+    tidyr::fill("etref_mm_day") |>
+    dplyr::mutate(
+      etc_mm_day = eto_to_etc_bism(
+        eto = .data$etref_mm_day,
+        crop_name = .data$crop_name[[1]],
+        date = .data$date
+      ),
+      .by = "crop_name"
+    )
+
+  complete_crop_timeseries
+}
diff --git a/modules/data.land/inst/irrigation-statewide/R/events_df.R b/modules/data.land/inst/irrigation-statewide/R/events_df.R
new file mode 100644
index 00000000000..982c3e9b8b2
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/R/events_df.R
@@ -0,0 +1,56 @@
+#!/usr/bin/env Rscript
+
+make_event_df <- function(
+  parcel_waterbalance,
+  outfile,
+  n_ensemble = NULL,
+  frac_uncertainty = 0.1
+) {
+  pw_sub <- parcel_waterbalance |>
+    dplyr::filter(.data$irr > 0) |>
+    dplyr::relocate("parcel_id", "date", "crop_name", "canopy_cover", "irr")
+
+  irr_events <- pw_sub |>
+    dplyr::mutate(
+      crop_code = .data$crop_name,
+      method = dplyr::case_when(
+        .data$crop_code == "Rice" ~ "flood",
+        TRUE ~ "canopy"
+      )
+    ) |>
+    dplyr::select("parcel_id", "date", "amount_mm" = "irr", "method")
+
+  if (is.null(n_ensemble)) {
+    arrow::write_parquet(irr_events, outfile)
+    return(invisible(outfile))
+  }
+
+  # Crude uncertainty propagation. We apply a uniform uncertainty multiplier
+  # across the entire irrigation time series.
+  unc_table <- irr_events |>
+    dplyr::distinct(.data$parcel_id) |>
+    dplyr::mutate(
+      unc_multi = purrr::map(
+        .data$parcel_id,
+        ~rnorm(n_ensemble, 1.0, frac_uncertainty)
+      ),
+      ens_id = purrr::map(.data$unc_multi, seq_along)
+    ) |>
+    tidyr::unnest(c("unc_multi", "ens_id")) |>
+    dplyr::mutate(ens_id = sprintf("irr_ens_%03d", .data$ens_id))
+
+  irr_events_unc <- irr_events |>
+    dplyr::left_join(
+      unc_table,
+      by = "parcel_id",
+      relationship = "many-to-many"
+    ) |>
+    dplyr::mutate(
+      amount_mm = .data$amount_mm * .data$unc_multi,
+      .keep = "unused"
+    ) |>
+    dplyr::relocate("parcel_id", "ens_id", "date")
+
+  arrow::write_parquet(irr_events_unc, outfile)
+  invisible(outfile)
+}
diff --git a/modules/data.land/inst/irrigation-statewide/R/misc.R b/modules/data.land/inst/irrigation-statewide/R/misc.R
index e7dc3d68f24..eaa4cdfed78 100644
--- a/modules/data.land/inst/irrigation-statewide/R/misc.R
+++ b/modules/data.land/inst/irrigation-statewide/R/misc.R
@@ -1,96 +1,5 @@
+#!/usr/bin/env Rscript
+
 split_into_batches <- function(x, batch_size) {
   split(x, ceiling(seq_along(x) / batch_size))
 }
-
-make_crop_timeseries <- function(crops_with_soil, phenology, precip, etref) {
-  crop_cols <- c(
-    "parcel_id",
-    "year",
-    "crop_name",
-    "whc_min_frac",
-    "whc_mm",
-    "whc_min_frac"
-  )
-
-  crop_soil_timeseries <- crops_with_soil |>
-    dplyr::select(dplyr::all_of(crop_cols)) |>
-    dplyr::inner_join(
-      phenology,
-      by = c("parcel_id", "year"),
-      relationship = "many-to-many"
-    ) |>
-    dplyr::slice_max(
-      .data$canopy_cover,
-      n = 1,
-      by = c("parcel_id", "date")
-    )
-
-  check_unique <- crop_soil_timeseries |>
-    dplyr::group_by(.data$parcel_id, .data$date) |>
-    dplyr::count() |>
-    dplyr::filter(.data$n > 1)
-  if (nrow(check_unique) > 1) {
-    bad_parcels <- unique(check_unique[["parcel_id"]])
-    warning(
-      "The parcels below have some non-unique values ",
-      "even after `slice_max(canopy_cover)`. ",
-      "This is likely because of non-unique ",
-      "landIQ crop --> crop_type mappings. ",
-      "Selecting only the first row in each of these cases.",
-      "\n",
-      paste(bad_parcels, collapse = ", ")
-    )
-    crop_soil_timeseries <- crop_soil_timeseries |>
-      dplyr::slice_max(
-        .data$canopy_cover,
-        n = 1,
-        by = c("parcel_id", "date"),
-        with_ties = FALSE
-      )
-  }
-
-  complete_crop_timeseries <- crop_soil_timeseries |>
-    dplyr::left_join(precip, by = c("parcel_id", "date")) |>
-    dplyr::left_join(
-      dplyr::select(etref, -"year"),
-      by = c("parcel_id", "date")
-    ) |>
-    dplyr::arrange(.data$parcel_id, .data$date) |>
-    tidyr::fill("etref_mm_day") |>
-    dplyr::mutate(
-      etc_mm_day = eto_to_etc_bism(
-        eto = .data$etref_mm_day,
-        crop_name = .data$crop_name[[1]],
-        date = .data$date
-      ),
-      .by = "crop_name"
-    )
-
-  complete_crop_timeseries
-}
-
-make_event_df <- function(parcel_waterbalance, outfile) {
-  pw_sub <- parcel_waterbalance |>
-    dplyr::filter(.data$irr > 0) |>
-    dplyr::relocate("parcel_id", "date", "crop_name", "canopy_cover", "irr")
-
-  irr_events <- pw_sub |>
-    dplyr::select(
-      "parcel_id",
-      "date",
-      "crop_name",
-      amount_mm = "irr",
-      "canopy_cover"
-    ) |>
-    dplyr::mutate(
-      crop_code = crop_name,
-      method = dplyr::case_when(
-        crop_code == "Rice" ~ "flood",
-        TRUE ~ "canopy"
-      )
-    ) |>
-    dplyr::select("parcel_id", "date", "amount_mm", "method")
-
-  arrow::write_parquet(irr_events, outfile)
-  invisible(outfile)
-}
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
index 2131234e08c..24ec43d328c 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -5,7 +5,7 @@ library(tarchetypes)
 library(crew)
 library(crew.cluster)
 
-root_dir <- here::here("modules/data.land/inst/irrigation-statewide/")
+root_dir <- here::here("modules/data.land/inst/irrigation-statewide")
 logdir <- file.path(root_dir, "_logs")
 dir.create(logdir, showWarnings = FALSE, recursive = TRUE)
 
@@ -210,7 +210,9 @@ list(
     irr_events,
     make_event_df(
       parcel_waterbalance,
-      file.path(event_output_dir, event_filename)
+      file.path(event_output_dir, event_filename),
+      n_ensemble = 20,
+      frac_uncertainty = 0.1
     ),
     format = "file"
   ),

From 82eeefb287d7fab0ee285b999010e1e7235b966b Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sun, 22 Mar 2026 15:26:13 -0400
Subject: [PATCH 47/67] make full crop time series, filling missing data

instead of left-joining etref and precip into (incomplete) phenology
time series, left-join phenology *into* *complete* etref and precip time
series. then, fill.

default canopy_cover value is 0. for other values, fill "down" first (if
missing, assume previous), then, fill "up" (if missing, assume next
non-missing --- basically, fill the beginning of the time series with
the first value).
---
 .../irrigation-statewide/R/crop_timeseries.R     | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R b/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
index 6015d6951c1..5a47da4d3e5 100644
--- a/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
+++ b/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
@@ -47,14 +47,26 @@ make_crop_timeseries <- function(crops_with_soil, phenology, precip, etref) {
       )
   }
 
-  complete_crop_timeseries <- crop_soil_timeseries |>
-    dplyr::left_join(precip, by = c("parcel_id", "date")) |>
+  start_date <- min(crop_soil_timeseries[["date"]])
+  end_date <- max(crop_soil_timeseries[["date"]])
+
+  complete_crop_timeseries <- precip |>
+    dplyr::filter(
+      .data$date >= .env$start_date,
+      .data$date <= .env$end_date
+    ) |>
     dplyr::left_join(
       dplyr::select(etref, -"year"),
       by = c("parcel_id", "date")
     ) |>
     dplyr::arrange(.data$parcel_id, .data$date) |>
     tidyr::fill("etref_mm_day") |>
+    dplyr::left_join(crop_soil_timeseries, by = c("parcel_id", "date")) |>
+    tidyr::replace_na(list(canopy_cover = 0)) |>
+    tidyr::fill(
+      c("whc_min_frac", "whc_mm", "crop_name"),
+      .direction = "downup"
+    ) |>
     dplyr::mutate(
       etc_mm_day = eto_to_etc_bism(
         eto = .data$etref_mm_day,

From f76ad1222ed7a48dc8ca6ac914b51d8fade5e5d9 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sun, 22 Mar 2026 20:13:51 -0400
Subject: [PATCH 48/67] irrigation: add config for all parcels

---
 modules/data.land/inst/irrigation-statewide/_targets.R | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
index 24ec43d328c..ce12c75c597 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -24,6 +24,13 @@ proj_defaults <- list(
     n_remote_workers = 15,
     exec_type = "cluster",
     event_filename = "irrigation_10000.parquet"
+  ),
+  all = list(
+    n_parcels = "all",
+    batch_size = 5000,
+    n_remote_workers = 24,
+    exec_type = "cluster",
+    event_filename = "irrigation_all.parquet"
   )
 )
 proj_defaults[["main"]] <- proj_defaults[["small"]]

From ef4318ebd9d7dea067b9207ecc4a5d3bf3efac09 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Sun, 22 Mar 2026 21:13:45 -0400
Subject: [PATCH 49/67] fix SGE workers crashing on init

This is caused by an insidious threading bug between OpenBLAS and
nanonext (the driver behind TLS that crew/mirai uses). See below Claude
debugging session for more details.

https://claude.ai/share/39915b2f-9047-4367-8f9f-c310a6441b75
---
 .../inst/irrigation-statewide/_targets.R      | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
index ce12c75c597..7432daf8a06 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -86,18 +86,19 @@ ctrl_local <- crew_controller_local(
 ctrl_sge <- crew_controller_sge(
   name = "sge",
   workers = n_remote_workers,
+  # TLS causes weird allocator bugs. We're on an internal network, so no TLS is
+  # probably fine.
+  tls = crew::crew_tls(mode = "none"),
   options_cluster = crew_options_sge(
-    envvars = TRUE, # Needed for pixi
     log_output = logdir,
     script_lines = c(
-      # Try to prevent multiple threads
-      "#$ -pe omp 1",
-      "export OMP_NUM_THREADS=1",
-      "export OPENBLAS_NUM_THREADS=1",
-      "export MKL_NUM_THREADS=1",
-      "export RCPP_PARALLEL_NUM_THREADS=1",
-      "export GOTO_NUM_THREADS=1",
-      "export USE_SIMPLE_THREADED_LEVEL3=1"
+      # Activate pixi
+      'eval "$(pixi shell-hook -s bash)"',
+      # Diagnostics
+      "echo 'PIXI environment:'",
+      "env | grep PIXI",
+      "echo 'R .libPaths():'",
+      "Rscript -e '.libPaths()'"
     )
   )
 )

From 6ebc3449d7188d1d86d5442d4e27d5ca42bc9552 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 26 Mar 2026 12:14:39 -0400
Subject: [PATCH 50/67] update crop_whc citation

---
 modules/data.land/R/data.R        | 4 ++--
 modules/data.land/man/crop_whc.Rd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/data.land/R/data.R b/modules/data.land/R/data.R
index 3e8651f848e..34f5c931c72 100644
--- a/modules/data.land/R/data.R
+++ b/modules/data.land/R/data.R
@@ -307,8 +307,8 @@
 #'   \item{rooting_depth_notes}{Rationale or source for the rooting depth value.}
 #' }
 #' @source Allen, R. G., Pereira, L. S., Raes, D., & Smith, M.
-#' \emph{FAO Irrigation and Drainage Paper No. 56}.
-#' https://www.professormendoncauenf.com.br/wp-content/uploads/2021/03/ag_fao_56_ingles.pdf
+#' \emph{FAO Irrigation and Drainage Paper No. 56: Crop evapotranspiration}. Chapter 8. Table 22.
+#' https://www.fao.org/4/x0490e/x0490e0e.htm#chapter%208%20%20%20etc%20under%20soil%20water%20stress%20conditions
 #' @examples
 #' data(crop_whc)
 #' head(crop_whc)
diff --git a/modules/data.land/man/crop_whc.Rd b/modules/data.land/man/crop_whc.Rd
index 70a03fd0000..875dfb16b11 100644
--- a/modules/data.land/man/crop_whc.Rd
+++ b/modules/data.land/man/crop_whc.Rd
@@ -18,8 +18,8 @@ A tibble with one row per crop and the following columns:
 }
 \source{
 Allen, R. G., Pereira, L. S., Raes, D., & Smith, M.
-\emph{FAO Irrigation and Drainage Paper No. 56}.
-https://www.professormendoncauenf.com.br/wp-content/uploads/2021/03/ag_fao_56_ingles.pdf
+\emph{FAO Irrigation and Drainage Paper No. 56: Crop evapotranspiration}. Chapter 8. Table 22.
+https://www.fao.org/4/x0490e/x0490e0e.htm#chapter%208%20%20%20etc%20under%20soil%20water%20stress%20conditions
 }
 \usage{
 crop_whc

From 25466ac54afd7c9d1e73405cc85a7043f3328aa4 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 26 Mar 2026 13:31:59 -0400
Subject: [PATCH 51/67] calc_awc needs to also aggregate components

---
 .../inst/irrigation-statewide/R/calc_awc.R        | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/modules/data.land/inst/irrigation-statewide/R/calc_awc.R b/modules/data.land/inst/irrigation-statewide/R/calc_awc.R
index e83e1222ff1..dfd671a44f4 100644
--- a/modules/data.land/inst/irrigation-statewide/R/calc_awc.R
+++ b/modules/data.land/inst/irrigation-statewide/R/calc_awc.R
@@ -66,17 +66,26 @@ add_soil_awc <- function(
       rooting_depth_cm = .data$rooting_depth_m * 100,
       .keep = "unused"
     ) |>
+    # Aggregate horizons (by component)
     dplyr::summarize(
-      whc_mm = calc_effective_awc(
+      whc_mm_cmp = calc_effective_awc(
         .data$hzdept_r,
         .data$hzdepb_r,
         .data$awc_r,
         .data$rooting_depth_cm
       ),
-      .by = c("parcel_id", "mukey", "cokey", "area_m2", "weight")
+      .by = c("parcel_id", "mukey", "cokey", "area_m2", "weight", "comppct_r")
     ) |>
+    # Aggregate components (by mapping unit)
     dplyr::summarize(
-      whc_mm = sum(.data$whc_mm * .data$weight),
+      whc_mm_mu = sum(
+        .data$whc_mm_cmp * .data$comppct_r / sum(.data$comppct_r)
+      ),
+      .by = c("parcel_id", "mukey", "area_m2", "weight")
+    ) |>
+    # Aggregate mapping units (by parcel)
+    dplyr::summarize(
+      whc_mm = sum(.data$whc_mm_mu * .data$weight),
       .by = "parcel_id"
     )
 

From 30ba3954601433b09a59106a49f0776f0fdbd03b Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 9 Apr 2026 10:10:00 -0400
Subject: [PATCH 52/67] write final outputs to parquet

---
 .../irrigation-statewide/R/crop_timeseries.R  |  3 ++
 .../inst/irrigation-statewide/R/events_df.R   |  7 ++---
 .../inst/irrigation-statewide/_targets.R      | 31 ++++++++++++++++---
 .../inst/irrigation-statewide/_targets.yaml   |  3 ++
 .../inst/irrigation-statewide/push-to-carb.sh |  5 +++
 .../inst/irrigation-statewide/run-pipeline.R  |  6 ++++
 6 files changed, 45 insertions(+), 10 deletions(-)
 create mode 100644 modules/data.land/inst/irrigation-statewide/push-to-carb.sh

diff --git a/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R b/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
index 5a47da4d3e5..d706558c0ef 100644
--- a/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
+++ b/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
@@ -1,5 +1,8 @@
 #!/usr/bin/env Rscript
 
+# Sys.setenv(TAR_PROJECT = "small")
+# targets::tar_load(c(crops_with_soil, phenology, precip, etref))
+
 make_crop_timeseries <- function(crops_with_soil, phenology, precip, etref) {
   crop_cols <- c(
     "parcel_id",
diff --git a/modules/data.land/inst/irrigation-statewide/R/events_df.R b/modules/data.land/inst/irrigation-statewide/R/events_df.R
index 982c3e9b8b2..693ac9b4643 100644
--- a/modules/data.land/inst/irrigation-statewide/R/events_df.R
+++ b/modules/data.land/inst/irrigation-statewide/R/events_df.R
@@ -2,7 +2,6 @@
 
 make_event_df <- function(
   parcel_waterbalance,
-  outfile,
   n_ensemble = NULL,
   frac_uncertainty = 0.1
 ) {
@@ -21,8 +20,7 @@ make_event_df <- function(
     dplyr::select("parcel_id", "date", "amount_mm" = "irr", "method")
 
   if (is.null(n_ensemble)) {
-    arrow::write_parquet(irr_events, outfile)
-    return(invisible(outfile))
+    return(irr_events)
   }
 
   # Crude uncertainty propagation. We apply a uniform uncertainty multiplier
@@ -51,6 +49,5 @@ make_event_df <- function(
     ) |>
     dplyr::relocate("parcel_id", "ens_id", "date")
 
-  arrow::write_parquet(irr_events_unc, outfile)
-  invisible(outfile)
+  irr_events_unc
 }
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
index 7432daf8a06..fc2af3d6802 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -28,8 +28,8 @@ proj_defaults <- list(
   all = list(
     n_parcels = "all",
     batch_size = 5000,
-    n_remote_workers = 24,
-    exec_type = "cluster",
+    n_remote_workers = 60,
+    exec_type = "local",
     event_filename = "irrigation_all.parquet"
   )
 )
@@ -98,7 +98,9 @@ ctrl_sge <- crew_controller_sge(
       "echo 'PIXI environment:'",
       "env | grep PIXI",
       "echo 'R .libPaths():'",
-      "Rscript -e '.libPaths()'"
+      "Rscript -e '.libPaths()'",
+      # prevent arrow parallelism
+      "export OMP_NUM_THREADS=1"
     )
   )
 )
@@ -215,13 +217,32 @@ list(
   ),
 
   tar_target(
-    irr_events,
+    irr_events_df,
     make_event_df(
       parcel_waterbalance,
-      file.path(event_output_dir, event_filename),
       n_ensemble = 20,
       frac_uncertainty = 0.1
     ),
+    pattern = map(parcel_waterbalance),
+    format = "parquet"
+  ),
+
+  tar_target(
+    irr_events_files,
+    {
+      parcel_ids <- unique(irr_events_df[["parcel_id"]])
+      pid_min <- min(parcel_ids)
+      pid_max <- max(parcel_ids)
+      out_dir <- file.path(
+        event_output_dir,
+        gsub("\\.parquet$", "", event_filename)
+      )
+      dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
+      out_file <- file.path(out_dir, sprintf("%d_%d.parquet", pid_min, pid_max))
+      arrow::write_parquet(irr_events_df, out_file)
+      out_file
+    },
+    pattern = map(irr_events_df),
     format = "file"
   ),
 
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.yaml b/modules/data.land/inst/irrigation-statewide/_targets.yaml
index f5b210885a6..2fe59f8cee4 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.yaml
+++ b/modules/data.land/inst/irrigation-statewide/_targets.yaml
@@ -7,3 +7,6 @@ small:
 medium:
   inherits: main
   store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets_medium/
+all:
+  inherits: main
+  store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets_all/
diff --git a/modules/data.land/inst/irrigation-statewide/push-to-carb.sh b/modules/data.land/inst/irrigation-statewide/push-to-carb.sh
new file mode 100644
index 00000000000..ef0822b44fd
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/push-to-carb.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+aws s3 sync --profile ccmmf \
+  /projectnb/dietzelab/ccmmf/management/irrigation_event_files/ \
+  s3://carb/management/irrigation/v1.0/
diff --git a/modules/data.land/inst/irrigation-statewide/run-pipeline.R b/modules/data.land/inst/irrigation-statewide/run-pipeline.R
index 89ebb3b1885..27f51f23023 100644
--- a/modules/data.land/inst/irrigation-statewide/run-pipeline.R
+++ b/modules/data.land/inst/irrigation-statewide/run-pipeline.R
@@ -3,6 +3,11 @@
 #' author: "Alexey N. Shiklomanov"
 #' ---
 
+Sys.setenv(
+  "TAR_PROJECT" = "all",
+  "OMP_NUM_THREADS" = 1
+)
+
 library(targets)
 
 # devtools::document("modules/data.land")
@@ -10,6 +15,7 @@ library(targets)
 
 #' Run the pipeline. Targets that are already up-to-date will be skipped.
 tar_make()
+
 if (interactive()) {
   tar_load_everything()
 }

From ed673d14d9165f2cce2e4c61ee76e9075d155f66 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Thu, 9 Apr 2026 10:21:43 -0400
Subject: [PATCH 53/67] write irrigation output directly to named parquet

don't add extra steps.
---
 .../inst/irrigation-statewide/R/events_df.R   | 15 ++++++++++
 .../inst/irrigation-statewide/_targets.R      | 28 ++++---------------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/modules/data.land/inst/irrigation-statewide/R/events_df.R b/modules/data.land/inst/irrigation-statewide/R/events_df.R
index 693ac9b4643..085e38c51bc 100644
--- a/modules/data.land/inst/irrigation-statewide/R/events_df.R
+++ b/modules/data.land/inst/irrigation-statewide/R/events_df.R
@@ -1,5 +1,20 @@
 #!/usr/bin/env Rscript
 
+make_event_df_parquet <- function(output_dir, ..., out_file = NULL) {
+  result <- make_event_df(...)
+  if (is.null(out_file)) {
+    pid_min <- min(result[["parcel_id"]], na.rm = TRUE)
+    pid_max <- max(result[["parcel_id"]], na.rm = TRUE)
+    dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
+    out_file <- file.path(
+      output_dir,
+      sprintf("%d_%d.parquet", pid_min, pid_max)
+    )
+  }
+  arrow::write_parquet(result, out_file)
+  invisible(out_file)
+}
+
 make_event_df <- function(
   parcel_waterbalance,
   n_ensemble = NULL,
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
index fc2af3d6802..005a4428a77 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -16,21 +16,21 @@ proj_defaults <- list(
     batch_size = 100,
     n_remote_workers = 1,  # dummy value
     exec_type = "local",
-    event_filename = "irrigation_1000.parquet"
+    event_filename = "irrigation_1000"
   ),
   medium = list(
     n_parcels = 10000,
     batch_size = 1000,
     n_remote_workers = 15,
     exec_type = "cluster",
-    event_filename = "irrigation_10000.parquet"
+    event_filename = "irrigation_10000"
   ),
   all = list(
     n_parcels = "all",
     batch_size = 5000,
     n_remote_workers = 60,
     exec_type = "local",
-    event_filename = "irrigation_all.parquet"
+    event_filename = "irrigation_all"
   )
 )
 proj_defaults[["main"]] <- proj_defaults[["small"]]
@@ -218,31 +218,13 @@ list(
 
   tar_target(
     irr_events_df,
-    make_event_df(
+    make_event_df_parquet(
+      file.path(event_output_dir, event_filename),
       parcel_waterbalance,
       n_ensemble = 20,
       frac_uncertainty = 0.1
     ),
     pattern = map(parcel_waterbalance),
-    format = "parquet"
-  ),
-
-  tar_target(
-    irr_events_files,
-    {
-      parcel_ids <- unique(irr_events_df[["parcel_id"]])
-      pid_min <- min(parcel_ids)
-      pid_max <- max(parcel_ids)
-      out_dir <- file.path(
-        event_output_dir,
-        gsub("\\.parquet$", "", event_filename)
-      )
-      dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
-      out_file <- file.path(out_dir, sprintf("%d_%d.parquet", pid_min, pid_max))
-      arrow::write_parquet(irr_events_df, out_file)
-      out_file
-    },
-    pattern = map(irr_events_df),
     format = "file"
   ),
 

From c6fcc46beeebe6484aa3b2ab2dd16e858db0e981 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 14:37:29 -0400
Subject: [PATCH 54/67] use config for workflow configuration

---
 .../inst/irrigation-statewide/_targets.R      | 87 ++++++-------------
 .../inst/irrigation-statewide/config.yml      | 23 +++++
 .../irrigation-statewide/config_paths.yml     |  8 ++
 3 files changed, 59 insertions(+), 59 deletions(-)
 create mode 100644 modules/data.land/inst/irrigation-statewide/config.yml
 create mode 100644 modules/data.land/inst/irrigation-statewide/config_paths.yml

diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/modules/data.land/inst/irrigation-statewide/_targets.R
index 005a4428a77..c9e499954c8 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/modules/data.land/inst/irrigation-statewide/_targets.R
@@ -9,63 +9,32 @@ root_dir <- here::here("modules/data.land/inst/irrigation-statewide")
 logdir <- file.path(root_dir, "_logs")
 dir.create(logdir, showWarnings = FALSE, recursive = TRUE)
 
-tar_project <- Sys.getenv("TAR_PROJECT", "main")
-proj_defaults <- list(
-  small = list(
-    n_parcels = 1000,
-    batch_size = 100,
-    n_remote_workers = 1,  # dummy value
-    exec_type = "local",
-    event_filename = "irrigation_1000"
-  ),
-  medium = list(
-    n_parcels = 10000,
-    batch_size = 1000,
-    n_remote_workers = 15,
-    exec_type = "cluster",
-    event_filename = "irrigation_10000"
-  ),
-  all = list(
-    n_parcels = "all",
-    batch_size = 5000,
-    n_remote_workers = 60,
-    exec_type = "local",
-    event_filename = "irrigation_all"
-  )
-)
-proj_defaults[["main"]] <- proj_defaults[["small"]]
+targets_config <- Sys.getenv("TAR_CONFIG", file.path(root_dir, "_targets.yaml"))
+Sys.setenv(TAR_CONFIG = targets_config)
 
-n_parcels <- Sys.getenv(
-  "N_PARCELS",
-  proj_defaults[[tar_project]][["n_parcels"]]
+project <- Sys.getenv("TAR_PROJECT", "small")
+config_base <- config::get(
+  file = file.path(root_dir, "config.yml"),
+  config = project
 )
-if (tolower(n_parcels) == "all") {
-  n_parcels <- NULL
-} else {
-  n_parcels <- as.integer(n_parcels)
-}
-batch_size <- as.integer(Sys.getenv(
-  "BATCH_SIZE",
-  proj_defaults[[tar_project]][["batch_size"]]
-))
-n_remote_workers <- as.integer(Sys.getenv(
-  "N_REMOTE_WORKERS",
-  proj_defaults[[tar_project]][["n_remote_workers"]]
-))
-n_local_workers <- as.integer(Sys.getenv("NSLOTS", 1))
-exec_type <- Sys.getenv(
-  "EXEC_TYPE",
-  proj_defaults[[tar_project]][["exec_type"]]
-)
-event_filename <- Sys.getenv(
-  "EVENT_FILENAME",
-  proj_defaults[[tar_project]][["event_filename"]]
+config_paths <- config::get(
+  file = file.path(root_dir, "config_paths.yml"),
+  config = Sys.getenv("IRRIGATION_PATHS_CONFIG", "default")
 )
+config <- config::merge(config_base, config_paths)
 
+n_parcels <- config[["n_parcels"]]
+batch_size <- config[["batch_size"]]
+n_remote_workers <- config[["n_remote_workers"]]
+n_local_workers <- as.integer(Sys.getenv("NSLOTS", 1))
+exec_type <- config[["exec_type"]]
 stopifnot(exec_type %in% c("cluster", "local"))
+event_output_dir <- config[["event_output_dir"]]
+event_filename <- config[["event_filename"]]
+n_irr_ensemble <- config[["n_irr_ensemble"]]
 
 message(glue::glue(
-  "PROJECT: {tar_project}\n",
+  "PROJECT: {project}\n",
   "Running {n_parcels} parcels in batches of {batch_size} parcels each.\n",
   "Execution type: {exec_type} with ",
   if (exec_type == "local") {
@@ -74,7 +43,7 @@ message(glue::glue(
     "{n_remote_workers} workers.\n"
   },
   "Output will be saved to ",
-  "{file.path(Sys.getenv('EVENT_OUTPUT_DIR'), event_filename)}\n",
+  "{file.path(event_output_dir, event_filename)}\n",
   "Targets output will be stored in ", tar_config_get("store")
 ))
 
@@ -136,14 +105,14 @@ if (exec_type == "cluster") {
 tar_source(file.path(root_dir, "R"))
 
 list(
-  tar_target(crops_path, path.expand(Sys.getenv("LANDIQ_CROPS"))),
-  tar_target(mslsp_path, path.expand(Sys.getenv("LANDIQ_TIMESERIES"))),
-  tar_target(cimis_etref_path, path.expand(Sys.getenv("CIMIS_ETREF"))),
-  tar_target(chirps_precip_path, path.expand(Sys.getenv("CHIRPS_PRECIP"))),
-  tar_target(ssurgo_weights_path, path.expand(Sys.getenv("SSURGO_WEIGHTS"))),
-  tar_target(ssurgo_gdb_path, path.expand(Sys.getenv("SSURGO_GDB"))),
+  tar_target(crops_path, path.expand(config[["crops_path"]])),
+  tar_target(mslsp_path, path.expand(config[["mslsp_path"]])),
+  tar_target(cimis_etref_path, path.expand(config[["cimis_etref_path"]])),
+  tar_target(chirps_precip_path, path.expand(config[["chirps_precip_path"]])),
+  tar_target(ssurgo_weights_path, path.expand(config[["ssurgo_weights_path"]])),
+  tar_target(ssurgo_gdb_path, path.expand(config[["ssurgo_gdb_path"]])),
 
-  tar_target(event_output_dir, path.expand(Sys.getenv("EVENT_OUTPUT_DIR"))),
+  tar_target(event_output_dir, path.expand(config[["event_output_dir"]])),
 
   tar_target(validated_paths, {
     stopifnot(
@@ -221,7 +190,7 @@ list(
     make_event_df_parquet(
       file.path(event_output_dir, event_filename),
       parcel_waterbalance,
-      n_ensemble = 20,
+      n_ensemble = n_irr_ensemble,
       frac_uncertainty = 0.1
     ),
     pattern = map(parcel_waterbalance),
diff --git a/modules/data.land/inst/irrigation-statewide/config.yml b/modules/data.land/inst/irrigation-statewide/config.yml
new file mode 100644
index 00000000000..dd79356bc9c
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/config.yml
@@ -0,0 +1,23 @@
+default:
+  n_parcels: 1000
+  batch_size: 100
+  n_remote_workers: 1
+  exec_type: "local"
+  event_filename: "irrigation_1000"
+  n_irr_ensembe: 20
+
+small:
+
+medium:
+  n_parcels: 10000
+  batch_size: 1000
+  n_remote_workers: 15
+  exec_type: "cluster"
+  event_filename: "irrigation_10000"
+
+all:
+  n_parcels: null   # `null` means "all"
+  batch_size: 5000
+  n_remote_workers: 60
+  exec_type: "local"
+  event_filename: "irrigation_all"
diff --git a/modules/data.land/inst/irrigation-statewide/config_paths.yml b/modules/data.land/inst/irrigation-statewide/config_paths.yml
new file mode 100644
index 00000000000..64e981f73df
--- /dev/null
+++ b/modules/data.land/inst/irrigation-statewide/config_paths.yml
@@ -0,0 +1,8 @@
+default:
+  event_output_dir: "/projectnb/dietzelab/ccmmf/usr/ashiklom/event-outputs"
+  crops_path: "/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1/crops_all_years.parq"
+  mslsp_path: "/projectnb/dietzelab/ccmmf/management/phenology/matched_landiq_mslsp_v4.1"
+  cimis_etref_path: "/projectnb/dietzelab/ccmmf/data/cimis-extracted"
+  chirps_precip_path: "/projectnb/dietzelab/ccmmf/data/chirps-extracted"
+  ssurgo_weights_path: "/projectnb/dietzelab/ccmmf/data_raw/ssurgo/ssurgo-weights.parquet"
+  ssurgo_gdb_path: "/projectnb/dietzelab/ccmmf/data_raw/ssurgo/gSSURGO_CA.gdb"

From f54974d4350051accc80c556bd18e0b0f25f2250 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 14:38:00 -0400
Subject: [PATCH 55/67] add irrigation maximum parameter

---
 modules/data.land/R/water_balance.R          | 22 +++++++++++++++-----
 modules/data.land/man/apply_water_balance.Rd |  5 +++++
 modules/data.land/man/calc_water_balance.Rd  |  6 +++++-
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 7c8fe7d36fd..1421bbcf1bc 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -32,6 +32,8 @@
 #'   triggered when soil water falls below this level; defaults to
 #'   `whc_min_frac * whc` if NULL.
 #'   Can be a single value or a vector of the same length as `et`.
+#' @param irrigation_max If set, maximum amount of irrigation to apply at a
+#'   time (distance).
 #' @return List with vectors: W_t (soil water), irr (irrigation), runoff
 #' @examples
 #' # Calculate WHC from field capacity, wilting point, and rooting depth
@@ -52,7 +54,8 @@ calc_water_balance <- function(
   whc,
   whc_min_frac,
   W_initial = NULL, #nolint: object_name_linter
-  w_min = NULL
+  w_min = NULL,
+  irrigation_max = NULL
 ) {
 
   # nolint start: object_name_linter
@@ -115,10 +118,10 @@ calc_water_balance <- function(
     W0 <- W_prev + precip[t] - et[t]
 
     # If W0 falls below w_min (e.g., high ET; low precip), irrigate
-    # to field capacity (i.e., full WHC).
+    # to field capacity (i.e., full WHC), but no more than irrigation_max.
     if (W0 < w_min[t]) {
-      irr[t] <- whc[t] - W0
-      W0 <- whc[t]
+      irr[t] <- min(whc[t] - W0, irrigation_max)
+      W0 <- W0 + irr[t]
     } else {
       irr[t] <- 0
     }
@@ -289,6 +292,9 @@ calc_water_balance_rice <- function(
 #' or similar).
 #' @param whc_mm Water holding capacity (mm); ignored if `whc_mm` is a column
 #' in `df`.
+#' @param irrigation_max_mm Maximum irrigation to be applied at a time. See
+#' `irrigation_max` argument of [calc_water_balance()]. Ignored if
+#' `irrigation_max_mm` is a column of `df`.
 #' @inheritParams calc_water_balance_rice
 #' @return Data frame with added columns: `W_t` / `pond_depth`, `irr`, `runoff`
 #' @export
@@ -296,6 +302,7 @@ apply_water_balance <- function(
   df,
   idcol,
   whc_mm = 500,
+  irrigation_max_mm = 150,
   flood_target = 125,
   flood_min = 62.5,
   flood_max = 175,
@@ -334,6 +341,10 @@ apply_water_balance <- function(
     df[["whc_mm"]] <- whc_mm
   }
 
+  if (!("irrigation_max_mm" %in% colnames(df))) {
+    df[["irrigation_max_mm"]] <- irrigation_max_mm
+  }
+
   try_wb_rice <- function(...) {
     tryCatch(
       calc_water_balance_rice(...),
@@ -396,7 +407,8 @@ apply_water_balance <- function(
         et = .data$etc_mm_day,
         precip = .data$precip_mm_day,
         whc = .data$whc_mm,
-        whc_min_frac = .data$whc_min_frac
+        whc_min_frac = .data$whc_min_frac,
+        irrigation_max = .data$irrigation_max_mm
       )),
       .by = dplyr::all_of(idcol)
     ) |>
diff --git a/modules/data.land/man/apply_water_balance.Rd b/modules/data.land/man/apply_water_balance.Rd
index eee6122c6d6..dd27e9963fb 100644
--- a/modules/data.land/man/apply_water_balance.Rd
+++ b/modules/data.land/man/apply_water_balance.Rd
@@ -8,6 +8,7 @@ apply_water_balance(
   df,
   idcol,
   whc_mm = 500,
+  irrigation_max_mm = 150,
   flood_target = 125,
   flood_min = 62.5,
   flood_max = 175,
@@ -26,6 +27,10 @@ or similar).}
 \item{whc_mm}{Water holding capacity (mm); ignored if `whc_mm` is a column
 in `df`.}
 
+\item{irrigation_max_mm}{Maximum irrigation to be applied at a time. See
+`irrigation_max` argument of [calc_water_balance()]. Ignored if
+`irrigation_max_mm` is a column of `df`.}
+
 \item{flood_target}{Numeric scalar. Target ponded water depth.
 Irrigation refills to this level.}
 
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index 563333117ea..35c1be843c3 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -10,7 +10,8 @@ calc_water_balance(
   whc,
   whc_min_frac,
   W_initial = NULL,
-  w_min = NULL
+  w_min = NULL,
+  irrigation_max = NULL
 )
 }
 \arguments{
@@ -33,6 +34,9 @@ Can be a single value or a vector of the same length as `et`.}
 triggered when soil water falls below this level; defaults to
 `whc_min_frac * whc` if NULL.
 Can be a single value or a vector of the same length as `et`.}
+
+\item{irrigation_max}{If set, maximum amount of irrigation to apply at a
+time (distance).}
 }
 \value{
 List with vectors: W_t (soil water), irr (irrigation), runoff

From 70481bcde1bfa930e6b4812fbf56715399de2d2c Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 14:40:01 -0400
Subject: [PATCH 56/67] move irrigation workflow to workflows/

---
 modules/data.land/.Rbuildignore               |   1 -
 .../inst/irrigation-statewide/_targets.yaml   |  12 --
 .../irrigation-statewide/.gitignore           |   0
 .../irrigation-statewide/R/calc_awc.R         |   0
 .../irrigation-statewide/R/crop_timeseries.R  |   0
 .../irrigation-statewide/R/events_df.R        |   0
 .../irrigation-statewide/R/get.R              |   0
 .../irrigation-statewide/R/misc.R             |   0
 .../R/resolve_multicrop.R                     |   0
 .../irrigation-statewide/README.md            |   0
 .../irrigation-statewide/_targets.R           |   5 +-
 workflows/irrigation-statewide/_targets.yaml  |  12 ++
 workflows/irrigation-statewide/check-result.R |  16 +++
 .../irrigation-statewide/config.yml           |   2 +-
 .../irrigation-statewide/config_paths.yml     |   0
 workflows/irrigation-statewide/figures.R      | 115 ++++++++++++++++++
 workflows/irrigation-statewide/figures2.R     |  75 ++++++++++++
 .../irrigation-statewide/push-to-carb.sh      |   0
 .../irrigation-statewide/run-pipeline.R       |   0
 19 files changed, 221 insertions(+), 17 deletions(-)
 delete mode 100644 modules/data.land/inst/irrigation-statewide/_targets.yaml
 rename {modules/data.land/inst => workflows}/irrigation-statewide/.gitignore (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/R/calc_awc.R (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/R/crop_timeseries.R (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/R/events_df.R (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/R/get.R (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/R/misc.R (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/R/resolve_multicrop.R (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/README.md (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/_targets.R (96%)
 create mode 100644 workflows/irrigation-statewide/_targets.yaml
 create mode 100644 workflows/irrigation-statewide/check-result.R
 rename {modules/data.land/inst => workflows}/irrigation-statewide/config.yml (94%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/config_paths.yml (100%)
 create mode 100644 workflows/irrigation-statewide/figures.R
 create mode 100644 workflows/irrigation-statewide/figures2.R
 rename {modules/data.land/inst => workflows}/irrigation-statewide/push-to-carb.sh (100%)
 rename {modules/data.land/inst => workflows}/irrigation-statewide/run-pipeline.R (100%)

diff --git a/modules/data.land/.Rbuildignore b/modules/data.land/.Rbuildignore
index 530eaa07a82..edd7e377255 100644
--- a/modules/data.land/.Rbuildignore
+++ b/modules/data.land/.Rbuildignore
@@ -2,4 +2,3 @@ contrib
 data-raw
 ^docs$
 .*venv/
-.*/_targets.*/
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.yaml b/modules/data.land/inst/irrigation-statewide/_targets.yaml
deleted file mode 100644
index 2fe59f8cee4..00000000000
--- a/modules/data.land/inst/irrigation-statewide/_targets.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-main:
-  script: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets.R
-  store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets/
-small:
-  inherits: main
-  store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets_small/
-medium:
-  inherits: main
-  store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets_medium/
-all:
-  inherits: main
-  store: /projectnb/dietzelab/ccmmf/usr/ashiklom/pecan/cimis-et-complete/modules/data.land/inst/irrigation-statewide/_targets_all/
diff --git a/modules/data.land/inst/irrigation-statewide/.gitignore b/workflows/irrigation-statewide/.gitignore
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/.gitignore
rename to workflows/irrigation-statewide/.gitignore
diff --git a/modules/data.land/inst/irrigation-statewide/R/calc_awc.R b/workflows/irrigation-statewide/R/calc_awc.R
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/R/calc_awc.R
rename to workflows/irrigation-statewide/R/calc_awc.R
diff --git a/modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R b/workflows/irrigation-statewide/R/crop_timeseries.R
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/R/crop_timeseries.R
rename to workflows/irrigation-statewide/R/crop_timeseries.R
diff --git a/modules/data.land/inst/irrigation-statewide/R/events_df.R b/workflows/irrigation-statewide/R/events_df.R
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/R/events_df.R
rename to workflows/irrigation-statewide/R/events_df.R
diff --git a/modules/data.land/inst/irrigation-statewide/R/get.R b/workflows/irrigation-statewide/R/get.R
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/R/get.R
rename to workflows/irrigation-statewide/R/get.R
diff --git a/modules/data.land/inst/irrigation-statewide/R/misc.R b/workflows/irrigation-statewide/R/misc.R
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/R/misc.R
rename to workflows/irrigation-statewide/R/misc.R
diff --git a/modules/data.land/inst/irrigation-statewide/R/resolve_multicrop.R b/workflows/irrigation-statewide/R/resolve_multicrop.R
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/R/resolve_multicrop.R
rename to workflows/irrigation-statewide/R/resolve_multicrop.R
diff --git a/modules/data.land/inst/irrigation-statewide/README.md b/workflows/irrigation-statewide/README.md
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/README.md
rename to workflows/irrigation-statewide/README.md
diff --git a/modules/data.land/inst/irrigation-statewide/_targets.R b/workflows/irrigation-statewide/_targets.R
similarity index 96%
rename from modules/data.land/inst/irrigation-statewide/_targets.R
rename to workflows/irrigation-statewide/_targets.R
index c9e499954c8..12dfaccfd4a 100644
--- a/modules/data.land/inst/irrigation-statewide/_targets.R
+++ b/workflows/irrigation-statewide/_targets.R
@@ -5,7 +5,7 @@ library(tarchetypes)
 library(crew)
 library(crew.cluster)
 
-root_dir <- here::here("modules/data.land/inst/irrigation-statewide")
+root_dir <- here::here("workflows/irrigation-statewide")
 logdir <- file.path(root_dir, "_logs")
 dir.create(logdir, showWarnings = FALSE, recursive = TRUE)
 
@@ -29,7 +29,6 @@ n_remote_workers <- config[["n_remote_workers"]]
 n_local_workers <- as.integer(Sys.getenv("NSLOTS", 1))
 exec_type <- config[["exec_type"]]
 stopifnot(exec_type %in% c("cluster", "local"))
-event_output_dir <- config[["event_output_dir"]]
 event_filename <- config[["event_filename"]]
 n_irr_ensemble <- config[["n_irr_ensemble"]]
 
@@ -43,7 +42,7 @@ message(glue::glue(
     "{n_remote_workers} workers.\n"
   },
   "Output will be saved to ",
-  "{file.path(event_output_dir, event_filename)}\n",
+  "{file.path(config[['event_output_dir']], event_filename)}\n",
   "Targets output will be stored in ", tar_config_get("store")
 ))
 
diff --git a/workflows/irrigation-statewide/_targets.yaml b/workflows/irrigation-statewide/_targets.yaml
new file mode 100644
index 00000000000..5958a0c0f64
--- /dev/null
+++ b/workflows/irrigation-statewide/_targets.yaml
@@ -0,0 +1,12 @@
+main:
+  script: workflows/irrigation-statewide/_targets.R
+  store: workflows/irrigation-statewide/_targets/
+small:
+  inherits: main
+  store: workflows/irrigation-statewide/_targets_small/
+medium:
+  inherits: main
+  store: workflows/irrigation-statewide/_targets_medium/
+all:
+  inherits: main
+  store: workflows/irrigation-statewide/_targets_all/
diff --git a/workflows/irrigation-statewide/check-result.R b/workflows/irrigation-statewide/check-result.R
new file mode 100644
index 00000000000..79bca2627d5
--- /dev/null
+++ b/workflows/irrigation-statewide/check-result.R
@@ -0,0 +1,16 @@
+#!/usr/bin/env Rscript
+
+fname <- "../../event-outputs/irrigation_all"
+dat <- arrow::open_dataset(fname)
+
+dat |>
+  head(20) |>
+  dplyr::collect()
+
+dat |>
+  dplyr::filter(parcel_id == 3657) |>
+  dplyr::collect()
+
+pids <- dat |>
+  dplyr::distinct(.data$parcel_id) |>
+  dplyr::pull()
diff --git a/modules/data.land/inst/irrigation-statewide/config.yml b/workflows/irrigation-statewide/config.yml
similarity index 94%
rename from modules/data.land/inst/irrigation-statewide/config.yml
rename to workflows/irrigation-statewide/config.yml
index dd79356bc9c..66443d207a4 100644
--- a/modules/data.land/inst/irrigation-statewide/config.yml
+++ b/workflows/irrigation-statewide/config.yml
@@ -19,5 +19,5 @@ all:
   n_parcels: null   # `null` means "all"
   batch_size: 5000
   n_remote_workers: 60
-  exec_type: "local"
+  exec_type: "cluster"
   event_filename: "irrigation_all"
diff --git a/modules/data.land/inst/irrigation-statewide/config_paths.yml b/workflows/irrigation-statewide/config_paths.yml
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/config_paths.yml
rename to workflows/irrigation-statewide/config_paths.yml
diff --git a/workflows/irrigation-statewide/figures.R b/workflows/irrigation-statewide/figures.R
new file mode 100644
index 00000000000..e2d63675fdb
--- /dev/null
+++ b/workflows/irrigation-statewide/figures.R
@@ -0,0 +1,115 @@
+#!/usr/bin/env Rscript
+
+library(ggplot2)
+library(patchwork)
+
+Sys.setenv(TAR_PROJECT = "medium")
+# targets::tar_load(phenology)
+targets::tar_load(parcel_waterbalance)
+targets::tar_load(complete_crop_timeseries)
+targets::tar_load(etref)
+
+head(unique(parcel_waterbalance$parcel_id), 20)
+
+# pid <- c(39011, 59465)
+pid <- c(39230, 86888)
+# dsub <- complete_crop_timeseries |>
+#   dplyr::filter(parcel_id %in% pid)
+wbsub <- parcel_waterbalance |>
+  dplyr::filter(parcel_id %in% pid)
+wbsub_long <- wbsub |>
+  dplyr::select(
+    "parcel_id", "date", "precip_mm_day", "etref_mm_day",
+    "etc_mm_day", "canopy_cover", "irr", "pond_depth",
+    "runoff", "W_t", "crop_name"
+  ) |>
+  tidyr::pivot_longer(
+    -c("parcel_id", "date", "crop_name"),
+    names_to = "variable",
+    values_to = "value"
+  )
+vcols <- c(
+  "Water balance (mm)" = "W_t",
+  "Pond depth (mm)" = "pond_depth",
+  "Irrigation (mm)" = "irr",
+  "Runoff (mm)" = "runoff"
+)
+irrplot <- wbsub_long |>
+  dplyr::filter(variable %in% vcols) |>
+  dplyr::mutate(variable = factor(variable, vcols, names(vcols))) |>
+  ggplot() +
+  aes(x = date, color = crop_name, y = value, group = 1) +
+  geom_line() +
+  scale_color_brewer(palette = "Dark2") +
+  facet_grid(rows = vars(variable), cols = vars(parcel_id), scales = "free") +
+  theme_bw() +
+  theme(legend.position = "bottom")
+ggsave("~/irrigation.png", irrplot, width = 10, height = 8, units = "in")
+etref_plot <- wbsub_long |>
+  dplyr::filter(
+    variable %in% c("etref_mm_day", "etc_mm_day")
+  ) |>
+  dplyr::mutate(
+    variable = factor(variable, c("etref_mm_day", "etc_mm_day"), c(
+      "ET[ref] ~ (mm/day)",
+      "ET ~ (mm/day)"
+    ))
+  ) |>
+  ggplot() +
+  aes(x = date, y = value, color = crop_name, group = 1) +
+  geom_line() +
+  facet_grid(
+    rows = vars(variable),
+    cols = vars(parcel_id),
+    scales = "free",
+    labeller = label_parsed
+  ) +
+  scale_color_brewer(palette = "Dark2") +
+  theme_bw() +
+  theme(legend.position = "bottom")
+ggsave("~/etref-etc.png", etref_plot, width = 8, height = 6, units = "in")
+
+# wbsub_long |>
+#   dplyr::filter(
+#     variable %in% c("precip_mm_day", "etc_mm_day", "irr", "W_t")
+#   ) |>
+#   dplyr::mutate(
+#     variable = factor(
+#       variable,
+#       c("")
+#     )
+#   )
+# ggplot(wbsub_long) +
+#   aes(x = date, y = value, color = crop_name, group = 1) +
+#   geom_line() +
+#   facet_grid(rows = vars(variable), cols = vars(parcel_id), scales = "free") +
+#   theme_bw()
+#
+# dsub_long <- dsub |>
+#   dplyr::select(
+#     "parcel_id", "date", "precip_mm_day", "etref_mm_day", "etc_mm_day",
+#     "whc_min_frac", "whc_mm", "canopy_cover", "crop_name"
+#   ) |>
+#   tidyr::pivot_longer(
+#     -c("parcel_id", "date", "crop_name"),
+#     names_to = "variable",
+#     values_to = "value"
+#   )
+#
+# ggplot(dsub_long) +
+#   aes(x = date, y = value, color = crop_name, group = 1) +
+#   geom_line() +
+#   facet_grid(rows = vars(variable), cols = vars(parcel_id), scales = "free")
+#
+# etsub <- etref |>
+#   dplyr::filter(parcel_id %in% pid)
+#
+# ggplot(etsub) +
+#   aes(x = date, y = etref_mm_day) +
+#   geom_line() +
+#   facet_grid(rows = "parcel_id")
+#
+#
+# ggplot(pheno_sub) +
+#   aes(x = date, y = canopy_cover, color = parcel_id) +
+#   geom_line()
diff --git a/workflows/irrigation-statewide/figures2.R b/workflows/irrigation-statewide/figures2.R
new file mode 100644
index 00000000000..903c5331e6c
--- /dev/null
+++ b/workflows/irrigation-statewide/figures2.R
@@ -0,0 +1,75 @@
+#!/usr/bin/env Rscript
+
+library(ggplot2)
+library(patchwork)
+
+Sys.setenv(TAR_PROJECT = "medium")
+# targets::tar_load(phenology)
+targets::tar_load(parcel_waterbalance)
+targets::tar_load(complete_crop_timeseries)
+targets::tar_load(etref)
+
+head(unique(parcel_waterbalance$parcel_id), 20)
+
+pid <- 114897
+dmin <- "2018-06-01"
+dmax <- "2025-12-30"
+wbsub <- parcel_waterbalance |>
+  dplyr::filter(parcel_id == pid)
+wbsub_long <- wbsub |>
+  dplyr::select(
+    "date", "precip_mm_day", "etref_mm_day",
+    "etc_mm_day", "canopy_cover", "irr", "pond_depth",
+    "runoff", "W_t", "crop_name"
+  ) |>
+  tidyr::pivot_longer(
+    -c("date", "crop_name"),
+    names_to = "variable",
+    values_to = "value"
+  )
+vcols <- c(
+  "Precipitation (mm/day)" = "precip_mm_day",
+  "ET (mm/day)" = "etc_mm_day",
+  "Water balance (mm)" = "W_t",
+  "Irrigation (mm)" = "irr",
+  "Runoff (mm)" = "runoff"
+)
+irrplot <- wbsub_long |>
+  dplyr::filter(
+    variable %in% vcols,
+    date <= dmax,
+    date >= dmin
+  ) |>
+  dplyr::mutate(variable = factor(variable, vcols, names(vcols))) |>
+  ggplot() +
+  aes(x = date, color = crop_name, y = value, group = 1) +
+  geom_line() +
+  scale_color_brewer(palette = "Dark2") +
+  facet_wrap(vars(variable), scales = "free") +
+  theme_bw() +
+  theme(legend.position = "bottom")
+ggsave(sprintf("~/irrigation-%d.png", pid), irrplot, width = 10, height = 8, units = "in")
+etref_plot <- wbsub_long |>
+  dplyr::filter(
+    variable %in% c("etref_mm_day", "etc_mm_day"),
+    date >= dmin,
+    date <= dmax
+  ) |>
+  dplyr::mutate(
+    variable = factor(variable, c("etref_mm_day", "etc_mm_day"), c(
+      "ET[ref] ~ (mm/day)",
+      "ET ~ (mm/day)"
+    ))
+  ) |>
+  ggplot() +
+  aes(x = date, y = value, color = crop_name, group = 1) +
+  geom_line() +
+  facet_grid(
+    rows = vars(variable),
+    scales = "free",
+    labeller = label_parsed
+  ) +
+  scale_color_brewer(palette = "Dark2") +
+  theme_bw() +
+  theme(legend.position = "bottom")
+ggsave(sprintf("~/etref-etc-%d.png", pid), etref_plot, width = 8, height = 6, units = "in")
diff --git a/modules/data.land/inst/irrigation-statewide/push-to-carb.sh b/workflows/irrigation-statewide/push-to-carb.sh
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/push-to-carb.sh
rename to workflows/irrigation-statewide/push-to-carb.sh
diff --git a/modules/data.land/inst/irrigation-statewide/run-pipeline.R b/workflows/irrigation-statewide/run-pipeline.R
similarity index 100%
rename from modules/data.land/inst/irrigation-statewide/run-pipeline.R
rename to workflows/irrigation-statewide/run-pipeline.R

From 007d6c76fc0276db3ed912936da86a76dbc6e945 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 15:27:27 -0400
Subject: [PATCH 57/67] Update README

---
 workflows/irrigation-statewide/README.md | 51 +++++++++---------------
 1 file changed, 19 insertions(+), 32 deletions(-)

diff --git a/workflows/irrigation-statewide/README.md b/workflows/irrigation-statewide/README.md
index d758da7f481..04a0c3de040 100644
--- a/workflows/irrigation-statewide/README.md
+++ b/workflows/irrigation-statewide/README.md
@@ -4,47 +4,34 @@ This generates PEcAn event files for irrigation events across all of California.
 The spatial unit is harmonized LandIQ parcels.
 
 The workflow uses `targets` for reproducibility, scalability, and incremental execution.
-The pipeline is defined in `_targets.R` and can be executed via the `run-pipeline.R` script (which is just a thin wrapper around `library(targets); tar_make()`).
 
 # Setup
 
-The workflow requires the following environment variables to be set:
+If you are running this code from inside this directory, you should not need to set any environment variables.
 
-- `TAR_CONFIG` --- path to the `_targets.yaml` config file in this directory.
-- `N_PARCELS` --- the number of parcels to process (sampled at random) or `all` to run for all LandIQ parcels.
-- `BATCH_SIZE` --- number of parcels per "batch". Each batch gets its own target. Note that having too many small batches creates a lot of overhead.
-- `N_REMOTE_WORKERS` --- number of remote workers (SGE jobs) to spawn for execution 
-- `EXEC_TYPE` --- execution type. Either `local` (run on current machine, with `NSLOTS` parallel processes) or `cluster` (to run using SGE jobs)
+If you are running the code from _outside_ this directory (e.g., from the PEcAn root directory), you will need to set the `TAR_CONFIG` environment variable to point to the `_targets.yaml` file in this directory.
+One way to do this is to create a `.Renviron` file in the PEcAn project root with the following contents:
 
-- `LANDIQ_CROPS` --- path to harmonized LandIQ crops file (`crops_all_years.parq`)
-- `LANDIQ_TIMESERIES` --- path to HLS-based phenology (MSLSP) parquet files
-
-- `EVENT_OUTPUT_DIR` --- output directory where final event files will be written. If it doesn't exist, it will be created.
-- `EVENT_FILENAME` --- name of event file to be created. Should have `.parquet` extension. It will be placed in `EVENT_OUTPUT_DIR`
-
-- `CHIRPS_PRECIP` --- path to pre-extracted CHIRPS precipitation data (folder containing parquet files)
-- `CIMIS_ETREF` --- path to pre-extracted CIMIS evapotranspiration data (folder containing parquet files)
-- `SSURGO_WEIGHTS` --- path to pre-computed SSURGO weights for LandIQ parcels (single parquet file)
-- `SSURGO_GDB` --- path to complete SSURGO geodatabase (geodatabase; folder with `.gdb` extension)
+```
+TAR_CONFIG=workflows/irrigation-statewide/_targets.yaml
+```
 
-A good way to set these is via a project-local `.Renviron` file that looks like this:
+If you are running this code on the BU SCC, all the paths have already been preconfigured for you.
+If you are running the code on another system, or if the paths have changed, modify the `config_paths.yml` file accordingly.
 
-```
-TAR_CONFIG=modules/data.land/inst/irrigation-statewide/_targets.yaml
+# Execution
 
-LANDIQ_PARCELS=/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1/parcels.gpkg
-LANDIQ_CROPS=/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1/crops_all_years.parq
-LANDIQ_TIMESERIES=/projectnb/dietzelab/ccmmf/management/phenology/matched_landiq_mslsp_v4.1
-EVENT_OUTPUT_DIR=/projectnb/dietzelab/ccmmf/usr/ashiklom/event-outputs
+This code ships with three different configurations (defined in `config.yml`) of the irrigation pipeline:
 
-CHIRPS_PRECIP=/projectnb/dietzelab/ccmmf/data/chirps-extracted
-CIMIS_ETREF=/projectnb/dietzelab/ccmmf/data/cimis-extracted
-SSURGO_WEIGHTS=/projectnb/dietzelab/ccmmf/data_raw/ssurgo/ssurgo-weights.parquet
-SSURGO_GDB=/projectnb/dietzelab/ccmmf/data_raw/ssurgo/gSSURGO_CA.gdb
-```
+- `small` (default) --- 1000 randomly selected parcels split into batches of 100 parcels each. The code will run locally, using as many CPUs as are defined by the `NSLOTS` environment variable (or 1 CPU, if `NSLOTS` is unset).
+- `medium` --- 10,000 randomly selected parcels split into batches of 1000 parcels each. This will run across `n_remote_workers` (set to 15) SGE array jobs.
+- `all` --- This will run all (~600,000) parcels in California in batches of 5000 parcels each. This will run on 60 SGE array jobs.
 
-Use R commands like `Sys.getenv("TAR_CONFIG")` from inside your R session to confirm these variables are set correctly.
+These values can be modified by modifying the `config.yml` file.
 
-# Execution
+To run a specific configuration, set the `TAR_PROJECT` environment variable accordingly and run `Rscript -e 'targets::tar_make()'`.
+For example, to run the full statewide pipeline (all 600K parcels), you can use a command like:
 
-Assuming the variables above are set, you can run the pipeline with just `Rscript -e 'targets::tar_make()'`.
+```
+TAR_PROJECT=all Rscript -e "targets::tar_make()"
+```

From fa5aab2ad8968d1195aa5d4f81bb5b75eda357fe Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 15:30:13 -0400
Subject: [PATCH 58/67] fix README paths. clean up some unnecessary files.

---
 workflows/irrigation-statewide/README.md      |   5 +-
 workflows/irrigation-statewide/figures.R      | 115 ------------------
 workflows/irrigation-statewide/figures2.R     |  75 ------------
 workflows/irrigation-statewide/run-pipeline.R |  25 ----
 4 files changed, 2 insertions(+), 218 deletions(-)
 delete mode 100644 workflows/irrigation-statewide/figures.R
 delete mode 100644 workflows/irrigation-statewide/figures2.R
 delete mode 100644 workflows/irrigation-statewide/run-pipeline.R

diff --git a/workflows/irrigation-statewide/README.md b/workflows/irrigation-statewide/README.md
index 04a0c3de040..9e172ef4c2b 100644
--- a/workflows/irrigation-statewide/README.md
+++ b/workflows/irrigation-statewide/README.md
@@ -7,9 +7,8 @@ The workflow uses `targets` for reproducibility, scalability, and incremental ex
 
 # Setup
 
-If you are running this code from inside this directory, you should not need to set any environment variables.
-
-If you are running the code from _outside_ this directory (e.g., from the PEcAn root directory), you will need to set the `TAR_CONFIG` environment variable to point to the `_targets.yaml` file in this directory.
+This code assumes that you are running from the PEcAn root directory.
+To ensure discovery of the targets script and store directories, you will need to set the `TAR_CONFIG` environment variable to point to the `_targets.yaml` file in this directory.
 One way to do this is to create a `.Renviron` file in the PEcAn project root with the following contents:
 
 ```
diff --git a/workflows/irrigation-statewide/figures.R b/workflows/irrigation-statewide/figures.R
deleted file mode 100644
index e2d63675fdb..00000000000
--- a/workflows/irrigation-statewide/figures.R
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/usr/bin/env Rscript
-
-library(ggplot2)
-library(patchwork)
-
-Sys.setenv(TAR_PROJECT = "medium")
-# targets::tar_load(phenology)
-targets::tar_load(parcel_waterbalance)
-targets::tar_load(complete_crop_timeseries)
-targets::tar_load(etref)
-
-head(unique(parcel_waterbalance$parcel_id), 20)
-
-# pid <- c(39011, 59465)
-pid <- c(39230, 86888)
-# dsub <- complete_crop_timeseries |>
-#   dplyr::filter(parcel_id %in% pid)
-wbsub <- parcel_waterbalance |>
-  dplyr::filter(parcel_id %in% pid)
-wbsub_long <- wbsub |>
-  dplyr::select(
-    "parcel_id", "date", "precip_mm_day", "etref_mm_day",
-    "etc_mm_day", "canopy_cover", "irr", "pond_depth",
-    "runoff", "W_t", "crop_name"
-  ) |>
-  tidyr::pivot_longer(
-    -c("parcel_id", "date", "crop_name"),
-    names_to = "variable",
-    values_to = "value"
-  )
-vcols <- c(
-  "Water balance (mm)" = "W_t",
-  "Pond depth (mm)" = "pond_depth",
-  "Irrigation (mm)" = "irr",
-  "Runoff (mm)" = "runoff"
-)
-irrplot <- wbsub_long |>
-  dplyr::filter(variable %in% vcols) |>
-  dplyr::mutate(variable = factor(variable, vcols, names(vcols))) |>
-  ggplot() +
-  aes(x = date, color = crop_name, y = value, group = 1) +
-  geom_line() +
-  scale_color_brewer(palette = "Dark2") +
-  facet_grid(rows = vars(variable), cols = vars(parcel_id), scales = "free") +
-  theme_bw() +
-  theme(legend.position = "bottom")
-ggsave("~/irrigation.png", irrplot, width = 10, height = 8, units = "in")
-etref_plot <- wbsub_long |>
-  dplyr::filter(
-    variable %in% c("etref_mm_day", "etc_mm_day")
-  ) |>
-  dplyr::mutate(
-    variable = factor(variable, c("etref_mm_day", "etc_mm_day"), c(
-      "ET[ref] ~ (mm/day)",
-      "ET ~ (mm/day)"
-    ))
-  ) |>
-  ggplot() +
-  aes(x = date, y = value, color = crop_name, group = 1) +
-  geom_line() +
-  facet_grid(
-    rows = vars(variable),
-    cols = vars(parcel_id),
-    scales = "free",
-    labeller = label_parsed
-  ) +
-  scale_color_brewer(palette = "Dark2") +
-  theme_bw() +
-  theme(legend.position = "bottom")
-ggsave("~/etref-etc.png", etref_plot, width = 8, height = 6, units = "in")
-
-# wbsub_long |>
-#   dplyr::filter(
-#     variable %in% c("precip_mm_day", "etc_mm_day", "irr", "W_t")
-#   ) |>
-#   dplyr::mutate(
-#     variable = factor(
-#       variable,
-#       c("")
-#     )
-#   )
-# ggplot(wbsub_long) +
-#   aes(x = date, y = value, color = crop_name, group = 1) +
-#   geom_line() +
-#   facet_grid(rows = vars(variable), cols = vars(parcel_id), scales = "free") +
-#   theme_bw()
-#
-# dsub_long <- dsub |>
-#   dplyr::select(
-#     "parcel_id", "date", "precip_mm_day", "etref_mm_day", "etc_mm_day",
-#     "whc_min_frac", "whc_mm", "canopy_cover", "crop_name"
-#   ) |>
-#   tidyr::pivot_longer(
-#     -c("parcel_id", "date", "crop_name"),
-#     names_to = "variable",
-#     values_to = "value"
-#   )
-#
-# ggplot(dsub_long) +
-#   aes(x = date, y = value, color = crop_name, group = 1) +
-#   geom_line() +
-#   facet_grid(rows = vars(variable), cols = vars(parcel_id), scales = "free")
-#
-# etsub <- etref |>
-#   dplyr::filter(parcel_id %in% pid)
-#
-# ggplot(etsub) +
-#   aes(x = date, y = etref_mm_day) +
-#   geom_line() +
-#   facet_grid(rows = "parcel_id")
-#
-#
-# ggplot(pheno_sub) +
-#   aes(x = date, y = canopy_cover, color = parcel_id) +
-#   geom_line()
diff --git a/workflows/irrigation-statewide/figures2.R b/workflows/irrigation-statewide/figures2.R
deleted file mode 100644
index 903c5331e6c..00000000000
--- a/workflows/irrigation-statewide/figures2.R
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env Rscript
-
-library(ggplot2)
-library(patchwork)
-
-Sys.setenv(TAR_PROJECT = "medium")
-# targets::tar_load(phenology)
-targets::tar_load(parcel_waterbalance)
-targets::tar_load(complete_crop_timeseries)
-targets::tar_load(etref)
-
-head(unique(parcel_waterbalance$parcel_id), 20)
-
-pid <- 114897
-dmin <- "2018-06-01"
-dmax <- "2025-12-30"
-wbsub <- parcel_waterbalance |>
-  dplyr::filter(parcel_id == pid)
-wbsub_long <- wbsub |>
-  dplyr::select(
-    "date", "precip_mm_day", "etref_mm_day",
-    "etc_mm_day", "canopy_cover", "irr", "pond_depth",
-    "runoff", "W_t", "crop_name"
-  ) |>
-  tidyr::pivot_longer(
-    -c("date", "crop_name"),
-    names_to = "variable",
-    values_to = "value"
-  )
-vcols <- c(
-  "Precipitation (mm/day)" = "precip_mm_day",
-  "ET (mm/day)" = "etc_mm_day",
-  "Water balance (mm)" = "W_t",
-  "Irrigation (mm)" = "irr",
-  "Runoff (mm)" = "runoff"
-)
-irrplot <- wbsub_long |>
-  dplyr::filter(
-    variable %in% vcols,
-    date <= dmax,
-    date >= dmin
-  ) |>
-  dplyr::mutate(variable = factor(variable, vcols, names(vcols))) |>
-  ggplot() +
-  aes(x = date, color = crop_name, y = value, group = 1) +
-  geom_line() +
-  scale_color_brewer(palette = "Dark2") +
-  facet_wrap(vars(variable), scales = "free") +
-  theme_bw() +
-  theme(legend.position = "bottom")
-ggsave(sprintf("~/irrigation-%d.png", pid), irrplot, width = 10, height = 8, units = "in")
-etref_plot <- wbsub_long |>
-  dplyr::filter(
-    variable %in% c("etref_mm_day", "etc_mm_day"),
-    date >= dmin,
-    date <= dmax
-  ) |>
-  dplyr::mutate(
-    variable = factor(variable, c("etref_mm_day", "etc_mm_day"), c(
-      "ET[ref] ~ (mm/day)",
-      "ET ~ (mm/day)"
-    ))
-  ) |>
-  ggplot() +
-  aes(x = date, y = value, color = crop_name, group = 1) +
-  geom_line() +
-  facet_grid(
-    rows = vars(variable),
-    scales = "free",
-    labeller = label_parsed
-  ) +
-  scale_color_brewer(palette = "Dark2") +
-  theme_bw() +
-  theme(legend.position = "bottom")
-ggsave(sprintf("~/etref-etc-%d.png", pid), etref_plot, width = 8, height = 6, units = "in")
diff --git a/workflows/irrigation-statewide/run-pipeline.R b/workflows/irrigation-statewide/run-pipeline.R
deleted file mode 100644
index 27f51f23023..00000000000
--- a/workflows/irrigation-statewide/run-pipeline.R
+++ /dev/null
@@ -1,25 +0,0 @@
-#' ---
-#' title: "Statewide irrigation workflow"
-#' author: "Alexey N. Shiklomanov"
-#' ---
-
-Sys.setenv(
-  "TAR_PROJECT" = "all",
-  "OMP_NUM_THREADS" = 1
-)
-
-library(targets)
-
-# devtools::document("modules/data.land")
-# devtools::install("modules/data.land", upgrade = FALSE, reload = TRUE)
-
-#' Run the pipeline. Targets that are already up-to-date will be skipped.
-tar_make()
-
-if (interactive()) {
-  tar_load_everything()
-}
-
-# tar_invalidate(dp_with_crops)
-# tar_load("phenology_crops")
-# tar_load(c("design_points", "dp_with_crops", "phenology"))

From f90b15855d5460599b45f21e5a04ae8aec46ffbc Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 15:36:01 -0400
Subject: [PATCH 59/67] resolve some r cmd check notes

---
 modules/data.land/DESCRIPTION              | 2 --
 modules/data.land/R/mslsp_to_canopycover.R | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION
index 948827429aa..2422e87eb6e 100644
--- a/modules/data.land/DESCRIPTION
+++ b/modules/data.land/DESCRIPTION
@@ -29,12 +29,10 @@ Imports:
     coda,
     curl,
     dplyr,
-    dplR,
     foreach,
     fs,
     future,
     furrr,
-    httr,
     httr2,
     lubridate,
     magrittr,
diff --git a/modules/data.land/R/mslsp_to_canopycover.R b/modules/data.land/R/mslsp_to_canopycover.R
index 8c0ce7b1cd7..074b49e77e2 100644
--- a/modules/data.land/R/mslsp_to_canopycover.R
+++ b/modules/data.land/R/mslsp_to_canopycover.R
@@ -59,7 +59,7 @@ expand_mslsp_cycle <- function(mslsp_row) {
   all_dates <- seq(min(dates), max(dates), by = "1 day")
   tibble::tibble(
     date = all_dates,
-    canopy_cover = approx(
+    canopy_cover = stats::approx(
       x = dates,
       y = .MSLSP_DATE_MAPPING$canopy_cover,
       xout = all_dates

From 97f6cea4c23908ba6dc8c74739267f10b7497747 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 16:13:31 -0400
Subject: [PATCH 60/67] remove some old scripts

---
 .../inst/CIMIS-event-files-targets.R          | 412 ------------------
 modules/data.land/inst/CIMIS-event-files.R    | 370 ----------------
 2 files changed, 782 deletions(-)
 delete mode 100644 modules/data.land/inst/CIMIS-event-files-targets.R
 delete mode 100644 modules/data.land/inst/CIMIS-event-files.R

diff --git a/modules/data.land/inst/CIMIS-event-files-targets.R b/modules/data.land/inst/CIMIS-event-files-targets.R
deleted file mode 100644
index 0cd22a2b9e6..00000000000
--- a/modules/data.land/inst/CIMIS-event-files-targets.R
+++ /dev/null
@@ -1,412 +0,0 @@
-#' ---
-#' title: "Example workflow generating SIPNET event files from CIMIS and CHIRPS data"
-#' author: "Alexey N. Shiklomanov"
-#' ---
-
-library(targets)
-
-devtools::document("modules/data.land")
-devtools::install("modules/data.land", upgrade = FALSE)
-devtools::reload("modules/data.land")
-
-targets_file <- here::here("_targets.R")
-targets_store <- here::here("_targets/")
-tar_config_set(
-  script = targets_file,
-  store = targets_store
-)
-
-#' Write the targets pipeline script to _targets.R in this directory.
-tar_script(
-  code = {
-    library(targets)
-    library(tarchetypes)
-
-    # -------------------------------------------------------------------------
-    # Helper functions
-    # -------------------------------------------------------------------------
-
-    #' Calculate effective available water capacity (mm) for a soil profile
-    #' clipped to a given rooting depth.
-    calc_effective_awc <- function(
-      hzdept_r_cm,
-      hzdepb_r_cm,
-      awc_r,
-      rooting_depth_cm
-    ) {
-      effective_top    <- pmin(hzdept_r_cm, rooting_depth_cm)
-      effective_bottom <- pmin(hzdepb_r_cm, rooting_depth_cm)
-      thickness_cm     <- pmax(0, effective_bottom - effective_top)
-      # awc_r is cm water / cm soil; multiply by thickness -> cm water -> mm water
-      sum(awc_r * thickness_cm, na.rm = TRUE) * 10
-    }
-
-    #' Average ETc and WHC across multi-crop parcels (double-cropping hack).
-    resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
-      id_sym   <- rlang::sym(id_col)
-      date_sym <- rlang::sym(date_col)
-
-      multicrop_counts <- etc_data |>
-        dplyr::add_count(!!id_sym, !!date_sym, name = "n") |>
-        dplyr::filter(.data$n > 1) |>
-        dplyr::summarize(
-          n_multicrop = dplyr::n_distinct(!!id_sym, !!date_sym),
-          .groups = "drop"
-        )
-
-      if (multicrop_counts$n_multicrop > 0) {
-        message(
-          "Multi-crop parcels: ",
-          multicrop_counts$n_multicrop,
-          " date-parcel combinations have multiple crops. Averaging ETc and WHC values."
-        )
-      }
-
-      etc_data |>
-        dplyr::group_by(!!id_sym, !!date_sym) |>
-        dplyr::summarize(
-          etc_mm_day   = mean(.data$etc_mm_day,   na.rm = TRUE),
-          whc_min_frac = mean(.data$whc_min_frac, na.rm = TRUE),
-          whc_mm       = mean(.data$whc_mm,       na.rm = TRUE),
-          .groups = "drop"
-        )
-    }
-
-    # -------------------------------------------------------------------------
-    # Package options
-    # -------------------------------------------------------------------------
-
-    tar_option_set(packages = c("ggplot2", "rlang"))
-
-    # -------------------------------------------------------------------------
-    # Pipeline
-    # -------------------------------------------------------------------------
-
-    list(
-
-      # --- Input paths
-
-      tar_target(design_points_path, path.expand(Sys.getenv("DESIGN_POINTS"))),
-      tar_target(cimis_eto_cog_path, path.expand(Sys.getenv("CIMIS_ETO_COG"))),
-      tar_target(parcels_path,       path.expand(Sys.getenv("LANDIQ_PARCELS"))),
-      tar_target(crops_path,         path.expand(Sys.getenv("LANDIQ_CROPS"))),
-      tar_target(mslsp_path,         path.expand(Sys.getenv("LANDIQ_TIMESERIES"))),
-      tar_target(event_output_dir,   path.expand(Sys.getenv("EVENT_OUTPUT_DIR"))),
-
-      tar_target(validated_paths, {
-        stopifnot(
-          file.exists(design_points_path),
-          dir.exists(cimis_eto_cog_path),
-          file.exists(parcels_path),
-          file.exists(crops_path),
-          dir.exists(mslsp_path),
-          length(list.files(mslsp_path, "\\.parquet")) == 7
-        )
-        dir.create(event_output_dir, showWarnings = FALSE, recursive = TRUE)
-        TRUE
-      }),
-
-      tar_target(
-        design_points,
-        readr::read_csv(design_points_path, show_col_types = FALSE) |>
-          # Remove duplicate IDs
-          dplyr::slice(1, .by = "id")
-      ),
-
-      tar_target(
-        dp_with_parcels,
-        PEcAn.data.land::get_landiq_parcel_ids(design_points, parcels_path) |>
-          dplyr::mutate(parcel_id = as.character(parcel_id))
-      ),
-
-      tar_target(
-        dp_with_phenology,
-        PEcAn.data.land::mslsp_to_canopycover(
-          mslsp_path,
-          parcel_ids = unique(dp_with_parcels[["parcel_id"]])
-        ) |>
-          dplyr::mutate(
-            landiq_SUBCLASS = as.integer(.data$landiq_SUBCLASS)
-          ) |>
-          dplyr::inner_join(dp_with_parcels, by = "parcel_id") |>
-          dplyr::select(-dplyr::starts_with("UniqueID_"))
-      ),
-
-      # --- LandIQ crop data --------------------------------------------------
-
-      tar_target(
-        design_point_crops,
-        {
-          #' NOTE: Some LandIQ classes/subclasses map onto multiple BISM crop types.
-          #' HACK: select just the first crop per class/subclass group.
-          bism_crop_unique <- PEcAn.data.land::bism_kc_by_crop |>
-            dplyr::distinct(.data$landiq_class, .data$landiq_subclass, .data$crop_name) |>
-            dplyr::slice(1, .by = c("landiq_class", "landiq_subclass"))
-
-          dp_crops <- dp_with_phenology |>
-            dplyr::left_join(
-              bism_crop_unique,
-              by = c(
-                "landiq_CLASS" = "landiq_class",
-                "landiq_SUBCLASS" = "landiq_subclass"
-              )
-            )
-
-          missing_crops <- dp_crops |> dplyr::filter(is.na(crop_name))
-          if (nrow(missing_crops) > 0) {
-            missing_crop_strs <- missing_crops |>
-              dplyr::distinct(.data$landiq_CLASS, .data$landiq_SUBCLASS) |>
-              dplyr::mutate(
-                string = glue::glue(
-                  "CLASS: {.data$landiq_CLASS} ",
-                  "SUBCLASS: {.data$landiq_SUBCLASS}"
-                )
-              ) |>
-              dplyr::pull(.data$string)
-            warning(
-              "Skipping ", nrow(missing_crops),
-              " rows with no matching BIS crop. Relevant pairs are: [",
-              paste(missing_crop_strs, collapse = "; "), "]"
-            )
-          }
-          dp_crops |>
-            dplyr::filter(!is.na(.data$crop_name)) |>
-            dplyr::left_join(
-              PEcAn.data.land::crop_whc |>
-                dplyr::select("crop_name", "whc_min_frac", "rooting_depth_m"),
-              by = "crop_name"
-            )
-        }
-      ),
-
-      # --- SSURGO soil data --------------------------------------------------
-
-      tar_target(
-        mukeys_list,
-        {
-          design_points_sf <- design_points |> 
-            dplyr::distinct(id, lon, lat)
-          purrr::map2(
-            design_points_sf$lon,
-            design_points_sf$lat,
-            ~ PEcAn.data.land::ssurgo_mukeys_point(
-              point = c(.x, .y),
-              distance = 10
-            )
-          )
-        }
-      ),
-
-      tar_target(
-        soil_raw,
-        PEcAn.data.land::gSSURGO.Query(
-          mukeys = unique(unlist(mukeys_list)),
-          fields = c("chorizon.awc_r", "chorizon.hzdept_r", "chorizon.hzdepb_r")
-        )
-      ),
-
-      tar_target(
-        soil_dominant,
-        soil_raw |>
-          dplyr::filter(cokey == cokey[which.max(comppct_r)], .by = "mukey")
-      ),
-
-      tar_target(
-        dp_with_whc,
-        design_point_crops |>
-          dplyr::mutate(
-            mukey = mukeys_list[match(id, design_points$id)]
-          ) |>
-          tidyr::unnest(mukey) |>
-          dplyr::mutate(mukey = as.numeric(mukey)) |>
-          dplyr::left_join(
-            soil_dominant,
-            by = "mukey",
-            relationship = "many-to-many"
-          ) |>
-          dplyr::summarize(
-            whc_mm = calc_effective_awc(
-              hzdept_r, hzdepb_r, awc_r,
-              rooting_depth_cm = rooting_depth_m[[1]] * 100
-            ),
-            .by = c("id", "parcel_id", "date", "crop_name", "whc_min_frac")
-          ) |>
-          dplyr::mutate(
-            whc_mm = dplyr::if_else(whc_mm > 0, whc_mm, 500, missing = 500)
-          )
-      ),
-
-      # --- Remote data extractions (slow; most benefit from caching) ---------
-
-      tar_target(
-        precip_et_dates,
-        with(design_point_crops, seq(min(date), max(date), by = "1 day"))
-      ),
-
-      tar_target(
-        etref,
-        design_point_crops |>
-          PEcAn.data.land::extract_cimis_dates(
-            precip_et_dates,
-            cimis_eto_cog_path,
-            download_missing = TRUE,
-            .progress = TRUE
-          )
-      ),
-
-      tar_target(
-        precip,
-        PEcAn.data.land::extract_chirps_remote(design_points, precip_et_dates)
-      ),
-
-
-      # --- ETc and water balance ---------------------------------------------
-
-      tar_target(
-        dp_with_eto,
-        dp_with_whc |>
-          dplyr::left_join(
-            etref |> dplyr::select("id", "date", "etref_mm_day"),
-            by = c("id", "date")
-          )
-      ),
-
-      tar_target(
-        dp_with_etc,
-        dp_with_eto |>
-          dplyr::mutate(
-            etc_mm_day = eto_to_etc_bism(
-              eto       = etref_mm_day,
-              crop_name = crop_name[[1]],
-              date      = date
-            ),
-            .by = "crop_name"
-          ) |>
-          dplyr::select(
-            dplyr::any_of(c("id", "parcel_id", "lat", "lon")),
-            "date", "etc_mm_day", "whc_min_frac", "whc_mm"
-          ) |>
-          resolve_multicrop()
-      ),
-
-      tar_target(
-        dp_crops_all,
-        dp_with_etc |>
-          dplyr::inner_join(precip, by = c("id", "date")) |>
-          dplyr::select(
-            "id", "lat", "lon", "date",
-            "etc_mm_day", "precip_mm_day", "whc_min_frac", "whc_mm"
-          )
-      ),
-
-      tar_target(
-        dpwb,
-        apply_water_balance(dp_crops_all, "id")
-      ),
-
-      # --- Diagnostics -------------------------------------------------------
-
-      tar_target(
-        etc_summary,
-        dp_crops_all |>
-          dplyr::summarize(
-            etc_min  = min(.data$etc_mm_day,  na.rm = TRUE),
-            etc_max  = max(.data$etc_mm_day,  na.rm = TRUE),
-            etc_mean = mean(.data$etc_mm_day, na.rm = TRUE),
-            .by = "id"
-          )
-      ),
-
-      tar_target(
-        wb_summary,
-        dpwb |>
-          dplyr::group_by(.data$id) |>
-          dplyr::summarize(
-            irr_total    = sum(.data$irr,   na.rm = TRUE),
-            irr_max      = max(.data$irr,   na.rm = TRUE),
-            irr_mean     = mean(.data$irr,  na.rm = TRUE),
-            runoff_total = sum(.data$runoff, na.rm = TRUE),
-            W_t_min      = min(.data$W_t,   na.rm = TRUE),
-            W_t_max      = max(.data$W_t,   na.rm = TRUE),
-            .groups = "drop"
-          ) |>
-          (\(x) {
-            print(x)
-            if (any(x$irr_max < 0))  warning("Negative irrigation values detected!")
-            else                      message("Irrigation values are non-negative")
-            if (any(x$W_t_min < 0))  warning("Negative soil water values detected!")
-            else                      message("Soil water values are non-negative")
-            x
-          })()
-      ),
-
-      tar_target(
-        monthly_irr,
-        dpwb |>
-          dplyr::mutate(month = lubridate::month(.data$date)) |>
-          dplyr::group_by(.data$month) |>
-          dplyr::summarize(irr_mean = mean(.data$irr, na.rm = TRUE), .groups = "drop") |>
-          (\(x) { print(x); x })()
-      ),
-
-      # --- Plot (saved as PNG) -----------------------------------------------
-
-      tar_target(
-        irrigation_plot, {
-          p <- dpwb |>
-            ggplot2::ggplot() +
-            ggplot2::aes(x = date, y = irr, color = id) +
-            ggplot2::geom_line() +
-            ggplot2::labs(
-              title = "Irrigation Requirements by Site",
-              y     = "Irrigation (mm/day)"
-            )
-          path <- file.path(event_output_dir, "irrigation_plot.png")
-          ggplot2::ggsave(path, p, width = 10, height = 6)
-          path
-        },
-        format = "file"
-      ),
-
-      # --- Write SIPNET event files ------------------------------------------
-
-      tar_target(
-        event_files, {
-          dpwb |>
-            dplyr::group_nest(.data$id) |>
-            dplyr::mutate(
-              fname = purrr::map2(
-                id, data,
-                \(site_id, dat) {
-                  readr::write_delim(
-                    create_event_file(dat),
-                    file.path(
-                      event_output_dir,
-                      glue::glue("{site_id}_events.txt")
-                    ),
-                    delim      = " ",
-                    col_names  = FALSE
-                  )
-                }
-              )
-            )
-          list.files(event_output_dir, full.names = TRUE,
-                     pattern = "_events\\.txt$")
-        },
-        format = "file"
-      )
-
-    )
-  },
-  ask = FALSE
-)
-
-#' Run the pipeline. Targets that are already up-to-date will be skipped.
-# tar_make()
-# tar_invalidate(dp_with_crops)
-tar_make(c(precip))
-
-if (interactive()) {
-  # tar_load(c("design_points", "dp_with_crops", "phenology"))
-  tar_load_everything()
-}
diff --git a/modules/data.land/inst/CIMIS-event-files.R b/modules/data.land/inst/CIMIS-event-files.R
deleted file mode 100644
index 585affd65a9..00000000000
--- a/modules/data.land/inst/CIMIS-event-files.R
+++ /dev/null
@@ -1,370 +0,0 @@
-#' ---
-#' title: "Example workflow generating SIPNET event files from CIMIS and CHIRPS data"
-#' author: "Alexey N. Shiklomanov"
-#' ---
-
-if (interactive()) {
-  devtools::load_all("modules/data.land")
-} else {
-  library(PEcAn.data.land)
-}
-
-#' Define paths to relevant external data files
-design_points_path <- "~/projects/cimis-to-irrigation/design_points.csv"
-cimis_eto_cog_path <- "~/data/CIMIS-ETo-COG"
-parcels_path <- "~/data/LandIQ-harmonized-v3/parcels.gpkg"
-crops_path <- "~/data/LandIQ-harmonized-v3/crops_all_years.parq"
-
-#' Start from a range of dates (2020 to present) and locations (`design_points.csv`).
-
-dates <- seq.Date(as.Date("2020-03-01"), as.Date("2020-11-30"), "day")
-design_points <- readr::read_csv(design_points_path) |>
-  head(10)
-
-#' # CIMIS ETref
-#'
-#' For each site, extract its reference ETref from the CIMIS data.
-
-etref <- design_points |>
-  extract_cimis_dates(
-    dates,
-    cimis_eto_cog_path,
-    .progress = TRUE
-  )
-
-#' # CHIRPS Precipitation
-#'
-#' Also, extract precipitation from CHIRPS v2.
-
-precip <- extract_chirps_remote(design_points, dates)
-
-#' # BIS Kc coefficients
-#'
-#' For each site, get LandIQ parcel and crop data.
-
-dp_with_crops <- get_landiq(
-  design_points,
-  parcels_file = parcels_path,
-  crops_file = crops_path
-) |>
-  tibble::as_tibble()
-
-#' Map `CLASS/SUBCLASS` to `crop_name` using `bism_kc_by_crop`.
-#'
-#' **NOTE:** Some LandIQ classes/subclasses map onto *multiple BISM crop types*.
-
-bism_kc_by_crop |>
-  dplyr::summarize(
-    n_unique = dplyr::n(),
-    .by = c("landiq_class", "landiq_subclass")
-  ) |>
-  dplyr::filter(n_unique > 1) |>
-  dplyr::left_join(bism_kc_by_crop) |>
-  dplyr::summarize(
-    crops = paste(crop_name, collapse = ", "),
-    .by = c("landiq_class", "landiq_subclass", "n_unique")
-  )
-
-#' So, below, we introduce a **HACK** to select just the first crop in any of these groups.
-#' The more correct fix is to do some kind of averaging later.
-
-bism_crop_unique <- bism_kc_by_crop |>
-  dplyr::distinct(landiq_class, landiq_subclass, crop_name) |>
-  # WARNING: Hack here!
-  dplyr::slice(1, .by = c("landiq_class", "landiq_subclass"))
-design_point_crops <- dp_with_crops |>
-  dplyr::left_join(
-    bism_crop_unique,
-    by = c("CLASS" = "landiq_class", "SUBCLASS" = "landiq_subclass")
-  )
-
-#' For demonstration purposes, we will expand this naively using `tidyr::fill` and hard-code dates for the 4 seasons to January 1, April 1, July 1, October 1.
-#' In reality, you would resolve these more finely using phenology data (e.g., from remote sensing).
-
-fill_season <- function(year, season) {
-  if (season == 1) {
-    start <- lubridate::make_date(year, 1, 1)
-    end <- lubridate::make_date(year, 3, 31)
-  } else if (season == 2) {
-    start <- lubridate::make_date(year, 4, 1)
-    end <- lubridate::make_date(year, 6, 30)
-  } else if (season == 3) {
-    start <- lubridate::make_date(year, 7, 1)
-    end <- lubridate::make_date(year, 9, 30)
-  } else if (season == 4) {
-    start <- lubridate::make_date(year, 10, 1)
-    end <- lubridate::make_date(year, 12, 31)
-  }
-  seq.Date(start, end, "day")
-}
-
-dp_crops_filled <- design_point_crops |>
-  dplyr::filter(!is.na(season)) |>
-  tidyr::fill(
-    "CLASS",
-    "SUBCLASS",
-    "crop_name",
-    .direction = "downup",
-    .by = "parcel_id"
-  ) |>
-  dplyr::mutate(date = purrr::map2(year, season, fill_season)) |>
-  tidyr::unnest(date) |>
-  dplyr::filter(date %in% !!dates)
-
-#' Identify and warn about parcels with no matching BIS crop.
-
-missing_crops <- dp_crops_filled |> dplyr::filter(is.na(crop_name))
-if (nrow(missing_crops) > 0) {
-  missing_crop_strs <- missing_crops |>
-    dplyr::distinct(CLASS, SUBCLASS) |>
-    dplyr::mutate(string = glue::glue("CLASS: {CLASS} SUBCLASS: {SUBCLASS}")) |>
-    dplyr::pull(string)
-  missing_crop_str <- sprintf("[%s]", paste(missing_crop_strs, collapse = "; "))
-  warning(
-    "Skipping ",
-    nrow(missing_crops),
-    " rows with no matching BIS crop. Relevant pairs are: ",
-    missing_crop_str
-  )
-}
-
-dp_with_cropname <- dp_crops_filled |>
-  dplyr::filter(!is.na(crop_name)) |>
-  dplyr::left_join(
-    crop_whc |> dplyr::select("crop_name", "whc_min_frac", "rooting_depth_m"),
-    by = "crop_name"
-  )
-
-#' # SSURGO Soil Data
-#'
-#' Calculate site-specific water holding capacity (WHC) from SSURGO soil data and crop rooting depth.
-
-calc_effective_awc <- function(
-  hzdept_r_cm,
-  hzdepb_r_cm,
-  awc_r,
-  rooting_depth_cm
-) {
-  # Clip each horizon to the rooting depth
-  effective_top <- pmin(hzdept_r_cm, rooting_depth_cm)
-  effective_bottom <- pmin(hzdepb_r_cm, rooting_depth_cm)
-  thickness_cm <- pmax(0, effective_bottom - effective_top)
-
-  # awc_r is cm water / cm soil, so multiply by thickness to get cm water.
-  # Convert cm water to mm water by multiplying by 10.
-  sum(awc_r * thickness_cm, na.rm = TRUE) * 10
-}
-
-# 1. Get mukeys for all design points
-design_points_sf <- design_points |>
-  dplyr::distinct(id, lon, lat)
-
-mukeys_list <- purrr::map2(
-  design_points_sf$lon,
-  design_points_sf$lat,
-  ~ ssurgo_mukeys_point(point = c(.x, .y), distance = 20)
-)
-
-# 2. Query gSSURGO for soil data
-all_mukeys <- unique(unlist(mukeys_list))
-soil_raw <- gSSURGO.Query(
-  mukeys = all_mukeys,
-  fields = c("chorizon.awc_r", "chorizon.hzdept_r", "chorizon.hzdepb_r")
-)
-
-# 3. Calculate effective WHC for each site-crop combination
-# We use the dominant soil component for each map unit.
-soil_dominant <- soil_raw |>
-  dplyr::filter(cokey == cokey[which.max(comppct_r)], .by = "mukey")
-
-dp_with_whc <- dp_with_cropname |>
-  dplyr::mutate(mukey = mukeys_list[match(id, design_points_sf$id)]) |>
-  tidyr::unnest(mukey) |>
-  dplyr::mutate(mukey = as.numeric(mukey)) |>
-  dplyr::left_join(
-    soil_dominant,
-    by = "mukey",
-    relationship = "many-to-many"
-  ) |>
-  dplyr::summarize(
-    whc_mm = calc_effective_awc(
-      hzdept_r,
-      hzdepb_r,
-      awc_r,
-      rooting_depth_cm = rooting_depth_m[[1]] * 100
-    ),
-    .by = c("id", "parcel_id", "date", "crop_name", "whc_min_frac")
-  ) |>
-  # Fallback to default if WHC is 0 or NA
-  dplyr::mutate(whc_mm = dplyr::if_else(whc_mm > 0, whc_mm, 500, missing = 500))
-
-#' # Join with ETo data
-#'
-#' Join with ETref data.
-
-dp_with_eto <- dp_with_whc |>
-  dplyr::left_join(
-    (etref |> dplyr::select("id", "date", "etref_mm_day")),
-    by = c("id", "date")
-  )
-
-#' Calculate ETc directly using eto_to_etc_bism. Group by crop_name and apply since eto_to_etc_bism takes a single crop at a time.
-
-dp_with_etc <- dp_with_eto |>
-  dplyr::mutate(
-    etc_mm_day = eto_to_etc_bism(
-      eto = etref_mm_day,
-      crop_name = crop_name[[1]],
-      date = date
-    ),
-    .by = "crop_name"
-  ) |>
-  dplyr::select(
-    dplyr::any_of(c("id", "parcel_id", "lat", "lon")),
-    "date",
-    "etc_mm_day",
-    "whc_min_frac",
-    "whc_mm"
-  )
-
-#' Handle multi-crop parcels (double-cropping) - placeholder logic that warns and averages ETc values.
-
-resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
-  id_sym <- rlang::sym(id_col)
-  date_sym <- rlang::sym(date_col)
-
-  multicrop_counts <- etc_data |>
-    dplyr::add_count(!!id_sym, !!date_sym, name = "n") |>
-    dplyr::filter(.data$n > 1) |>
-    dplyr::summarize(
-      n_multicrop = dplyr::n_distinct(!!id_sym, !!date_sym),
-      .groups = "drop"
-    )
-
-  if (multicrop_counts$n_multicrop > 0) {
-    message(
-      "Multi-crop parcels: ",
-      multicrop_counts$n_multicrop,
-      " date-parcel combinations have multiple crops. Averaging ETc and WHC values."
-    )
-  }
-
-  etc_data |>
-    dplyr::group_by(!!id_sym, !!date_sym) |>
-    dplyr::summarize(
-      etc_mm_day = mean(.data$etc_mm_day, na.rm = TRUE),
-      whc_min_frac = mean(.data$whc_min_frac, na.rm = TRUE),
-      whc_mm = mean(.data$whc_mm, na.rm = TRUE),
-      .groups = "drop"
-    )
-}
-
-dp_with_etc <- resolve_multicrop(dp_with_etc)
-
-#' Join with precipitation data (inner_join to ensure matching dates).
-
-dp_crops_all <- dp_with_etc |>
-  dplyr::inner_join(precip, by = c("id", "date")) |>
-  dplyr::select(c(
-    "id",
-    "lat",
-    "lon",
-    "date",
-    "etc_mm_day",
-    "precip_mm_day",
-    "whc_min_frac",
-    "whc_mm"
-  ))
-
-#' # Calculate water balance
-
-dpwb <- apply_water_balance(dp_crops_all, "id")
-
-#' Check crop evapotranspiration values are reasonable.
-
-etc_summary <- dp_crops_all |>
-  dplyr::summarize(
-    etc_min = min(.data$etc_mm_day, na.rm = TRUE),
-    etc_max = max(.data$etc_mm_day, na.rm = TRUE),
-    etc_mean = mean(.data$etc_mm_day, na.rm = TRUE),
-    .by = "id"
-  )
-print(etc_summary)
-
-#' Check that water balance calculations are reasonable.
-
-wb_summary <- dpwb |>
-  dplyr::group_by(.data$id) |>
-  dplyr::summarize(
-    irr_total = sum(.data$irr, na.rm = TRUE),
-    irr_max = max(.data$irr, na.rm = TRUE),
-    irr_mean = mean(.data$irr, na.rm = TRUE),
-    runoff_total = sum(.data$runoff, na.rm = TRUE),
-    W_t_min = min(.data$W_t, na.rm = TRUE),
-    W_t_max = max(.data$W_t, na.rm = TRUE),
-    .groups = "drop"
-  )
-print(wb_summary)
-
-#' Check for other issues.
-
-if (any(wb_summary$irr_max < 0)) {
-  warning("Negative irrigation values detected!")
-} else {
-  message("Irrigation values are non-negative")
-}
-
-if (any(wb_summary$W_t_min < 0)) {
-  warning("Negative soil water values detected!")
-} else {
-  message("Soil water values are non-negative")
-}
-
-#' Seasonal variation check - irrigation should be higher in summer.
-
-monthly_irr <- dpwb |>
-  dplyr::mutate(month = lubridate::month(.data$date)) |>
-  dplyr::group_by(.data$month) |>
-  dplyr::summarize(irr_mean = mean(.data$irr, na.rm = TRUE), .groups = "drop")
-print(monthly_irr)
-
-#' # Plot results
-
-library(ggplot2)
-dpwb |>
-  ggplot() +
-  aes(x = date, y = irr, color = id) +
-  geom_line() +
-  labs(title = "Irrigation Requirements by Site", y = "Irrigation (mm/day)")
-
-#' # Write event files
-#'
-#' Example of a single event data frame.
-
-dpwb |>
-  dplyr::filter(id == id[[1]]) |>
-  create_event_file()
-
-#' Write all event files.
-
-outdir <- tempfile(pattern = "events_")
-dir.create(outdir)
-dpwb |>
-  dplyr::group_nest(.data$id) |>
-  dplyr::mutate(
-    fname = purrr::map2(
-      id,
-      data,
-      \(id, dat) {
-        readr::write_delim(
-          create_event_file(dat),
-          file.path(outdir, glue::glue("{id}_events.txt")),
-          delim = " ",
-          col_names = FALSE
-        )
-      }
-    )
-  )
-
-fnames <- list.files(outdir, full.names = TRUE)
-cat(readr::read_file(fnames[[1]]))

From f74c8afe97b13bd59a85670471f28a4f7aa813d8 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 16:22:39 -0400
Subject: [PATCH 61/67] remove unused code

---
 modules/data.land/NAMESPACE                   |  6 --
 modules/data.land/R/CHIRPS-precip.R           | 61 ------------
 modules/data.land/R/CIMIS-ET.R                | 94 -------------------
 modules/data.land/R/get-landiq-parcels.R      | 22 -----
 modules/data.land/R/get-landiq.R              | 23 -----
 modules/data.land/man/download_cimis_et.Rd    | 21 -----
 .../data.land/man/extract_chirps_remote.Rd    | 32 -------
 modules/data.land/man/extract_cimis_date.Rd   | 31 ------
 modules/data.land/man/extract_cimis_dates.Rd  | 26 -----
 modules/data.land/man/get_landiq.Rd           | 22 -----
 .../data.land/man/get_landiq_parcel_ids.Rd    | 20 ----
 .../tests/testthat/test-CHIRPS-precip.R       | 68 --------------
 .../R/resolve_multicrop.R                     | 30 ------
 13 files changed, 456 deletions(-)
 delete mode 100644 modules/data.land/R/CHIRPS-precip.R
 delete mode 100644 modules/data.land/R/CIMIS-ET.R
 delete mode 100644 modules/data.land/R/get-landiq-parcels.R
 delete mode 100644 modules/data.land/R/get-landiq.R
 delete mode 100644 modules/data.land/man/download_cimis_et.Rd
 delete mode 100644 modules/data.land/man/extract_chirps_remote.Rd
 delete mode 100644 modules/data.land/man/extract_cimis_date.Rd
 delete mode 100644 modules/data.land/man/extract_cimis_dates.Rd
 delete mode 100644 modules/data.land/man/get_landiq.Rd
 delete mode 100644 modules/data.land/man/get_landiq_parcel_ids.Rd
 delete mode 100644 modules/data.land/tests/testthat/test-CHIRPS-precip.R
 delete mode 100644 workflows/irrigation-statewide/R/resolve_multicrop.R

diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index 3d72ff060e5..6680fcb4b85 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -18,7 +18,6 @@ export(cohort2pool)
 export(create_event_file)
 export(dataone_download)
 export(download.SM_CDS)
-export(download_cimis_et)
 export(download_package_rm)
 export(ens_veg_module)
 export(eto_to_etc)
@@ -27,9 +26,6 @@ export(extract.stringCode)
 export(extract_FIA)
 export(extract_NEON_veg)
 export(extract_SM_CDS)
-export(extract_chirps_remote)
-export(extract_cimis_date)
-export(extract_cimis_dates)
 export(extract_openet_daily)
 export(extract_soil_gssurgo)
 export(extract_soil_nc)
@@ -42,8 +38,6 @@ export(gSSURGO.Query)
 export(generate_soilgrids_ensemble)
 export(get.attributes)
 export(get.soil)
-export(get_landiq)
-export(get_landiq_parcel_ids)
 export(get_resource_map)
 export(get_veg_module)
 export(ic_process)
diff --git a/modules/data.land/R/CHIRPS-precip.R b/modules/data.land/R/CHIRPS-precip.R
deleted file mode 100644
index 3c27d4fad5b..00000000000
--- a/modules/data.land/R/CHIRPS-precip.R
+++ /dev/null
@@ -1,61 +0,0 @@
-#' Extract CHIRPS Precipitation Data from Remote NetCDF
-#'
-#' Downloads and extracts daily precipitation data from the CHIRPS (Climate
-#' Hazards group InfraRed Precipitation with Station data) dataset via remote
-#' NetCDF file access using vsicurl.
-#'
-#' @param design_points A data frame or tibble containing columns `lon` and `lat`
-#'   specifying the geographic coordinates of points to extract precipitation for.
-#' @param dates A vector of dates or date-time objects specifying the days for which
-#'   to extract precipitation data.
-#' @returns A modified version of `design_points` with new rows added for each date,
-#'   plus two new columns:
-#'   \item{date}{The date of the extracted data (same as the input `date`).}
-#'   \item{precip_mm_day}{Precipitation in millimeters for the specified day.}
-#' @examples
-#' \dontrun{
-#' pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
-#' result <- extract_chirps_remote(pts, as.Date(c("2020-06-15", "2021-06-15")))
-#' }
-#' @export
-extract_chirps_remote <- function(design_points, dates) {
-  CHIRPS_REMOTE_ROOT <- "https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_daily/netcdf/p05"
-
-  dates <- lubridate::as_date(dates)
-  years <- lubridate::year(dates)
-
-  result_list <- list()
-
-  for (yr in unique(years)) {
-    dates_yr <- dates[years == yr]
-    day_of_year_yr <- lubridate::yday(dates_yr)
-
-    url <- glue::glue("{CHIRPS_REMOTE_ROOT}/chirps-v2.0.{yr}.days_p05.nc")
-    vsicurl_path <- paste0("/vsicurl/", url)
-
-    # Suppress only the "no extent" warning. We set the extent on the next
-    # line. Other warnings should still fire.
-    withCallingHandlers({
-      r <- terra::rast(vsicurl_path)
-    },
-    warning = function(w) {
-      if (grepl("unknown extent", w$message)) {
-        invokeRestart("muffleWarning")
-      }
-    })
-    terra::ext(r) <- c(-180, 180, -50, 50)
-    terra::crs(r) <- "EPSG:4326"
-
-    r_days <- r[[day_of_year_yr]]
-
-    pts <- as.matrix(design_points[, c("lon", "lat")])
-    vals <- terra::extract(r_days, pts)
-    vals_vec <- as.vector(t(t(vals)))
-
-    result_list[[as.character(yr)]] <- design_points |>
-      tidyr::expand_grid(date = dates_yr) |>
-      dplyr::mutate(precip_mm_day = vals_vec)
-  }
-
-  dplyr::bind_rows(result_list)
-}
diff --git a/modules/data.land/R/CIMIS-ET.R b/modules/data.land/R/CIMIS-ET.R
deleted file mode 100644
index 265cd52b897..00000000000
--- a/modules/data.land/R/CIMIS-ET.R
+++ /dev/null
@@ -1,94 +0,0 @@
-#' Download CIMIS ETo data
-#'
-#' Read raw ETo.asc.gz directly from CIMIS spatial portal, add CRS, and save
-#' locally as Cloud-optimized GeoTIFF. Outputs will be saved to
-#' `<local_root_dir>/CIMIS-ETo-YYYY-MM-DD.tif`.
-#'
-#' @param date Date to download
-#' @param local_root_dir Root directory for storing outputs.
-#'
-#' @return Path to saved TIF file (invisibly)
-#' @export
-download_cimis_et <- function(date, local_root_dir) {
-  date_str <- format(date, "%Y/%m/%d")
-  date_filename <- format(date, "%Y-%m-%d")
-
-  base_url <- "https://spatialcimis.water.ca.gov/cimis"
-  remote_path <- file.path(base_url, date_str, "ETo.asc.gz")
-  vsicurl_path <- paste0("/vsigzip//vsicurl/", remote_path)
-
-  tif_path <- file.path(
-    local_root_dir,
-    paste0("CIMIS-ETo-", date_filename, ".tif")
-  )
-
-  r <- terra::rast(vsicurl_path)
-  terra::crs(r) <- "EPSG:3310"
-
-  terra::writeRaster(r, tif_path, filetype = "COG", overwrite = TRUE)
-
-  invisible(tif_path)
-}
-
-#' Extract CIMIS daily reference ETo values
-#'
-#' @param design_points `data.frame` of design points with columns
-#' `location_id`, `lat`, and `lon`
-#' @param download_missing If `TRUE` and the local COG is missing, download it.
-#' If `FALSE` and the file is missing, throw an error.
-#'
-#' @inheritParams download_cimis_et
-#'
-#' @return `design_points` `data.frame` with additional columns `date`, and
-#' `etref_mm_day` (reference ET, mm/day)
-#' @export
-extract_cimis_date <- function(
-  design_points,
-  date,
-  local_root_dir,
-  download_missing = FALSE
-) {
-  date_filename <- format(date, "%Y-%m-%d")
-  tif_path <- file.path(
-    local_root_dir,
-    paste0("CIMIS-ET-", date_filename, ".tif")
-  )
-
-  if (!file.exists(tif_path)) {
-    if (!download_missing) {
-      stop("Missing file ", tif_path)
-    }
-    download_cimis_et(date, local_root_dir)
-  }
-
-  r <- terra::rast(tif_path)
-
-  pts_sf <- sf::st_as_sf(design_points, coords = c("lon", "lat"), crs = 4326)
-  pts_albers <- sf::st_transform(pts_sf, crs = 3310)
-  coords <- sf::st_coordinates(pts_albers)
-
-  vals <- terra::extract(r, coords)
-
-  design_points |>
-    dplyr::mutate(date = date, etref_mm_day = vals[, 1])
-}
-
-#' Extract CIMIS reference ET for multiple dates
-#'
-#' @param dates Sequence of dates for which to extract data
-#' @param ... Additional arguments to [extract_cimis_date()]
-#' @inheritParams extract_cimis_date
-#' @inheritParams purrr::map
-#'
-#' @return `design_points` `data.frame` extended with ETref data for all dates.
-#' @export
-extract_cimis_dates <- function(design_points, dates, .progress = FALSE, ...) {
-  df_list <- purrr::map(
-    dates,
-    purrr::possibly(extract_cimis_date, NULL, quiet = FALSE),
-    design_points = design_points,
-    .progress = .progress,
-    ...
-  )
-  dplyr::bind_rows(df_list)
-}
diff --git a/modules/data.land/R/get-landiq-parcels.R b/modules/data.land/R/get-landiq-parcels.R
deleted file mode 100644
index f5ca39ba987..00000000000
--- a/modules/data.land/R/get-landiq-parcels.R
+++ /dev/null
@@ -1,22 +0,0 @@
-#' Get Parcel IDs from LandIQ
-#'
-#' @param design_points `data.frame` of coordinates to extract. Must contain
-#' columns `id`, `lat`, and `lon`.
-#' @param parcels_file Path to harmonized LandIQ parcels (GPKG) file
-#'
-#' @return `design_points` `data.frame` with harmonized LandIQ parcel_IDs
-#' @export
-get_landiq_parcel_ids <- function(design_points, parcels_file) {
-  parcels_vect <- terra::vect(parcels_file)
-  pts_sf <- design_points |>
-    dplyr::select("id", "lat", "lon") |>
-    sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
-    sf::st_transform(sf::st_crs(parcels_vect))
-  dp_vect <- terra::vect(pts_sf)
-  matched <- terra::intersect(dp_vect, parcels_vect)
-  matched_sf <- sf::st_as_sf(matched) |>
-    sf::st_drop_geometry()
-  dp_with_parcels <- design_points |>
-    dplyr::left_join(matched_sf, by = "id")
-  dp_with_parcels
-}
diff --git a/modules/data.land/R/get-landiq.R b/modules/data.land/R/get-landiq.R
deleted file mode 100644
index e6ed76a72e8..00000000000
--- a/modules/data.land/R/get-landiq.R
+++ /dev/null
@@ -1,23 +0,0 @@
-#' Get LandIQ parcels and crop data
-#'
-#' @param design_points `data.frame` of coordinates to extract. Must contain
-#' columns `id`, `lat`, and `lon`.
-#' @param parcels_file Path to harmonized LandIQ parcels (GPKG) file
-#' @param crops_file Path to LandIQ crops parquet file
-#'
-#' @return `design_points` `data.frame` with LandIQ parcel IDs, year, season, CLASS, and SUBCLASS
-#' @export
-get_landiq <- function(design_points, parcels_file, crops_file) {
-  dp_with_parcels <- get_landiq_parcel_ids(design_points, parcels_file)
-
-  crops <- arrow::open_dataset(crops_file) |>
-    dplyr::filter(.data$parcel_id %in% unique(dp_with_parcels[["parcel_id"]])) |> 
-    dplyr::select("parcel_id", "year", "season", "CLASS", "SUBCLASS") |>
-    dplyr::collect()
-
-  dp_with_crops <- dp_with_parcels |>
-    dplyr::left_join(crops, by = "parcel_id") |>
-    tibble::as_tibble()
-
-  dp_with_crops
-}
diff --git a/modules/data.land/man/download_cimis_et.Rd b/modules/data.land/man/download_cimis_et.Rd
deleted file mode 100644
index dca753c5bef..00000000000
--- a/modules/data.land/man/download_cimis_et.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/CIMIS-ET.R
-\name{download_cimis_et}
-\alias{download_cimis_et}
-\title{Download CIMIS ETo data}
-\usage{
-download_cimis_et(date, local_root_dir)
-}
-\arguments{
-\item{date}{Date to download}
-
-\item{local_root_dir}{Root directory for storing outputs.}
-}
-\value{
-Path to saved TIF file (invisibly)
-}
-\description{
-Read raw ETo.asc.gz directly from CIMIS spatial portal, add CRS, and save
-locally as Cloud-optimized GeoTIFF. Outputs will be saved to
-`<local_root_dir>/CIMIS-ETo-YYYY-MM-DD.tif`.
-}
diff --git a/modules/data.land/man/extract_chirps_remote.Rd b/modules/data.land/man/extract_chirps_remote.Rd
deleted file mode 100644
index ff9a4d69dfd..00000000000
--- a/modules/data.land/man/extract_chirps_remote.Rd
+++ /dev/null
@@ -1,32 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/CHIRPS-precip.R
-\name{extract_chirps_remote}
-\alias{extract_chirps_remote}
-\title{Extract CHIRPS Precipitation Data from Remote NetCDF}
-\usage{
-extract_chirps_remote(design_points, dates)
-}
-\arguments{
-\item{design_points}{A data frame or tibble containing columns `lon` and `lat`
-specifying the geographic coordinates of points to extract precipitation for.}
-
-\item{dates}{A vector of dates or date-time objects specifying the days for which
-to extract precipitation data.}
-}
-\value{
-A modified version of `design_points` with new rows added for each date,
-  plus two new columns:
-  \item{date}{The date of the extracted data (same as the input `date`).}
-  \item{precip_mm_day}{Precipitation in millimeters for the specified day.}
-}
-\description{
-Downloads and extracts daily precipitation data from the CHIRPS (Climate
-Hazards group InfraRed Precipitation with Station data) dataset via remote
-NetCDF file access using vsicurl.
-}
-\examples{
-\dontrun{
-pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
-result <- extract_chirps_remote(pts, as.Date(c("2020-06-15", "2021-06-15")))
-}
-}
diff --git a/modules/data.land/man/extract_cimis_date.Rd b/modules/data.land/man/extract_cimis_date.Rd
deleted file mode 100644
index 445161d55ca..00000000000
--- a/modules/data.land/man/extract_cimis_date.Rd
+++ /dev/null
@@ -1,31 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/CIMIS-ET.R
-\name{extract_cimis_date}
-\alias{extract_cimis_date}
-\title{Extract CIMIS daily reference ETo values}
-\usage{
-extract_cimis_date(
-  design_points,
-  date,
-  local_root_dir,
-  download_missing = FALSE
-)
-}
-\arguments{
-\item{design_points}{`data.frame` of design points with columns
-`location_id`, `lat`, and `lon`}
-
-\item{date}{Date to download}
-
-\item{local_root_dir}{Root directory for storing outputs.}
-
-\item{download_missing}{If `TRUE` and the local COG is missing, download it.
-If `FALSE` and the file is missing, throw an error.}
-}
-\value{
-`design_points` `data.frame` with additional columns `date`, and
-`etref_mm_day` (reference ET, mm/day)
-}
-\description{
-Extract CIMIS daily reference ETo values
-}
diff --git a/modules/data.land/man/extract_cimis_dates.Rd b/modules/data.land/man/extract_cimis_dates.Rd
deleted file mode 100644
index 17096384e1e..00000000000
--- a/modules/data.land/man/extract_cimis_dates.Rd
+++ /dev/null
@@ -1,26 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/CIMIS-ET.R
-\name{extract_cimis_dates}
-\alias{extract_cimis_dates}
-\title{Extract CIMIS reference ET for multiple dates}
-\usage{
-extract_cimis_dates(design_points, dates, .progress = FALSE, ...)
-}
-\arguments{
-\item{design_points}{`data.frame` of design points with columns
-`location_id`, `lat`, and `lon`}
-
-\item{dates}{Sequence of dates for which to extract data}
-
-\item{.progress}{Whether to show a progress bar. Use \code{TRUE} to turn on
-a basic progress bar, use a string to give it a name, or see
-\link[purrr]{progress_bars} for more details.}
-
-\item{...}{Additional arguments to [extract_cimis_date()]}
-}
-\value{
-`design_points` `data.frame` extended with ETref data for all dates.
-}
-\description{
-Extract CIMIS reference ET for multiple dates
-}
diff --git a/modules/data.land/man/get_landiq.Rd b/modules/data.land/man/get_landiq.Rd
deleted file mode 100644
index e15e3395071..00000000000
--- a/modules/data.land/man/get_landiq.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/get-landiq.R
-\name{get_landiq}
-\alias{get_landiq}
-\title{Get LandIQ parcels and crop data}
-\usage{
-get_landiq(design_points, parcels_file, crops_file)
-}
-\arguments{
-\item{design_points}{`data.frame` of coordinates to extract. Must contain
-columns `id`, `lat`, and `lon`.}
-
-\item{parcels_file}{Path to harmonized LandIQ parcels (GPKG) file}
-
-\item{crops_file}{Path to LandIQ crops parquet file}
-}
-\value{
-`design_points` `data.frame` with LandIQ parcel IDs, year, season, CLASS, and SUBCLASS
-}
-\description{
-Get LandIQ parcels and crop data
-}
diff --git a/modules/data.land/man/get_landiq_parcel_ids.Rd b/modules/data.land/man/get_landiq_parcel_ids.Rd
deleted file mode 100644
index f3cefe20a33..00000000000
--- a/modules/data.land/man/get_landiq_parcel_ids.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/get-landiq-parcels.R
-\name{get_landiq_parcel_ids}
-\alias{get_landiq_parcel_ids}
-\title{Get Parcel IDs from LandIQ}
-\usage{
-get_landiq_parcel_ids(design_points, parcels_file)
-}
-\arguments{
-\item{design_points}{`data.frame` of coordinates to extract. Must contain
-columns `id`, `lat`, and `lon`.}
-
-\item{parcels_file}{Path to harmonized LandIQ parcels (GPKG) file}
-}
-\value{
-`design_points` `data.frame` with harmonized LandIQ parcel_IDs
-}
-\description{
-Get Parcel IDs from LandIQ
-}
diff --git a/modules/data.land/tests/testthat/test-CHIRPS-precip.R b/modules/data.land/tests/testthat/test-CHIRPS-precip.R
deleted file mode 100644
index db981116d21..00000000000
--- a/modules/data.land/tests/testthat/test-CHIRPS-precip.R
+++ /dev/null
@@ -1,68 +0,0 @@
-test_that("extract_chirps_remote returns data for single date", {
-  skip_if_offline()
-  skip_on_ci()
-
-  pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
-
-  result <- extract_chirps_remote(pts, as.Date("2020-06-15"))
-
-  expect_s3_class(result, "data.frame")
-  expect_equal(nrow(result), 2)
-  expect_true("date" %in% names(result))
-  expect_true("precip_mm_day" %in% names(result))
-  expect_true(all(result$date == as.Date("2020-06-15")))
-  expect_type(result$precip_mm_day, "double")
-  expect_true(all(result$precip_mm_day >= 0, na.rm = TRUE))
-})
-
-test_that("extract_chirps_remote handles multiple dates in same year", {
-  skip_if_offline()
-  skip_on_ci()
-
-  pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
-  dates <- seq(as.Date("2020-06-01"), as.Date("2020-06-30"), by = "1 day")
-
-  result <- extract_chirps_remote(pts, dates)
-
-  expect_equal(nrow(result), 2 * 30)
-  expect_equal(unique(lubridate::year(result$date)), 2020)
-  expect_equal(sort(unique(result$date)), sort(dates))
-  expect_type(result$precip_mm_day, "double")
-  expect_true(all(result$precip_mm_day >= 0, na.rm = TRUE))
-})
-
-test_that("extract_chirps_remote handles dates spanning multiple years", {
-  skip_if_offline()
-  skip_on_ci()
-
-  pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
-  dates <- c(
-    as.Date("2020-12-15"),
-    as.Date("2020-12-31"),
-    as.Date("2021-01-01"),
-    as.Date("2021-01-15")
-  )
-
-  result <- extract_chirps_remote(pts, dates)
-
-  expect_equal(nrow(result), 2 * 4)
-  expect_equal(sort(unique(result$date)), sort(dates))
-  expect_true(all(result$precip_mm_day >= 0, na.rm = TRUE))
-})
-
-test_that("extract_chirps_remote output has correct structure", {
-  skip_if_offline()
-  skip_on_ci()
-
-  pts <- tibble::tibble(lon = c(-120, -110), lat = c(35, 40), site_id = 1:2)
-  dates <- seq(as.Date("2020-06-01"), as.Date("2020-06-10"), by = "1 day")
-
-  result <- extract_chirps_remote(pts, dates)
-
-  expect_equal(nrow(result), nrow(pts) * length(dates))
-  expect_true("date" %in% names(result))
-  expect_true("precip_mm_day" %in% names(result))
-  expect_true(all(result$date %in% dates))
-  expect_true(is.numeric(result$precip_mm_day))
-  expect_true(all(is.na(result$precip_mm_day) | result$precip_mm_day >= 0))
-})
diff --git a/workflows/irrigation-statewide/R/resolve_multicrop.R b/workflows/irrigation-statewide/R/resolve_multicrop.R
deleted file mode 100644
index ed1c4cfb3d9..00000000000
--- a/workflows/irrigation-statewide/R/resolve_multicrop.R
+++ /dev/null
@@ -1,30 +0,0 @@
-#' Average ETc and WHC across multi-crop parcels (double-cropping hack).
-resolve_multicrop <- function(etc_data, id_col = "id", date_col = "date") {
-  id_sym   <- rlang::sym(id_col)
-  date_sym <- rlang::sym(date_col)
-
-  multicrop_counts <- etc_data |>
-    dplyr::add_count(!!id_sym, !!date_sym, name = "n") |>
-    dplyr::filter(.data$n > 1) |>
-    dplyr::summarize(
-      n_multicrop = dplyr::n_distinct(!!id_sym, !!date_sym),
-      .groups = "drop"
-    )
-
-  if (multicrop_counts$n_multicrop > 0) {
-    message(
-      "Multi-crop parcels: ",
-      multicrop_counts$n_multicrop,
-      " date-parcel combinations have multiple crops. Averaging ETc and WHC values."
-    )
-  }
-
-  etc_data |>
-    dplyr::group_by(!!id_sym, !!date_sym) |>
-    dplyr::summarize(
-      etc_mm_day   = mean(.data$etc_mm_day,   na.rm = TRUE),
-      whc_min_frac = mean(.data$whc_min_frac, na.rm = TRUE),
-      whc_mm       = mean(.data$whc_mm,       na.rm = TRUE),
-      .groups = "drop"
-    )
-}

From 367950dffe8e34fd673662733dd103e72d916a7c Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 16:44:47 -0400
Subject: [PATCH 62/67] add preprocessing scripts

---
 .../preprocessing/.clustermq_sge.tmpl         |  14 ++
 .../preprocessing/README.md                   |  24 ++++
 .../preprocessing/chirps-preprocess.R         |  77 +++++++++++
 .../preprocessing/cimis-01-weights.R          |  79 +++++++++++
 .../preprocessing/cimis-02-extract.R          |  98 +++++++++++++
 .../preprocessing/cimis-03-combine.sql        |  27 ++++
 .../preprocessing/ssurgo-01-spatial-weights.R | 130 ++++++++++++++++++
 .../preprocessing/ssurgo-02-combine.R         |  47 +++++++
 8 files changed, 496 insertions(+)
 create mode 100644 workflows/irrigation-statewide/preprocessing/.clustermq_sge.tmpl
 create mode 100644 workflows/irrigation-statewide/preprocessing/README.md
 create mode 100644 workflows/irrigation-statewide/preprocessing/chirps-preprocess.R
 create mode 100644 workflows/irrigation-statewide/preprocessing/cimis-01-weights.R
 create mode 100644 workflows/irrigation-statewide/preprocessing/cimis-02-extract.R
 create mode 100644 workflows/irrigation-statewide/preprocessing/cimis-03-combine.sql
 create mode 100644 workflows/irrigation-statewide/preprocessing/ssurgo-01-spatial-weights.R
 create mode 100644 workflows/irrigation-statewide/preprocessing/ssurgo-02-combine.R

diff --git a/workflows/irrigation-statewide/preprocessing/.clustermq_sge.tmpl b/workflows/irrigation-statewide/preprocessing/.clustermq_sge.tmpl
new file mode 100644
index 00000000000..881add9c826
--- /dev/null
+++ b/workflows/irrigation-statewide/preprocessing/.clustermq_sge.tmpl
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+#$ -N {{ job_name }}               # job name
+##$ -q default                      # submit to queue named "default"
+#$ -j y                            # combine stdout/error in one file
+#$ -o {{ log_file | /dev/null }}   # output file
+#$ -cwd                            # use pwd as work dir
+#$ -V                              # use environment variable
+#$ -t 1-{{ n_jobs }}               # submit jobs as array
+#$ -pe smp {{ cores | 1 }}         # number of cores to use per job
+#$ -l h_rt={{ walltime | 01:00:00 }}
+
+# ulimit -v $(( 1024 * {{ memory | 4096 }} ))
+
+CMQ_AUTH={{ auth }} pixi run Rscript -e "clustermq:::worker('{{ master }}')"
diff --git a/workflows/irrigation-statewide/preprocessing/README.md b/workflows/irrigation-statewide/preprocessing/README.md
new file mode 100644
index 00000000000..d4c090af2c5
--- /dev/null
+++ b/workflows/irrigation-statewide/preprocessing/README.md
@@ -0,0 +1,24 @@
+# Data preprocessing workflows for irrigation inputs
+
+This directory uses `pixi` for dependency management and `clustermq` for parallelizing work automatically across an SGE cluster (like the BU SCC).
+
+## CIMIS reference ET
+
+Prerequisites: Raw CIMIS ETref data downloaded from spatialcimis.water.ca.gov.
+
+- `cimis-01-weights.R` --- Pre-calculate the area weights of CIMIS ETref pixels for each harmonized LandIQ polygon using `exactextractr` (slow)
+- `cimis-02-extract.R` --- Apply the weights to each CIMITS ETref raster file (fast)
+- `cimis-03-combine.sql` --- Recombine the results into a properly hive-partitioned parquet dataset.
+
+## CHIRPS
+
+Prerequisites: Raw CHIRPS v2 data downloaded from https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_daily/netcdf/p05/.
+
+- `chirps-preprocess.R` --- Entire workflow. For each year, use `exactextractr::exact_extract` to pull out the area-weighted values of CHIRPS for each harmonized LandIQ parcel.
+
+## SSURGO soil data
+
+Prerequisites: A downloaded copy of gSSURGO for all of California (https://nrcs.app.box.com/v/soils/folder/233398887779).
+
+- `ssurgo-01-spatial-weights.R` --- Calculate the area weights of each SSURGO mapping unit for each harmonized LandIQ parcel. Parallelized across batches of parcels.
+- `ssurgo-02-combine.R` --- Recombine the batches into a single parquet file. 
diff --git a/workflows/irrigation-statewide/preprocessing/chirps-preprocess.R b/workflows/irrigation-statewide/preprocessing/chirps-preprocess.R
new file mode 100644
index 00000000000..87b03239a25
--- /dev/null
+++ b/workflows/irrigation-statewide/preprocessing/chirps-preprocess.R
@@ -0,0 +1,77 @@
+#!/usr/bin/env Rscript
+
+chirps_dir <- "/projectnb/dietzelab/ccmmf/management/irrigation/"
+chirpsfiles <- list.files(
+  chirps_dir,
+  "chirps-v2.0.*.nc",
+  full.names = TRUE
+)
+
+parcel_file <- "/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1/parcels.gpkg"
+
+extract_chirps <- function(fname, parcel_file, outdir = "_results_chirps") {
+  # fname <- chirpsfiles[[1]]
+  parcels_sf <- sf::read_sf(parcel_file, use_stream = TRUE)
+  # parcels_sf <- head(parcels_sf_full, 500)
+
+  dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
+
+  year <- basename(fname) |>
+    gsub(
+      pattern = "chirps-v2\\.0\\.(\\d+)\\.days_p05\\.nc",
+      replacement = "\\1"
+    ) |>
+    as.numeric()
+
+  outfile <- file.path(outdir, paste0("chirps-", year, ".parquet"))
+  if (file.exists(outfile)) {
+    message("File exists ", outfile)
+    return(outfile)
+  }
+
+  r <- terra::rast(fname)
+  terra::ext(r) <- c(-180, 180, -50, 50)
+  terra::crs(r) <- "EPSG:4326"
+
+  parcels_proj <- sf::st_transform(parcels_sf, sf::st_crs(r))
+
+  vals <- exactextractr::exact_extract(
+    r,
+    parcels_proj,
+    "mean",
+    append_cols = "parcel_id"
+  )
+
+  date0 <- as.Date(paste0(year, "-01-01"))
+  vals_df <- dplyr::bind_cols(vals) |>
+    tibble::as_tibble() |>
+    tidyr::pivot_longer(
+      -c("parcel_id"),
+      names_to = "yday",
+      names_pattern = ".*\\.days_p05_(\\d+)$",
+      names_transform = as.integer,
+      values_to = "precip_mm_day"
+    ) |>
+    dplyr::mutate(
+      date = date0 + .data$yday,
+      .keep = "unused"
+    ) |>
+    dplyr::relocate("date", .after = "parcel_id") |>
+    dplyr::arrange(.data$parcel_id, .data$date)
+
+  arrow::write_parquet(vals_df, outfile)
+  invisible(outfile)
+}
+
+options(
+  clustermq.scheduler = "sge",
+  clustermq.template  = ".clustermq_sge.tmpl"
+)
+
+clustermq::Q(
+  fun = extract_chirps,
+  fname = chirpsfiles,
+  const = list(parcel_file = parcel_file),
+  n_jobs = length(chirpsfiles),
+  template = list(cores = 1, walltime = "06:00:00")
+)
diff --git a/workflows/irrigation-statewide/preprocessing/cimis-01-weights.R b/workflows/irrigation-statewide/preprocessing/cimis-01-weights.R
new file mode 100644
index 00000000000..b652062a4c4
--- /dev/null
+++ b/workflows/irrigation-statewide/preprocessing/cimis-01-weights.R
@@ -0,0 +1,79 @@
+#!/usr/bin/env Rscript
+
+library(sf)
+library(terra)
+library(dplyr)
+library(Matrix)
+library(exactextractr)
+library(arrow)
+
+parcel_file <- "/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1/parcels.gpkg"
+cimis_dir <- "/projectnb/dietzelab/ccmmf/data_raw/cimis/cimis"
+
+outdir <- "_results_v2"
+dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
+
+parcels_sf <- read_sf(parcel_file, use_stream = TRUE)
+
+cimis_f0 <- file.path(cimis_dir, "2019", "01", "01", "ETo.asc.gz")
+stopifnot(file.exists(cimis_f0))
+
+# A subset of CIMIS files, like this one, have slightly smaller dimensions for
+# some reason. Compute their weights separately.
+cimis_falt <- file.path(cimis_dir, "2015", "03", "01", "ETo.asc.gz")
+stopifnot(file.exists(cimis_falt))
+
+n_parcels <- nrow(parcels_sf)
+
+calc_weights <- function(fname, rdsfile, parqfile, overwrite = FALSE) {
+  if (!overwrite && file.exists(rdsfile) && file.exists(parqfile)) {
+    message("Files exist")
+    return(NULL)
+  }
+
+  r <- rast(file.path("/vsigzip", fname))
+  crs(r) <- "EPSG:3310"
+
+  n_cells <- ncell(r)
+
+  raw_weights <- exact_extract(
+    r,
+    parcels_sf,
+    fun = NULL,
+    include_cell = TRUE,
+    include_cols = "parcel_id",
+    progress = TRUE
+  )
+
+  weights_df <- raw_weights |>
+    bind_rows(.id = "parcel_idx") |>
+    mutate(parcel_idx = as.integer(.data$parcel_idx)) |>
+    as_tibble() |>
+    mutate(
+      w = .data$coverage_fraction / sum(.data$coverage_fraction),
+      .by = "parcel_idx"
+    ) |>
+    select(all_of(c("parcel_idx", "parcel_id", "cell", "w")))
+
+  W <- sparseMatrix(
+    i = weights_df[["parcel_idx"]],
+    j = weights_df[["cell"]],
+    x = weights_df[["w"]],
+    dims = c(n_parcels, n_cells),
+    repr = "C"
+  )
+  rownames(W) <- parcels_sf[["parcel_id"]]
+
+  saveRDS(W, rdsfile)
+  write_parquet(weights_df, parqfile)
+  TRUE
+}
+
+rdsfile_0 <- file.path(outdir, "spatial_weights.rds")
+parqfile_0 <- file.path(outdir, "weights_df.parquet")
+
+calc_weights(cimis_f0, rdsfile_0, parqfile_0)
+
+rdsfile_alt <- file.path(outdir, "spatial_weights_alt.rds")
+parqfile_alt <- file.path(outdir, "weights_df_alt.parquet")
+calc_weights(cimis_falt, rdsfile_alt, parqfle_alt)
diff --git a/workflows/irrigation-statewide/preprocessing/cimis-02-extract.R b/workflows/irrigation-statewide/preprocessing/cimis-02-extract.R
new file mode 100644
index 00000000000..b0fcd2c87cc
--- /dev/null
+++ b/workflows/irrigation-statewide/preprocessing/cimis-02-extract.R
@@ -0,0 +1,98 @@
+#!/usr/bin/env Rscript
+
+options(
+  clustermq.scheduler = "sge",
+  clustermq.template  = ".clustermq_sge.tmpl"
+)
+
+library(terra)
+library(progress)
+library(arrow)
+library(clustermq)
+
+n_workers <- 20
+walltime <- "02:00:00"
+
+outdir <- "_results_v2/daily-raw"
+dir.create(outdir, recursive = TRUE, showWarnings = FALSE)
+
+W <- readRDS("_results_v2/spatial_weights.rds")
+
+cimis_manifest <- "cimis_files.txt"
+years <- seq(2015, 2024)
+if (!file.exists(cimis_manifest)) {
+  get_cimis_files <- function(year) {
+    ydir <- file.path(cimis_root, year)
+    stopifnot(dir.exists(ydir))
+    list.files(
+      ydir,
+      pattern = "ETo\\.asc\\.gz$",
+      full.names = TRUE,
+      recursive = TRUE
+    )
+  }
+  ylist <- map(years, get_cimis_files, .progress = TRUE)
+  cimis_files <- sort(unlist(ylist))
+  writeLines(cimis_files, cimis_manifest)
+} else {
+  cimis_files <- readLines(cimis_manifest)
+}
+
+# Extract
+process_file <- function(fname, W, outdir) {
+  day <- basename(dirname(fname))
+  month <- basename(dirname(dirname(fname)))
+  year <- basename(dirname(dirname(dirname(fname))))
+  datestr <- paste(year, month, day, sep = "-")
+  outfile <- file.path(outdir, paste0(datestr, ".parquet"))
+  if (file.exists(outfile)) {
+    return(outfile)
+  }
+  r <- tryCatch({
+    terra::rast(file.path("/vsigzip", fname))
+  }, error = function(e) {
+    # Some files aren't actually zipped. Try them this way.
+    message("Error reading zipped. Trying unzipped. --- ", fname)
+    terra::rast(fname)
+  })
+  rsize <- terra::size(r)
+  if (rsize == 285600) {
+    # Alternate size files -- use alternate weights
+    W <- readRDS("_results_v2/spatial_weights_alt.rds")
+  } else if (rsize != 276000) {
+    stop("File ", fname, " has unexpected size ", rsize)
+  }
+  terra::crs(r) <- "EPSG:3310"
+  v <- terra::values(r, mat = FALSE)
+  date <- as.Date(datestr)
+  na_mask <- is.na(v)
+  v[na_mask] <- 0
+  valid_mask <- as.numeric(!na_mask)
+  weight_sums <- as.numeric(W %*% valid_mask)
+  et_vals <- as.numeric(W %*% v)
+  et_vals[weight_sums == 0] <- NA_real_   # All values are NA
+  result <- tibble::tibble(
+    parcel_id = as.numeric(rownames(W)),
+    date = date,
+    etref_mm_day = et_vals
+  )
+  arrow::write_parquet(result, outfile)
+  invisible(outfile)
+}
+
+# outfiles_raw <- purrr::map(
+#   cimis_files,
+#   process_file,
+#   W = W,
+#   outdir = outdir,
+#   .progress = TRUE
+# )
+
+cimis_long <- Q(
+  fun     = process_file,
+  fname = cimis_files,
+  const   = list(W = W, outdir = outdir),
+  n_jobs  = n_workers,         # SGE array size — persistent worker processes
+  template = list(cores = 1, walltime = walltime),
+  fail_on_error = FALSE
+)
diff --git a/workflows/irrigation-statewide/preprocessing/cimis-03-combine.sql b/workflows/irrigation-statewide/preprocessing/cimis-03-combine.sql
new file mode 100644
index 00000000000..95805898c05
--- /dev/null
+++ b/workflows/irrigation-statewide/preprocessing/cimis-03-combine.sql
@@ -0,0 +1,27 @@
+-- DuckDB SQL script
+-- Run with:
+--    duckdb < 03-combine.sql
+--
+-- Or, interactively:
+-- 1. Launch duckdb -- `duckdb`
+-- 2. From duckdb -- `.read 03-combine.sql`
+
+SET memory_limit = '32GB';
+
+COPY (
+  SELECT
+    parcel_id,
+    date,
+    etref_mm_day,
+    year(date) AS year
+  FROM read_parquet('_results_v2/daily-raw/*.parquet')
+  ORDER BY parcel_id ASC
+)
+TO '_results_v2/cimis-extracted'
+(
+  FORMAT PARQUET,
+  PARTITION_BY (year),
+  OVERWRITE_OR_IGNORE,
+  COMPRESSION 'ZSTD'
+);
+
diff --git a/workflows/irrigation-statewide/preprocessing/ssurgo-01-spatial-weights.R b/workflows/irrigation-statewide/preprocessing/ssurgo-01-spatial-weights.R
new file mode 100644
index 00000000000..e96ade37340
--- /dev/null
+++ b/workflows/irrigation-statewide/preprocessing/ssurgo-01-spatial-weights.R
@@ -0,0 +1,130 @@
+#!/usr/bin/env Rscript
+
+options(
+  clustermq.scheduler = "sge",
+  clustermq.template  = ".clustermq_sge.tmpl"
+)
+
+library(sf)
+library(dplyr)
+library(arrow)  # for efficient weight storage
+library(clustermq)
+
+outdir <- "_results"
+dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
+
+# Number of parcels to process at a time
+# Higher numbers mean a more intensive intersection.
+# Lower numbers waste more cycles re-reading the (big) mupolygons data frame.
+parcel_chunk_size <- 2000
+
+# Transform ssurgo mupolygon column to parquet for faster reading
+gdb_path   <- "/projectnb/dietzelab/ccmmf/data_raw/ssurgo/gSSURGO_CA.gdb"
+mupoly_path <- "./ssurgo_mupolygons.parquet"
+if (!file.exists(mupoly_path)) {
+  message("Creating parquet version of mupolygons for faster reads")
+  message("01 - reading mupoly")
+  mupoly_raw <- sf::read_sf(
+    gdb_path,
+    layer = "MUPOLYGON",
+    use_stream = TRUE,
+    as_tibble = TRUE
+  )
+  message("02 - validating and transforming")
+  mupoly <- mupoly_raw |>
+    sf::st_make_valid() |>
+    sf::st_transform(crs = "EPSG:3310")
+  message("03 - writing parquet")
+  sf::write_sf(mupoly, mupoly_path, driver = "Parquet")
+  rm(mupoly_raw, mupoly)
+}
+
+parcels_path <- "/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1/parcels.gpkg" #nolint
+
+parcel_ids <- st_read(
+  parcels_path,
+  query = "SELECT DISTINCT parcel_id FROM parcels",
+  geometry_column = NULL,
+  use_stream = TRUE
+)[["parcel_id"]]
+
+parcel_chunks <- split(parcel_ids, parcel_ids %/% parcel_chunk_size)
+parcel_mins <- lapply(parcel_chunks, min)
+parcel_maxs <- lapply(parcel_chunks, max)
+
+get_weights <- function(
+  parcel_min,
+  parcel_max,
+  parcels_path,
+  mupoly_path,
+  outdir
+) {
+  # parcel_min <- parcel_mins[[1]]
+  # parcel_max <- parcel_maxs[[1]]
+  stopifnot(
+    file.exists(mupoly_path),
+    file.exists(parcels_path)
+  )
+  outfile <- file.path(
+    outdir,
+    sprintf("%d-%d.parquet", parcel_min, parcel_max)
+  )
+  if (file.exists(outfile)) {
+    message("File exists: ", outfile)
+    return(invisible(outfile))
+  }
+  parcels <- sf::read_sf(
+    parcels_path,
+    query = sprintf(
+      paste(
+        "SELECT parcel_id, geom FROM parcels",
+        "WHERE parcel_id >= %d AND parcel_id <= %d"
+      ),
+      parcel_min, parcel_max
+    ),
+    use_stream = TRUE
+  )
+  mupolygon <- sf::read_sf(
+    mupoly_path,
+    # use_stream = TRUE,
+    as_tibble = TRUE
+  )["MUKEY"]
+  intersect <- sf::st_intersection(parcels, mupolygon)
+  weights <- intersect |>
+    dplyr::mutate(
+      area_m2 = as.numeric(sf::st_area(.data$geom)),
+      mukey   = as.character(.data$MUKEY)
+    ) |>
+    sf::st_drop_geometry() |>
+    dplyr::mutate(
+      weight = .data$area_m2 / sum(.data$area_m2),
+      .by = "parcel_id"
+    ) |>
+    dplyr::select("parcel_id", "mukey", "area_m2", "weight")
+  arrow::write_parquet(weights, outfile)
+  invisible(outfile)
+}
+
+# for (i in seq_along(parcel_mins)) {
+#   message("Trying ", i)
+#   get_weights(
+#     parcel_mins[[i]],
+#     parcel_maxs[[i]],
+#     parcels_path,
+#     mupoly_path,
+#     outdir
+#   )
+# }
+
+Q(
+  get_weights,
+  parcel_min = parcel_mins,
+  parcel_max = parcel_maxs,
+  const = list(
+    parcels_path = parcels_path,
+    mupoly_path = mupoly_path,
+    outdir = outdir
+  ),
+  n_jobs = 30,
+  template = list(cores = 1, walltime = "05:00:00")
+)
diff --git a/workflows/irrigation-statewide/preprocessing/ssurgo-02-combine.R b/workflows/irrigation-statewide/preprocessing/ssurgo-02-combine.R
new file mode 100644
index 00000000000..36c57630740
--- /dev/null
+++ b/workflows/irrigation-statewide/preprocessing/ssurgo-02-combine.R
@@ -0,0 +1,47 @@
+#!/usr/bin/env Rscript
+
+library(arrow)
+library(duckdb)
+
+conn <- dbConnect(duckdb(), dbdir = ":memory:")
+
+dbExecute(conn, "SET memory_limit = '32GB'")
+
+dbExecute(conn, "
+  COPY (
+    SELECT
+    *
+    FROM read_parquet('_results/*.parquet')
+    ORDER BY parcel_id ASC
+  )
+  TO 'ssurgo-weights.parquet'
+  (
+    FORMAT PARQUET,
+    OVERWRITE_OR_IGNORE,
+    COMPRESSION 'ZSTD'
+  );
+  "
+)
+
+dbDisconnect(conn, shutdown = TRUE)
+
+# Test to confirm we can open
+message(
+  "Testing to confirm we can open the data ",
+  "and it produces valid weights."
+)
+dat <- open_dataset("ssurgo-weights.parquet")
+
+dsub <- dat |>
+  dplyr::filter(parcel_id %in% c(1, 100, 1000, 10000, 100000)) |>
+  dplyr::collect()
+print(dsub)
+
+dat |>
+  dplyr::summarize(
+    wt = sum(weight),
+    delta = abs(wt - 1),
+    .by = "parcel_id"
+  ) |>
+  dplyr::arrange(dplyr::desc(delta)) |>
+  dplyr::collect()

From d0001ee6bff3028b719604656bc76c5a898f4ac1 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 16:46:38 -0400
Subject: [PATCH 63/67] update generate dependencies

---
 docker/depends/pecan_package_dependencies.csv | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
index e0ffb6791c7..2db8b506916 100644
--- a/docker/depends/pecan_package_dependencies.csv
+++ b/docker/depends/pecan_package_dependencies.csv
@@ -49,7 +49,6 @@
 "doSNOW","*","modules/data.atmosphere","Suggests",FALSE
 "doSNOW","*","modules/data.land","Suggests",FALSE
 "doSNOW","*","modules/data.remote","Suggests",FALSE
-"dplR","*","modules/data.land","Imports",FALSE
 "dplR","*","modules/data.land","Suggests",FALSE
 "dplyr","*","base/qaqc","Imports",FALSE
 "dplyr","*","base/remote","Imports",FALSE
@@ -133,7 +132,6 @@
 "here","*","base/db","Suggests",FALSE
 "httr","*","base/remote","Imports",FALSE
 "httr","*","modules/data.atmosphere","Imports",FALSE
-"httr","*","modules/data.land","Imports",FALSE
 "httr","*","modules/data.land","Suggests",FALSE
 "httr","*","modules/data.remote","Suggests",FALSE
 "httr2","*","modules/data.land","Imports",FALSE

From db592f33087b6310c76362b85151c32690416324 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Tue, 21 Apr 2026 17:14:29 -0400
Subject: [PATCH 64/67] update water balance docs

---
 modules/data.land/R/water_balance.R           | 21 +++++++------------
 modules/data.land/man/calc_water_balance.Rd   | 10 ++++-----
 .../data.land/man/calc_water_balance_rice.Rd  | 11 ++++------
 3 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/modules/data.land/R/water_balance.R b/modules/data.land/R/water_balance.R
index 1421bbcf1bc..12a266e7c51 100644
--- a/modules/data.land/R/water_balance.R
+++ b/modules/data.land/R/water_balance.R
@@ -11,12 +11,10 @@
 #' - `w = whc` represents field capacity (maximum plant-available water)
 #' - `whc = (field_capacity - wilting_point) * rooting_depth` (the plant-available range)
 #'
-#' The handling of rice here is crude and primitive: setting `whc_min_frac` =
-#' 1.0 (as set in `crop_whc` for rice) means a near-constant need for
-#' irrigation to balance ET + seepage, which roughly mimics the behavior of
-#' maintaining a standing flood. However, proper treatment of rice requires
-#' maintaining a field *above* field capacity, and has other complications.
-#' These will be implemented in the future.
+#' Although this function can be used as a crude approximation of rice
+#' irrigation (by setting `whc_min_frac = 1.0`), we recommend using
+#' [calc_water_balance_rice()] instead, which explicitly tracks rice pond
+#' depth, implements seepage, etc.
 #'
 #' @param et Vector of evapotranspiration values (distance / time)
 #' @param precip Vector of precipitation values (distance / time)
@@ -148,12 +146,9 @@ calc_water_balance <- function(
 #' Models the water balance of a flooded rice system with a two-layer
 #' structure: a ponded water layer above a saturated soil profile. This is
 #' physically distinct from the upland soil water balance in
-#' calc_water_balance(). Water is managed to maintain a target flood depth,
+#' [calc_water_balance()]. Water is managed to maintain a target flood depth,
 #' with support for mid-season drainage events.
 #'
-#' State variable:
-#'   - pond_depth: depth of standing water above the soil surface
-#'
 #' The soil profile is assumed to be continuously saturated during flooded
 #' periods, so plant-available soil water is not tracked separately. ET is
 #' applied directly to the pond layer (open-water ET during flooded periods).
@@ -162,9 +157,9 @@ calc_water_balance <- function(
 #' refill to flood_target. Runoff (bund overflow) occurs when pond_depth
 #' exceeds flood_max.
 #'
-#' Mid-season drainage is specified as a logical vector (drain[t] = TRUE means
-#' the field is intentionally drained on day t). During drainage days, the
-#' pond is drawn down to pond_depth = 0 and irrigation is suppressed. This
+#' Mid-season drainage is specified as a logical vector (`drain[t] = TRUE`
+#' means the field is intentionally drained on day t). During drainage days,
+#' the pond is drawn down to pond_depth = 0 and irrigation is suppressed. This
 #' represents practices such as weed control or pre-harvest drainage.
 #'
 #' @param et        Numeric vector. Daily reference ET. During flooded
diff --git a/modules/data.land/man/calc_water_balance.Rd b/modules/data.land/man/calc_water_balance.Rd
index 35c1be843c3..0a6eb20d64e 100644
--- a/modules/data.land/man/calc_water_balance.Rd
+++ b/modules/data.land/man/calc_water_balance.Rd
@@ -54,12 +54,10 @@ This function operates in *relative* WHC space, where:
 - `w = whc` represents field capacity (maximum plant-available water)
 - `whc = (field_capacity - wilting_point) * rooting_depth` (the plant-available range)
 
-The handling of rice here is crude and primitive: setting `whc_min_frac` =
-1.0 (as set in `crop_whc` for rice) means a near-constant need for
-irrigation to balance ET + seepage, which roughly mimics the behavior of
-maintaining a standing flood. However, proper treatment of rice requires
-maintaining a field *above* field capacity, and has other complications.
-These will be implemented in the future.
+Although this function can be used as a crude approximation of rice
+irrigation (by setting `whc_min_frac = 1.0`), we recommend using
+[calc_water_balance_rice()] instead, which explicitly tracks rice pond
+depth, implements seepage, etc.
 }
 \examples{
 # Calculate WHC from field capacity, wilting point, and rooting depth
diff --git a/modules/data.land/man/calc_water_balance_rice.Rd b/modules/data.land/man/calc_water_balance_rice.Rd
index e50a00c21a2..249c874f017 100644
--- a/modules/data.land/man/calc_water_balance_rice.Rd
+++ b/modules/data.land/man/calc_water_balance_rice.Rd
@@ -55,13 +55,10 @@ A list with numeric vectors of length n:
 Models the water balance of a flooded rice system with a two-layer
 structure: a ponded water layer above a saturated soil profile. This is
 physically distinct from the upland soil water balance in
-calc_water_balance(). Water is managed to maintain a target flood depth,
+[calc_water_balance()]. Water is managed to maintain a target flood depth,
 with support for mid-season drainage events.
 }
 \details{
-State variable:
-  - pond_depth: depth of standing water above the soil surface
-
 The soil profile is assumed to be continuously saturated during flooded
 periods, so plant-available soil water is not tracked separately. ET is
 applied directly to the pond layer (open-water ET during flooded periods).
@@ -70,8 +67,8 @@ Irrigation is triggered when pond_depth falls below flood_min. Farmers
 refill to flood_target. Runoff (bund overflow) occurs when pond_depth
 exceeds flood_max.
 
-Mid-season drainage is specified as a logical vector (drain[t] = TRUE means
-the field is intentionally drained on day t). During drainage days, the
-pond is drawn down to pond_depth = 0 and irrigation is suppressed. This
+Mid-season drainage is specified as a logical vector (`drain[t] = TRUE`
+means the field is intentionally drained on day t). During drainage days,
+the pond is drawn down to pond_depth = 0 and irrigation is suppressed. This
 represents practices such as weed control or pre-harvest drainage.
 }

From e02263240743ab56217cd23f7fd0176ff995898e Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Wed, 22 Apr 2026 14:35:47 -0400
Subject: [PATCH 65/67] data.land: increase rcheck ref log package number

---
 modules/data.land/tests/Rcheck_reference.log | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/data.land/tests/Rcheck_reference.log b/modules/data.land/tests/Rcheck_reference.log
index 5452a7e02c0..84df7871a38 100644
--- a/modules/data.land/tests/Rcheck_reference.log
+++ b/modules/data.land/tests/Rcheck_reference.log
@@ -13,7 +13,7 @@
 * package encoding: UTF-8
 * checking package namespace information ... OK
 * checking package dependencies ... NOTE
-Imports includes 22 non-default packages.
+Imports includes 25 non-default packages.
 Importing from so many packages makes the package vulnerable to any of
 them becoming unavailable.  Move as many as possible to Suggests and
 use conditionally.

From 68170ee6ce4a7b525a25d8808f6f6a1851e6410c Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 24 Apr 2026 10:41:49 -0400
Subject: [PATCH 66/67] versioned httr2 dependency

So that `req_perform_parallel` is included in the package.
---
 docker/depends/pecan_package_dependencies.csv | 2 +-
 modules/data.land/DESCRIPTION                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
index 2db8b506916..298a3161b6c 100644
--- a/docker/depends/pecan_package_dependencies.csv
+++ b/docker/depends/pecan_package_dependencies.csv
@@ -134,7 +134,7 @@
 "httr","*","modules/data.atmosphere","Imports",FALSE
 "httr","*","modules/data.land","Suggests",FALSE
 "httr","*","modules/data.remote","Suggests",FALSE
-"httr2","*","modules/data.land","Imports",FALSE
+"httr2",">= 1.1.0","modules/data.land","Imports",FALSE
 "IDPmisc","*","modules/assim.batch","Imports",FALSE
 "imager","*","models/peprmt","Suggests",FALSE
 "itertools","*","modules/assim.sequential","Suggests",FALSE
diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION
index 2422e87eb6e..7f52937bb05 100644
--- a/modules/data.land/DESCRIPTION
+++ b/modules/data.land/DESCRIPTION
@@ -33,7 +33,7 @@ Imports:
     fs,
     future,
     furrr,
-    httr2,
+    httr2 (>= 1.1.0),
     lubridate,
     magrittr,
     ncdf4 (>= 1.15),

From 7594127e8bc4f9cfdf8a4f9908cd857695dde9cd Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <alexey.shiklomanov@gmail.com>
Date: Fri, 24 Apr 2026 10:42:28 -0400
Subject: [PATCH 67/67] typo in config.yml

---
 workflows/irrigation-statewide/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/irrigation-statewide/config.yml b/workflows/irrigation-statewide/config.yml
index 66443d207a4..4483c4a5bfd 100644
--- a/workflows/irrigation-statewide/config.yml
+++ b/workflows/irrigation-statewide/config.yml
@@ -4,7 +4,7 @@ default:
   n_remote_workers: 1
   exec_type: "local"
   event_filename: "irrigation_1000"
-  n_irr_ensembe: 20
+  n_irr_ensemble: 20
 
 small: