Add testthat coverage for trait meta-analysis pipeline functions

omkarrr2533 · omkarrr2533 · commit 6783775a08d3 · 2026-03-19T18:55:10.000+05:30
Add comprehensive test coverage for the core meta-analysis pipeline
functions as preparation for modularity improvements planned in the
GSoC 'Refactoring the PEcAn trait meta-analysis workflow' project.

New test files:
- helper-test-data.R: Shared fixtures
- test-check_consistent.R: Unit tests for check_consistent()
- test-run.meta.analysis.pft.R: Tests for workflow wrapper
- test-meta_analysis_standalone.R: Tests for core analysis function
- test-get.parameter.samples.R: Documents invisible(NULL) return problem

These tests lock in current behavior before any refactoring begins.
No functional changes to existing code.
diff --git a/modules/meta.analysis/tests/testthat/helper-test-data.R b/modules/meta.analysis/tests/testthat/helper-test-data.R
@@ -0,0 +1,107 @@
+# helper-test-data.R
+# Shared test fixtures for meta-analysis pipeline tests
+#
+# These helpers create minimal but realistic test data structures
+# that mirror what the actual pipeline produces and consumes.
+
+# Create minimal trait data matching the structure expected by
+# meta_analysis_standalone() and run.meta.analysis.pft()
+#
+# The columns here match the @param documentation in run.meta.analysis.pft.R:
+#   name, mean, statname, stat, greenhouse, n,
+#   site_id, specie_id, citation_id, cultivar_id,
+#   date, time, control
+create_test_trait_data <- function(n_obs = 10, trait_mean = 20, trait_sd = 2, seed = 42) {
+  set.seed(seed)
+  test_trait <- data.frame(
+    citation_id  = rep(1L, n_obs),
+    site_id      = rep(1:2, length.out = n_obs),
+    name         = rep(paste0("species_", 1:2), length.out = n_obs),
+    trt_id       = rep("control", n_obs),
+    control      = rep(1L, n_obs),
+    greenhouse   = rep(0L, n_obs),
+    date         = rep(1, n_obs),
+    time         = rep(NA, n_obs),
+    cultivar_id  = rep(1L, n_obs),
+    specie_id    = rep(1L, n_obs),
+    n            = rep(5L, n_obs),
+    mean = rnorm(n_obs, mean = trait_mean, sd = trait_sd),
+    stat         = rep(trait_sd / sqrt(5), n_obs),
+    statname     = rep("SE", n_obs),
+    treatment_id = seq_len(n_obs),
+    stringsAsFactors = FALSE
+  )
+  return(test_trait)
+}
+
+# Create a minimal prior distributions data.frame
+# matching the structure returned by PEcAn.DB::query.priors()
+#
+# Uses normal distribution so that check_consistent() and
+# p.point.in.prior() work without exotic distribution functions.
+create_test_priors <- function(trait_names = "SLA",
+                               distn = "norm",
+                               parama = 20,
+                               paramb = 5) {
+  prior.distns <- data.frame(
+    distn  = rep(distn, length(trait_names)),
+    parama = rep(parama, length(trait_names)),
+    paramb = rep(paramb, length(trait_names)),
+    n      = rep(NA_real_, length(trait_names)),
+    stringsAsFactors = FALSE
+  )
+  rownames(prior.distns) <- trait_names
+  return(prior.distns)
+}
+
+# Create a minimal pft list matching what run.meta.analysis.pft() expects
+#
+# This mirrors the structure from settings$pfts[[i]] after get.trait.data.pft()
+# has been run, which adds posteriorid to the pft object.
+create_test_pft <- function(outdir = file.path(tempdir(), "test-pft"),
+                            pft_name = "temperate.deciduous",
+                            posteriorid = 9999L) {
+  dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
+  list(
+    name        = pft_name,
+    outdir      = outdir,
+    posteriorid = posteriorid
+  )
+}
+
+# Write trait.data.Rdata and prior.distns.Rdata files to a directory
+# so that run.meta.analysis.pft() can load them
+#
+# This simulates what get.trait.data.pft() produces as output.
+setup_trait_files <- function(outdir,
+                              trait_names = "SLA",
+                              trait_mean = 20,
+                              trait_sd = 2,
+                              n_obs = 10,
+                              prior_parama = 20,
+                              prior_paramb = 5) {
+  dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
+
+  # Build trait.data as a named list of data.frames (one per trait)
+  trait.data <- stats::setNames(
+    lapply(trait_names, function(tn) {
+      create_test_trait_data(
+        n_obs = n_obs,
+        trait_mean = trait_mean,
+        trait_sd = trait_sd
+      )
+    }),
+    trait_names
+  )
+  save(trait.data, file = file.path(outdir, "trait.data.Rdata"))
+
+  # Build prior.distns data.frame
+  prior.distns <- create_test_priors(
+    trait_names = trait_names,
+    parama = prior_parama,
+    paramb = prior_paramb
+  )
+  save(prior.distns, file = file.path(outdir, "prior.distns.Rdata"))
+
+  invisible(list(trait.data = trait.data, prior.distns = prior.distns))
+}
diff --git a/modules/meta.analysis/tests/testthat/test-check_consistent.R b/modules/meta.analysis/tests/testthat/test-check_consistent.R
@@ -0,0 +1,94 @@
+# test-check_consistent.R
+# Tests for check_consistent() — the prior-data consistency checker
+#
+# check_consistent() is a pure function with no side effects or DB dependency,
+# making it ideal for unit testing. It is called twice inside
+# meta_analysis_standalone(): once to check data vs prior, and again to check
+# posterior vs prior.
+
+# Note: We use ::: to access internal (non-exported) functions.
+# This is standard practice for testing internal functions in R packages.
+# See: https://r-pkgs.org/testing-basics.html
+
+test_that("check_consistent returns no_error=TRUE, no_warning=TRUE when point is well within prior", {
+  # A normal(0, 10) prior — point at the mean should be perfectly consistent
+
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 10,
+                      stringsAsFactors = FALSE)
+  result <- PEcAn.MA:::check_consistent(point = 0, prior = prior)
+
+  expect_type(result, "logical")
+  expect_named(result, c("no_error", "no_warning"))
+  expect_true(result[["no_error"]])
+  expect_true(result[["no_warning"]])
+})
+
+test_that("check_consistent returns warning (but not error) for moderately extreme points", {
+
+  # normal(0, 1): p(2.1) ≈ 0.982 which is > 0.975 (warning threshold)
+
+  # but < 0.9995 (error threshold)
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 1,
+                      stringsAsFactors = FALSE)
+  result <- PEcAn.MA:::check_consistent(point = 2.1, prior = prior)
+
+  expect_true(result[["no_error"]])
+  expect_false(result[["no_warning"]])
+})
+
+test_that("check_consistent returns error for extremely inconsistent points", {
+  # normal(0, 1): p(5) ≈ 1.0 which exceeds both thresholds
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 1,
+                      stringsAsFactors = FALSE)
+  result <- PEcAn.MA:::check_consistent(point = 5, prior = prior)
+
+  expect_false(result[["no_error"]])
+  expect_false(result[["no_warning"]])
+})
+
+test_that("check_consistent works symmetrically for low-tail extremes", {
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 1,
+                      stringsAsFactors = FALSE)
+  result <- PEcAn.MA:::check_consistent(point = -5, prior = prior)
+
+  expect_false(result[["no_error"]])
+  expect_false(result[["no_warning"]])
+})
+
+test_that("check_consistent respects custom p_error and p_warning thresholds", {
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 1,
+                      stringsAsFactors = FALSE)
+
+  # With very permissive thresholds, even extreme points pass
+  result <- PEcAn.MA:::check_consistent(
+    point = 3, prior = prior,
+    p_error = 1e-10, p_warning = 1e-5
+  )
+  expect_true(result[["no_error"]])
+  expect_true(result[["no_warning"]])
+})
+
+test_that("check_consistent validates that p_warning >= p_error", {
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 1,
+                      stringsAsFactors = FALSE)
+  expect_error(
+    PEcAn.MA:::check_consistent(point = 0, prior = prior,
+                                p_error = 0.05, p_warning = 0.01)
+  )
+})
+
+test_that("check_consistent works with non-normal prior distributions", {
+  # Gamma distribution: dgamma with shape=2, rate=1
+  # Median of gamma(2,1) ≈ 1.678
+  prior_gamma <- data.frame(distn = "gamma", parama = 2, paramb = 1,
+                            stringsAsFactors = FALSE)
+
+  # Point near the mode should be consistent
+  result <- PEcAn.MA:::check_consistent(point = 1.5, prior = prior_gamma)
+  expect_true(result[["no_error"]])
+  expect_true(result[["no_warning"]])
+
+  # Point far in the tail should trigger error
+  result_extreme <- PEcAn.MA:::check_consistent(point = 50, prior = prior_gamma)
+  expect_false(result_extreme[["no_error"]])
+})
diff --git a/modules/meta.analysis/tests/testthat/test-get.parameter.samples.R b/modules/meta.analysis/tests/testthat/test-get.parameter.samples.R
@@ -0,0 +1,54 @@
+# test-get.parameter.samples.R
+# Documents the invisible(NULL) return value problem in get.parameter.samples()
+#
+# get.parameter.samples() ends with save() and has no return() statement,
+# so it returns invisible(NULL). This means callers cannot programmatically
+# access results without loading the saved file.
+# The GSoC project "Refactoring the PEcAn trait meta-analysis workflow"
+# aims to fix this by returning a named list instead.
+
+
+
+test_that("get.parameter.samples returns invisible(NULL) — documenting the problem", {
+  skip(paste0(
+    "Requires full PEcAn settings + database connection. ",
+    "Current function ends with save() and no return(), ",
+    "so it returns invisible(NULL). ",
+    "GSoC refactoring will make it return a named list."
+  ))
+})
+
+# ---------------------------------------------------------------------------
+# p.point.in.prior (helper used throughout the pipeline)
+# ---------------------------------------------------------------------------
+
+
+
+test_that("p.point.in.prior returns correct quantile for normal distribution", {
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 1)
+  result <- PEcAn.MA:::p.point.in.prior(point = 0, prior = prior)
+  expect_equal(result, 0.5)
+})
+
+test_that("p.point.in.prior returns correct quantile for extreme values", {
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 1)
+  result_low <- PEcAn.MA:::p.point.in.prior(point = -5, prior = prior)
+  expect_true(result_low < 0.001)
+  result_high <- PEcAn.MA:::p.point.in.prior(point = 5, prior = prior)
+  expect_true(result_high > 0.999)
+})
+
+test_that("p.point.in.prior works with gamma distribution", {
+  prior <- data.frame(distn = "gamma", parama = 2, paramb = 1)
+  result <- PEcAn.MA:::p.point.in.prior(point = 2, prior = prior)
+  expected <- pgamma(2, shape = 2, rate = 1)
+  expect_equal(result, expected)
+})
+
+test_that("p.point.in.prior returns numeric of length 1", {
+  prior <- data.frame(distn = "norm", parama = 0, paramb = 1)
+  result <- PEcAn.MA:::p.point.in.prior(point = 1.5, prior = prior)
+  expect_type(result, "double")
+  expect_length(result, 1)
+  expect_true(result >= 0 && result <= 1)
+})
diff --git a/modules/meta.analysis/tests/testthat/test-meta_analysis_standalone.R b/modules/meta.analysis/tests/testthat/test-meta_analysis_standalone.R
@@ -0,0 +1,111 @@
+# test-meta_analysis_standalone.R
+# Tests for meta_analysis_standalone() — the pure analysis function
+#
+# meta_analysis_standalone() is the core analysis function that:
+#   1. Calls jagify() to prepare data for JAGS
+#   2. Checks data-prior consistency via check_consistent()
+#   3. Runs pecan.ma() (JAGS MCMC)
+#   4. Summarizes results via pecan.ma.summary()
+#   5. Fits approximate posteriors via approx.posterior()
+#   6. Returns list(trait.mcmc, post.distns, jagged.data)
+
+# ---------------------------------------------------------------------------
+# Greenhouse data filtering
+# ---------------------------------------------------------------------------
+
+test_that("meta_analysis_standalone filters greenhouse data when use_ghs=FALSE", {
+  skip_if_not_installed("rjags")
+  skip_on_cran()
+
+  jags_available <- tryCatch({
+    rjags::jags.model(
+      textConnection("model { x ~ dnorm(0, 1) }"),
+      data = list(), n.chains = 1, quiet = TRUE
+    )
+    TRUE
+  }, error = function(e) FALSE)
+  skip_if(!jags_available, "JAGS not available")
+
+  outdir <- file.path(tempdir(), "test-ma-greenhouse")
+  dir.create(outdir, recursive = TRUE, showWarnings = FALSE)
+  on.exit(unlink(outdir, recursive = TRUE), add = TRUE)
+
+  # Create data that is ALL greenhouse
+  gh_data <- create_test_trait_data(n_obs = 10, trait_mean = 20)
+  gh_data$greenhouse <- 1L
+  trait_data <- list(SLA = gh_data)
+  priors <- create_test_priors("SLA")
+
+  # When use_ghs=FALSE, all greenhouse data is excluded.
+  # With no data left, function should handle this gracefully.
+  result <- tryCatch(
+    PEcAn.MA::meta_analysis_standalone(
+      trait_data = trait_data,
+      priors = priors,
+      iterations = 1000,
+      outdir = outdir,
+      use_ghs = FALSE
+    ),
+    error = function(e) e
+  )
+
+  expect_true(
+    is.list(result),
+    info = "Should return list or error object, not crash"
+  )
+})
+
+# ---------------------------------------------------------------------------
+# Full integration test (requires JAGS)
+# ---------------------------------------------------------------------------
+
+test_that("meta_analysis_standalone returns correct structure with valid inputs", {
+  skip_if_not_installed("rjags")
+  skip_on_cran()
+
+  jags_available <- tryCatch({
+    rjags::jags.model(
+      textConnection("model { x ~ dnorm(0, 1) }"),
+      data = list(), n.chains = 1, quiet = TRUE
+    )
+    TRUE
+  }, error = function(e) FALSE)
+  skip_if(!jags_available, "JAGS not available")
+
+  outdir <- file.path(tempdir(), "test-ma-standalone-integration")
+  dir.create(outdir, recursive = TRUE, showWarnings = FALSE)
+  on.exit(unlink(outdir, recursive = TRUE), add = TRUE)
+
+  set.seed(42)
+  trait_data <- list(
+    SLA = create_test_trait_data(n_obs = 15, trait_mean = 20, trait_sd = 2)
+  )
+  priors <- create_test_priors("SLA", parama = 20, paramb = 5)
+
+  result <- PEcAn.MA::meta_analysis_standalone(
+    trait_data = trait_data,
+    priors = priors,
+    iterations = 3000,
+    outdir = outdir,
+    pft_name = "test.integration",
+    random = TRUE,
+    threshold = 1.2,
+    use_ghs = TRUE
+  )
+
+  # Verify return structure
+  expect_type(result, "list")
+  expect_named(result, c("trait.mcmc", "post.distns", "jagged.data"),
+               ignore.order = FALSE)
+
+  # post.distns should be a data.frame
+  expect_s3_class(result$post.distns, "data.frame")
+  expect_true(all(c("distn", "parama", "paramb") %in% colnames(result$post.distns)))
+  expect_true("SLA" %in% rownames(result$post.distns))
+
+  # jagged.data should be a named list
+  expect_type(result$jagged.data, "list")
+  expect_true("SLA" %in% names(result$jagged.data))
+  expect_s3_class(result$jagged.data[["SLA"]], "data.frame")
+  expect_true("Y" %in% colnames(result$jagged.data[["SLA"]]))
+})
diff --git a/modules/meta.analysis/tests/testthat/test-run.meta.analysis.pft.R b/modules/meta.analysis/tests/testthat/test-run.meta.analysis.pft.R