Merge branch 'master' into fix/kde-correction-empty-vector

utkarshpawade · web-flow · commit 9b2859fd3dca · 2026-03-27T00:16:38.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,4 @@ release-prep.R
 
 # vscode/positron/etc settings
 .vscode/*
+Rplots.pdf
diff --git a/NEWS.md b/NEWS.md
@@ -1,10 +1,11 @@
 # bayesplot (development version)
 
 * Validate user-provided `pit` values in `ppc_loo_pit_data()` using `validate_pit()` to reject invalid inputs (non-numeric, out of range, NAs) at the entry point instead of in internal helpers.
+* Added unit tests for `ppc_error_data()` and `ppc_loo_pit_data()` covering output structure, argument handling, and edge cases.
+* Added vignette sections demonstrating `*_data()` companion functions for building custom ggplot2 visualizations (#435)
+* Extract `drop_singleton_values()` helper in `mcmc_nuts_treedepth()` to remove duplicated filtering logic.
 * Eliminate redundant data processing in `mcmc_areas_data()` by reusing the prepared MCMC array for both interval and density computation.
-* Validate equal chain lengths in `validate_df_with_chain()`, reject missing
-  chain labels, and renumber data-frame chain labels internally when converting
-  to arrays.
+* Validate equal chain lengths in `validate_df_with_chain()`, reject missing chain labels, and renumber data-frame chain labels internally when converting to arrays.
 * Added unit tests for previously untested edge cases in `param_range()`, `param_glue()`, and `tidyselect_parameters()` (no-match, partial-match, and negation behavior).
 * Bumped minimum version for `rstantools` from `>= 1.5.0` to `>= 2.0.0` .
 * Use `rlang::warn()` and `rlang::inform()` for selected PPC user messages instead of base `warning()` and `message()`.
diff --git a/R/mcmc-diagnostics-nuts.R b/R/mcmc-diagnostics-nuts.R
@@ -369,11 +369,7 @@ mcmc_nuts_treedepth <- function(x, lp, chain = NULL, ...) {
     yaxis_ticks(FALSE)
 
   violin_lp_data <- data.frame(treedepth, lp = lp$Value)
-
-  # Only keep treedepth values that occur more than once for violin plot
-  value_counts <- table(violin_lp_data$Value)
-  keep_values <- names(value_counts[value_counts > 1])
-  violin_lp_data <- violin_lp_data[violin_lp_data$Value %in% keep_values, ]
+  violin_lp_data <- drop_singleton_values(violin_lp_data, "Value")
 
   violin_lp <-
     ggplot(violin_lp_data, aes(x = factor(.data$Value), y = .data$lp)) +
@@ -382,11 +378,7 @@ mcmc_nuts_treedepth <- function(x, lp, chain = NULL, ...) {
     bayesplot_theme_get()
 
   violin_accept_stat_data <- data.frame(treedepth, as = accept_stat$Value)
-
-  # Only keep treedepth values that occur more than once for violin plot
-  value_counts <- table(violin_accept_stat_data$Value)
-  keep_values <- names(value_counts[value_counts > 1])
-  violin_accept_stat_data <- violin_accept_stat_data[violin_accept_stat_data$Value %in% keep_values, ]
+  violin_accept_stat_data <- drop_singleton_values(violin_accept_stat_data, "Value")
 
   violin_accept_stat <-
     ggplot(violin_accept_stat_data, aes(x = factor(.data$Value), y = .data$as)) +
@@ -572,3 +564,11 @@ chain_violin <-
       alpha = alpha
     )
   }
+
+# Drop rows whose value in `col` appears only once (singletons cannot
+# produce a violin density estimate).
+drop_singleton_values <- function(df, col) {
+  counts <- table(df[[col]])
+  keep <- names(counts[counts > 1])
+  df[df[[col]] %in% keep, ]
+}
diff --git a/tests/testthat/test-ppc-errors.R b/tests/testthat/test-ppc-errors.R
@@ -1,7 +1,7 @@
+skip_if_not_installed("rstantools")
 source(test_path("data-for-ppc-tests.R"))
 
 test_that("ppc_error_hist and ppc_error_scatter return ggplot object", {
-  skip_if_not_installed("rstantools")
   expect_gg(ppc_error_hist(y, yrep[1:5, ], binwidth = 0.1))
   expect_gg(ppc_error_scatter(y, yrep[1:5, ]))
 
@@ -13,14 +13,12 @@ test_that("ppc_error_hist and ppc_error_scatter return ggplot object", {
 })
 
 test_that("ppc_error_hist_grouped returns ggplot object", {
-  skip_if_not_installed("rstantools")
   expect_gg(ppc_error_hist_grouped(y, yrep[1:5, ], group, binwidth = 0.1))
   expect_gg(ppc_error_hist_grouped(y, yrep[1,, drop = FALSE], group,
                                    freq = FALSE, binwidth = 1))
 })
 
 test_that("ppc_error_scatter_avg returns ggplot2 object", {
-  skip_if_not_installed("rstantools")
   expect_gg(ppc_error_scatter_avg(y, yrep))
   expect_gg(ppc_error_scatter_avg(y, yrep[1:5, ]))
 
@@ -30,7 +28,6 @@ test_that("ppc_error_scatter_avg returns ggplot2 object", {
 })
 
 test_that("ppc_error_scatter_avg same as ppc_error_scatter if nrow(yrep) = 1", {
-  skip_if_not_installed("rstantools")
   p1 <- ppc_error_scatter_avg(y2, yrep2)
   p2 <- ppc_error_scatter(y2, yrep2)
   d1 <- p1$data
@@ -42,8 +39,6 @@ test_that("ppc_error_scatter_avg same as ppc_error_scatter if nrow(yrep) = 1", {
 })
 
 test_that("ppc_error_scatter_avg_vs_x returns ggplot2 object", {
-  skip_if_not_installed("rstantools")
-
   # expect warning
   expect_warning(expect_gg(ppc_error_scatter_avg_vs_x(y, yrep, x = rnorm(length(y)))),
                  "'ppc_error_scatter_avg_vs_x' is deprecated.")
@@ -52,7 +47,6 @@ test_that("ppc_error_scatter_avg_vs_x returns ggplot2 object", {
 })
 
 test_that("ppc_error_binned returns ggplot object", {
-  skip_if_not_installed("rstantools")
   load(test_path("data-for-binomial.rda"))
   expect_gg(ppc_error_binned(y, Ey))
   expect_gg(ppc_error_binned(y[1:5], Ey[, 1:5]))
@@ -73,6 +67,24 @@ test_that("bin_errors works for edge cases", {
   expect_equal(ans, val)
 })
 
+# ppc_error_data tests -----------------------------------------------------
+
+test_that("ppc_error_data returns exact structure and computed errors", {
+  d <- ppc_error_data(y, yrep)
+  expect_named(d, c("y_id", "y_name", "y_obs", "rep_id", "rep_label", "value"))
+  third_rep <- d[d$rep_id == 3, ]
+  expected_errors <- y - yrep[3, ]
+  expect_equal(third_rep$value, expected_errors)
+  expect_equal(third_rep$y_obs, y)
+})
+
+test_that("ppc_error_data with group returns exact structure", {
+  d <- ppc_error_data(y, yrep, group = group)
+  expect_named(d, c("group", "y_id", "y_name", "y_obs", "rep_id", "rep_label", "value"))
+  expect_identical(levels(d$group), levels(group))
+  expect_equal(d$group[d$rep_id == 1], group)
+})
+
 
 # Visual tests -----------------------------------------------------------------
 
diff --git a/tests/testthat/test-ppc-loo.R b/tests/testthat/test-ppc-loo.R
@@ -358,3 +358,50 @@ test_that("ppc_loo_pit_ecdf renders correctly", {
   )
   vdiffr::expect_doppelganger("ppc_loo_pit_ecdf (ecdf difference)", p_custom)
 })
+
+
+# ppc_loo_pit_data tests ---------------------------------------------------
+
+test_that("ppc_loo_pit_data returns the expected structure for both boundary modes", {
+  set.seed(123)
+  pit_vals <- runif(50)
+  n_samples <- 10
+  expect_message(
+    d_raw <- ppc_loo_pit_data(
+      pit = pit_vals,
+      boundary_correction = FALSE,
+      samples = n_samples
+    ),
+    "pit"
+  )
+  expect_s3_class(d_raw, "data.frame")
+  expect_named(
+    d_raw,
+    c("y_id", "y_name", "rep_id", "rep_label", "is_y", "is_y_label", "value")
+  )
+  y_rows <- d_raw[d_raw$is_y, ]
+  yrep_rows <- d_raw[!d_raw$is_y, ]
+  expect_equal(nrow(y_rows), length(pit_vals))
+  expect_equal(nrow(yrep_rows), length(pit_vals) * n_samples)
+  expect_equal(y_rows$value, pit_vals)
+
+  grid_len <- 128
+  expect_message(
+    d_bc <- ppc_loo_pit_data(
+      pit = pit_vals,
+      boundary_correction = TRUE,
+      samples = n_samples,
+      grid_len = grid_len
+    ),
+    "pit"
+  )
+  expect_named(
+    d_bc,
+    c("y_id", "y_name", "rep_id", "rep_label", "is_y", "is_y_label", "value", "x")
+  )
+  y_rows <- d_bc[d_bc$is_y, ]
+  yrep_rows <- d_bc[!d_bc$is_y, ]
+  expect_equal(nrow(y_rows), grid_len)
+  expect_equal(nrow(yrep_rows), grid_len * n_samples)
+  expect_false(anyNA(d_bc$x))
+})
diff --git a/vignettes/graphical-ppcs.Rmd b/vignettes/graphical-ppcs.Rmd
@@ -314,6 +314,45 @@ See Figure 8 in [Gabry et al. (2019)](#gabry2019) for another example of using
 
 <br>
 
+## Using `*_data()` functions for custom plots
+
+Many bayesplot plotting functions have a companion `*_data()` function that
+returns the pre-processed data as a tidy data frame instead of a plot. This is
+useful when you want to build a fully custom ggplot2 visualization using the
+same summary statistics that bayesplot computes internally.
+
+For example, `ppc_intervals_data()` returns the quantile summaries that
+`ppc_intervals()` uses:
+
+```{r data_intervals, eval=params$EVAL}
+d <- ppc_intervals_data(y, yrep_nb, prob = 0.5, prob_outer = 0.9)
+head(d)
+```
+
+You can then use this data to create your own plot:
+
+```{r data_intervals_custom, eval=params$EVAL}
+ggplot(d, aes(x = x, y = m)) +
+  geom_linerange(aes(ymin = ll, ymax = hh), color = "skyblue", linewidth = 0.6) +
+  geom_linerange(aes(ymin = l, ymax = h), color = "steelblue", linewidth = 1.2) +
+  geom_point(aes(y = y_obs), shape = 21, fill = "red", size = 1.5) +
+  labs(title = "Custom interval plot from ppc_intervals_data()",
+       x = "Observation", y = "Value") +
+  theme_minimal()
+```
+
+Similarly, `ppc_stat_data()` returns the computed test statistics:
+
+```{r data_stat, eval=params$EVAL, message=FALSE}
+stat_d <- ppc_stat_data(y, yrep_nb, stat = "median")
+head(stat_d)
+```
+
+See `available_ppc(plots_only = FALSE)` and `available_mcmc(plots_only = FALSE)`
+for a full list of data-preparation functions.
+
+<br>
+
 ## Providing an interface to bayesplot PPCs from another package
 
 The **bayesplot** package provides the S3 generic function `pp_check`. Authors of
diff --git a/vignettes/plotting-mcmc-draws.Rmd b/vignettes/plotting-mcmc-draws.Rmd
@@ -367,6 +367,23 @@ mcmc_trace_highlight(posterior, pars = "sigma", highlight = 3)
 ```
 
 
+<br>
+
+## Using `*_data()` functions for custom plots
+
+As with PPC functions, many MCMC plotting functions have `*_data()` companions
+that return the underlying data instead of a plot. For example,
+`mcmc_intervals_data()` returns the quantiles used by `mcmc_intervals()`:
+
+```{r data_intervals_mcmc}
+d <- mcmc_intervals_data(posterior, pars = c("(Intercept)", "sigma"))
+d
+```
+
+This can be used to build fully custom ggplot2 visualizations using the same
+summary statistics that bayesplot computes internally. See
+`available_mcmc(plots_only = FALSE)` for a full list of `*_data()` functions.
+
 <br>
 
 ## References

Original file line number	Diff line number	Diff line change
`@@ -16,3 +16,4 @@ release-prep.R`
`16`	`16`
`17`	`17`	`# vscode/positron/etc settings`
`18`	`18`	`.vscode/*`
	`19`	`+Rplots.pdf`