diff --git a/DESCRIPTION b/DESCRIPTION index ebbc16b1..0426059a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ggRandomForests Type: Package Title: Visually Exploring Random Forests -Version: 2.7.3.9004 -Date: 2026-05-20 +Version: 2.7.3.9005 +Date: 2026-05-21 Authors@R: person("John", "Ehrlinger", role = c("aut", "cre"), email = "john.ehrlinger@gmail.com") diff --git a/NEWS.md b/NEWS.md index 5d0a97a4..6271461c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ Package: ggRandomForests -Version: 2.7.3.9004 +Version: 2.7.3.9005 ggRandomForests v2.8.0 (development) — continued ================================================= @@ -31,6 +31,19 @@ ggRandomForests v2.8.0 (development) — continued ggRandomForests v2.8.0 (development) ==================================== +* **`gg_variable.randomForest` classification fix (#87).** + - `gg_variable.randomForest()` for classification forests now stores + per-class OOB vote fractions as `yhat.` columns (from + `object$votes`), matching the `rfsrc` path. Previously a single + `yhat` factor column (class labels from `object$predicted`) was + stored, which prevented the multi-class pivot in `plot.gg_variable` + from firing. Vote fractions are row-normalised to `[0, 1]` even + when the forest was fit with `norm.votes = FALSE`. + - `plot.gg_variable` binary classification: `smooth = TRUE` now + correctly maps x/y aesthetics onto the smooth layer. + - `plot.gg_variable` multi-class numeric path: `smooth = TRUE` now + adds a smooth layer (was silently skipped). + - Closes stale issues #81 (fixed in PR #83) and #82. * **varPro partial dependence: `gg_partial_varpro()` (#84).** - `gg_partial_varpro()` replaces `gg_partialpro()` as the primary entry point for varPro partial dependence plots. The new extractor accepts diff --git a/R/gg_variable.R b/R/gg_variable.R index 6ddcd8dc..f6cc447c 100644 --- a/R/gg_variable.R +++ b/R/gg_variable.R @@ -271,10 +271,16 @@ gg_variable.randomForest <- function(object, ...) { arg_list <- list(...) - # randomForest objects do not store OOB predictions in a way that maps back - # to the predictor space, so we always use in-bag (full-forest) predictions. - if (!is.null(arg_list$oob)) { - arg_list$oob <- FALSE + # randomForest uses object$votes (OOB vote matrix) unconditionally — it is the + # only honest per-class probability estimate. In-bag class probabilities are + # not exposed through a consistent randomForest API, so oob=FALSE is not + # supported. Warn the caller rather than silently ignoring the argument. + if (!is.null(arg_list$oob) && identical(arg_list$oob, FALSE)) { + warning( + "oob = FALSE is not supported for randomForest objects: ", + "in-bag class probabilities are unavailable. ", + "OOB vote fractions (object$votes) will be used instead." + ) } if (!inherits(object, "randomForest")) { @@ -307,10 +313,20 @@ gg_variable.randomForest <- function(object, } gg_dta <- predictors - # Append the forest's in-bag predicted values. - gg_dta$yhat <- as.vector(object$predicted) + # For classification forests use per-class OOB vote fractions (object$votes), + # stored as yhat. columns — the same shape gg_variable.rfsrc + # produces. For regression a single numeric yhat column suffices. if (object$type == "classification") { - gg_dta$yvar <- response + preds <- object$votes # n × n_classes matrix; may be raw counts or fractions + rs <- rowSums(preds) + if (any(rs > 1 + 1e-8, na.rm = TRUE)) { + preds <- preds / rs # normalise raw vote counts to [0, 1] + } + colnames(preds) <- paste0("yhat.", colnames(preds)) + gg_dta <- cbind(gg_dta, preds) + gg_dta$yvar <- response + } else { + gg_dta$yhat <- as.vector(object$predicted) } # randomForest uses object$type ("classification" / "regression"); the diff --git a/R/plot.gg_variable.R b/R/plot.gg_variable.R index e8a7f273..e6d7ae38 100644 --- a/R/plot.gg_variable.R +++ b/R/plot.gg_variable.R @@ -166,10 +166,14 @@ plot.gg_variable <- function(x, # nolint: cyclocomp_linter gg_dta_y <- gg_dta[, grep("yhat.", colnames(gg_dta))] lng <- ncol(gg_dta_y) gg2 <- parallel::mclapply(seq_len(ncol(gg_dta_y)), function(ind) { - cbind(gg_dta_x, yhat = gg_dta_y[, ind], outcome = ind) + cbind(gg_dta_x, yhat = gg_dta_y[, ind], + outcome = sub("^yhat\\.", "", colnames(gg_dta_y)[ind])) }) gg3 <- do.call(rbind, gg2) - gg3$outcome <- factor(gg3$outcome) + # Use column order from gg_dta_y (not alphabetical) so facet panels + # appear in the same order as the model's class levels. + outcome_levels <- sub("^yhat\\.", "", colnames(gg_dta_y)) + gg3$outcome <- factor(gg3$outcome, levels = outcome_levels) gg_dta <- gg3 } } @@ -516,7 +520,10 @@ plot.gg_variable <- function(x, # nolint: cyclocomp_linter } if (smooth) { gg_plt[[ind]] <- gg_plt[[ind]] + - ggplot2::geom_smooth(...) + ggplot2::geom_smooth( + ggplot2::aes(x = .data$var, y = .data$yhat), + ... + ) } } else { # Factor predictor: jitter + boxplot coloured by observed class @@ -550,6 +557,13 @@ plot.gg_variable <- function(x, # nolint: cyclocomp_linter ), ... ) + if (smooth) { + gg_plt[[ind]] <- gg_plt[[ind]] + + ggplot2::geom_smooth( + ggplot2::aes(x = .data$var, y = .data$yhat), + ... + ) + } } else { gg_plt[[ind]] <- gg_plt[[ind]] + ggplot2::geom_boxplot( diff --git a/tests/testthat/test_gg_variable.R b/tests/testthat/test_gg_variable.R index 4c6633d1..e4bd9e40 100644 --- a/tests/testthat/test_gg_variable.R +++ b/tests/testthat/test_gg_variable.R @@ -368,3 +368,127 @@ test_that("gg_variable.randomForest classification: class attr uses 'class' not expect_false("classification" %in% class(gg_dta)) expect_s3_class(gg_dta, "gg_variable") }) + +## ── randomForest classification (PR #87) ───────────────────────────────────── + +test_that("gg_variable.randomForest classification: produces yhat.* columns not yhat", { + skip_if_not_installed("randomForest") + set.seed(42L) + rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L) + gg <- gg_variable(rf) + # Must have one column per class + expect_true(all(c("yhat.setosa", "yhat.versicolor", "yhat.virginica") + %in% names(gg))) + # Must NOT have a bare yhat column for multi-class + expect_false("yhat" %in% names(gg)) + # Observed-class column must be present + expect_true("yvar" %in% names(gg)) + # Vote fractions must be in [0, 1] and row-sum to ~1 + vote_cols <- c("yhat.setosa", "yhat.versicolor", "yhat.virginica") + expect_true(all(gg[, vote_cols] >= 0)) + expect_true(all(gg[, vote_cols] <= 1)) + expect_true(all(abs(rowSums(gg[, vote_cols]) - 1) < 1e-6)) +}) + +test_that("gg_variable.randomForest classification: plot returns patchwork for all xvar", { + skip_if_not_installed("randomForest") + set.seed(42L) + rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L) + gg <- gg_variable(rf) + p <- plot(gg) + # iris has 4 predictors so the no-xvar default assembles a multi-panel + # patchwork; assert patchwork specifically to catch regressions to a bare + # list (#80). + expect_s3_class(p, "patchwork") +}) + +test_that("gg_variable.randomForest classification: layer_data works on single-xvar plot", { + skip_if_not_installed("randomForest") + set.seed(42L) + rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L) + gg <- gg_variable(rf) + p <- plot(gg, xvar = "Sepal.Length") + expect_no_error(ggplot2::layer_data(p, 1L)) +}) + +test_that("gg_variable.randomForest classification: norm.votes=FALSE still gives [0,1] fractions", { + skip_if_not_installed("randomForest") + set.seed(42L) + rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L, + norm.votes = FALSE) + gg <- gg_variable(rf) + vote_cols <- c("yhat.setosa", "yhat.versicolor", "yhat.virginica") + expect_true(all(c("yhat.setosa", "yhat.versicolor", "yhat.virginica") %in% names(gg))) + expect_true(all(gg[, vote_cols] >= 0)) + expect_true(all(gg[, vote_cols] <= 1)) + expect_true(all(abs(rowSums(gg[, vote_cols]) - 1) < 1e-6)) +}) + +test_that("plot.gg_variable RF classification: smooth=TRUE layer_data smokeable (binary smooth aes bug)", { + skip_if_not_installed("randomForest") + # Two-class subset to exercise the *binary* classification path + set.seed(42L) + bin_data <- iris[iris$Species != "virginica", ] + bin_data$Species <- droplevels(bin_data$Species) + rf <- randomForest::randomForest(Species ~ ., data = bin_data, ntree = 50L) + gg <- gg_variable(rf) + p <- plot(gg, xvar = "Sepal.Length", smooth = TRUE) + # Before the fix, geom_smooth(...) has no aes and layer_data errors with + # "stat_smooth() requires the following missing aesthetics: x and y" + expect_no_error(ggplot2::layer_data(p, 2L)) +}) + +test_that("plot.gg_variable RF classification: smooth=TRUE works for multi-class (missing block)", { + skip_if_not_installed("randomForest") + set.seed(42L) + rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L) + gg <- gg_variable(rf) + # Before the fix the multi-class numeric path silently skips smooth=TRUE + # but does not error; after the fix a smooth layer is present (layer 2). + p <- plot(gg, xvar = "Sepal.Length", smooth = TRUE) + expect_s3_class(p, "ggplot") + ld <- ggplot2::layer_data(p, 2L) # layer 2 = geom_smooth + expect_gt(nrow(ld), 0L) +}) + +test_that("plot.gg_variable RF classification multi-class: outcome column is class names not integers", { + skip_if_not_installed("randomForest") + set.seed(42L) + rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L) + gg <- gg_variable(rf) + p <- plot(gg, xvar = "Sepal.Length") + expect_s3_class(p, "ggplot") + # The 'outcome' column in the plot data drives facet labels. + # It must contain class names, not integer indices. + # ggplot2 >= 3.5 uses S7 slots; fall back to $ accessor for older versions. + pd <- tryCatch(p@data, error = function(e) p$data) + expect_false(is.numeric(pd$outcome)) + expect_true(all(c("setosa", "versicolor", "virginica") %in% as.character(pd$outcome))) +}) + +test_that("plot.gg_variable RF classification multi-class: outcome factor levels match column order", { + skip_if_not_installed("randomForest") + set.seed(42L) + rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L) + gg <- gg_variable(rf) + p <- plot(gg, xvar = "Sepal.Length") + pd <- tryCatch(p@data, error = function(e) p$data) + # Levels must follow the yhat.* column order in gg_variable output, + # not alphabetical order (which factor() would impose by default). + expected_levels <- sub("^yhat\\.", "", grep("^yhat\\.", names(gg), value = TRUE)) + expect_equal(levels(pd$outcome), expected_levels) +}) + +test_that("gg_variable.randomForest: oob=FALSE triggers a warning", { + skip_if_not_installed("randomForest") + set.seed(42L) + rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L) + # oob=FALSE is not supported for randomForest; a warning must be emitted + # and OOB vote fractions are still returned. + expect_warning( + gg <- gg_variable(rf, oob = FALSE), + regexp = "oob = FALSE is not supported" + ) + expect_s3_class(gg, "gg_variable") + expect_true("yhat.setosa" %in% names(gg)) +}) diff --git a/tests/testthat/test_snapshots.R b/tests/testthat/test_snapshots.R index 7894ea9b..9c0b7738 100644 --- a/tests/testthat/test_snapshots.R +++ b/tests/testthat/test_snapshots.R @@ -274,4 +274,27 @@ local({ }) } +## ── randomForest classification snapshots (PR #87) ─────────────────────────── +if (requireNamespace("randomForest", quietly = TRUE)) { + local({ + set.seed(42L) + rf_iris <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L) + gg_iris <- gg_variable(rf_iris) + + test_that("snapshot: gg-variable-rf-classification-default", { + vdiffr::expect_doppelganger( + "gg-variable-rf-classification-default", + plot(gg_iris) + ) + }) + + test_that("snapshot: gg-variable-rf-classification-smooth", { + vdiffr::expect_doppelganger( + "gg-variable-rf-classification-smooth", + plot(gg_iris, xvar = "Sepal.Length", smooth = TRUE) + ) + }) + }) +} + } # end CI guard