Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: ggRandomForests
Type: Package
Title: Visually Exploring Random Forests
Version: 2.7.3.9004
Date: 2026-05-20
Version: 2.7.3.9005
Date: 2026-05-21
Authors@R: person("John", "Ehrlinger",
role = c("aut", "cre"),
email = "john.ehrlinger@gmail.com")
Expand Down
15 changes: 14 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: ggRandomForests
Version: 2.7.3.9004
Version: 2.7.3.9005

ggRandomForests v2.8.0 (development) — continued
=================================================
Expand Down Expand Up @@ -31,6 +31,19 @@ ggRandomForests v2.8.0 (development) — continued

ggRandomForests v2.8.0 (development)
====================================
* **`gg_variable.randomForest` classification fix (#87).**
- `gg_variable.randomForest()` for classification forests now stores
per-class OOB vote fractions as `yhat.<classname>` columns (from
`object$votes`), matching the `rfsrc` path. Previously a single
`yhat` factor column (class labels from `object$predicted`) was
stored, which prevented the multi-class pivot in `plot.gg_variable`
from firing. Vote fractions are row-normalised to `[0, 1]` even
when the forest was fit with `norm.votes = FALSE`.
- `plot.gg_variable` binary classification: `smooth = TRUE` now
correctly maps x/y aesthetics onto the smooth layer.
- `plot.gg_variable` multi-class numeric path: `smooth = TRUE` now
adds a smooth layer (was silently skipped).
- Closes stale issues #81 (fixed in PR #83) and #82.
* **varPro partial dependence: `gg_partial_varpro()` (#84).**
- `gg_partial_varpro()` replaces `gg_partialpro()` as the primary entry
point for varPro partial dependence plots. The new extractor accepts
Expand Down
30 changes: 23 additions & 7 deletions R/gg_variable.R
Original file line number Diff line number Diff line change
Expand Up @@ -271,10 +271,16 @@ gg_variable.randomForest <- function(object,
...) {
arg_list <- list(...)

# randomForest objects do not store OOB predictions in a way that maps back
# to the predictor space, so we always use in-bag (full-forest) predictions.
if (!is.null(arg_list$oob)) {
arg_list$oob <- FALSE
# randomForest uses object$votes (OOB vote matrix) unconditionally — it is the
# only honest per-class probability estimate. In-bag class probabilities are
# not exposed through a consistent randomForest API, so oob=FALSE is not
# supported. Warn the caller rather than silently ignoring the argument.
if (!is.null(arg_list$oob) && identical(arg_list$oob, FALSE)) {
warning(
"oob = FALSE is not supported for randomForest objects: ",
"in-bag class probabilities are unavailable. ",
"OOB vote fractions (object$votes) will be used instead."
)
}

if (!inherits(object, "randomForest")) {
Expand Down Expand Up @@ -307,10 +313,20 @@ gg_variable.randomForest <- function(object,
}

gg_dta <- predictors
# Append the forest's in-bag predicted values.
gg_dta$yhat <- as.vector(object$predicted)
# For classification forests use per-class OOB vote fractions (object$votes),
# stored as yhat.<classname> columns — the same shape gg_variable.rfsrc
# produces. For regression a single numeric yhat column suffices.
if (object$type == "classification") {
gg_dta$yvar <- response
preds <- object$votes # n × n_classes matrix; may be raw counts or fractions
rs <- rowSums(preds)
if (any(rs > 1 + 1e-8, na.rm = TRUE)) {
preds <- preds / rs # normalise raw vote counts to [0, 1]
}
colnames(preds) <- paste0("yhat.", colnames(preds))
gg_dta <- cbind(gg_dta, preds)
gg_dta$yvar <- response
} else {
gg_dta$yhat <- as.vector(object$predicted)
}

# randomForest uses object$type ("classification" / "regression"); the
Expand Down
20 changes: 17 additions & 3 deletions R/plot.gg_variable.R
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,14 @@ plot.gg_variable <- function(x, # nolint: cyclocomp_linter
gg_dta_y <- gg_dta[, grep("yhat.", colnames(gg_dta))]
lng <- ncol(gg_dta_y)
gg2 <- parallel::mclapply(seq_len(ncol(gg_dta_y)), function(ind) {
cbind(gg_dta_x, yhat = gg_dta_y[, ind], outcome = ind)
cbind(gg_dta_x, yhat = gg_dta_y[, ind],
outcome = sub("^yhat\\.", "", colnames(gg_dta_y)[ind]))
})
gg3 <- do.call(rbind, gg2)
gg3$outcome <- factor(gg3$outcome)
# Use column order from gg_dta_y (not alphabetical) so facet panels
# appear in the same order as the model's class levels.
outcome_levels <- sub("^yhat\\.", "", colnames(gg_dta_y))
gg3$outcome <- factor(gg3$outcome, levels = outcome_levels)
gg_dta <- gg3
}
}
Expand Down Expand Up @@ -516,7 +520,10 @@ plot.gg_variable <- function(x, # nolint: cyclocomp_linter
}
if (smooth) {
gg_plt[[ind]] <- gg_plt[[ind]] +
ggplot2::geom_smooth(...)
ggplot2::geom_smooth(
ggplot2::aes(x = .data$var, y = .data$yhat),
...
)
}
} else {
# Factor predictor: jitter + boxplot coloured by observed class
Expand Down Expand Up @@ -550,6 +557,13 @@ plot.gg_variable <- function(x, # nolint: cyclocomp_linter
),
...
)
if (smooth) {
gg_plt[[ind]] <- gg_plt[[ind]] +
ggplot2::geom_smooth(
ggplot2::aes(x = .data$var, y = .data$yhat),
...
)
}
} else {
gg_plt[[ind]] <- gg_plt[[ind]] +
ggplot2::geom_boxplot(
Expand Down
124 changes: 124 additions & 0 deletions tests/testthat/test_gg_variable.R
Original file line number Diff line number Diff line change
Expand Up @@ -368,3 +368,127 @@ test_that("gg_variable.randomForest classification: class attr uses 'class' not
expect_false("classification" %in% class(gg_dta))
expect_s3_class(gg_dta, "gg_variable")
})

## ── randomForest classification (PR #87) ─────────────────────────────────────

test_that("gg_variable.randomForest classification: produces yhat.* columns not yhat", {
skip_if_not_installed("randomForest")
set.seed(42L)
rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L)
gg <- gg_variable(rf)
# Must have one column per class
expect_true(all(c("yhat.setosa", "yhat.versicolor", "yhat.virginica")
%in% names(gg)))
# Must NOT have a bare yhat column for multi-class
expect_false("yhat" %in% names(gg))
# Observed-class column must be present
expect_true("yvar" %in% names(gg))
# Vote fractions must be in [0, 1] and row-sum to ~1
vote_cols <- c("yhat.setosa", "yhat.versicolor", "yhat.virginica")
expect_true(all(gg[, vote_cols] >= 0))
expect_true(all(gg[, vote_cols] <= 1))
expect_true(all(abs(rowSums(gg[, vote_cols]) - 1) < 1e-6))
})

test_that("gg_variable.randomForest classification: plot returns patchwork for all xvar", {
skip_if_not_installed("randomForest")
set.seed(42L)
rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L)
gg <- gg_variable(rf)
p <- plot(gg)
# iris has 4 predictors so the no-xvar default assembles a multi-panel
# patchwork; assert patchwork specifically to catch regressions to a bare
# list (#80).
expect_s3_class(p, "patchwork")
})

test_that("gg_variable.randomForest classification: layer_data works on single-xvar plot", {
skip_if_not_installed("randomForest")
set.seed(42L)
rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L)
gg <- gg_variable(rf)
p <- plot(gg, xvar = "Sepal.Length")
expect_no_error(ggplot2::layer_data(p, 1L))
})

test_that("gg_variable.randomForest classification: norm.votes=FALSE still gives [0,1] fractions", {
skip_if_not_installed("randomForest")
set.seed(42L)
rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L,
norm.votes = FALSE)
gg <- gg_variable(rf)
vote_cols <- c("yhat.setosa", "yhat.versicolor", "yhat.virginica")
expect_true(all(c("yhat.setosa", "yhat.versicolor", "yhat.virginica") %in% names(gg)))
expect_true(all(gg[, vote_cols] >= 0))
expect_true(all(gg[, vote_cols] <= 1))
expect_true(all(abs(rowSums(gg[, vote_cols]) - 1) < 1e-6))
})

test_that("plot.gg_variable RF classification: smooth=TRUE layer_data smokeable (binary smooth aes bug)", {
skip_if_not_installed("randomForest")
# Two-class subset to exercise the *binary* classification path
set.seed(42L)
bin_data <- iris[iris$Species != "virginica", ]
bin_data$Species <- droplevels(bin_data$Species)
rf <- randomForest::randomForest(Species ~ ., data = bin_data, ntree = 50L)
gg <- gg_variable(rf)
p <- plot(gg, xvar = "Sepal.Length", smooth = TRUE)
# Before the fix, geom_smooth(...) has no aes and layer_data errors with
# "stat_smooth() requires the following missing aesthetics: x and y"
expect_no_error(ggplot2::layer_data(p, 2L))
})

test_that("plot.gg_variable RF classification: smooth=TRUE works for multi-class (missing block)", {
skip_if_not_installed("randomForest")
set.seed(42L)
rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L)
gg <- gg_variable(rf)
# Before the fix the multi-class numeric path silently skips smooth=TRUE
# but does not error; after the fix a smooth layer is present (layer 2).
p <- plot(gg, xvar = "Sepal.Length", smooth = TRUE)
expect_s3_class(p, "ggplot")
ld <- ggplot2::layer_data(p, 2L) # layer 2 = geom_smooth
expect_gt(nrow(ld), 0L)
})

test_that("plot.gg_variable RF classification multi-class: outcome column is class names not integers", {
skip_if_not_installed("randomForest")
set.seed(42L)
rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L)
gg <- gg_variable(rf)
p <- plot(gg, xvar = "Sepal.Length")
expect_s3_class(p, "ggplot")
# The 'outcome' column in the plot data drives facet labels.
# It must contain class names, not integer indices.
# ggplot2 >= 3.5 uses S7 slots; fall back to $ accessor for older versions.
pd <- tryCatch(p@data, error = function(e) p$data)
expect_false(is.numeric(pd$outcome))
expect_true(all(c("setosa", "versicolor", "virginica") %in% as.character(pd$outcome)))
})

test_that("plot.gg_variable RF classification multi-class: outcome factor levels match column order", {
skip_if_not_installed("randomForest")
set.seed(42L)
rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L)
gg <- gg_variable(rf)
p <- plot(gg, xvar = "Sepal.Length")
pd <- tryCatch(p@data, error = function(e) p$data)
# Levels must follow the yhat.* column order in gg_variable output,
# not alphabetical order (which factor() would impose by default).
expected_levels <- sub("^yhat\\.", "", grep("^yhat\\.", names(gg), value = TRUE))
expect_equal(levels(pd$outcome), expected_levels)
})

test_that("gg_variable.randomForest: oob=FALSE triggers a warning", {
skip_if_not_installed("randomForest")
set.seed(42L)
rf <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L)
# oob=FALSE is not supported for randomForest; a warning must be emitted
# and OOB vote fractions are still returned.
expect_warning(
gg <- gg_variable(rf, oob = FALSE),
regexp = "oob = FALSE is not supported"
)
expect_s3_class(gg, "gg_variable")
expect_true("yhat.setosa" %in% names(gg))
})
23 changes: 23 additions & 0 deletions tests/testthat/test_snapshots.R
Original file line number Diff line number Diff line change
Expand Up @@ -274,4 +274,27 @@ local({
})
}

## ── randomForest classification snapshots (PR #87) ───────────────────────────
if (requireNamespace("randomForest", quietly = TRUE)) {
local({
set.seed(42L)
rf_iris <- randomForest::randomForest(Species ~ ., data = iris, ntree = 50L)
gg_iris <- gg_variable(rf_iris)

test_that("snapshot: gg-variable-rf-classification-default", {
vdiffr::expect_doppelganger(
"gg-variable-rf-classification-default",
plot(gg_iris)
)
})

test_that("snapshot: gg-variable-rf-classification-smooth", {
vdiffr::expect_doppelganger(
"gg-variable-rf-classification-smooth",
plot(gg_iris, xvar = "Sepal.Length", smooth = TRUE)
)
})
})
}

} # end CI guard
Loading