Skip to content

Commit a9f6a81

Browse files
authored
Merge pull request #3 from openpharma/fix-tests
Changed lasso implementation of sim_glmnet so that s='lambda.min' ins…
2 parents 1e0a73f + ff1e972 commit a9f6a81

22 files changed

Lines changed: 190 additions & 54 deletions

.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
^knockofftools\.Rproj$
66
^LICENSE$
77
^README\.md$
8+
^\.Rproj\.user$

DESCRIPTION

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ Imports:
2626
survival (>= 2.44.1.1),
2727
randomForestSRC (>= 2.9.1),
2828
grf (>= 2.3.1),
29-
glasso (>= 1.11),
3029
CVglasso (>= 1.0),
31-
nnet (>= 7.3-18)
32-
RoxygenNote: 7.2.3
30+
nnet (>= 7.3-18),
31+
magrittr (>= 1.5)
32+
RoxygenNote: 7.3.3
3333
Suggests:
3434
knitr,
3535
rmarkdown,

NAMESPACE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,24 @@ export(stat_predictive_causal_forest)
3535
export(stat_predictive_glmnet)
3636
export(stat_random_forest)
3737
export(variable.selections)
38+
importFrom(magrittr,"%>%")
39+
importFrom(stats,as.formula)
40+
importFrom(stats,coef)
41+
importFrom(stats,contrasts)
42+
importFrom(stats,cutree)
43+
importFrom(stats,dist)
44+
importFrom(stats,dnbinom)
45+
importFrom(stats,hclust)
46+
importFrom(stats,ks.test)
47+
importFrom(stats,lm)
48+
importFrom(stats,model.matrix)
49+
importFrom(stats,pnbinom)
50+
importFrom(stats,predict)
51+
importFrom(stats,qqnorm)
52+
importFrom(stats,rbinom)
53+
importFrom(stats,rexp)
54+
importFrom(stats,rmultinom)
55+
importFrom(stats,rnorm)
56+
importFrom(stats,runif)
57+
importFrom(stats,sd)
58+
importFrom(stats,toeplitz)

R/globals.R

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
utils::globalVariables(c(
2+
"selected",
3+
"variable",
4+
"draw",
5+
"drawclass"
6+
))

R/imports.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#' @keywords internal
2+
#' @importFrom magrittr %>%
3+
#' @importFrom stats as.formula coef contrasts cutree dist dnbinom hclust ks.test lm model.matrix pnbinom predict qqnorm rbinom rexp rmultinom rnorm runif sd toeplitz
4+
NULL

R/internal.R

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ find_single_optimal_variable_set <- function(S, p, trim=0.5) {
133133
#' Do not call this function on its own. Fits cross-validated glmnet model with fixed effect.
134134
#'
135135
#'
136-
#' @param X.fixed a data.frame (or tibble) with "numeric" and "factor" columns corresponding to covariates or terms that should be treated as fixed effects in the model.
136+
#' @param X_fixed a data.frame (or tibble) with "numeric" and "factor" columns corresponding to covariates or terms that should be treated as fixed effects in the model.
137137
#' @param X original data.frame (or tibble) with "numeric" and "factor" columns only. The number of columns, ncol(X) needs to be > 2.
138138
#' @param y response vector with \code{length(y) = nrow(X)}. Accepts "numeric" (family="gaussian") or binary "factor" (family="binomial"). Can also be a survival object of class Surv
139139
#' as obtained from y = survival::Surv(time, status).
@@ -158,7 +158,7 @@ cv_coeffs_glmnet_with_fixed_effect <- function(X_fixed, X, y, family, nlambda=50
158158
X = scale(X)
159159
}
160160

161-
if (!methods::hasArg(lambda) ) {
161+
if (!hasArg(lambda) ) {
162162
if( identical(family, "gaussian") ) {
163163
if(!is.numeric(y)) {
164164
stop('Input y must be numeric.')
@@ -195,13 +195,12 @@ cv_coeffs_glmnet_with_fixed_effect <- function(X_fixed, X, y, family, nlambda=50
195195
#' @param y response vector with \code{length(y) = nrow(X)}. Accepts "numeric" (family="gaussian") or binary "factor" (family="binomial"). Can also be a survival object of class Surv
196196
#' as obtained from y = survival::Surv(time, status).
197197
#' @param type should be "regression" if y is numeric, "classification" if y is a binary factor variable or "survival" if y is a survival object.
198-
#' @param ...
199198
#'
200199
#' @return importance scores
201200
#' @export
202201
#'
203202
#' @keywords internal
204-
random_forest_importance_scores <- function(X, y, trt, type = "regression", ...){
203+
random_forest_importance_scores <- function(X, y, trt, type = "regression"){
205204
# make the column names unique
206205
colnames(X) = make.unique(colnames(X))
207206

@@ -323,16 +322,15 @@ ns.transform <- function(y) {
323322

324323
}
325324

326-
#' Heuristic check for whether a variable can be reasonably treated as continuous
325+
#' Heuristic check for whether numeric variables can be reasonably treated as continuous
327326
#'
328-
#' @param x a numeric variable vector
327+
#' @param X the design matrix of interest with columns either "numeric" or "factor"
329328
#'
330329
#' @return a logical TRUE or FALSE depending on whether n_distinct(x) > 30
331330
#' @export
332331
#'
333332
#' @keywords internal
334333
check_if_continuous <- function(X) {
335-
`%>%` <- magrittr::`%>%`
336334
X_numeric <- dplyr::select_if(X, is.numeric)
337335
is.continuous <- sum(X_numeric %>% lapply(dplyr::n_distinct) %>% unlist() <= 30) > 0
338336
if (is.continuous) warning("Some of the numeric columns of X have suspiciously few distinct values: n_distinct <= 30. Those columns should perhaps not be treated as continuous variables. Please review carefully and read the documentation about the gcm parameter of the knockoff.statistics function.")

R/knockoff_filters.R

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,6 @@ knockoff.statistics <- function(y, X, type="regression",
157157
#' @param y response vector with \code{length(y) = nrow(X)}. Accepts "numeric", binary "factor", or survival ("Surv") object.
158158
#' @param X data.frame (or tibble) with "numeric" and "factor" columns only. The number of columns, ncol(X) needs to be > 2.
159159
#' @param type should be "regression" if y is numeric, "classification" if y is a binary factor variable or "survival" if y is a survival object.
160-
#' @param M the number of independent knockoff feature statistics that should be calculated.
161160
#' @param knockoff.method what type of knockoffs to calculate. Defaults to sequential knockoffs, knockoff.method="seq", but other options are "sparseseq" and "mx". The "mx" option only works if all columns of X are continuous.
162161
#' @param statistic knockoff feature statistic function, defaults to glmnet coefficient difference (statistic="stat_glmnet"; see ?stat_glmnet). Other options include statistic="stat_random_forest" (see ?stat_random_forest), statistic="stat_predictive_glmnet" (see ?stat_predictive_glmnet) or statistic="stat_predictive_causal_forest" (see ?stat_predictive_causal_forest).
163162
#' @param trt binary treatment (factor) variable required if statistic involves a predictive knockoff filter (i.e. if statistic="stat_predictive_glmnet" or statistic="stat_predictive_causal_forest")
@@ -315,7 +314,7 @@ stat_glmnet <- function(y, X, X_k, type = "regression", X.fixed=NULL, penalty.fi
315314
#' @param y response vector with \code{length(y) = nrow(X)}. Accepts "numeric" (family="gaussian") or binary "factor" (family="binomial"). Can also be a survival object of class "Surv" (type="survival")
316315
#' as obtained from y = survival::Surv(time, status).
317316
#' @param type should be "regression" if y is numeric, "classification" if y is a binary factor variable or "survival" if y is a survival object.
318-
#' @param ...
317+
#' @param ... other parameters passed to \code{random_forest_importance_scores}.
319318
#'
320319
#' @return data.frame with knockoff statistics W as column. The number of rows matches the number of columns (variables) of the data.frame X and the variable names are recorded in rownames(W).
321320
#' @export

R/performance.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ eval_fdp <- function(selected, negatives) {
2121
#' True positive proportion (tpp) as function of selection and known positives:
2222
#'
2323
#' @param selected vector of indices of selected variables
24-
#' @param negatives vector of indices of known non-null variables (that influence response)
24+
#' @param positives vector of indices of known non-null variables (that influence response)
2525
#'
2626
#' @return true positive rate
2727
#' @export

R/plot.R

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,25 @@
1+
12
#' Heatmap of multiple variable selections ordered by importance
23
#'
3-
#' @param S data.frame of variable selections from multiple knockoffs (each entry is either 1 if variable is selected and 0 otherwise). Columns correspond to different knockoffs and rows correspond to the underlying variables. row.names(S) records the variable names.
4+
#' @param x data.frame of variable selections from multiple knockoffs
5+
#' (each entry is either 1 if variable is selected and 0 otherwise).
6+
#' Columns correspond to different knockoffs and rows correspond to the
7+
#' underlying variables. row.names(x) records the variable names.
8+
#'
9+
#' @param ... Additional arguments passed to other plot methods (currently ignored).
10+
#'
411
#' @param nbcocluster bivariate vector c(number of variable clusters, number of selection clusters).
5-
#' The former number must be specified less than nrow(S) and the latter must be less than ncol(S).
12+
#' The former number must be specified less than nrow(x) and the latter must be less than ncol(x).
613
#'
714
#' @details To help visualize most important variables we perform clustering both selections and variables.
815
#'
916
#' @return plot of heatmap
17+
#'
18+
#' @method plot variable.selections
1019
#' @export
1120
#'
1221
#' @examples
1322
#' library(knockofftools)
14-
#'
1523
#' set.seed(1)
1624
#'
1725
#' # Simulate 8 Gaussian covariate predictors and 2 binary factors:
@@ -20,19 +28,19 @@
2028
#' # create linear predictor with first 5 beta-coefficients = 1 (all other zero)
2129
#' lp <- generate_lp(X, p_nn = 5, a=1)
2230
#'
23-
#' # Gaussian
24-
#'
25-
#' # Simulate response from a linear model y = lp + epsilon, where epsilon ~ N(0,1):
31+
#' # Simulate response:
2632
#' y <- lp + rnorm(100)
2733
#'
28-
#' # Calculate M independent knockoff feature statistics:
34+
#' # Calculate knockoff statistics:
2935
#' W <- knockoff.statistics(y=y, X=X, type="regression", M=5)
3036
#'
31-
#' S = variable.selections(W, error.type = "pfer", level = 1)
37+
#' S <- variable.selections(W, error.type = "pfer", level = 1)
3238
#'
3339
#' # plot heatmap of knockoff selections:
3440
#' plot(S)
35-
plot.variable.selections <- function(S, nbcocluster=c(7,7)) {
41+
plot.variable.selections <- function(x, ..., nbcocluster=c(7,7)) {
42+
43+
S <- x
3644

3745
if (class(S)[1]!="variable.selections") {
3846
stop("Input S must be of class \'variable.selections\'. Please see ?variable.selections.")
@@ -49,8 +57,6 @@ plot.variable.selections <- function(S, nbcocluster=c(7,7)) {
4957
variable = factor(rownames(S)),
5058
selected = as.numeric(as.matrix(S)))
5159

52-
`%>%` <- dplyr::`%>%`
53-
5460
sel.mat <- matrix(selections$selected,nrow=nrow(S))
5561
hclust.row <- hclust(dist(sel.mat, method="binary"), method="ward.D")
5662
hclust.col <- hclust(dist(t(sel.mat), method="binary"), method="ward.D")

R/simdata-data.R

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#' Simulated dataset for knockofftools
2+
#'
3+
#' @description
4+
#' A synthetic dataset generated by the function \code{generate_simdata()}.
5+
#' It contains simulated Gaussian, binary, and survival outcomes along with covariates.
6+
#'
7+
#' @format A data frame with 2000 rows and 33 variables:
8+
#' \describe{
9+
#' \item{Yg}{Continuous outcome}
10+
#' \item{Yb}{Binary outcome}
11+
#' \item{Tc}{Treatment indicator}
12+
#' \item{X1}{Covariate 1}
13+
#' \item{X2}{Covariate 2}
14+
#' \item{X3}{Covariate 3}
15+
#' \item{X4}{Covariate 4}
16+
#' \item{X5}{Covariate 5}
17+
#' \item{X6}{Covariate 6}
18+
#' \item{X7}{Covariate 7}
19+
#' \item{X8}{Covariate 8}
20+
#' \item{X9}{Covariate 9}
21+
#' \item{X10}{Covariate 10}
22+
#' \item{X11}{Covariate 11}
23+
#' \item{X12}{Covariate 12}
24+
#' \item{X13}{Covariate 13}
25+
#' \item{X14}{Covariate 14}
26+
#' \item{X15}{Covariate 15}
27+
#' \item{X16}{Covariate 16}
28+
#' \item{X17}{Covariate 17}
29+
#' \item{X18}{Covariate 18}
30+
#' \item{X19}{Covariate 19}
31+
#' \item{X20}{Covariate 20}
32+
#' \item{X21}{Covariate 21}
33+
#' \item{X22}{Covariate 22}
34+
#' \item{X23}{Covariate 23}
35+
#' \item{X24}{Covariate 24}
36+
#' \item{X25}{Covariate 25}
37+
#' \item{X26}{Covariate 26}
38+
#' \item{X27}{Covariate 27}
39+
#' \item{X28}{Covariate 28}
40+
#' \item{X29}{Covariate 29}
41+
#' \item{X30}{Covariate 30}
42+
#' }
43+
#'
44+
#' @source Simulated using \code{generate_simdata()}
45+
#'
46+
#' @examples
47+
#' data(simdata)
48+
#' head(simdata)
49+
"simdata"

0 commit comments

Comments
 (0)