-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfn-prepare_model_input.R
More file actions
116 lines (98 loc) · 3.29 KB
/
fn-prepare_model_input.R
File metadata and controls
116 lines (98 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
prepare_model_input <- function(name) {
# Load active analyses ---------------------------------------------------------
print("Load active analyses")
active_analyses <- readr::read_rds("lib/active_analyses.rds")
# Filter active_analyses to model inputs to be prepared ------------------------
print("Filter active_analyses to model inputs to be prepared")
active_analyses <- active_analyses[active_analyses$name == name, ]
if (nrow(active_analyses) == 0) {
stop(paste0("Input: ", name, " does not match any analyses"))
}
# Load data ------------------------------------------------------------------
print(paste0("Load data for ", active_analyses$name))
input <- readr::read_rds(paste0(
"output/dataset_clean/input_",
active_analyses$cohort,
"_clean.rds"
))
# Restrict to required variables for dataset preparation ---------------------
print("Restrict to required variables for dataset preparation")
reqvars <- unique(c(
"patient_id",
"index_date",
"end_date_exposure",
"end_date_outcome",
active_analyses$exposure,
active_analyses$outcome,
active_analyses$strata,
active_analyses$covariate_age,
"cov_cat_sex",
"cov_cat_ethnicity",
"cov_cat_smoking",
unlist(strsplit(active_analyses$covariate_other, split = ";")),
c(grep("sub_", colnames(input), value = TRUE)), #sub_cat_covidhospital, sub_cat_covidhistory, and other subgroups
"sup_bin_preex"
))
input <- input[, intersect(reqvars, colnames(input))]
if (length(setdiff(reqvars, colnames(input))) > 0) {
message(
"Variables (",
setdiff(reqvars, colnames(input)),
") not present in dataset"
)
}
# Identify final list of variables to keep -----------------------------------
print("Identify final list of variables to keep")
keep <- c(
"patient_id",
"index_date",
"end_date_exposure",
"end_date_outcome",
"exp_date",
"out_date"
)
varlists <- c("strata", "covariate_age", "covariate_sex", "covariate_other")
for (j in varlists) {
if (active_analyses[, j] != "NULL") {
keep <- c(
keep,
stringr::str_split(as.vector(active_analyses[, j]), ";")[[1]]
)
}
}
# Update end date for outcome and exposure by definition ---------------------
input <- dplyr::rename(
input,
"out_date" = active_analyses$outcome,
"exp_date" = active_analyses$exposure
)
# Remove outcomes outside of follow-up time ----------------------------------
print("Remove outcomes outside of follow-up time")
input <- input %>%
dplyr::mutate(
out_date = replace(
out_date,
which(out_date > end_date_outcome | out_date < index_date),
NA
),
exp_date = replace(
exp_date,
which(exp_date > end_date_exposure | exp_date < index_date),
NA
),
sub_cat_covidhospital = replace(
sub_cat_covidhospital,
which(is.na(exp_date)),
"no_infection"
)
)
# Update end date to be outcome date where applicable ------------------------
print("Update end date to be outcome date where applicable")
input <- input %>%
dplyr::rowwise() %>%
dplyr::mutate(
end_date_outcome = min(end_date_outcome, out_date, na.rm = TRUE)
) %>%
dplyr::ungroup()
return(list(input = input, keep = keep))
}