Skip to content

Commit 187a3d1

Browse files
authored
Merge pull request #9 from InsightRX/feat/categorical-mapping
feat: add categorical_mapping arg to reformat_data_* functions
2 parents 3b03fac + de67ee9 commit 187a3d1

17 files changed

Lines changed: 661 additions & 17 deletions

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: irxforge
22
Title: Forging data for pharmacometric analyses
3-
Version: 0.0.0.9006
3+
Version: 0.0.0.9009
44
Authors@R: c(
55
person("Ron", "Keizer", email = "ron@insight-rx.com", role = c("cre", "aut")),
66
person("Michael", "McCarthy", email = "michael.mccarthy@insight-rx.com", role = "ctb"),

R/apply_categorical_mapping.R

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
#' Apply categorical-to-numeric encoding for specified columns
2+
#'
3+
#' Internal helper used by `reformat_data_*` functions to convert character or
4+
#' factor columns to numeric values according to a user-supplied mapping or
5+
#' automatic frequency-based encoding.
6+
#'
7+
#' @param data A data.frame.
8+
#' @param categorical_mapping Either:
9+
#' - A character vector of column names to auto-encode (most common value
10+
#' gets 0, next most common gets 1, etc.).
11+
#' - A data.frame with columns `column`, `original_value`, `encoded_value`
12+
#' (case-insensitive) specifying explicit mappings. Values in the data not
13+
#' covered by the mapping receive continuation integers starting from
14+
#' `max(encoded_value) + 1`.
15+
#' - `NULL` (default) to skip encoding.
16+
#'
17+
#' NA values are always encoded as -99.
18+
#'
19+
#' @returns The input `data` with specified columns converted to numeric. A
20+
#' `"categorical_mapping"` attribute is attached: a data.frame with columns
21+
#' `column`, `original_value`, `encoded_value` describing the full mapping
22+
#' used.
23+
#'
24+
#' @keywords internal
25+
apply_categorical_mapping <- function(data, categorical_mapping = NULL) {
26+
if (is.null(categorical_mapping)) {
27+
return(data)
28+
}
29+
30+
if (is.character(categorical_mapping)) {
31+
mapping <- apply_categorical_mapping_auto(data, categorical_mapping)
32+
} else if (is.data.frame(categorical_mapping)) {
33+
mapping <- apply_categorical_mapping_manual(data, categorical_mapping)
34+
} else {
35+
stop(
36+
"`categorical_mapping` must be a character vector or a data.frame, ",
37+
"got ", class(categorical_mapping)[1], "."
38+
)
39+
}
40+
41+
# Apply the mapping to data
42+
data <- apply_mapping_to_data(data, mapping)
43+
44+
attr(data, "categorical_mapping") <- mapping
45+
data
46+
}
47+
48+
#' Build an automatic frequency-based mapping
49+
#' @param data A data.frame
50+
#' @param columns Character vector of column names
51+
#' @returns A data.frame with columns `column`, `original_value`, `encoded_value`
52+
#' @keywords internal
53+
apply_categorical_mapping_auto <- function(data, columns) {
54+
mappings <- list()
55+
56+
for (col in columns) {
57+
if (!col %in% names(data)) {
58+
warning("Column '", col, "' not found in data, skipping.")
59+
next
60+
}
61+
62+
vals <- data[[col]]
63+
if (is.factor(vals)) vals <- as.character(vals)
64+
65+
# Frequency table excluding NAs, sorted descending
66+
freq <- sort(table(vals, useNA = "no"), decreasing = TRUE)
67+
class_names <- names(freq)
68+
69+
mappings[[length(mappings) + 1]] <- data.frame(
70+
column = col,
71+
original_value = class_names,
72+
encoded_value = seq_along(class_names) - 1L,
73+
stringsAsFactors = FALSE
74+
)
75+
}
76+
77+
if (length(mappings) == 0) {
78+
return(data.frame(
79+
column = character(0),
80+
original_value = character(0),
81+
encoded_value = integer(0),
82+
stringsAsFactors = FALSE
83+
))
84+
}
85+
86+
do.call(rbind, mappings)
87+
}
88+
89+
#' Build a mapping from a user-supplied data.frame, filling in unmapped classes
90+
#' @param data A data.frame
91+
#' @param mapping_df A data.frame with columns column/original_value/encoded_value
92+
#' @returns A data.frame with columns `column`, `original_value`, `encoded_value`
93+
#' @keywords internal
94+
apply_categorical_mapping_manual <- function(data, mapping_df) {
95+
# Normalize column names to lowercase for matching
96+
names(mapping_df) <- tolower(names(mapping_df))
97+
98+
required_cols <- c("column", "original_value", "encoded_value")
99+
missing_cols <- setdiff(required_cols, names(mapping_df))
100+
if (length(missing_cols) > 0) {
101+
stop(
102+
"`categorical_mapping` data.frame must contain columns: ",
103+
paste(required_cols, collapse = ", "),
104+
". Missing: ", paste(missing_cols, collapse = ", "), "."
105+
)
106+
}
107+
108+
# Ensure consistent types
109+
mapping_df$column <- as.character(mapping_df$column)
110+
mapping_df$original_value <- as.character(mapping_df$original_value)
111+
mapping_df$encoded_value <- as.numeric(mapping_df$encoded_value)
112+
113+
mappings <- list()
114+
115+
for (col in unique(mapping_df$column)) {
116+
if (!col %in% names(data)) {
117+
warning("Column '", col, "' not found in data, skipping.")
118+
next
119+
}
120+
121+
col_mapping <- mapping_df[mapping_df$column == col, , drop = FALSE]
122+
123+
vals <- data[[col]]
124+
if (is.factor(vals)) vals <- as.character(vals)
125+
126+
# Find unmapped non-NA values
127+
mapped_values <- col_mapping$original_value
128+
unique_vals <- unique(vals[!is.na(vals)])
129+
unmapped <- setdiff(unique_vals, mapped_values)
130+
131+
if (length(unmapped) > 0) {
132+
# Sort unmapped by descending frequency
133+
freq <- sort(table(vals[vals %in% unmapped], useNA = "no"), decreasing = TRUE)
134+
unmapped_sorted <- names(freq)
135+
136+
next_value <- max(col_mapping$encoded_value) + 1L
137+
138+
extra_mapping <- data.frame(
139+
column = col,
140+
original_value = unmapped_sorted,
141+
encoded_value = seq(next_value, length.out = length(unmapped_sorted)),
142+
stringsAsFactors = FALSE
143+
)
144+
145+
col_mapping <- rbind(
146+
col_mapping[, required_cols, drop = FALSE],
147+
extra_mapping
148+
)
149+
} else {
150+
col_mapping <- col_mapping[, required_cols, drop = FALSE]
151+
}
152+
153+
mappings[[length(mappings) + 1]] <- col_mapping
154+
}
155+
156+
if (length(mappings) == 0) {
157+
return(data.frame(
158+
column = character(0),
159+
original_value = character(0),
160+
encoded_value = integer(0),
161+
stringsAsFactors = FALSE
162+
))
163+
}
164+
165+
result <- do.call(rbind, mappings)
166+
rownames(result) <- NULL
167+
result
168+
}
169+
170+
#' Apply a mapping data.frame to the data columns
171+
#' @param data A data.frame
172+
#' @param mapping A data.frame with columns column/original_value/encoded_value
173+
#' @returns The modified data.frame
174+
#' @keywords internal
175+
apply_mapping_to_data <- function(data, mapping) {
176+
for (col in unique(mapping$column)) {
177+
if (!col %in% names(data)) next
178+
179+
col_map <- mapping[mapping$column == col, ]
180+
lookup <- stats::setNames(col_map$encoded_value, col_map$original_value)
181+
182+
vals <- as.character(data[[col]])
183+
184+
# Map values: use lookup for known values, -99 for NA or unmapped
185+
new_vals <- numeric(length(vals))
186+
for (i in seq_along(vals)) {
187+
if (is.na(vals[i])) {
188+
new_vals[i] <- -99
189+
} else if (vals[i] %in% names(lookup)) {
190+
new_vals[i] <- lookup[vals[i]]
191+
} else {
192+
# Should not happen if mapping was built correctly, but be safe
193+
new_vals[i] <- -99
194+
}
195+
}
196+
197+
data[[col]] <- new_vals
198+
}
199+
200+
data
201+
}

R/reformat_data.R

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717
#' `DS`, following the SDTM structure and nomenclature.
1818
#' @param output_type type of output dataset. Can be either `"nca"` or
1919
#' `"modeling"`.
20-
#' @param ... passed onto specific reformatting functions:
20+
#' @param ... passed onto specific reformatting functions. All child functions
21+
#' accept `categorical_mapping` (character vector or data.frame) for converting
22+
#' categorical columns to numeric. See individual function docs for details.
23+
#' Additional arguments passed to specific reformatting functions:
2124
#' - `input_type = "nca"` and `output_type = "modeling"`:
2225
#' [reformat_data_nca_to_modeling()]
2326
#' - `input_type = "sdtm"` and `output_type = "modeling"`:

R/reformat_data_modeling_to_modeling.R

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,14 @@
22
#' dataset.
33
#'
44
#' @param data dataset formatted as modeling-ready dataset
5-
#' @param dictionary a data dictionary that maps expected variable names to
5+
#' @param dictionary a data dictionary that maps expected variable names to
66
#' variables in the data.
7+
#' @param categorical_mapping Either a character vector of column names to
8+
#' auto-encode (most common value gets 0, next gets 1, etc.), or a data.frame
9+
#' with columns `column`, `original_value`, `encoded_value` for explicit
10+
#' mappings. NA values are encoded as -99. The final mapping is attached as a
11+
#' `"categorical_mapping"` attribute on the returned data.frame. Default `NULL`
12+
#' skips encoding.
713
#' @param na what to set NA values to. E.g. ".", (default) or NA (keep NA),
814
#' or NULL (do nothing).
915
#'
@@ -14,6 +20,7 @@
1420
reformat_data_modeling_to_modeling <- function(
1521
data,
1622
dictionary = NULL,
23+
categorical_mapping = NULL,
1724
na = "."
1825
) {
1926

@@ -38,11 +45,18 @@ reformat_data_modeling_to_modeling <- function(
3845
}
3946
}
4047

48+
## Apply categorical encoding
49+
data <- apply_categorical_mapping(data, categorical_mapping)
50+
cat_map <- attr(data, "categorical_mapping")
51+
4152
## Convert NA's to dots (or something else)
4253
if(!is.null(na)) {
4354
data <- data |>
4455
dplyr::mutate(dplyr::across(dplyr::everything(), ~ifelse(is.na(.) | . == "NA", na, .)))
4556
}
46-
57+
58+
## Preserve categorical mapping attribute (dplyr may strip it)
59+
if (!is.null(cat_map)) attr(data, "categorical_mapping") <- cat_map
60+
4761
data
4862
}

R/reformat_data_nca_to_modeling.R

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@
1616
#' as `ceiling(max(observation_time) / interval)`. Only applies to column-wise
1717
#' dose data. Default `NULL` preserves existing behavior (no ADDL/II columns).
1818
#' Examples: `list(interval = 12)` or `list(n = 5, interval = 12)`.
19+
#' @param categorical_mapping Either a character vector of column names to
20+
#' auto-encode (most common value gets 0, next gets 1, etc.), or a data.frame
21+
#' with columns `column`, `original_value`, `encoded_value` for explicit
22+
#' mappings. NA values are encoded as -99. The final mapping is attached as a
23+
#' `"categorical_mapping"` attribute on the returned data.frame. Default `NULL`
24+
#' skips explicit encoding (existing blanket conversion still applies).
1925
#'
2026
#' @returns data.frame with population PK input data in NONMEM-style
2127
#' format.
@@ -34,6 +40,7 @@ reformat_data_nca_to_modeling <- function(
3440
obs_compartment = 1,
3541
covariates = NULL,
3642
repeat_doses = NULL,
43+
categorical_mapping = NULL,
3744
na = "."
3845
) {
3946

@@ -131,25 +138,34 @@ reformat_data_nca_to_modeling <- function(
131138
dplyr::select("ID", "TIME", "CMT", "EVID", "MDV", "DV", "AMT", dplyr::any_of(c("ADDL", "II")), "GROUP", "ORIGID", !!covariates) |>
132139
dplyr::arrange(.data$GROUP, .data$ID, .data$TIME, -.data$EVID)
133140

134-
## Convert all character columns to categorical (but numeric)
141+
## Apply user-specified categorical encoding
142+
comb <- apply_categorical_mapping(comb, categorical_mapping)
143+
cat_map <- attr(comb, "categorical_mapping")
144+
already_encoded <- if (!is.null(cat_map)) unique(cat_map$column) else character(0)
145+
146+
## Convert remaining character columns to categorical (but numeric)
135147
for(key in names(comb)) {
148+
if (key %in% already_encoded) next
136149
if(! inherits(comb[[key]], "numeric")) {
137150
suppressWarnings(
138151
comb[[key]] <- match(comb[[key]], unique(comb[[key]]))
139152
)
140153
}
141154
}
142-
155+
143156
## Remove any observations with DV = -99
144157
comb <- comb |>
145158
dplyr::filter(.data$DV != -99)
146-
159+
147160
## Convert NA's to dots or something else
148161
if(!is.null(na)) {
149162
comb <- comb |>
150163
dplyr::mutate(dplyr::across(dplyr::everything(), ~ifelse(is.na(.) | . == "NA", na, .)))
151164
}
152-
165+
166+
## Preserve categorical mapping attribute (dplyr may strip it)
167+
if (!is.null(cat_map)) attr(comb, "categorical_mapping") <- cat_map
168+
153169
## Return
154170
comb
155171

R/reformat_data_sdtm_to_modeling.R

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,27 @@
11
#' Reformat SDTM datasets into NONMEM-style modeling dataset
22
#'
33
#' @param data list containing data.frames with SDTM domains
4-
#' @param dictionary a data dictionary that maps expected variable names to
4+
#' @param dictionary a data dictionary that maps expected variable names to
55
#' variables in the data.
6+
#' @param categorical_mapping Either a character vector of column names to
7+
#' auto-encode (most common value gets 0, next gets 1, etc.), or a data.frame
8+
#' with columns `column`, `original_value`, `encoded_value` for explicit
9+
#' mappings. NA values are encoded as -99. The final mapping is attached as a
10+
#' `"categorical_mapping"` attribute on the returned data.frame. Default `NULL`
11+
#' skips encoding.
612
#' @param na what to set NA values to. E.g. ".", (default) or NA (keep NA),
713
#' or NULL (do nothing).
814
#'
915
#' @returns data.frame with population PK input data in NONMEM-style
10-
#' format. It will also add the non-standard columns ROUTE ("oral", "iv") and
11-
#' FORM (formulation: "tablet", "suspension", "patch", "infusion", etc.) with
16+
#' format. It will also add the non-standard columns ROUTE ("oral", "iv") and
17+
#' FORM (formulation: "tablet", "suspension", "patch", "infusion", etc.) with
1218
#' values for each dose and NA for observations.
13-
#'
19+
#'
1420
#' @export
1521
reformat_data_sdtm_to_modeling <- function(
16-
data,
22+
data,
1723
dictionary,
24+
categorical_mapping = NULL,
1825
na = "."
1926
) {
2027

@@ -504,12 +511,19 @@ reformat_data_sdtm_to_modeling <- function(
504511
) %>%
505512
dplyr::filter(!(is.na(.data$DV) & .data$EVID == 0)) # filter out observation rows with missing DV
506513

514+
## Apply categorical encoding
515+
poppk_data <- apply_categorical_mapping(poppk_data, categorical_mapping)
516+
cat_map <- attr(poppk_data, "categorical_mapping")
517+
507518
## Convert NA's to dots (or something else)
508519
if(!is.null(na)) {
509520
poppk_data <- poppk_data |>
510521
dplyr::mutate(dplyr::across(dplyr::everything(), ~ifelse(is.na(.) | . == "NA", na, .)))
511522
}
512-
523+
524+
## Preserve categorical mapping attribute (dplyr may strip it)
525+
if (!is.null(cat_map)) attr(poppk_data, "categorical_mapping") <- cat_map
526+
513527
poppk_data
514528
}
515529

0 commit comments

Comments
 (0)