InsightRX
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion b/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/apply_categorical_mapping.R‎
Lines changed: 201 additions & 0 deletions b/‎R/apply_categorical_mapping.R‎
Lines changed: 201 additions & 0 deletions
diff --git a/‎R/reformat_data.R‎
Lines changed: 4 additions & 1 deletion b/‎R/reformat_data.R‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎R/reformat_data_modeling_to_modeling.R‎
Lines changed: 16 additions & 2 deletions b/‎R/reformat_data_modeling_to_modeling.R‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎R/reformat_data_nca_to_modeling.R‎
Lines changed: 20 additions & 4 deletions b/‎R/reformat_data_nca_to_modeling.R‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎R/reformat_data_sdtm_to_modeling.R‎
Lines changed: 20 additions & 6 deletions b/‎R/reformat_data_sdtm_to_modeling.R‎
Lines changed: 20 additions & 6 deletions
@@ -1,6 +1,6 @@
 Package: irxforge
 Title: Forging data for pharmacometric analyses
-Version: 0.0.0.9006
+Version: 0.0.0.9009
 Authors@R: c(
     person("Ron", "Keizer", email = "ron@insight-rx.com", role = c("cre", "aut")),
     person("Michael", "McCarthy", email = "michael.mccarthy@insight-rx.com", role = "ctb"),
 
@@ -0,0 +1,201 @@
+#' Apply categorical-to-numeric encoding for specified columns
+#'
+#' Internal helper used by `reformat_data_*` functions to convert character or
+#' factor columns to numeric values according to a user-supplied mapping or
+#' automatic frequency-based encoding.
+#'
+#' @param data A data.frame.
+#' @param categorical_mapping Either:
+#'   - A character vector of column names to auto-encode (most common value
+#'     gets 0, next most common gets 1, etc.).
+#'   - A data.frame with columns `column`, `original_value`, `encoded_value`
+#'     (case-insensitive) specifying explicit mappings. Values in the data not
+#'     covered by the mapping receive continuation integers starting from
+#'     `max(encoded_value) + 1`.
+#'   - `NULL` (default) to skip encoding.
+#'
+#' NA values are always encoded as -99.
+#'
+#' @returns The input `data` with specified columns converted to numeric. A
+#'   `"categorical_mapping"` attribute is attached: a data.frame with columns
+#'   `column`, `original_value`, `encoded_value` describing the full mapping
+#'   used.
+#'
+#' @keywords internal
+apply_categorical_mapping <- function(data, categorical_mapping = NULL) {
+  if (is.null(categorical_mapping)) {
+    return(data)
+  }
+
+  if (is.character(categorical_mapping)) {
+    mapping <- apply_categorical_mapping_auto(data, categorical_mapping)
+  } else if (is.data.frame(categorical_mapping)) {
+    mapping <- apply_categorical_mapping_manual(data, categorical_mapping)
+  } else {
+    stop(
+      "`categorical_mapping` must be a character vector or a data.frame, ",
+      "got ", class(categorical_mapping)[1], "."
+    )
+  }
+
+  # Apply the mapping to data
+  data <- apply_mapping_to_data(data, mapping)
+
+  attr(data, "categorical_mapping") <- mapping
+  data
+}
+
+#' Build an automatic frequency-based mapping
+#' @param data A data.frame
+#' @param columns Character vector of column names
+#' @returns A data.frame with columns `column`, `original_value`, `encoded_value`
+#' @keywords internal
+apply_categorical_mapping_auto <- function(data, columns) {
+  mappings <- list()
+
+  for (col in columns) {
+    if (!col %in% names(data)) {
+      warning("Column '", col, "' not found in data, skipping.")
+      next
+    }
+
+    vals <- data[[col]]
+    if (is.factor(vals)) vals <- as.character(vals)
+
+    # Frequency table excluding NAs, sorted descending
+    freq <- sort(table(vals, useNA = "no"), decreasing = TRUE)
+    class_names <- names(freq)
+
+    mappings[[length(mappings) + 1]] <- data.frame(
+      column = col,
+      original_value = class_names,
+      encoded_value = seq_along(class_names) - 1L,
+      stringsAsFactors = FALSE
+    )
+  }
+
+  if (length(mappings) == 0) {
+    return(data.frame(
+      column = character(0),
+      original_value = character(0),
+      encoded_value = integer(0),
+      stringsAsFactors = FALSE
+    ))
+  }
+
+  do.call(rbind, mappings)
+}
+
+#' Build a mapping from a user-supplied data.frame, filling in unmapped classes
+#' @param data A data.frame
+#' @param mapping_df A data.frame with columns column/original_value/encoded_value
+#' @returns A data.frame with columns `column`, `original_value`, `encoded_value`
+#' @keywords internal
+apply_categorical_mapping_manual <- function(data, mapping_df) {
+  # Normalize column names to lowercase for matching
+  names(mapping_df) <- tolower(names(mapping_df))
+
+  required_cols <- c("column", "original_value", "encoded_value")
+  missing_cols <- setdiff(required_cols, names(mapping_df))
+  if (length(missing_cols) > 0) {
+    stop(
+      "`categorical_mapping` data.frame must contain columns: ",
+      paste(required_cols, collapse = ", "),
+      ". Missing: ", paste(missing_cols, collapse = ", "), "."
+    )
+  }
+
+  # Ensure consistent types
+  mapping_df$column <- as.character(mapping_df$column)
+  mapping_df$original_value <- as.character(mapping_df$original_value)
+  mapping_df$encoded_value <- as.numeric(mapping_df$encoded_value)
+
+  mappings <- list()
+
+  for (col in unique(mapping_df$column)) {
+    if (!col %in% names(data)) {
+      warning("Column '", col, "' not found in data, skipping.")
+      next
+    }
+
+    col_mapping <- mapping_df[mapping_df$column == col, , drop = FALSE]
+
+    vals <- data[[col]]
+    if (is.factor(vals)) vals <- as.character(vals)
+
+    # Find unmapped non-NA values
+    mapped_values <- col_mapping$original_value
+    unique_vals <- unique(vals[!is.na(vals)])
+    unmapped <- setdiff(unique_vals, mapped_values)
+
+    if (length(unmapped) > 0) {
+      # Sort unmapped by descending frequency
+      freq <- sort(table(vals[vals %in% unmapped], useNA = "no"), decreasing = TRUE)
+      unmapped_sorted <- names(freq)
+
+      next_value <- max(col_mapping$encoded_value) + 1L
+
+      extra_mapping <- data.frame(
+        column = col,
+        original_value = unmapped_sorted,
+        encoded_value = seq(next_value, length.out = length(unmapped_sorted)),
+        stringsAsFactors = FALSE
+      )
+
+      col_mapping <- rbind(
+        col_mapping[, required_cols, drop = FALSE],
+        extra_mapping
+      )
+    } else {
+      col_mapping <- col_mapping[, required_cols, drop = FALSE]
+    }
+
+    mappings[[length(mappings) + 1]] <- col_mapping
+  }
+
+  if (length(mappings) == 0) {
+    return(data.frame(
+      column = character(0),
+      original_value = character(0),
+      encoded_value = integer(0),
+      stringsAsFactors = FALSE
+    ))
+  }
+
+  result <- do.call(rbind, mappings)
+  rownames(result) <- NULL
+  result
+}
+
+#' Apply a mapping data.frame to the data columns
+#' @param data A data.frame
+#' @param mapping A data.frame with columns column/original_value/encoded_value
+#' @returns The modified data.frame
+#' @keywords internal
+apply_mapping_to_data <- function(data, mapping) {
+  for (col in unique(mapping$column)) {
+    if (!col %in% names(data)) next
+
+    col_map <- mapping[mapping$column == col, ]
+    lookup <- stats::setNames(col_map$encoded_value, col_map$original_value)
+
+    vals <- as.character(data[[col]])
+
+    # Map values: use lookup for known values, -99 for NA or unmapped
+    new_vals <- numeric(length(vals))
+    for (i in seq_along(vals)) {
+      if (is.na(vals[i])) {
+        new_vals[i] <- -99
+      } else if (vals[i] %in% names(lookup)) {
+        new_vals[i] <- lookup[vals[i]]
+      } else {
+        # Should not happen if mapping was built correctly, but be safe
+        new_vals[i] <- -99
+      }
+    }
+
+    data[[col]] <- new_vals
+  }
+
+  data
+}
@@ -17,7 +17,10 @@
 #'   `DS`,  following the SDTM structure and nomenclature.
 #' @param output_type type of output dataset. Can be either `"nca"` or
 #'   `"modeling"`.
-#' @param ... passed onto specific reformatting functions:
+#' @param ... passed onto specific reformatting functions. All child functions
+#' accept `categorical_mapping` (character vector or data.frame) for converting
+#' categorical columns to numeric. See individual function docs for details.
+#' Additional arguments passed to specific reformatting functions:
 #' - `input_type = "nca"` and `output_type = "modeling"`:
 #'   [reformat_data_nca_to_modeling()]
 #' - `input_type = "sdtm"` and `output_type = "modeling"`:
 
@@ -2,8 +2,14 @@
 #' dataset.
 #' 
 #' @param data dataset formatted as modeling-ready dataset
-#' @param dictionary a data dictionary that maps expected variable names to 
+#' @param dictionary a data dictionary that maps expected variable names to
 #' variables in the data.
+#' @param categorical_mapping Either a character vector of column names to
+#' auto-encode (most common value gets 0, next gets 1, etc.), or a data.frame
+#' with columns `column`, `original_value`, `encoded_value` for explicit
+#' mappings. NA values are encoded as -99. The final mapping is attached as a
+#' `"categorical_mapping"` attribute on the returned data.frame. Default `NULL`
+#' skips encoding.
 #' @param na what to set NA values to. E.g. ".", (default) or NA (keep NA),
 #' or NULL (do nothing).
 #' 
@@ -14,6 +20,7 @@
 reformat_data_modeling_to_modeling <- function(
   data,
   dictionary = NULL,
+  categorical_mapping = NULL,
   na = "."
 ) {
 
@@ -38,11 +45,18 @@ reformat_data_modeling_to_modeling <- function(
     }
   }
 
+  ## Apply categorical encoding
+  data <- apply_categorical_mapping(data, categorical_mapping)
+  cat_map <- attr(data, "categorical_mapping")
+
   ## Convert NA's to dots (or something else)
   if(!is.null(na)) {
     data <- data |>
       dplyr::mutate(dplyr::across(dplyr::everything(), ~ifelse(is.na(.) | . == "NA", na, .)))
   }
-  
+
+  ## Preserve categorical mapping attribute (dplyr may strip it)
+  if (!is.null(cat_map)) attr(data, "categorical_mapping") <- cat_map
+
   data
 } 
@@ -16,6 +16,12 @@
 #' as `ceiling(max(observation_time) / interval)`. Only applies to column-wise
 #' dose data. Default `NULL` preserves existing behavior (no ADDL/II columns).
 #' Examples: `list(interval = 12)` or `list(n = 5, interval = 12)`.
+#' @param categorical_mapping Either a character vector of column names to
+#' auto-encode (most common value gets 0, next gets 1, etc.), or a data.frame
+#' with columns `column`, `original_value`, `encoded_value` for explicit
+#' mappings. NA values are encoded as -99. The final mapping is attached as a
+#' `"categorical_mapping"` attribute on the returned data.frame. Default `NULL`
+#' skips explicit encoding (existing blanket conversion still applies).
 #'
 #' @returns data.frame with population PK input data in NONMEM-style
 #' format.
@@ -34,6 +40,7 @@ reformat_data_nca_to_modeling <- function(
   obs_compartment = 1,
   covariates = NULL,
   repeat_doses = NULL,
+  categorical_mapping = NULL,
   na = "."
 ) {
 
@@ -131,25 +138,34 @@ reformat_data_nca_to_modeling <- function(
     dplyr::select("ID", "TIME", "CMT", "EVID", "MDV", "DV", "AMT", dplyr::any_of(c("ADDL", "II")), "GROUP", "ORIGID", !!covariates) |>
     dplyr::arrange(.data$GROUP, .data$ID, .data$TIME, -.data$EVID)
 
-  ## Convert all character columns to categorical (but numeric)
+  ## Apply user-specified categorical encoding
+  comb <- apply_categorical_mapping(comb, categorical_mapping)
+  cat_map <- attr(comb, "categorical_mapping")
+  already_encoded <- if (!is.null(cat_map)) unique(cat_map$column) else character(0)
+
+  ## Convert remaining character columns to categorical (but numeric)
   for(key in names(comb)) {
+    if (key %in% already_encoded) next
     if(! inherits(comb[[key]], "numeric")) {
       suppressWarnings(
         comb[[key]] <- match(comb[[key]], unique(comb[[key]]))
       )
     }
   }
-  
+
   ## Remove any observations with DV = -99
   comb <- comb |>
     dplyr::filter(.data$DV != -99)
-  
+
   ## Convert NA's to dots or something else
   if(!is.null(na)) {
     comb <- comb |>
       dplyr::mutate(dplyr::across(dplyr::everything(), ~ifelse(is.na(.) | . == "NA", na, .)))
   }
-  
+
+  ## Preserve categorical mapping attribute (dplyr may strip it)
+  if (!is.null(cat_map)) attr(comb, "categorical_mapping") <- cat_map
+
   ## Return
   comb
 
 
@@ -1,20 +1,27 @@
 #' Reformat SDTM datasets into NONMEM-style modeling dataset
 #' 
 #' @param data list containing data.frames with SDTM domains
-#' @param dictionary a data dictionary that maps expected variable names to 
+#' @param dictionary a data dictionary that maps expected variable names to
 #' variables in the data.
+#' @param categorical_mapping Either a character vector of column names to
+#' auto-encode (most common value gets 0, next gets 1, etc.), or a data.frame
+#' with columns `column`, `original_value`, `encoded_value` for explicit
+#' mappings. NA values are encoded as -99. The final mapping is attached as a
+#' `"categorical_mapping"` attribute on the returned data.frame. Default `NULL`
+#' skips encoding.
 #' @param na what to set NA values to. E.g. ".", (default) or NA (keep NA),
 #' or NULL (do nothing).
 #'
 #' @returns data.frame with population PK input data in NONMEM-style
-#' format. It will also add the non-standard columns ROUTE ("oral", "iv") and 
-#' FORM (formulation: "tablet", "suspension", "patch", "infusion", etc.) with 
+#' format. It will also add the non-standard columns ROUTE ("oral", "iv") and
+#' FORM (formulation: "tablet", "suspension", "patch", "infusion", etc.) with
 #' values for each dose and NA for observations.
-#' 
+#'
 #' @export
 reformat_data_sdtm_to_modeling <- function(
-  data, 
+  data,
   dictionary,
+  categorical_mapping = NULL,
   na = "."
 ) {
 
@@ -504,12 +511,19 @@ reformat_data_sdtm_to_modeling <- function(
     ) %>%
     dplyr::filter(!(is.na(.data$DV) & .data$EVID == 0)) # filter out observation rows with missing DV
 
+  ## Apply categorical encoding
+  poppk_data <- apply_categorical_mapping(poppk_data, categorical_mapping)
+  cat_map <- attr(poppk_data, "categorical_mapping")
+
   ## Convert NA's to dots (or something else)
   if(!is.null(na)) {
     poppk_data <- poppk_data |>
       dplyr::mutate(dplyr::across(dplyr::everything(), ~ifelse(is.na(.) | . == "NA", na, .)))
   }
-  
+
+  ## Preserve categorical mapping attribute (dplyr may strip it)
+  if (!is.null(cat_map)) attr(poppk_data, "categorical_mapping") <- cat_map
+
   poppk_data
 }