|
| 1 | +#' Apply categorical-to-numeric encoding for specified columns |
| 2 | +#' |
| 3 | +#' Internal helper used by `reformat_data_*` functions to convert character or |
| 4 | +#' factor columns to numeric values according to a user-supplied mapping or |
| 5 | +#' automatic frequency-based encoding. |
| 6 | +#' |
| 7 | +#' @param data A data.frame. |
| 8 | +#' @param categorical_mapping Either: |
| 9 | +#' - A character vector of column names to auto-encode (most common value |
| 10 | +#' gets 0, next most common gets 1, etc.). |
| 11 | +#' - A data.frame with columns `column`, `original_value`, `encoded_value` |
| 12 | +#' (case-insensitive) specifying explicit mappings. Values in the data not |
| 13 | +#' covered by the mapping receive continuation integers starting from |
| 14 | +#' `max(encoded_value) + 1`. |
| 15 | +#' - `NULL` (default) to skip encoding. |
| 16 | +#' |
| 17 | +#' NA values are always encoded as -99. |
| 18 | +#' |
| 19 | +#' @returns The input `data` with specified columns converted to numeric. A |
| 20 | +#' `"categorical_mapping"` attribute is attached: a data.frame with columns |
| 21 | +#' `column`, `original_value`, `encoded_value` describing the full mapping |
| 22 | +#' used. |
| 23 | +#' |
| 24 | +#' @keywords internal |
| 25 | +apply_categorical_mapping <- function(data, categorical_mapping = NULL) { |
| 26 | + if (is.null(categorical_mapping)) { |
| 27 | + return(data) |
| 28 | + } |
| 29 | + |
| 30 | + if (is.character(categorical_mapping)) { |
| 31 | + mapping <- apply_categorical_mapping_auto(data, categorical_mapping) |
| 32 | + } else if (is.data.frame(categorical_mapping)) { |
| 33 | + mapping <- apply_categorical_mapping_manual(data, categorical_mapping) |
| 34 | + } else { |
| 35 | + stop( |
| 36 | + "`categorical_mapping` must be a character vector or a data.frame, ", |
| 37 | + "got ", class(categorical_mapping)[1], "." |
| 38 | + ) |
| 39 | + } |
| 40 | + |
| 41 | + # Apply the mapping to data |
| 42 | + data <- apply_mapping_to_data(data, mapping) |
| 43 | + |
| 44 | + attr(data, "categorical_mapping") <- mapping |
| 45 | + data |
| 46 | +} |
| 47 | + |
| 48 | +#' Build an automatic frequency-based mapping |
| 49 | +#' @param data A data.frame |
| 50 | +#' @param columns Character vector of column names |
| 51 | +#' @returns A data.frame with columns `column`, `original_value`, `encoded_value` |
| 52 | +#' @keywords internal |
| 53 | +apply_categorical_mapping_auto <- function(data, columns) { |
| 54 | + mappings <- list() |
| 55 | + |
| 56 | + for (col in columns) { |
| 57 | + if (!col %in% names(data)) { |
| 58 | + warning("Column '", col, "' not found in data, skipping.") |
| 59 | + next |
| 60 | + } |
| 61 | + |
| 62 | + vals <- data[[col]] |
| 63 | + if (is.factor(vals)) vals <- as.character(vals) |
| 64 | + |
| 65 | + # Frequency table excluding NAs, sorted descending |
| 66 | + freq <- sort(table(vals, useNA = "no"), decreasing = TRUE) |
| 67 | + class_names <- names(freq) |
| 68 | + |
| 69 | + mappings[[length(mappings) + 1]] <- data.frame( |
| 70 | + column = col, |
| 71 | + original_value = class_names, |
| 72 | + encoded_value = seq_along(class_names) - 1L, |
| 73 | + stringsAsFactors = FALSE |
| 74 | + ) |
| 75 | + } |
| 76 | + |
| 77 | + if (length(mappings) == 0) { |
| 78 | + return(data.frame( |
| 79 | + column = character(0), |
| 80 | + original_value = character(0), |
| 81 | + encoded_value = integer(0), |
| 82 | + stringsAsFactors = FALSE |
| 83 | + )) |
| 84 | + } |
| 85 | + |
| 86 | + do.call(rbind, mappings) |
| 87 | +} |
| 88 | + |
| 89 | +#' Build a mapping from a user-supplied data.frame, filling in unmapped classes |
| 90 | +#' @param data A data.frame |
| 91 | +#' @param mapping_df A data.frame with columns column/original_value/encoded_value |
| 92 | +#' @returns A data.frame with columns `column`, `original_value`, `encoded_value` |
| 93 | +#' @keywords internal |
| 94 | +apply_categorical_mapping_manual <- function(data, mapping_df) { |
| 95 | + # Normalize column names to lowercase for matching |
| 96 | + names(mapping_df) <- tolower(names(mapping_df)) |
| 97 | + |
| 98 | + required_cols <- c("column", "original_value", "encoded_value") |
| 99 | + missing_cols <- setdiff(required_cols, names(mapping_df)) |
| 100 | + if (length(missing_cols) > 0) { |
| 101 | + stop( |
| 102 | + "`categorical_mapping` data.frame must contain columns: ", |
| 103 | + paste(required_cols, collapse = ", "), |
| 104 | + ". Missing: ", paste(missing_cols, collapse = ", "), "." |
| 105 | + ) |
| 106 | + } |
| 107 | + |
| 108 | + # Ensure consistent types |
| 109 | + mapping_df$column <- as.character(mapping_df$column) |
| 110 | + mapping_df$original_value <- as.character(mapping_df$original_value) |
| 111 | + mapping_df$encoded_value <- as.numeric(mapping_df$encoded_value) |
| 112 | + |
| 113 | + mappings <- list() |
| 114 | + |
| 115 | + for (col in unique(mapping_df$column)) { |
| 116 | + if (!col %in% names(data)) { |
| 117 | + warning("Column '", col, "' not found in data, skipping.") |
| 118 | + next |
| 119 | + } |
| 120 | + |
| 121 | + col_mapping <- mapping_df[mapping_df$column == col, , drop = FALSE] |
| 122 | + |
| 123 | + vals <- data[[col]] |
| 124 | + if (is.factor(vals)) vals <- as.character(vals) |
| 125 | + |
| 126 | + # Find unmapped non-NA values |
| 127 | + mapped_values <- col_mapping$original_value |
| 128 | + unique_vals <- unique(vals[!is.na(vals)]) |
| 129 | + unmapped <- setdiff(unique_vals, mapped_values) |
| 130 | + |
| 131 | + if (length(unmapped) > 0) { |
| 132 | + # Sort unmapped by descending frequency |
| 133 | + freq <- sort(table(vals[vals %in% unmapped], useNA = "no"), decreasing = TRUE) |
| 134 | + unmapped_sorted <- names(freq) |
| 135 | + |
| 136 | + next_value <- max(col_mapping$encoded_value) + 1L |
| 137 | + |
| 138 | + extra_mapping <- data.frame( |
| 139 | + column = col, |
| 140 | + original_value = unmapped_sorted, |
| 141 | + encoded_value = seq(next_value, length.out = length(unmapped_sorted)), |
| 142 | + stringsAsFactors = FALSE |
| 143 | + ) |
| 144 | + |
| 145 | + col_mapping <- rbind( |
| 146 | + col_mapping[, required_cols, drop = FALSE], |
| 147 | + extra_mapping |
| 148 | + ) |
| 149 | + } else { |
| 150 | + col_mapping <- col_mapping[, required_cols, drop = FALSE] |
| 151 | + } |
| 152 | + |
| 153 | + mappings[[length(mappings) + 1]] <- col_mapping |
| 154 | + } |
| 155 | + |
| 156 | + if (length(mappings) == 0) { |
| 157 | + return(data.frame( |
| 158 | + column = character(0), |
| 159 | + original_value = character(0), |
| 160 | + encoded_value = integer(0), |
| 161 | + stringsAsFactors = FALSE |
| 162 | + )) |
| 163 | + } |
| 164 | + |
| 165 | + result <- do.call(rbind, mappings) |
| 166 | + rownames(result) <- NULL |
| 167 | + result |
| 168 | +} |
| 169 | + |
| 170 | +#' Apply a mapping data.frame to the data columns |
| 171 | +#' @param data A data.frame |
| 172 | +#' @param mapping A data.frame with columns column/original_value/encoded_value |
| 173 | +#' @returns The modified data.frame |
| 174 | +#' @keywords internal |
| 175 | +apply_mapping_to_data <- function(data, mapping) { |
| 176 | + for (col in unique(mapping$column)) { |
| 177 | + if (!col %in% names(data)) next |
| 178 | + |
| 179 | + col_map <- mapping[mapping$column == col, ] |
| 180 | + lookup <- stats::setNames(col_map$encoded_value, col_map$original_value) |
| 181 | + |
| 182 | + vals <- as.character(data[[col]]) |
| 183 | + |
| 184 | + # Map values: use lookup for known values, -99 for NA or unmapped |
| 185 | + new_vals <- numeric(length(vals)) |
| 186 | + for (i in seq_along(vals)) { |
| 187 | + if (is.na(vals[i])) { |
| 188 | + new_vals[i] <- -99 |
| 189 | + } else if (vals[i] %in% names(lookup)) { |
| 190 | + new_vals[i] <- lookup[vals[i]] |
| 191 | + } else { |
| 192 | + # Should not happen if mapping was built correctly, but be safe |
| 193 | + new_vals[i] <- -99 |
| 194 | + } |
| 195 | + } |
| 196 | + |
| 197 | + data[[col]] <- new_vals |
| 198 | + } |
| 199 | + |
| 200 | + data |
| 201 | +} |
0 commit comments