diff --git a/DESCRIPTION b/DESCRIPTION index 96e2edc..9e8c3b4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,8 @@ Description: Ingests instructions to clean datasets containing results from License: AGPL-3 Encoding: UTF-8 Imports: - data.table + data.table, + logger Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/R/check_metadata.R b/R/check_metadata.R index 984e47a..5d760dc 100644 --- a/R/check_metadata.R +++ b/R/check_metadata.R @@ -58,9 +58,9 @@ check_dataset_model <- function(dataset) { # num_nonnumeric <- sum(invalid_) # if (num_nonnumeric > 0) { # stop(paste("The file 'dataset' has", num_nonnumeric,"record(s) where the variable ", varname," contains a non-numeric value.")) - # } + # } # } - message("[CleanLabValues] Dataset model check passed successfully.") + logger::log_info("[CleanLabValues] Dataset model check passed successfully.") } ####################################################### @@ -83,11 +83,10 @@ check_lab_target_units <- function(lab_target_units) { if (!all(c("concept_id", "unit_target") %in% names(dt))) { stop(paste("The file", lab_target_units, "should be a csv file with data model concept_id,unit_target")) } - message("[CleanLabValues] LAB_target_units check passed successfully.") + logger::log_info("[CleanLabValues] LAB_target_units check passed successfully.") invisible(dt) } - ####################################################### # lab_unit_conversion @@ -114,24 +113,24 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal stop(paste("The file", lab_unit_conversion, "should include the variable", varname, "in its data model")) } } - - # check that whenever multiplication_factor_from_origin_to_target is missing in at least one row, then the variable conversion_not_multiplication exists, and that there is no row where both variables are missing + + # check that whenever multiplication_factor_from_origin_to_target is missing in at least one row, then the variable conversion_not_multiplication exists, and that there is no row where both variables are missing mult_raw <- dt[["multiplication_factor_from_origin_to_target"]] mult_chr <- trimws(as.character(mult_raw)) is_missing_mult <- is.na(mult_raw) | mult_chr == "" nummissing_mult <- sum(is_missing_mult) - + mult_num <- suppressWarnings(as.numeric(mult_chr)) invalid_mult <- !is_missing_mult & is.na(mult_num) num_nonnumeric <- sum(invalid_mult) if (num_nonnumeric > 0) { - stop(paste("The file", lab_unit_conversion, "has", num_nonnumeric,"record(s) where the conversion method multiplication_factor_from_origin_to_target contains a non-numeric value.")) - } - + stop(paste("The file", lab_unit_conversion, "has", num_nonnumeric, "record(s) where the conversion method multiplication_factor_from_origin_to_target contains a non-numeric value.")) + } + varconv <- "conversion_not_multiplication" exists_conv <- varconv %in% names(dt) - + if (exists_conv) { conv_raw <- dt[[varconv]] conv_chr <- trimws(as.character(conv_raw)) @@ -139,23 +138,22 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal # count rows where both conversion methods are missing both_present_mult_and_conv <- !is_missing_mult & !is_missing_conv numboth_present_mult_and_conv <- sum(both_present_mult_and_conv) - #count rows where both conversion methods are included + # count rows where both conversion methods are included both_missing_mult_and_conv <- is_missing_mult & is_missing_conv nummissing_mult_and_conv <- sum(both_missing_mult_and_conv) - }else{ + } else { # conv_raw <- dt[["conversion_not_multiplication"]] # conv_chr <- trimws(as.character(conv_raw)) # is_missing_conv <- is.na(conv_raw) | conv_chr == "" } - + if (nummissing_mult > 0) { if (!exists_conv) { - stop(paste("The file", lab_unit_conversion, "has", nummissing_mult,"record(s) where the concversion method multiplication_factor_from_origin_to_target is missing. This can only happen if there is an alternative conversion method stored in a variable named", varconv, ".")) - }else{ - + stop(paste("The file", lab_unit_conversion, "has", nummissing_mult, "record(s) where the concversion method multiplication_factor_from_origin_to_target is missing. This can only happen if there is an alternative conversion method stored in a variable named", varconv, ".")) + } else { # Check 1: no row should have both conversion methods missing. - + if (nummissing_mult_and_conv > 0) { stop( "There are ", @@ -164,9 +162,9 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal "and conversion_not_multiplication are missing." ) } - + # Check 2: no row should have both conversion methods non-missing. - + if (numboth_present_mult_and_conv > 0) { stop( "In the file ", lab_unit_conversion, " there are ", @@ -177,9 +175,9 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal ) } } - }else{ + } else { # if multiplication_factor_from_origin_to_target does not have missing values, we still need to check that there is no conflicting information on the conversion - if (exists_conv) { + if (exists_conv) { if (numboth_present_mult_and_conv > 0) { stop( "In the file", lab_unit_conversion, "there are ", @@ -191,52 +189,49 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal } } } - + # If the alternative conversion method exists or the condition_on_value exists, check that it contains valid R expressions using dataset$value. for (varconv in c("conversion_not_multiplication", "condition_on_value")) { if (varconv %in% names(dt)) { - conv_raw <- dt[[varconv]] - + if (is.character(conv_raw)) { conv_chr <- trimws(conv_raw) } else { conv_chr <- trimws(as.character(conv_raw)) } - + is_missing_conv <- is.na(conv_raw) | conv_chr == "" - + has_nonempty_conv <- any(!is_missing_conv) - + if (has_nonempty_conv) { - if (!is.character(conv_raw)) { stop( - "The column ", varconv ," of the file ", lab_unit_conversion, " must be character." + "The column ", varconv, " of the file ", lab_unit_conversion, " must be character." ) } - + rows_with_conv <- which(!is_missing_conv) - + bad_parse_conv <- rep(FALSE, nrow(dt)) - bad_eval_conv <- rep(FALSE, nrow(dt)) + bad_eval_conv <- rep(FALSE, nrow(dt)) eval_error_conv <- rep(NA_character_, nrow(dt)) - + for (i in rows_with_conv) { - expr_txt <- conv_chr[i] - + parsed_expr <- tryCatch( parse(text = expr_txt), error = function(e) e ) - + if (inherits(parsed_expr, "error") || length(parsed_expr) != 1L) { bad_parse_conv[i] <- TRUE eval_error_conv[i] <- conditionMessage(parsed_expr) next } - + eval_result <- tryCatch( eval( parse(text = expr_txt), @@ -245,27 +240,27 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal ), error = function(e) e ) - + if (inherits(eval_result, "error")) { bad_eval_conv[i] <- TRUE eval_error_conv[i] <- conditionMessage(eval_result) } } - + numbad_parse_conv <- sum(bad_parse_conv) - numbad_eval_conv <- sum(bad_eval_conv) - + numbad_eval_conv <- sum(bad_eval_conv) + if (numbad_parse_conv > 0) { stop( - "The column ", varconv ," of the file ", lab_unit_conversion, " contains ", + "The column ", varconv, " of the file ", lab_unit_conversion, " contains ", numbad_parse_conv, " expression(s) that cannot be parsed as valid R expressions." ) } - + if (numbad_eval_conv > 0) { stop( - "The column ", varconv ," of the file ", lab_unit_conversion, " contains ", + "The column ", varconv, " of the file ", lab_unit_conversion, " contains ", numbad_eval_conv, " expression(s) that cannot be evaluated as R expressions involving a numeric variable named value." ) @@ -273,23 +268,26 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal } } } - + # if the non-mandatory argument 'datasource' is used, check that it corresponds to a variable in lab_unit_conversion if (datasource != "" & !("datasource" %in% names(dt))) { stop(paste("You specified the argument 'datasource' but in this case the file", lab_unit_conversion, "should include the variable 'datasource' in its data model, while it does not")) } for (variable in unique(c(list_analyses))) { if (nrow(dt[concept_id == variable, .(unit_target)]) > 0) { - if (trimws(unique(dt[concept_id == variable, .(unit_target)])) != target_unit[[variable]]) { - stop(paste("In the lab_unit_conversion, the concept_id", variable, "is described with target units that are inconsistent with the target unit assigned in the tab LAB_target_units, which is", target_unit[[variable]])) + target_unit_in_data <- trimws(unique(dt[concept_id == variable, .(unit_target)])) + target_unit_in_meta <- target_unit[[variable]] + if (!is.na(target_unit_in_data) & !is.na(target_unit_in_meta)) { + if (target_unit_in_data != target_unit_in_meta) { + stop(paste("In the lab_unit_conversion, the concept_id", variable, "is described with target units that are inconsistent with the target unit assigned in the tab LAB_target_units, which is", target_unit_in_meta)) + } } } } - message("[CleanLabValues] LAB_unit_conversion check passed successfully.") + logger::log_info("[CleanLabValues] LAB_unit_conversion check passed successfully.") invisible(dt) } - ##################################### # lab_thresholds @@ -313,20 +311,20 @@ check_lab_thresholds <- function(lab_thresholds, dataset) { stop(paste("The file", lab_thresholds, "should include the variable", varname, "in its data model")) } } - + for (varname in c("Min", "Max")) { vec_ <- dt[[varname]] is_num_ <- suppressWarnings(as.numeric(vec_)) invalid_ <- is.na(is_num_) num_nonnumeric <- sum(invalid_) - + if (num_nonnumeric > 0) { - stop(paste("The file", lab_thresholds, "has", num_nonnumeric,"record(s) where the variable ", varname," contains a non-numeric value.")) - } + stop(paste("The file", lab_thresholds, "has", num_nonnumeric, "record(s) where the variable ", varname, " contains a non-numeric value.")) + } } - - - message("[CleanLabValues] LAB_thresholds check passed successfully.") + + + logger::log_info("[CleanLabValues] LAB_thresholds check passed successfully.") invisible(dt) } diff --git a/R/clean_lab_main.R b/R/clean_lab_main.R index 86c8f51..9551bad 100644 --- a/R/clean_lab_main.R +++ b/R/clean_lab_main.R @@ -56,30 +56,42 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u # Step 1: Fill missing units dt <- fill_missing_unit(dt, meta_unit_conv, target_unit) + dt_cols <- names(dt) + dt_by_cid <- split(dt, by = "concept_id", keep.by = TRUE, sorted = FALSE) + meta_by_cid <- split(meta_unit_conv, by = "concept_id", keep.by = TRUE, sorted = FALSE) + thresholds_by_cid <- split(meta_thresholds, by = "concept_id", keep.by = TRUE, sorted = FALSE) + empty_meta_cid <- meta_unit_conv[0] + empty_thresholds_cid <- meta_thresholds[0] + input_out_cols <- input_cols + if ("value" %in% input_out_cols) input_out_cols[input_out_cols == "value"] <- "value_origin" + if ("unit" %in% input_out_cols) input_out_cols[input_out_cols == "unit"] <- "unit_origin" + # Prepare result list result_list <- list() - for (cid in unique(dt$concept_id)) { + for (cid in names(dt_by_cid)) { # Preserve all columns from the original input for thresholding (e.g., age) - dt_cid <- dt[concept_id == cid, .SD, .SDcols = names(dt)] - meta_cid <- meta_unit_conv[concept_id == cid] + dt_cid <- dt_by_cid[[cid]] + meta_cid <- meta_by_cid[[cid]] + if (is.null(meta_cid)) meta_cid <- empty_meta_cid target_unit_cid <- target_unit[[cid]] if (nrow(dt_cid) == 0) next + logger::log_info(paste0("[CleanLabValues] Processing concept_id ", cid, " with ", nrow(dt_cid), " row(s).")) # Step 2: Prepare unit_matched (unit_filled if present, else unit) dt_cid[, unit_matched := unit_filled] dt_cid[is.na(unit_matched) | unit_matched == "", unit_matched := target_unit_cid] dt_cid[, unit_target := target_unit_cid] # Mark rows with missing original unit dt_cid[, unit_missing := is.na(unit_origin) | unit_origin == ""] - # For missing unit, set unit_matched to assumed_unit_if_missing from metadata (per-row, not just first) + # For missing unit, set unit_matched to assumed_unit_if_missing when metadata provides one. if ("assumed_unit_if_missing" %in% names(meta_cid)) { - idx_missing <- which(dt_cid$unit_missing) - if (length(idx_missing) > 0) { - for (j in idx_missing) { - assumed_unit <- meta_cid[unit_target == dt_cid$unit_target[j] & !is.na(assumed_unit_if_missing) & assumed_unit_if_missing != "", assumed_unit_if_missing][1] - if (!is.na(assumed_unit) && assumed_unit != "") { - dt_cid$unit_matched[j] <- assumed_unit - } - } + assumed_unit <- meta_cid[ + unit_target == target_unit_cid & + !is.na(assumed_unit_if_missing) & + assumed_unit_if_missing != "", + assumed_unit_if_missing[1] + ] + if (length(assumed_unit) > 0 && !is.na(assumed_unit) && assumed_unit != "") { + dt_cid[unit_missing == TRUE, unit_matched := assumed_unit] } } @@ -97,8 +109,9 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u # Always ensure direct match row (unit_matched == unit_target, factor 1, next_attempt from existing rows) is present and first # Use the first available next_attempt for this concept_id and unit_target, or 0 if none direct_next_attempt <- 0 - if (nrow(meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)]) > 0) { - direct_next_attempt <- meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)][[1, "next_attempt"]] + next_attempt_rows <- meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)] + if (nrow(next_attempt_rows) > 0) { + direct_next_attempt <- next_attempt_rows[[1, "next_attempt"]] } direct_row <- data.table::data.table( concept_id = cid, @@ -109,7 +122,8 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u ) meta_cid_full <- rbind(direct_row, meta_cid_full, fill = TRUE) # Join thresholds on concept_id and unit_target only - meta_thresholds_cid <- meta_thresholds[concept_id == cid] + meta_thresholds_cid <- thresholds_by_cid[[cid]] + if (is.null(meta_thresholds_cid)) meta_thresholds_cid <- empty_thresholds_cid meta_cid_full <- merge(meta_cid_full, meta_thresholds_cid, by = c("concept_id", "unit_target"), all.x = TRUE) # For compatibility with mo_convert, rename unit_origin to unit_matched setnames(meta_cid_full, "unit_origin", "unit_matched", skip_absent = TRUE) @@ -125,13 +139,10 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u # but expose the original measurement as `value_origin`/`unit_origin` (these are created above), # then append the cleaning result columns. result_cols <- c("included", "value", "unit_target", "conversion", "rule_applied") - # map input column names to output names: original `value` -> `value_origin`, `unit` -> `unit_origin` - input_out_cols <- input_cols - if ("value" %in% input_out_cols) input_out_cols[input_out_cols == "value"] <- "value_origin" - if ("unit" %in% input_out_cols) input_out_cols[input_out_cols == "unit"] <- "unit_origin" out_cols <- c(intersect(input_out_cols, names(dt_cid)), result_cols, ".order_id") out <- dt_cid[, ..out_cols] result_list[[cid]] <- out + logger::log_info(paste0("[CleanLabValues] Completed concept_id ", cid, ".")) } result <- data.table::rbindlist(result_list, fill = TRUE) # restore original ordering and remove temporary order column (drop explicitly) diff --git a/R/fill_missing_unit.R b/R/fill_missing_unit.R index b13cb8c..e18c9c9 100644 --- a/R/fill_missing_unit.R +++ b/R/fill_missing_unit.R @@ -18,9 +18,14 @@ fill_missing_unit <- function(dt, meta_unit_conv, target_unit, concept_id_col = for (cid in unique(dt[[concept_id_col]])) { assumed_unit <- meta_unit_conv[concept_id == cid & unit_origin == "MISSING" & is.na(condition_on_value), assumed_unit_if_missing] idx <- which(dt[[concept_id_col]] == cid & (dt[[unit_col]] == "" | is.na(dt[[unit_col]]))) + if (length(idx) == 0) { + next + } if (length(assumed_unit) > 0 && !is.na(assumed_unit[1])) { + logger::log_info(paste0("[CleanLabValues] Filling missing units for concept_id ", cid, " using assumed unit '", assumed_unit[1], "' on ", length(idx), " row(s).")) dt[idx, unit_filled := assumed_unit[1]] } else { + logger::log_info(paste0("[CleanLabValues] Filling missing units for concept_id ", cid, " using target unit '", target_unit[[cid]], "' on ", length(idx), " row(s).")) dt[idx, unit_filled := target_unit[[cid]]] } } diff --git a/R/mo_convert.R b/R/mo_convert.R index 8c33636..6105117 100644 --- a/R/mo_convert.R +++ b/R/mo_convert.R @@ -9,51 +9,138 @@ NULL .mo_norm <- function(x) tolower(trimws(as.character(x))) -.mo_get_var_value <- function(dat, row_idx, varname) { +.mo_get_cached_var_value <- function(var_cache, row_idx, varname) { if (is.null(varname) || is.na(varname) || varname == "") { return(NA) } - if (varname %in% names(dat)) { - return(dat[[varname]][row_idx]) + values <- var_cache[[varname]] + if (is.null(values)) { + return(NA) + } + values[row_idx] +} + +.mo_parse_optional_expr <- function(expr_text, default_expr = quote(NA_real_)) { + if (is.null(expr_text) || is.na(expr_text) || expr_text == "") { + return(default_expr) + } + parse(text = expr_text)[[1]] +} + +.mo_prepare_attempt_bundle <- function(attempts) { + n_attempts <- nrow(attempts) + if (n_attempts == 0) { + return(list( + n = 0L, + unit_matched = character(0), + next_attempt = integer(0), + factor_num = numeric(0), + expr_parsed = list(), + minv = numeric(0), + maxv = numeric(0), + condition_on_value = character(0), + cond_value_parsed = list(), + condition_on_variable = character(0), + cond_var_parsed = list(), + variable = character(0), + assumed_unit_if_missing = character(0) + )) } - NA + + list( + n = n_attempts, + unit_matched = attempts$unit_matched, + next_attempt = attempts$next_attempt, + factor_num = suppressWarnings(as.numeric(attempts$multiplication_factor_from_origin_to_target)), + expr_parsed = lapply(attempts$conversion_not_multiplication, .mo_parse_optional_expr), + minv = suppressWarnings(as.numeric(attempts$Min)), + maxv = suppressWarnings(as.numeric(attempts$Max)), + condition_on_value = attempts$condition_on_value, + cond_value_parsed = lapply(attempts$condition_on_value, .mo_parse_optional_expr, default_expr = quote(TRUE)), + condition_on_variable = attempts$condition_on_variable, + cond_var_parsed = lapply(attempts$condition_on_variable, .mo_parse_optional_expr, default_expr = quote(TRUE)), + variable = if ("variable" %in% names(attempts)) as.character(attempts$variable) else rep(NA_character_, n_attempts), + assumed_unit_if_missing = if ("assumed_unit_if_missing" %in% names(attempts)) as.character(attempts$assumed_unit_if_missing) else rep(NA_character_, n_attempts) + ) } -.mo_eval_attempt <- function(attempt_row, val_raw, dat, row_idx) { - # Returns list(success=TRUE/FALSE, val_conv=?, attempted=TRUE/FALSE) - # factor <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) - # if (is.na(factor) || is.na(val_raw)) { - # return(list(success = FALSE, attempted = FALSE)) - # } - # val_conv <- val_raw * factor - factor <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) - if (!is.na(factor) ) { +.mo_finalize_attempt_bundle <- function(attempt_bundle) { + if (attempt_bundle$n == 0L) { + attempt_bundle$next_positive_idx <- integer(0) + attempt_bundle$missing_idx <- integer(0) + attempt_bundle$fallback_idx_by_code <- list() + attempt_bundle$assumed_units <- character(0) + return(attempt_bundle) + } + + next_pos <- which(!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0) + attempt_bundle$next_positive_idx <- if (length(next_pos) > 0) { + next_pos[order(attempt_bundle$next_attempt[next_pos])] + } else { + integer(0) + } + attempt_bundle$missing_idx <- which(attempt_bundle$unit_matched == "missing") + fallback_order <- sort(unique(attempt_bundle$next_attempt[next_pos])) + attempt_bundle$fallback_idx_by_code <- lapply(fallback_order, function(code) { + which(attempt_bundle$next_attempt == code) + }) + names(attempt_bundle$fallback_idx_by_code) <- as.character(fallback_order) + + assumed_idx <- which( + attempt_bundle$unit_matched == "missing" & + (is.na(attempt_bundle$condition_on_value) | attempt_bundle$condition_on_value == "") + ) + attempt_bundle$assumed_units <- unique(na.omit(.mo_norm(attempt_bundle$assumed_unit_if_missing[assumed_idx]))) + attempt_bundle +} + +.mo_eval_vectorized_values <- function(values, meta_rows, meta_pos = NULL) { + out <- rep(NA_real_, length(values)) + factor_num <- meta_rows$factor_num + idx_factor <- which(!is.na(factor_num)) + if (length(idx_factor) > 0) { + out[idx_factor] <- values[idx_factor] * factor_num[idx_factor] + } + idx_expr <- which(is.na(factor_num)) + if (length(idx_expr) > 0) { + if (is.null(meta_pos)) { + meta_pos <- seq_len(nrow(meta_rows)) + } + expr_groups <- split(idx_expr, meta_pos[idx_expr]) + for (group_idx in expr_groups) { + expr_parsed <- meta_rows$expr_parsed[[group_idx[1]]] + out[group_idx] <- eval(expr_parsed, envir = list(value = values[group_idx])) + } + } + out +} + +.mo_eval_attempt_prepared <- function(attempt_bundle, attempt_idx, val_raw, row_idx, var_cache) { + factor <- attempt_bundle$factor_num[attempt_idx] + if (!is.na(factor)) { val_conv <- val_raw * factor - }else{ - expression_tp_eval <- attempt_row$conversion_not_multiplication - expression_tp_eval <- gsub("value", as.character(val_raw), expression_tp_eval) -# print(expression_tp_eval) - val_conv <- eval( parse( text = expression_tp_eval)) + } else { + val_conv <- eval(attempt_bundle$expr_parsed[[attempt_idx]], envir = list(value = val_raw)) } - minv <- suppressWarnings(as.numeric(attempt_row$Min)) - maxv <- suppressWarnings(as.numeric(attempt_row$Max)) - cond_val_expr <- attempt_row$condition_on_value + minv <- attempt_bundle$minv[attempt_idx] + maxv <- attempt_bundle$maxv[attempt_idx] + cond_val_expr <- attempt_bundle$condition_on_value[attempt_idx] if (!is.null(cond_val_expr) && !is.na(cond_val_expr) && cond_val_expr != "") { - expr <- gsub("value", as.character(val_raw), cond_val_expr) - cond_ok <- tryCatch(isTRUE(eval(parse(text = expr))), error = function(e) FALSE) + cond_ok <- tryCatch(isTRUE(eval(attempt_bundle$cond_value_parsed[[attempt_idx]], envir = list(value = val_raw))), error = function(e) FALSE) if (!isTRUE(cond_ok)) { return(list(success = FALSE, attempted = FALSE)) } } else { - cond_var <- attempt_row$condition_on_variable + cond_var <- attempt_bundle$condition_on_variable[attempt_idx] if (!is.null(cond_var) && !is.na(cond_var) && cond_var != "") { - varname <- if (!is.null(attempt_row$variable) && !is.na(attempt_row$variable) && attempt_row$variable != "") attempt_row$variable else NULL - cond_val <- .mo_get_var_value(dat, row_idx, varname) + varname <- attempt_bundle$variable[attempt_idx] + if (!is.null(varname) && !is.na(varname) && varname == "") varname <- NULL + cond_val <- .mo_get_cached_var_value(var_cache, row_idx, varname) if (is.na(cond_val)) { return(list(success = FALSE, attempted = FALSE)) } - expr <- gsub(varname, as.character(cond_val), cond_var) - cond_ok <- tryCatch(isTRUE(eval(parse(text = expr))), error = function(e) FALSE) + cond_env <- setNames(list(cond_val), varname) + cond_ok <- tryCatch(isTRUE(eval(attempt_bundle$cond_var_parsed[[attempt_idx]], envir = cond_env)), error = function(e) FALSE) if (!isTRUE(cond_ok)) { return(list(success = FALSE, attempted = FALSE)) } @@ -67,146 +154,24 @@ NULL list(success = FALSE, attempted = attempted) } -# Small, top-level helpers to keep `mo_convert()` concise. They operate by -# modifying `dat` by reference (using `i`) and return success flags and -# attempt counts where appropriate. -.mo_try_direct_matches <- function(dat, i, attempts, unit_origin, target, val_raw) { - if (nrow(attempts) > 0 && !is.na(unit_origin)) { - direct <- attempts[unit_matched == unit_origin] - if (nrow(direct) > 0) { - for (r in seq_len(nrow(direct))) { - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - res <- .mo_eval_attempt(direct[r], val_raw, dat, i) - if (isTRUE(res$success)) { - dat[i, `:=`( - included = 1L, - value_converted = res$val_conv, - conversion = ifelse(is.na(unit_origin) || unit_origin == target, 0L, ifelse(unit_origin == target, 0L, 1L)), - rule_applied = ifelse(is.na(unit_origin) || unit_origin == target, 0L, 1L) - )] - return(TRUE) - } - } - } - } - FALSE -} - -.mo_try_prefilled_assumed <- function(dat, i, attempts, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) { - # Returns list(success=TRUE/FALSE, tried=0/1) - if (is.na(row_unit_matched) || nrow(attempts) == 0) { - return(list(success = FALSE, tried = 0L)) - } - assumed_list <- unique(na.omit(tolower(trimws(as.character(attempts[unit_matched == "missing" & (is.na(condition_on_value) | condition_on_value == "")]$assumed_unit_if_missing))))) - if (!(row_unit_matched %in% assumed_list)) { - return(list(success = FALSE, tried = 0L)) +.mo_success_codes <- function(attempt_bundle, attempt_idx, conv_success, other_flow = FALSE, fallback_attempts_made = NA_integer_) { + factor_try <- attempt_bundle$factor_num[attempt_idx] + conv_code <- conv_success + if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) { + conv_code <- 2L } - assumed_rows <- attempts[unit_matched == row_unit_matched] - if (nrow(assumed_rows) == 0) { - return(list(success = FALSE, tried = 0L)) - } - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - res <- .mo_eval_attempt(assumed_rows[1], val_raw, dat, i) - if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(assumed_rows[1]$multiplication_factor_from_origin_to_target)) - if (!is.na(factor_try) && factor_try == 1) { - rp <- 0L + if (!is.na(fallback_attempts_made)) { + rp <- ifelse(fallback_attempts_made == 1L, 1L, 2L) + } else if (!is.na(factor_try) && factor_try == 1) { + rp <- 0L + } else { + rp <- if (!is.na(attempt_bundle$next_attempt[attempt_idx]) && attempt_bundle$next_attempt[attempt_idx] > 0) { + as.integer(attempt_bundle$next_attempt[attempt_idx]) } else { - rp <- if (!is.na(assumed_rows[1]$next_attempt) && assumed_rows[1]$next_attempt > 0) as.integer(assumed_rows[1]$next_attempt) else 1L - } - conv_code <- conv_success - if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L - dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = rp)] - return(list(success = TRUE, tried = 1L)) - } - list(success = FALSE, tried = 1L) -} - -.mo_try_missing_chain <- function(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = NA_character_) { - # Returns list(success=TRUE/FALSE, tried=number_of_attempts_tried) - tried <- 0L - # explicit next_attempt chain (>0) - miss_attempts <- attempts[!is.na(next_attempt) & next_attempt > 0][order(next_attempt)] - if (nrow(miss_attempts) > 0) { - for (r in seq_len(nrow(miss_attempts))) { - attempt_row <- miss_attempts[r] - res <- .mo_eval_attempt(attempt_row, val_raw, dat, i) - if (isTRUE(res$attempted)) { - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - tried <- tried + 1L - } - if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) - if (!is.na(factor_try) && factor_try == 1) { - rp <- 0L - } else { - rp <- if (!is.na(attempt_row$next_attempt) && attempt_row$next_attempt > 0) as.integer(attempt_row$next_attempt) else 1L - } - conv_code <- conv_success - if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L - dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = rp)] - return(list(success = TRUE, tried = tried)) - } - } - } - # rows labelled as missing - missing_rows <- attempts[unit_matched == "missing"] - if (!is.na(skip_assumed_unit) && nrow(missing_rows) > 0 && "assumed_unit_if_missing" %in% names(missing_rows)) { - missing_rows <- missing_rows[ - is.na(next_attempt) | - next_attempt != 0 | - .mo_norm(assumed_unit_if_missing) != skip_assumed_unit - ] - } - if (nrow(missing_rows) > 0) { - for (r in seq_len(nrow(missing_rows))) { - attempt_row <- missing_rows[r] - res <- .mo_eval_attempt(attempt_row, val_raw, dat, i) - if (isTRUE(res$attempted)) { - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - tried <- tried + 1L - } - if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) - if (!is.na(factor_try) && factor_try == 1) { - rp <- 0L - } else { - rp <- if (!is.na(attempt_row$next_attempt) && attempt_row$next_attempt > 0) as.integer(attempt_row$next_attempt) else 1L - } - conv_code <- conv_success - if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L - dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = rp)] - return(list(success = TRUE, tried = tried)) - } + 1L } } - list(success = FALSE, tried = tried) -} - -.mo_try_fallbacks <- function(dat, i, attempts, val_raw, conv_success = 1L, other_flow = FALSE) { - if (nrow(attempts) <= 0) { - return(FALSE) - } - fallback_order <- unique(attempts$next_attempt[!is.na(attempts$next_attempt) & attempts$next_attempt > 0]) - fallback_order <- sort(fallback_order) - attempts_made <- 0L - for (code in fallback_order) { - rows_idx <- which(attempts$next_attempt == code) - for (j in rows_idx) { - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - attempt_row <- attempts[j] - res <- .mo_eval_attempt(attempt_row, val_raw, dat, i) - attempts_made <- attempts_made + 1L - if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) - conv_code <- conv_success - if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L - dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = ifelse(attempts_made == 1L, 1L, 2L))] - return(TRUE) - } - } - } - FALSE + list(conversion = as.integer(conv_code), rule_applied = as.integer(rp)) } ##' Convert values using conversion metadata @@ -234,69 +199,349 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { if (!"next_attempt" %in% names(meta)) meta[, next_attempt := NA_integer_] if (!"Min" %in% names(meta)) meta[, Min := NA_real_] if (!"Max" %in% names(meta)) meta[, Max := NA_real_] + if (!"conversion_not_multiplication" %in% names(meta)) meta[, conversion_not_multiplication := NA_character_] + if (!"condition_on_value" %in% names(meta)) meta[, condition_on_value := NA_character_] + if (!"condition_on_variable" %in% names(meta)) meta[, condition_on_variable := NA_character_] + meta[, unit_matched := ifelse(is.na(unit_matched) | unit_matched == "", NA_character_, .mo_norm(unit_matched))] + meta[, .meta_key := paste(concept_id, unit_target, sep = "\r")] + meta_by_key <- split(meta, by = ".meta_key", keep.by = FALSE, sorted = FALSE) + meta_bundle_by_key <- lapply(meta_by_key, function(x) .mo_finalize_attempt_bundle(.mo_prepare_attempt_bundle(x))) + empty_attempts <- meta[0] + empty_attempt_bundle <- .mo_finalize_attempt_bundle(.mo_prepare_attempt_bundle(empty_attempts)) + simple_direct_meta <- meta[ + !is.na(unit_matched) & + unit_matched != "missing" & + (is.na(conversion_not_multiplication) | conversion_not_multiplication == "") & + (is.na(condition_on_value) | condition_on_value == "") & + (is.na(condition_on_variable) | condition_on_variable == "") + ] + if (nrow(simple_direct_meta) > 0) { + simple_direct_meta[, factor_num := suppressWarnings(as.numeric(multiplication_factor_from_origin_to_target))] + simple_direct_meta[, has_expr := !is.na(conversion_not_multiplication) & conversion_not_multiplication != ""] + simple_direct_meta[, expr_parsed := lapply(conversion_not_multiplication, function(expr_text) { + if (is.na(expr_text) || expr_text == "") { + quote(NA_real_) + } else { + parse(text = expr_text)[[1]] + } + })] + simple_direct_meta <- simple_direct_meta[ + (!is.na(factor_num) | has_expr) & + !is.na(Min) & + !is.na(Max) + ] + if (nrow(simple_direct_meta) > 0) { + simple_direct_meta[, .direct_key := paste(.meta_key, unit_matched, sep = "\r")] + simple_direct_meta <- simple_direct_meta[, if (.N == 1L) .SD, by = .direct_key] + } + } - # Use internal helpers - - dat[, `:=`( - included = as.integer(NA), - value_converted = as.numeric(NA), - conversion = as.integer(NA), - rule_applied = as.integer(NA), - n_conversion_attempts = 0L - )] + assumed_prefill_keys <- character(0) + if (nrow(meta) > 0) { + assumed_prefill_rows <- meta[ + unit_matched == "missing" & + (is.na(condition_on_value) | condition_on_value == "") & + !is.na(assumed_unit_if_missing) & + assumed_unit_if_missing != "" + ] + if (nrow(assumed_prefill_rows) > 0) { + assumed_prefill_keys <- unique(paste( + assumed_prefill_rows$.meta_key, + .mo_norm(assumed_prefill_rows$assumed_unit_if_missing), + sep = "\r" + )) + } + } # local alias helpers for readability norm <- .mo_norm + n_dat <- nrow(dat) + cid_vec <- dat$concept_id + target_vec <- dat$unit_target + val_raw_vec <- suppressWarnings(as.numeric(dat$value)) + unit_origin_vec <- ifelse(is.na(dat$unit_origin) | dat$unit_origin == "", NA_character_, norm(dat$unit_origin)) + unit_missing_vec <- if ("unit_missing" %in% names(dat)) { + !is.na(dat$unit_missing) & dat$unit_missing + } else { + is.na(unit_origin_vec) + } + row_unit_matched_vec <- if ("unit_matched" %in% names(dat)) { + ifelse(is.na(dat$unit_matched) | dat$unit_matched == "", NA_character_, norm(dat$unit_matched)) + } else { + rep(NA_character_, n_dat) + } + meta_key_vec <- paste(cid_vec, target_vec, sep = "\r") + direct_attempt_done <- rep(FALSE, n_dat) + prefilled_attempt_done <- rep(FALSE, n_dat) + prefilled_attempt_tried <- integer(n_dat) + n_conversion_attempts <- integer(n_dat) + included <- rep(NA_integer_, n_dat) + value_converted <- rep(NA_real_, n_dat) + conversion <- rep(NA_integer_, n_dat) + rule_applied <- rep(NA_integer_, n_dat) - for (i in seq_len(nrow(dat))) { - row <- dat[i] - cid <- row$concept_id - target <- row$unit_target - val_raw <- suppressWarnings(as.numeric(row$value)) - unit_origin <- ifelse(is.na(row$unit_origin) || row$unit_origin == "", NA_character_, norm(row$unit_origin)) - unit_missing_flag <- if ("unit_missing" %in% names(dat)) isTRUE(row$unit_missing) else is.na(unit_origin) + condition_variables <- unique(unlist(lapply(meta_bundle_by_key, function(bundle) bundle$variable), use.names = FALSE)) + condition_variables <- condition_variables[!is.na(condition_variables) & condition_variables != "" & condition_variables %in% names(dat)] + var_cache <- if (length(condition_variables) > 0) { + dat[, ..condition_variables] + } else { + list() + } - attempts <- meta[concept_id == cid & unit_target == target] - if (nrow(attempts) > 0) attempts[, unit_matched := norm(ifelse(is.na(unit_matched) | unit_matched == "", unit_origin, unit_matched))] + if (exists("simple_direct_meta") && nrow(simple_direct_meta) > 0) { + direct_key_vec <- paste(meta_key_vec, unit_origin_vec, sep = "\r") + direct_meta_pos <- match(direct_key_vec, simple_direct_meta$.direct_key) + fast_direct_idx <- which(!is.na(direct_meta_pos) & !is.na(unit_origin_vec)) + if (length(fast_direct_idx) > 0) { + direct_attempt_done[fast_direct_idx] <- TRUE + n_conversion_attempts[fast_direct_idx] <- n_conversion_attempts[fast_direct_idx] + 1L + meta_rows <- simple_direct_meta[direct_meta_pos[fast_direct_idx]] + val_conv_fast <- .mo_eval_vectorized_values( + values = val_raw_vec[fast_direct_idx], + meta_rows = meta_rows, + meta_pos = direct_meta_pos[fast_direct_idx] + ) + ok_fast <- !is.na(val_conv_fast) & + val_conv_fast >= meta_rows$Min & + val_conv_fast <= meta_rows$Max + success_idx <- fast_direct_idx[ok_fast] + if (length(success_idx) > 0) { + success_conv <- val_conv_fast[ok_fast] + success_is_same_unit <- unit_origin_vec[success_idx] == target_vec[success_idx] + included[success_idx] <- 1L + value_converted[success_idx] <- success_conv + conversion[success_idx] <- ifelse(success_is_same_unit, 0L, 1L) + rule_applied[success_idx] <- ifelse(success_is_same_unit, 0L, 1L) + } + } + + prefill_key_vec <- paste(meta_key_vec, row_unit_matched_vec, sep = "\r") + prefill_meta_pos <- match(prefill_key_vec, simple_direct_meta$.direct_key) + fast_prefill_idx <- which( + is.na(included) & + unit_missing_vec & + !is.na(row_unit_matched_vec) & + !is.na(prefill_meta_pos) & + prefill_key_vec %in% assumed_prefill_keys + ) + if (length(fast_prefill_idx) > 0) { + prefilled_attempt_done[fast_prefill_idx] <- TRUE + prefilled_attempt_tried[fast_prefill_idx] <- 1L + n_conversion_attempts[fast_prefill_idx] <- n_conversion_attempts[fast_prefill_idx] + 1L + meta_rows <- simple_direct_meta[prefill_meta_pos[fast_prefill_idx]] + val_conv_fast <- .mo_eval_vectorized_values( + values = val_raw_vec[fast_prefill_idx], + meta_rows = meta_rows, + meta_pos = prefill_meta_pos[fast_prefill_idx] + ) + ok_fast <- !is.na(val_conv_fast) & + val_conv_fast >= meta_rows$Min & + val_conv_fast <= meta_rows$Max + success_idx <- fast_prefill_idx[ok_fast] + if (length(success_idx) > 0) { + success_meta_rows <- meta_rows[ok_fast] + success_conv <- val_conv_fast[ok_fast] + factor_try <- success_meta_rows$factor_num + rp <- ifelse( + !is.na(factor_try) & factor_try == 1, + 0L, + ifelse(!is.na(success_meta_rows$next_attempt) & success_meta_rows$next_attempt > 0, as.integer(success_meta_rows$next_attempt), 1L) + ) + included[success_idx] <- 1L + value_converted[success_idx] <- success_conv + conversion[success_idx] <- 3L + rule_applied[success_idx] <- rp + } + } + } + + pending_idx <- which(is.na(included)) + for (i in pending_idx) { + target <- target_vec[i] + val_raw <- val_raw_vec[i] + unit_origin <- unit_origin_vec[i] + unit_missing_flag <- unit_missing_vec[i] + + attempt_bundle <- meta_bundle_by_key[[meta_key_vec[i]]] + if (is.null(attempt_bundle)) attempt_bundle <- empty_attempt_bundle + attempt_unit_matched <- if (attempt_bundle$n > 0) ifelse(is.na(attempt_bundle$unit_matched), unit_origin, attempt_bundle$unit_matched) else character(0) # Identify 'OTHER' origin: unit present but not listed among attempts - origin_other <- !is.na(unit_origin) && nrow(attempts) > 0 && !(unit_origin %in% attempts$unit_matched) + origin_other <- !is.na(unit_origin) && attempt_bundle$n > 0 && !(unit_origin %in% attempt_unit_matched) # 1) Try direct matches (unit_origin equals unit_matched) - if (.mo_try_direct_matches(dat, i, attempts, unit_origin, target, val_raw)) next + if (!direct_attempt_done[i] && attempt_bundle$n > 0 && !is.na(unit_origin)) { + direct_idx <- which(attempt_unit_matched == unit_origin) + if (length(direct_idx) > 0) { + direct_success <- FALSE + for (r in direct_idx) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$success)) { + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- ifelse(is.na(unit_origin) || unit_origin == target, 0L, ifelse(unit_origin == target, 0L, 1L)) + rule_applied[i] <- ifelse(is.na(unit_origin) || unit_origin == target, 0L, 1L) + direct_success <- TRUE + break + } + } + if (direct_success) next + } + } # If origin is 'OTHER' (present but not listed), treat it like MISSING for # conversion attempts (but use conversion=1 on success). Otherwise, for # genuinely missing units use conversion=3 on success. - missing_attempts_tried <- 0L + missing_attempts_tried <- prefilled_attempt_tried[i] if (origin_other) { - row_unit_matched <- NA_character_ - pref_res <- list(success = FALSE, tried = 0L) - if ("unit_matched" %in% names(dat)) row_unit_matched <- ifelse(is.na(dat$unit_matched[i]) || dat$unit_matched[i] == "", NA_character_, norm(dat$unit_matched[i])) - if (!is.na(row_unit_matched) && nrow(attempts) > 0) { - pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, row_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE) + row_unit_matched <- row_unit_matched_vec[i] + pref_res <- list(success = FALSE, tried = prefilled_attempt_tried[i]) + if (!prefilled_attempt_done[i] && !is.na(row_unit_matched) && attempt_bundle$n > 0) { + if (row_unit_matched %in% attempt_bundle$assumed_units) { + assumed_rows_idx <- which(attempt_unit_matched == row_unit_matched) + if (length(assumed_rows_idx) > 0) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + assumed_idx_first <- assumed_rows_idx[1] + res <- .mo_eval_attempt_prepared(attempt_bundle, assumed_idx_first, val_raw, i, var_cache) + pref_res <- list(success = isTRUE(res$success), tried = 1L) + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, assumed_idx_first, conv_success = 1L, other_flow = TRUE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + } + } + } missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } - skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_ - miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 1L, other_flow = TRUE, skip_assumed_unit = skip_assumed_unit) + skip_assumed_unit <- if (prefilled_attempt_done[i] || pref_res$tried > 0L) row_unit_matched else NA_character_ + miss_res <- list(success = FALSE, tried = 0L) + if (length(attempt_bundle$next_positive_idx) > 0) { + for (r in attempt_bundle$next_positive_idx) { + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$attempted)) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + miss_res$tried <- miss_res$tried + 1L + } + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, r, conv_success = 1L, other_flow = TRUE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + miss_res$success <- TRUE + break + } + } + } + if (!isTRUE(miss_res$success)) { + missing_idx <- attempt_bundle$missing_idx + if (!is.na(skip_assumed_unit) && length(missing_idx) > 0) { + missing_idx <- missing_idx[ + is.na(attempt_bundle$next_attempt[missing_idx]) | + attempt_bundle$next_attempt[missing_idx] != 0 | + .mo_norm(attempt_bundle$assumed_unit_if_missing[missing_idx]) != skip_assumed_unit + ] + } + if (length(missing_idx) > 0) { + for (r in missing_idx) { + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$attempted)) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + miss_res$tried <- miss_res$tried + 1L + } + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, r, conv_success = 1L, other_flow = TRUE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + miss_res$success <- TRUE + break + } + } + } + } missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } else { if (unit_missing_flag) { - row_unit_matched <- NA_character_ - if ("unit_matched" %in% names(dat)) row_unit_matched <- ifelse(is.na(dat$unit_matched[i]) || dat$unit_matched[i] == "", NA_character_, norm(dat$unit_matched[i])) - pref_res <- list(success = FALSE, tried = 0L) - if (!is.na(row_unit_matched) && nrow(attempts) > 0) { - pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) + row_unit_matched <- row_unit_matched_vec[i] + pref_res <- list(success = FALSE, tried = prefilled_attempt_tried[i]) + if (!prefilled_attempt_done[i] && !is.na(row_unit_matched) && attempt_bundle$n > 0) { + if (row_unit_matched %in% attempt_bundle$assumed_units) { + assumed_rows_idx <- which(attempt_unit_matched == row_unit_matched) + if (length(assumed_rows_idx) > 0) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + assumed_idx_first <- assumed_rows_idx[1] + res <- .mo_eval_attempt_prepared(attempt_bundle, assumed_idx_first, val_raw, i, var_cache) + pref_res <- list(success = isTRUE(res$success), tried = 1L) + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, assumed_idx_first, conv_success = 3L, other_flow = FALSE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + } + } + } missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } # Only call the missing chain if prefill did not already cover the single attempt # (i.e., prefill did not run, or there are chained attempts with next_attempt > 0 to pursue). - if (nrow(attempts) > 0 && (pref_res$tried == 0L || nrow(attempts[!is.na(next_attempt) & next_attempt > 0L]) > 0L)) { - skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_ - miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = skip_assumed_unit) + if (attempt_bundle$n > 0 && (pref_res$tried == 0L || any(!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0L))) { + skip_assumed_unit <- if (prefilled_attempt_done[i] || pref_res$tried > 0L) row_unit_matched else NA_character_ + miss_res <- list(success = FALSE, tried = 0L) + if (length(attempt_bundle$next_positive_idx) > 0) { + for (r in attempt_bundle$next_positive_idx) { + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$attempted)) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + miss_res$tried <- miss_res$tried + 1L + } + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, r, conv_success = 3L, other_flow = FALSE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + miss_res$success <- TRUE + break + } + } + } + if (!isTRUE(miss_res$success)) { + missing_idx <- attempt_bundle$missing_idx + if (!is.na(skip_assumed_unit) && length(missing_idx) > 0) { + missing_idx <- missing_idx[ + is.na(attempt_bundle$next_attempt[missing_idx]) | + attempt_bundle$next_attempt[missing_idx] != 0 | + .mo_norm(attempt_bundle$assumed_unit_if_missing[missing_idx]) != skip_assumed_unit + ] + } + if (length(missing_idx) > 0) { + for (r in missing_idx) { + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$attempted)) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + miss_res$tried <- miss_res$tried + 1L + } + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, r, conv_success = 3L, other_flow = FALSE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + miss_res$success <- TRUE + break + } + } + } + } missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } @@ -306,37 +551,86 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { # 3) Fallback: try all attempts ordered by next_attempt (1,2,...) as fallbacks for conversion conv_success_current <- if (origin_other) 1L else if (unit_missing_flag) 3L else 1L other_flow_current <- origin_other - if (.mo_try_fallbacks(dat, i, attempts, val_raw, conv_success_current, other_flow_current)) next + fallback_success <- FALSE + if (attempt_bundle$n > 0 && length(attempt_bundle$fallback_idx_by_code) > 0) { + attempts_made <- 0L + for (rows_idx in attempt_bundle$fallback_idx_by_code) { + for (j in rows_idx) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + res <- .mo_eval_attempt_prepared(attempt_bundle, j, val_raw, i, var_cache) + attempts_made <- attempts_made + 1L + if (isTRUE(res$success)) { + codes <- .mo_success_codes( + attempt_bundle, + j, + conv_success = conv_success_current, + other_flow = other_flow_current, + fallback_attempts_made = attempts_made + ) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + fallback_success <- TRUE + break + } + } + if (fallback_success) break + } + } + if (fallback_success) next # 4) If we get here, conversion failed or no applicable attempts # Decide conversion & rule codes per README semantics if (is.na(val_raw) || !is.numeric(val_raw)) { - dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = 3L, rule_applied = 99L)] + included[i] <- 0L + value_converted[i] <- NA_real_ + conversion[i] <- 3L + rule_applied[i] <- 99L } else if (unit_missing_flag) { # Use only the missing-unit attempts count to decide 90/91/92 attempts_made_final <- missing_attempts_tried rp_fail <- ifelse(is.na(attempts_made_final) || attempts_made_final == 0L, 90L, ifelse(attempts_made_final == 1L, 91L, 92L)) - dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = 3L, rule_applied = rp_fail)] - } else if (!is.na(unit_origin) && nrow(attempts) == 0) { + included[i] <- 0L + value_converted[i] <- NA_real_ + conversion[i] <- 3L + rule_applied[i] <- rp_fail + } else if (!is.na(unit_origin) && attempt_bundle$n == 0) { # unit present but no conversion metadata -> treat as OTHER - dat[i, `:=`(included = 1L, value_converted = val_raw, conversion = 2L, rule_applied = 0L)] - } else if (!is.na(unit_origin) && unit_origin != target) { + included[i] <- 1L + value_converted[i] <- val_raw + conversion[i] <- 2L + rule_applied[i] <- 0L + } else if (!is.na(unit_origin) && !is.na(target) && unit_origin != target) { # Tried conversions but none accepted # If no conversion attempts were actually made for this row, treat as OTHER accepted-as-is - attempts_made <- dat$n_conversion_attempts[i] + attempts_made <- n_conversion_attempts[i] if (is.na(attempts_made) || attempts_made == 0L) { - dat[i, `:=`(included = 1L, value_converted = val_raw, conversion = 2L, rule_applied = 0L)] + included[i] <- 1L + value_converted[i] <- val_raw + conversion[i] <- 2L + rule_applied[i] <- 0L } else { # Use actual number of attempts made on this row to determine rule_applied (91 if one try, 92 if multiple) final_conv <- if (origin_other) 2L else 1L - dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = final_conv, rule_applied = ifelse(attempts_made <= 1L, 91L, 92L))] + included[i] <- 0L + value_converted[i] <- NA_real_ + conversion[i] <- final_conv + rule_applied[i] <- ifelse(attempts_made <= 1L, 91L, 92L) } } else { - dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = 0L, rule_applied = 90L)] + included[i] <- 0L + value_converted[i] <- NA_real_ + conversion[i] <- 0L + rule_applied[i] <- 90L } } - # Clean helper columns - dat[, n_conversion_attempts := NULL] + dat[, `:=`( + included = included, + value_converted = value_converted, + conversion = conversion, + rule_applied = rule_applied + )] return(dat) } diff --git a/tests/testthat/data/Example 6/i_ground_truth/dataset_cleaned_lab_values.csv b/tests/testthat/data/Example 6/i_ground_truth/dataset_cleaned_lab_values.csv new file mode 100644 index 0000000..134023c --- /dev/null +++ b/tests/testthat/data/Example 6/i_ground_truth/dataset_cleaned_lab_values.csv @@ -0,0 +1,3 @@ +person_id,concept_id,value_origin,unit_origin,included,value,unit_target,conversion,rule_applied,age +P01,LAB_TEST,10,foo,1,10,mg,1,1,20 +P02,LAB_TEST,10,foo,0,,mg,1,91,10 diff --git a/tests/testthat/data/Example 6/i_input/LAB_target_units.csv b/tests/testthat/data/Example 6/i_input/LAB_target_units.csv new file mode 100644 index 0000000..5538ea4 --- /dev/null +++ b/tests/testthat/data/Example 6/i_input/LAB_target_units.csv @@ -0,0 +1,2 @@ +concept_id,unit_target +LAB_TEST,mg diff --git a/tests/testthat/data/Example 6/i_input/LAB_threshold.csv b/tests/testthat/data/Example 6/i_input/LAB_threshold.csv new file mode 100644 index 0000000..81be290 --- /dev/null +++ b/tests/testthat/data/Example 6/i_input/LAB_threshold.csv @@ -0,0 +1,2 @@ +concept_id,Min,Max,unit_target,condition_on_variable,variable +LAB_TEST,0,100,mg,age >= 18,age diff --git a/tests/testthat/data/Example 6/i_input/LAB_unit_conversion.csv b/tests/testthat/data/Example 6/i_input/LAB_unit_conversion.csv new file mode 100644 index 0000000..bcdbc9c --- /dev/null +++ b/tests/testthat/data/Example 6/i_input/LAB_unit_conversion.csv @@ -0,0 +1,2 @@ +concept_id,datasource,unit_origin,unit_target,multiplication_factor_from_origin_to_target,conversion_rate,conversion_not_multiplication,condition_on_value,assumed_unit_if_missing,next_attempt +LAB_TEST,,foo,mg,1,,,,,0 diff --git a/tests/testthat/data/Example 6/i_input/dataset_lab_values.csv b/tests/testthat/data/Example 6/i_input/dataset_lab_values.csv new file mode 100644 index 0000000..c24bea3 --- /dev/null +++ b/tests/testthat/data/Example 6/i_input/dataset_lab_values.csv @@ -0,0 +1,3 @@ +person_id,concept_id,value,unit,age +P01,LAB_TEST,10,foo,20 +P02,LAB_TEST,10,foo,10 diff --git a/tests/testthat/test-examples.R b/tests/testthat/test-examples.R index 5a7f231..bca99d3 100644 --- a/tests/testthat/test-examples.R +++ b/tests/testthat/test-examples.R @@ -4,7 +4,7 @@ library(testthat) base_path <- function(...) test_path("data", ...) -run_example <- function(ex) { +run_example <- function(ex, datasource = "") { input_dir <- base_path(ex, "i_input") gt_dir <- base_path(ex, "i_ground_truth") @@ -17,7 +17,8 @@ run_example <- function(ex) { dataset = dataset, lab_target_units = path_target_units, lab_unit_conversion = path_unit_conversion, - lab_thresholds = path_thresholds + lab_thresholds = path_thresholds, + datasource = datasource ) gt <- fread(file.path(gt_dir, "dataset_cleaned_lab_values.csv")) @@ -27,8 +28,14 @@ run_example <- function(ex) { all.equal(cleaned, gt, check.attributes = FALSE) } -for (i in 1:4) { - test_that(paste("Example", i, "matches ground truth"), { - expect_true(isTRUE(run_example(paste("Example", i)))) + +example_cases <- data.table( + example = paste("Example", 1:6), + datasource = c("", "", "", "", "DS_A", "") +) + +for (case_idx in seq_len(nrow(example_cases))) { + test_that(paste(example_cases$example[case_idx], "matches ground truth"), { + expect_true(isTRUE(run_example(example_cases$example[case_idx], example_cases$datasource[case_idx]))) }) } diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R index ddb9d09..c937357 100644 --- a/tests/testthat/test-validation.R +++ b/tests/testthat/test-validation.R @@ -28,28 +28,3 @@ test_that("Example 4 with wrong unit conversion does not match ground truth", { expect_false(isTRUE(all.equal(cleaned, gt, check.attributes = FALSE))) }) - -test_that("Example 5 with specified datasource matches ground truth", { - input_dir <- test_path("data", "Example 5", "i_input") - gt_dir <- test_path("data", "Example 5", "i_ground_truth") - - dataset <- fread(file.path(input_dir, "dataset_lab_values.csv")) - path_target_units <- file.path(input_dir, "LAB_target_units.csv") - path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion.csv") - path_thresholds <- file.path(input_dir, "LAB_threshold.csv") - - cleaned <- CleanLabValuesDataset( - dataset = dataset, - lab_target_units = path_target_units, - lab_unit_conversion = path_unit_conversion, - lab_thresholds = path_thresholds, - datasource = "DS_A" - ) - - gt <- fread(file.path(gt_dir, "dataset_cleaned_lab_values.csv")) - - setorder(cleaned, person_id, concept_id) - setorder(gt, person_id, concept_id) - - expect_true(isTRUE(all.equal(cleaned, gt, check.attributes = FALSE))) -})