From 5d328a9fd1aca9f81ce8d1121fafdd7bec7701e7 Mon Sep 17 00:00:00 2001 From: YMao-UMCU <159454837+YMao-UMCU@users.noreply.github.com> Date: Thu, 11 Jun 2026 18:35:50 +0200 Subject: [PATCH 1/5] fix NA unit case --- R/check_metadata.R | 104 ++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 53 deletions(-) diff --git a/R/check_metadata.R b/R/check_metadata.R index 984e47a..dfb7061 100644 --- a/R/check_metadata.R +++ b/R/check_metadata.R @@ -58,7 +58,7 @@ check_dataset_model <- function(dataset) { # num_nonnumeric <- sum(invalid_) # if (num_nonnumeric > 0) { # stop(paste("The file 'dataset' has", num_nonnumeric,"record(s) where the variable ", varname," contains a non-numeric value.")) - # } + # } # } message("[CleanLabValues] Dataset model check passed successfully.") } @@ -87,7 +87,6 @@ check_lab_target_units <- function(lab_target_units) { invisible(dt) } - ####################################################### # lab_unit_conversion @@ -114,24 +113,24 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal stop(paste("The file", lab_unit_conversion, "should include the variable", varname, "in its data model")) } } - - # check that whenever multiplication_factor_from_origin_to_target is missing in at least one row, then the variable conversion_not_multiplication exists, and that there is no row where both variables are missing + + # check that whenever multiplication_factor_from_origin_to_target is missing in at least one row, then the variable conversion_not_multiplication exists, and that there is no row where both variables are missing mult_raw <- dt[["multiplication_factor_from_origin_to_target"]] mult_chr <- trimws(as.character(mult_raw)) is_missing_mult <- is.na(mult_raw) | mult_chr == "" nummissing_mult <- sum(is_missing_mult) - + mult_num <- suppressWarnings(as.numeric(mult_chr)) invalid_mult <- !is_missing_mult & is.na(mult_num) num_nonnumeric <- sum(invalid_mult) if (num_nonnumeric > 0) { - stop(paste("The file", lab_unit_conversion, "has", num_nonnumeric,"record(s) where the conversion method multiplication_factor_from_origin_to_target contains a non-numeric value.")) - } - + stop(paste("The file", lab_unit_conversion, "has", num_nonnumeric, "record(s) where the conversion method multiplication_factor_from_origin_to_target contains a non-numeric value.")) + } + varconv <- "conversion_not_multiplication" exists_conv <- varconv %in% names(dt) - + if (exists_conv) { conv_raw <- dt[[varconv]] conv_chr <- trimws(as.character(conv_raw)) @@ -139,23 +138,22 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal # count rows where both conversion methods are missing both_present_mult_and_conv <- !is_missing_mult & !is_missing_conv numboth_present_mult_and_conv <- sum(both_present_mult_and_conv) - #count rows where both conversion methods are included + # count rows where both conversion methods are included both_missing_mult_and_conv <- is_missing_mult & is_missing_conv nummissing_mult_and_conv <- sum(both_missing_mult_and_conv) - }else{ + } else { # conv_raw <- dt[["conversion_not_multiplication"]] # conv_chr <- trimws(as.character(conv_raw)) # is_missing_conv <- is.na(conv_raw) | conv_chr == "" } - + if (nummissing_mult > 0) { if (!exists_conv) { - stop(paste("The file", lab_unit_conversion, "has", nummissing_mult,"record(s) where the concversion method multiplication_factor_from_origin_to_target is missing. This can only happen if there is an alternative conversion method stored in a variable named", varconv, ".")) - }else{ - + stop(paste("The file", lab_unit_conversion, "has", nummissing_mult, "record(s) where the concversion method multiplication_factor_from_origin_to_target is missing. This can only happen if there is an alternative conversion method stored in a variable named", varconv, ".")) + } else { # Check 1: no row should have both conversion methods missing. - + if (nummissing_mult_and_conv > 0) { stop( "There are ", @@ -164,9 +162,9 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal "and conversion_not_multiplication are missing." ) } - + # Check 2: no row should have both conversion methods non-missing. - + if (numboth_present_mult_and_conv > 0) { stop( "In the file ", lab_unit_conversion, " there are ", @@ -177,9 +175,9 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal ) } } - }else{ + } else { # if multiplication_factor_from_origin_to_target does not have missing values, we still need to check that there is no conflicting information on the conversion - if (exists_conv) { + if (exists_conv) { if (numboth_present_mult_and_conv > 0) { stop( "In the file", lab_unit_conversion, "there are ", @@ -191,52 +189,49 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal } } } - + # If the alternative conversion method exists or the condition_on_value exists, check that it contains valid R expressions using dataset$value. for (varconv in c("conversion_not_multiplication", "condition_on_value")) { if (varconv %in% names(dt)) { - conv_raw <- dt[[varconv]] - + if (is.character(conv_raw)) { conv_chr <- trimws(conv_raw) } else { conv_chr <- trimws(as.character(conv_raw)) } - + is_missing_conv <- is.na(conv_raw) | conv_chr == "" - + has_nonempty_conv <- any(!is_missing_conv) - + if (has_nonempty_conv) { - if (!is.character(conv_raw)) { stop( - "The column ", varconv ," of the file ", lab_unit_conversion, " must be character." + "The column ", varconv, " of the file ", lab_unit_conversion, " must be character." ) } - + rows_with_conv <- which(!is_missing_conv) - + bad_parse_conv <- rep(FALSE, nrow(dt)) - bad_eval_conv <- rep(FALSE, nrow(dt)) + bad_eval_conv <- rep(FALSE, nrow(dt)) eval_error_conv <- rep(NA_character_, nrow(dt)) - + for (i in rows_with_conv) { - expr_txt <- conv_chr[i] - + parsed_expr <- tryCatch( parse(text = expr_txt), error = function(e) e ) - + if (inherits(parsed_expr, "error") || length(parsed_expr) != 1L) { bad_parse_conv[i] <- TRUE eval_error_conv[i] <- conditionMessage(parsed_expr) next } - + eval_result <- tryCatch( eval( parse(text = expr_txt), @@ -245,27 +240,27 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal ), error = function(e) e ) - + if (inherits(eval_result, "error")) { bad_eval_conv[i] <- TRUE eval_error_conv[i] <- conditionMessage(eval_result) } } - + numbad_parse_conv <- sum(bad_parse_conv) - numbad_eval_conv <- sum(bad_eval_conv) - + numbad_eval_conv <- sum(bad_eval_conv) + if (numbad_parse_conv > 0) { stop( - "The column ", varconv ," of the file ", lab_unit_conversion, " contains ", + "The column ", varconv, " of the file ", lab_unit_conversion, " contains ", numbad_parse_conv, " expression(s) that cannot be parsed as valid R expressions." ) } - + if (numbad_eval_conv > 0) { stop( - "The column ", varconv ," of the file ", lab_unit_conversion, " contains ", + "The column ", varconv, " of the file ", lab_unit_conversion, " contains ", numbad_eval_conv, " expression(s) that cannot be evaluated as R expressions involving a numeric variable named value." ) @@ -273,15 +268,19 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal } } } - + # if the non-mandatory argument 'datasource' is used, check that it corresponds to a variable in lab_unit_conversion if (datasource != "" & !("datasource" %in% names(dt))) { stop(paste("You specified the argument 'datasource' but in this case the file", lab_unit_conversion, "should include the variable 'datasource' in its data model, while it does not")) } for (variable in unique(c(list_analyses))) { if (nrow(dt[concept_id == variable, .(unit_target)]) > 0) { - if (trimws(unique(dt[concept_id == variable, .(unit_target)])) != target_unit[[variable]]) { - stop(paste("In the lab_unit_conversion, the concept_id", variable, "is described with target units that are inconsistent with the target unit assigned in the tab LAB_target_units, which is", target_unit[[variable]])) + target_unit_in_data <- trimws(unique(dt[concept_id == variable, .(unit_target)])) + target_unit_in_meta <- target_unit[[variable]] + if (!is.na(target_unit_in_data) & !is.na(target_unit_in_meta)) { + if (target_unit_in_data != target_unit_in_meta) { + stop(paste("In the lab_unit_conversion, the concept_id", variable, "is described with target units that are inconsistent with the target unit assigned in the tab LAB_target_units, which is", target_unit_in_meta)) + } } } } @@ -289,7 +288,6 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal invisible(dt) } - ##################################### # lab_thresholds @@ -313,20 +311,20 @@ check_lab_thresholds <- function(lab_thresholds, dataset) { stop(paste("The file", lab_thresholds, "should include the variable", varname, "in its data model")) } } - + for (varname in c("Min", "Max")) { vec_ <- dt[[varname]] is_num_ <- suppressWarnings(as.numeric(vec_)) invalid_ <- is.na(is_num_) num_nonnumeric <- sum(invalid_) - + if (num_nonnumeric > 0) { - stop(paste("The file", lab_thresholds, "has", num_nonnumeric,"record(s) where the variable ", varname," contains a non-numeric value.")) - } + stop(paste("The file", lab_thresholds, "has", num_nonnumeric, "record(s) where the variable ", varname, " contains a non-numeric value.")) + } } - - + + message("[CleanLabValues] LAB_thresholds check passed successfully.") invisible(dt) } From 7a4f0e4a4ac91e38026b9d9d683716d08ebdddd9 Mon Sep 17 00:00:00 2001 From: YMao-UMCU <159454837+YMao-UMCU@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:38:15 +0200 Subject: [PATCH 2/5] use logger --- DESCRIPTION | 3 ++- R/check_metadata.R | 8 ++++---- R/clean_lab_main.R | 2 ++ R/fill_missing_unit.R | 5 +++++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 96e2edc..9e8c3b4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,8 @@ Description: Ingests instructions to clean datasets containing results from License: AGPL-3 Encoding: UTF-8 Imports: - data.table + data.table, + logger Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/R/check_metadata.R b/R/check_metadata.R index dfb7061..5d760dc 100644 --- a/R/check_metadata.R +++ b/R/check_metadata.R @@ -60,7 +60,7 @@ check_dataset_model <- function(dataset) { # stop(paste("The file 'dataset' has", num_nonnumeric,"record(s) where the variable ", varname," contains a non-numeric value.")) # } # } - message("[CleanLabValues] Dataset model check passed successfully.") + logger::log_info("[CleanLabValues] Dataset model check passed successfully.") } ####################################################### @@ -83,7 +83,7 @@ check_lab_target_units <- function(lab_target_units) { if (!all(c("concept_id", "unit_target") %in% names(dt))) { stop(paste("The file", lab_target_units, "should be a csv file with data model concept_id,unit_target")) } - message("[CleanLabValues] LAB_target_units check passed successfully.") + logger::log_info("[CleanLabValues] LAB_target_units check passed successfully.") invisible(dt) } @@ -284,7 +284,7 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal } } } - message("[CleanLabValues] LAB_unit_conversion check passed successfully.") + logger::log_info("[CleanLabValues] LAB_unit_conversion check passed successfully.") invisible(dt) } @@ -325,6 +325,6 @@ check_lab_thresholds <- function(lab_thresholds, dataset) { - message("[CleanLabValues] LAB_thresholds check passed successfully.") + logger::log_info("[CleanLabValues] LAB_thresholds check passed successfully.") invisible(dt) } diff --git a/R/clean_lab_main.R b/R/clean_lab_main.R index 86c8f51..c9181fe 100644 --- a/R/clean_lab_main.R +++ b/R/clean_lab_main.R @@ -64,6 +64,7 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u meta_cid <- meta_unit_conv[concept_id == cid] target_unit_cid <- target_unit[[cid]] if (nrow(dt_cid) == 0) next + logger::log_info(paste0("[CleanLabValues] Processing concept_id ", cid, " with ", nrow(dt_cid), " row(s).")) # Step 2: Prepare unit_matched (unit_filled if present, else unit) dt_cid[, unit_matched := unit_filled] dt_cid[is.na(unit_matched) | unit_matched == "", unit_matched := target_unit_cid] @@ -132,6 +133,7 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u out_cols <- c(intersect(input_out_cols, names(dt_cid)), result_cols, ".order_id") out <- dt_cid[, ..out_cols] result_list[[cid]] <- out + logger::log_info(paste0("[CleanLabValues] Completed concept_id ", cid, ".")) } result <- data.table::rbindlist(result_list, fill = TRUE) # restore original ordering and remove temporary order column (drop explicitly) diff --git a/R/fill_missing_unit.R b/R/fill_missing_unit.R index b13cb8c..e18c9c9 100644 --- a/R/fill_missing_unit.R +++ b/R/fill_missing_unit.R @@ -18,9 +18,14 @@ fill_missing_unit <- function(dt, meta_unit_conv, target_unit, concept_id_col = for (cid in unique(dt[[concept_id_col]])) { assumed_unit <- meta_unit_conv[concept_id == cid & unit_origin == "MISSING" & is.na(condition_on_value), assumed_unit_if_missing] idx <- which(dt[[concept_id_col]] == cid & (dt[[unit_col]] == "" | is.na(dt[[unit_col]]))) + if (length(idx) == 0) { + next + } if (length(assumed_unit) > 0 && !is.na(assumed_unit[1])) { + logger::log_info(paste0("[CleanLabValues] Filling missing units for concept_id ", cid, " using assumed unit '", assumed_unit[1], "' on ", length(idx), " row(s).")) dt[idx, unit_filled := assumed_unit[1]] } else { + logger::log_info(paste0("[CleanLabValues] Filling missing units for concept_id ", cid, " using target unit '", target_unit[[cid]], "' on ", length(idx), " row(s).")) dt[idx, unit_filled := target_unit[[cid]]] } } From adabd5f157f81058e5e3be10bc935a14c7a43255 Mon Sep 17 00:00:00 2001 From: YMao-UMCU <159454837+YMao-UMCU@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:08:18 +0200 Subject: [PATCH 3/5] enhance for loop speed --- R/clean_lab_main.R | 47 ++++++++------ R/mo_convert.R | 159 ++++++++++++++++++++++++++++++++------------- 2 files changed, 143 insertions(+), 63 deletions(-) diff --git a/R/clean_lab_main.R b/R/clean_lab_main.R index c9181fe..9551bad 100644 --- a/R/clean_lab_main.R +++ b/R/clean_lab_main.R @@ -56,12 +56,23 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u # Step 1: Fill missing units dt <- fill_missing_unit(dt, meta_unit_conv, target_unit) + dt_cols <- names(dt) + dt_by_cid <- split(dt, by = "concept_id", keep.by = TRUE, sorted = FALSE) + meta_by_cid <- split(meta_unit_conv, by = "concept_id", keep.by = TRUE, sorted = FALSE) + thresholds_by_cid <- split(meta_thresholds, by = "concept_id", keep.by = TRUE, sorted = FALSE) + empty_meta_cid <- meta_unit_conv[0] + empty_thresholds_cid <- meta_thresholds[0] + input_out_cols <- input_cols + if ("value" %in% input_out_cols) input_out_cols[input_out_cols == "value"] <- "value_origin" + if ("unit" %in% input_out_cols) input_out_cols[input_out_cols == "unit"] <- "unit_origin" + # Prepare result list result_list <- list() - for (cid in unique(dt$concept_id)) { + for (cid in names(dt_by_cid)) { # Preserve all columns from the original input for thresholding (e.g., age) - dt_cid <- dt[concept_id == cid, .SD, .SDcols = names(dt)] - meta_cid <- meta_unit_conv[concept_id == cid] + dt_cid <- dt_by_cid[[cid]] + meta_cid <- meta_by_cid[[cid]] + if (is.null(meta_cid)) meta_cid <- empty_meta_cid target_unit_cid <- target_unit[[cid]] if (nrow(dt_cid) == 0) next logger::log_info(paste0("[CleanLabValues] Processing concept_id ", cid, " with ", nrow(dt_cid), " row(s).")) @@ -71,16 +82,16 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u dt_cid[, unit_target := target_unit_cid] # Mark rows with missing original unit dt_cid[, unit_missing := is.na(unit_origin) | unit_origin == ""] - # For missing unit, set unit_matched to assumed_unit_if_missing from metadata (per-row, not just first) + # For missing unit, set unit_matched to assumed_unit_if_missing when metadata provides one. if ("assumed_unit_if_missing" %in% names(meta_cid)) { - idx_missing <- which(dt_cid$unit_missing) - if (length(idx_missing) > 0) { - for (j in idx_missing) { - assumed_unit <- meta_cid[unit_target == dt_cid$unit_target[j] & !is.na(assumed_unit_if_missing) & assumed_unit_if_missing != "", assumed_unit_if_missing][1] - if (!is.na(assumed_unit) && assumed_unit != "") { - dt_cid$unit_matched[j] <- assumed_unit - } - } + assumed_unit <- meta_cid[ + unit_target == target_unit_cid & + !is.na(assumed_unit_if_missing) & + assumed_unit_if_missing != "", + assumed_unit_if_missing[1] + ] + if (length(assumed_unit) > 0 && !is.na(assumed_unit) && assumed_unit != "") { + dt_cid[unit_missing == TRUE, unit_matched := assumed_unit] } } @@ -98,8 +109,9 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u # Always ensure direct match row (unit_matched == unit_target, factor 1, next_attempt from existing rows) is present and first # Use the first available next_attempt for this concept_id and unit_target, or 0 if none direct_next_attempt <- 0 - if (nrow(meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)]) > 0) { - direct_next_attempt <- meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)][[1, "next_attempt"]] + next_attempt_rows <- meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)] + if (nrow(next_attempt_rows) > 0) { + direct_next_attempt <- next_attempt_rows[[1, "next_attempt"]] } direct_row <- data.table::data.table( concept_id = cid, @@ -110,7 +122,8 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u ) meta_cid_full <- rbind(direct_row, meta_cid_full, fill = TRUE) # Join thresholds on concept_id and unit_target only - meta_thresholds_cid <- meta_thresholds[concept_id == cid] + meta_thresholds_cid <- thresholds_by_cid[[cid]] + if (is.null(meta_thresholds_cid)) meta_thresholds_cid <- empty_thresholds_cid meta_cid_full <- merge(meta_cid_full, meta_thresholds_cid, by = c("concept_id", "unit_target"), all.x = TRUE) # For compatibility with mo_convert, rename unit_origin to unit_matched setnames(meta_cid_full, "unit_origin", "unit_matched", skip_absent = TRUE) @@ -126,10 +139,6 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u # but expose the original measurement as `value_origin`/`unit_origin` (these are created above), # then append the cleaning result columns. result_cols <- c("included", "value", "unit_target", "conversion", "rule_applied") - # map input column names to output names: original `value` -> `value_origin`, `unit` -> `unit_origin` - input_out_cols <- input_cols - if ("value" %in% input_out_cols) input_out_cols[input_out_cols == "value"] <- "value_origin" - if ("unit" %in% input_out_cols) input_out_cols[input_out_cols == "unit"] <- "unit_origin" out_cols <- c(intersect(input_out_cols, names(dt_cid)), result_cols, ".order_id") out <- dt_cid[, ..out_cols] result_list[[cid]] <- out diff --git a/R/mo_convert.R b/R/mo_convert.R index 8c33636..963e85a 100644 --- a/R/mo_convert.R +++ b/R/mo_convert.R @@ -70,13 +70,13 @@ NULL # Small, top-level helpers to keep `mo_convert()` concise. They operate by # modifying `dat` by reference (using `i`) and return success flags and # attempt counts where appropriate. -.mo_try_direct_matches <- function(dat, i, attempts, unit_origin, target, val_raw) { +.mo_try_direct_matches <- function(dat, i, attempts, attempt_unit_matched, unit_origin, target, val_raw) { if (nrow(attempts) > 0 && !is.na(unit_origin)) { - direct <- attempts[unit_matched == unit_origin] - if (nrow(direct) > 0) { - for (r in seq_len(nrow(direct))) { + direct_idx <- which(attempt_unit_matched == unit_origin) + if (length(direct_idx) > 0) { + for (r in direct_idx) { dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - res <- .mo_eval_attempt(direct[r], val_raw, dat, i) + res <- .mo_eval_attempt(attempts[r], val_raw, dat, i) if (isTRUE(res$success)) { dat[i, `:=`( included = 1L, @@ -92,27 +92,29 @@ NULL FALSE } -.mo_try_prefilled_assumed <- function(dat, i, attempts, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) { +.mo_try_prefilled_assumed <- function(dat, i, attempts, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) { # Returns list(success=TRUE/FALSE, tried=0/1) if (is.na(row_unit_matched) || nrow(attempts) == 0) { return(list(success = FALSE, tried = 0L)) } - assumed_list <- unique(na.omit(tolower(trimws(as.character(attempts[unit_matched == "missing" & (is.na(condition_on_value) | condition_on_value == "")]$assumed_unit_if_missing))))) + assumed_idx <- which(attempt_unit_matched == "missing" & (is.na(attempts$condition_on_value) | attempts$condition_on_value == "")) + assumed_list <- unique(na.omit(tolower(trimws(as.character(attempts$assumed_unit_if_missing[assumed_idx]))))) if (!(row_unit_matched %in% assumed_list)) { return(list(success = FALSE, tried = 0L)) } - assumed_rows <- attempts[unit_matched == row_unit_matched] - if (nrow(assumed_rows) == 0) { + assumed_rows_idx <- which(attempt_unit_matched == row_unit_matched) + if (length(assumed_rows_idx) == 0) { return(list(success = FALSE, tried = 0L)) } dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - res <- .mo_eval_attempt(assumed_rows[1], val_raw, dat, i) + assumed_row <- attempts[assumed_rows_idx[1]] + res <- .mo_eval_attempt(assumed_row, val_raw, dat, i) if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(assumed_rows[1]$multiplication_factor_from_origin_to_target)) + factor_try <- suppressWarnings(as.numeric(assumed_row$multiplication_factor_from_origin_to_target)) if (!is.na(factor_try) && factor_try == 1) { rp <- 0L } else { - rp <- if (!is.na(assumed_rows[1]$next_attempt) && assumed_rows[1]$next_attempt > 0) as.integer(assumed_rows[1]$next_attempt) else 1L + rp <- if (!is.na(assumed_row$next_attempt) && assumed_row$next_attempt > 0) as.integer(assumed_row$next_attempt) else 1L } conv_code <- conv_success if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L @@ -122,14 +124,15 @@ NULL list(success = FALSE, tried = 1L) } -.mo_try_missing_chain <- function(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = NA_character_) { +.mo_try_missing_chain <- function(dat, i, attempts, attempt_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = NA_character_) { # Returns list(success=TRUE/FALSE, tried=number_of_attempts_tried) tried <- 0L # explicit next_attempt chain (>0) - miss_attempts <- attempts[!is.na(next_attempt) & next_attempt > 0][order(next_attempt)] - if (nrow(miss_attempts) > 0) { - for (r in seq_len(nrow(miss_attempts))) { - attempt_row <- miss_attempts[r] + miss_attempts_idx <- which(!is.na(attempts$next_attempt) & attempts$next_attempt > 0) + if (length(miss_attempts_idx) > 0) { + miss_attempts_idx <- miss_attempts_idx[order(attempts$next_attempt[miss_attempts_idx])] + for (r in miss_attempts_idx) { + attempt_row <- attempts[r] res <- .mo_eval_attempt(attempt_row, val_raw, dat, i) if (isTRUE(res$attempted)) { dat[i, n_conversion_attempts := n_conversion_attempts + 1L] @@ -150,17 +153,17 @@ NULL } } # rows labelled as missing - missing_rows <- attempts[unit_matched == "missing"] - if (!is.na(skip_assumed_unit) && nrow(missing_rows) > 0 && "assumed_unit_if_missing" %in% names(missing_rows)) { - missing_rows <- missing_rows[ - is.na(next_attempt) | - next_attempt != 0 | - .mo_norm(assumed_unit_if_missing) != skip_assumed_unit + missing_idx <- which(attempt_unit_matched == "missing") + if (!is.na(skip_assumed_unit) && length(missing_idx) > 0 && "assumed_unit_if_missing" %in% names(attempts)) { + missing_idx <- missing_idx[ + is.na(attempts$next_attempt[missing_idx]) | + attempts$next_attempt[missing_idx] != 0 | + .mo_norm(attempts$assumed_unit_if_missing[missing_idx]) != skip_assumed_unit ] } - if (nrow(missing_rows) > 0) { - for (r in seq_len(nrow(missing_rows))) { - attempt_row <- missing_rows[r] + if (length(missing_idx) > 0) { + for (r in missing_idx) { + attempt_row <- attempts[r] res <- .mo_eval_attempt(attempt_row, val_raw, dat, i) if (isTRUE(res$attempted)) { dat[i, n_conversion_attempts := n_conversion_attempts + 1L] @@ -234,6 +237,32 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { if (!"next_attempt" %in% names(meta)) meta[, next_attempt := NA_integer_] if (!"Min" %in% names(meta)) meta[, Min := NA_real_] if (!"Max" %in% names(meta)) meta[, Max := NA_real_] + if (!"conversion_not_multiplication" %in% names(meta)) meta[, conversion_not_multiplication := NA_character_] + if (!"condition_on_value" %in% names(meta)) meta[, condition_on_value := NA_character_] + if (!"condition_on_variable" %in% names(meta)) meta[, condition_on_variable := NA_character_] + meta[, unit_matched := ifelse(is.na(unit_matched) | unit_matched == "", NA_character_, .mo_norm(unit_matched))] + meta[, .meta_key := paste(concept_id, unit_target, sep = "\r")] + meta_by_key <- split(meta, by = ".meta_key", keep.by = FALSE, sorted = FALSE) + empty_attempts <- meta[0] + simple_direct_meta <- meta[ + !is.na(unit_matched) & + unit_matched != "missing" & + (is.na(conversion_not_multiplication) | conversion_not_multiplication == "") & + (is.na(condition_on_value) | condition_on_value == "") & + (is.na(condition_on_variable) | condition_on_variable == "") + ] + if (nrow(simple_direct_meta) > 0) { + simple_direct_meta[, factor_num := suppressWarnings(as.numeric(multiplication_factor_from_origin_to_target))] + simple_direct_meta <- simple_direct_meta[ + !is.na(factor_num) & + !is.na(Min) & + !is.na(Max) + ] + if (nrow(simple_direct_meta) > 0) { + simple_direct_meta[, .direct_key := paste(.meta_key, unit_matched, sep = "\r")] + simple_direct_meta <- simple_direct_meta[, if (.N == 1L) .SD, by = .direct_key] + } + } # Use internal helpers @@ -247,48 +276,90 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { # local alias helpers for readability norm <- .mo_norm + cid_vec <- dat$concept_id + target_vec <- dat$unit_target + val_raw_vec <- suppressWarnings(as.numeric(dat$value)) + unit_origin_vec <- ifelse(is.na(dat$unit_origin) | dat$unit_origin == "", NA_character_, norm(dat$unit_origin)) + unit_missing_vec <- if ("unit_missing" %in% names(dat)) { + !is.na(dat$unit_missing) & dat$unit_missing + } else { + is.na(unit_origin_vec) + } + row_unit_matched_vec <- if ("unit_matched" %in% names(dat)) { + ifelse(is.na(dat$unit_matched) | dat$unit_matched == "", NA_character_, norm(dat$unit_matched)) + } else { + rep(NA_character_, nrow(dat)) + } + meta_key_vec <- paste(cid_vec, target_vec, sep = "\r") + direct_attempt_done <- rep(FALSE, nrow(dat)) + + if (exists("simple_direct_meta") && nrow(simple_direct_meta) > 0) { + direct_key_vec <- paste(meta_key_vec, unit_origin_vec, sep = "\r") + direct_meta_pos <- match(direct_key_vec, simple_direct_meta$.direct_key) + fast_direct_idx <- which(!is.na(direct_meta_pos) & !is.na(unit_origin_vec)) + if (length(fast_direct_idx) > 0) { + direct_attempt_done[fast_direct_idx] <- TRUE + dat[fast_direct_idx, n_conversion_attempts := n_conversion_attempts + 1L] + meta_rows <- simple_direct_meta[direct_meta_pos[fast_direct_idx]] + val_conv_fast <- val_raw_vec[fast_direct_idx] * meta_rows$factor_num + ok_fast <- !is.na(val_conv_fast) & + val_conv_fast >= meta_rows$Min & + val_conv_fast <= meta_rows$Max + success_idx <- fast_direct_idx[ok_fast] + if (length(success_idx) > 0) { + success_conv <- val_conv_fast[ok_fast] + success_is_same_unit <- unit_origin_vec[success_idx] == target_vec[success_idx] + dat[success_idx, `:=`( + included = 1L, + value_converted = success_conv, + conversion = ifelse(success_is_same_unit, 0L, 1L), + rule_applied = ifelse(success_is_same_unit, 0L, 1L) + )] + } + } + } for (i in seq_len(nrow(dat))) { - row <- dat[i] - cid <- row$concept_id - target <- row$unit_target - val_raw <- suppressWarnings(as.numeric(row$value)) - unit_origin <- ifelse(is.na(row$unit_origin) || row$unit_origin == "", NA_character_, norm(row$unit_origin)) - unit_missing_flag <- if ("unit_missing" %in% names(dat)) isTRUE(row$unit_missing) else is.na(unit_origin) + if (!is.na(dat$included[i])) next + + cid <- cid_vec[i] + target <- target_vec[i] + val_raw <- val_raw_vec[i] + unit_origin <- unit_origin_vec[i] + unit_missing_flag <- unit_missing_vec[i] - attempts <- meta[concept_id == cid & unit_target == target] - if (nrow(attempts) > 0) attempts[, unit_matched := norm(ifelse(is.na(unit_matched) | unit_matched == "", unit_origin, unit_matched))] + attempts <- meta_by_key[[meta_key_vec[i]]] + if (is.null(attempts)) attempts <- empty_attempts + attempt_unit_matched <- if (nrow(attempts) > 0) ifelse(is.na(attempts$unit_matched), unit_origin, attempts$unit_matched) else character(0) # Identify 'OTHER' origin: unit present but not listed among attempts - origin_other <- !is.na(unit_origin) && nrow(attempts) > 0 && !(unit_origin %in% attempts$unit_matched) + origin_other <- !is.na(unit_origin) && nrow(attempts) > 0 && !(unit_origin %in% attempt_unit_matched) # 1) Try direct matches (unit_origin equals unit_matched) - if (.mo_try_direct_matches(dat, i, attempts, unit_origin, target, val_raw)) next + if (!direct_attempt_done[i] && .mo_try_direct_matches(dat, i, attempts, attempt_unit_matched, unit_origin, target, val_raw)) next # If origin is 'OTHER' (present but not listed), treat it like MISSING for # conversion attempts (but use conversion=1 on success). Otherwise, for # genuinely missing units use conversion=3 on success. missing_attempts_tried <- 0L if (origin_other) { - row_unit_matched <- NA_character_ + row_unit_matched <- row_unit_matched_vec[i] pref_res <- list(success = FALSE, tried = 0L) - if ("unit_matched" %in% names(dat)) row_unit_matched <- ifelse(is.na(dat$unit_matched[i]) || dat$unit_matched[i] == "", NA_character_, norm(dat$unit_matched[i])) if (!is.na(row_unit_matched) && nrow(attempts) > 0) { - pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, row_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE) + pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE) missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_ - miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 1L, other_flow = TRUE, skip_assumed_unit = skip_assumed_unit) + miss_res <- .mo_try_missing_chain(dat, i, attempts, attempt_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE, skip_assumed_unit = skip_assumed_unit) missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } else { if (unit_missing_flag) { - row_unit_matched <- NA_character_ - if ("unit_matched" %in% names(dat)) row_unit_matched <- ifelse(is.na(dat$unit_matched[i]) || dat$unit_matched[i] == "", NA_character_, norm(dat$unit_matched[i])) + row_unit_matched <- row_unit_matched_vec[i] pref_res <- list(success = FALSE, tried = 0L) if (!is.na(row_unit_matched) && nrow(attempts) > 0) { - pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) + pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } @@ -296,7 +367,7 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { # (i.e., prefill did not run, or there are chained attempts with next_attempt > 0 to pursue). if (nrow(attempts) > 0 && (pref_res$tried == 0L || nrow(attempts[!is.na(next_attempt) & next_attempt > 0L]) > 0L)) { skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_ - miss_res <- .mo_try_missing_chain(dat, i, attempts, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = skip_assumed_unit) + miss_res <- .mo_try_missing_chain(dat, i, attempts, attempt_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = skip_assumed_unit) missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } From 7a76825590c1fe78348b9c1b57a44b3fb0072236 Mon Sep 17 00:00:00 2001 From: YMao-UMCU <159454837+YMao-UMCU@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:39:03 +0200 Subject: [PATCH 4/5] enhance loop --- R/mo_convert.R | 287 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 207 insertions(+), 80 deletions(-) diff --git a/R/mo_convert.R b/R/mo_convert.R index 963e85a..c559328 100644 --- a/R/mo_convert.R +++ b/R/mo_convert.R @@ -19,41 +19,98 @@ NULL NA } -.mo_eval_attempt <- function(attempt_row, val_raw, dat, row_idx) { +.mo_parse_optional_expr <- function(expr_text, default_expr = quote(NA_real_)) { + if (is.null(expr_text) || is.na(expr_text) || expr_text == "") { + return(default_expr) + } + parse(text = expr_text)[[1]] +} + +.mo_prepare_attempt_bundle <- function(attempts) { + n_attempts <- nrow(attempts) + if (n_attempts == 0) { + return(list( + n = 0L, + unit_matched = character(0), + next_attempt = integer(0), + factor_num = numeric(0), + expr_parsed = list(), + minv = numeric(0), + maxv = numeric(0), + condition_on_value = character(0), + cond_value_parsed = list(), + condition_on_variable = character(0), + cond_var_parsed = list(), + variable = character(0), + assumed_unit_if_missing = character(0) + )) + } + + list( + n = n_attempts, + unit_matched = attempts$unit_matched, + next_attempt = attempts$next_attempt, + factor_num = suppressWarnings(as.numeric(attempts$multiplication_factor_from_origin_to_target)), + expr_parsed = lapply(attempts$conversion_not_multiplication, .mo_parse_optional_expr), + minv = suppressWarnings(as.numeric(attempts$Min)), + maxv = suppressWarnings(as.numeric(attempts$Max)), + condition_on_value = attempts$condition_on_value, + cond_value_parsed = lapply(attempts$condition_on_value, .mo_parse_optional_expr, default_expr = quote(TRUE)), + condition_on_variable = attempts$condition_on_variable, + cond_var_parsed = lapply(attempts$condition_on_variable, .mo_parse_optional_expr, default_expr = quote(TRUE)), + variable = if ("variable" %in% names(attempts)) as.character(attempts$variable) else rep(NA_character_, n_attempts), + assumed_unit_if_missing = if ("assumed_unit_if_missing" %in% names(attempts)) as.character(attempts$assumed_unit_if_missing) else rep(NA_character_, n_attempts) + ) +} + +.mo_eval_vectorized_values <- function(values, meta_rows, meta_pos = NULL) { + out <- rep(NA_real_, length(values)) + factor_num <- meta_rows$factor_num + idx_factor <- which(!is.na(factor_num)) + if (length(idx_factor) > 0) { + out[idx_factor] <- values[idx_factor] * factor_num[idx_factor] + } + idx_expr <- which(is.na(factor_num)) + if (length(idx_expr) > 0) { + if (is.null(meta_pos)) { + meta_pos <- seq_len(nrow(meta_rows)) + } + expr_groups <- split(idx_expr, meta_pos[idx_expr]) + for (group_idx in expr_groups) { + expr_parsed <- meta_rows$expr_parsed[[group_idx[1]]] + out[group_idx] <- eval(expr_parsed, envir = list(value = values[group_idx])) + } + } + out +} + +.mo_eval_attempt <- function(attempt_bundle, attempt_idx, val_raw, dat, row_idx) { # Returns list(success=TRUE/FALSE, val_conv=?, attempted=TRUE/FALSE) - # factor <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) - # if (is.na(factor) || is.na(val_raw)) { - # return(list(success = FALSE, attempted = FALSE)) - # } - # val_conv <- val_raw * factor - factor <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) - if (!is.na(factor) ) { + factor <- attempt_bundle$factor_num[attempt_idx] + if (!is.na(factor)) { val_conv <- val_raw * factor - }else{ - expression_tp_eval <- attempt_row$conversion_not_multiplication - expression_tp_eval <- gsub("value", as.character(val_raw), expression_tp_eval) -# print(expression_tp_eval) - val_conv <- eval( parse( text = expression_tp_eval)) + } else { + val_conv <- eval(attempt_bundle$expr_parsed[[attempt_idx]], envir = list(value = val_raw)) } - minv <- suppressWarnings(as.numeric(attempt_row$Min)) - maxv <- suppressWarnings(as.numeric(attempt_row$Max)) - cond_val_expr <- attempt_row$condition_on_value + minv <- attempt_bundle$minv[attempt_idx] + maxv <- attempt_bundle$maxv[attempt_idx] + cond_val_expr <- attempt_bundle$condition_on_value[attempt_idx] if (!is.null(cond_val_expr) && !is.na(cond_val_expr) && cond_val_expr != "") { - expr <- gsub("value", as.character(val_raw), cond_val_expr) - cond_ok <- tryCatch(isTRUE(eval(parse(text = expr))), error = function(e) FALSE) + cond_ok <- tryCatch(isTRUE(eval(attempt_bundle$cond_value_parsed[[attempt_idx]], envir = list(value = val_raw))), error = function(e) FALSE) if (!isTRUE(cond_ok)) { return(list(success = FALSE, attempted = FALSE)) } } else { - cond_var <- attempt_row$condition_on_variable + cond_var <- attempt_bundle$condition_on_variable[attempt_idx] if (!is.null(cond_var) && !is.na(cond_var) && cond_var != "") { - varname <- if (!is.null(attempt_row$variable) && !is.na(attempt_row$variable) && attempt_row$variable != "") attempt_row$variable else NULL + varname <- attempt_bundle$variable[attempt_idx] + if (!is.null(varname) && !is.na(varname) && varname == "") varname <- NULL cond_val <- .mo_get_var_value(dat, row_idx, varname) if (is.na(cond_val)) { return(list(success = FALSE, attempted = FALSE)) } - expr <- gsub(varname, as.character(cond_val), cond_var) - cond_ok <- tryCatch(isTRUE(eval(parse(text = expr))), error = function(e) FALSE) + cond_env <- setNames(list(cond_val), varname) + cond_ok <- tryCatch(isTRUE(eval(attempt_bundle$cond_var_parsed[[attempt_idx]], envir = cond_env)), error = function(e) FALSE) if (!isTRUE(cond_ok)) { return(list(success = FALSE, attempted = FALSE)) } @@ -70,13 +127,13 @@ NULL # Small, top-level helpers to keep `mo_convert()` concise. They operate by # modifying `dat` by reference (using `i`) and return success flags and # attempt counts where appropriate. -.mo_try_direct_matches <- function(dat, i, attempts, attempt_unit_matched, unit_origin, target, val_raw) { - if (nrow(attempts) > 0 && !is.na(unit_origin)) { +.mo_try_direct_matches <- function(dat, i, attempt_bundle, attempt_unit_matched, unit_origin, target, val_raw) { + if (attempt_bundle$n > 0 && !is.na(unit_origin)) { direct_idx <- which(attempt_unit_matched == unit_origin) if (length(direct_idx) > 0) { for (r in direct_idx) { dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - res <- .mo_eval_attempt(attempts[r], val_raw, dat, i) + res <- .mo_eval_attempt(attempt_bundle, r, val_raw, dat, i) if (isTRUE(res$success)) { dat[i, `:=`( included = 1L, @@ -92,13 +149,13 @@ NULL FALSE } -.mo_try_prefilled_assumed <- function(dat, i, attempts, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) { +.mo_try_prefilled_assumed <- function(dat, i, attempt_bundle, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) { # Returns list(success=TRUE/FALSE, tried=0/1) - if (is.na(row_unit_matched) || nrow(attempts) == 0) { + if (is.na(row_unit_matched) || attempt_bundle$n == 0) { return(list(success = FALSE, tried = 0L)) } - assumed_idx <- which(attempt_unit_matched == "missing" & (is.na(attempts$condition_on_value) | attempts$condition_on_value == "")) - assumed_list <- unique(na.omit(tolower(trimws(as.character(attempts$assumed_unit_if_missing[assumed_idx]))))) + assumed_idx <- which(attempt_unit_matched == "missing" & (is.na(attempt_bundle$condition_on_value) | attempt_bundle$condition_on_value == "")) + assumed_list <- unique(na.omit(tolower(trimws(as.character(attempt_bundle$assumed_unit_if_missing[assumed_idx]))))) if (!(row_unit_matched %in% assumed_list)) { return(list(success = FALSE, tried = 0L)) } @@ -107,14 +164,14 @@ NULL return(list(success = FALSE, tried = 0L)) } dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - assumed_row <- attempts[assumed_rows_idx[1]] - res <- .mo_eval_attempt(assumed_row, val_raw, dat, i) + assumed_idx_first <- assumed_rows_idx[1] + res <- .mo_eval_attempt(attempt_bundle, assumed_idx_first, val_raw, dat, i) if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(assumed_row$multiplication_factor_from_origin_to_target)) + factor_try <- attempt_bundle$factor_num[assumed_idx_first] if (!is.na(factor_try) && factor_try == 1) { rp <- 0L } else { - rp <- if (!is.na(assumed_row$next_attempt) && assumed_row$next_attempt > 0) as.integer(assumed_row$next_attempt) else 1L + rp <- if (!is.na(attempt_bundle$next_attempt[assumed_idx_first]) && attempt_bundle$next_attempt[assumed_idx_first] > 0) as.integer(attempt_bundle$next_attempt[assumed_idx_first]) else 1L } conv_code <- conv_success if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L @@ -124,26 +181,25 @@ NULL list(success = FALSE, tried = 1L) } -.mo_try_missing_chain <- function(dat, i, attempts, attempt_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = NA_character_) { +.mo_try_missing_chain <- function(dat, i, attempt_bundle, attempt_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = NA_character_) { # Returns list(success=TRUE/FALSE, tried=number_of_attempts_tried) tried <- 0L # explicit next_attempt chain (>0) - miss_attempts_idx <- which(!is.na(attempts$next_attempt) & attempts$next_attempt > 0) + miss_attempts_idx <- which(!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0) if (length(miss_attempts_idx) > 0) { - miss_attempts_idx <- miss_attempts_idx[order(attempts$next_attempt[miss_attempts_idx])] + miss_attempts_idx <- miss_attempts_idx[order(attempt_bundle$next_attempt[miss_attempts_idx])] for (r in miss_attempts_idx) { - attempt_row <- attempts[r] - res <- .mo_eval_attempt(attempt_row, val_raw, dat, i) + res <- .mo_eval_attempt(attempt_bundle, r, val_raw, dat, i) if (isTRUE(res$attempted)) { dat[i, n_conversion_attempts := n_conversion_attempts + 1L] tried <- tried + 1L } if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) + factor_try <- attempt_bundle$factor_num[r] if (!is.na(factor_try) && factor_try == 1) { rp <- 0L } else { - rp <- if (!is.na(attempt_row$next_attempt) && attempt_row$next_attempt > 0) as.integer(attempt_row$next_attempt) else 1L + rp <- if (!is.na(attempt_bundle$next_attempt[r]) && attempt_bundle$next_attempt[r] > 0) as.integer(attempt_bundle$next_attempt[r]) else 1L } conv_code <- conv_success if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L @@ -154,27 +210,26 @@ NULL } # rows labelled as missing missing_idx <- which(attempt_unit_matched == "missing") - if (!is.na(skip_assumed_unit) && length(missing_idx) > 0 && "assumed_unit_if_missing" %in% names(attempts)) { + if (!is.na(skip_assumed_unit) && length(missing_idx) > 0) { missing_idx <- missing_idx[ - is.na(attempts$next_attempt[missing_idx]) | - attempts$next_attempt[missing_idx] != 0 | - .mo_norm(attempts$assumed_unit_if_missing[missing_idx]) != skip_assumed_unit + is.na(attempt_bundle$next_attempt[missing_idx]) | + attempt_bundle$next_attempt[missing_idx] != 0 | + .mo_norm(attempt_bundle$assumed_unit_if_missing[missing_idx]) != skip_assumed_unit ] } if (length(missing_idx) > 0) { for (r in missing_idx) { - attempt_row <- attempts[r] - res <- .mo_eval_attempt(attempt_row, val_raw, dat, i) + res <- .mo_eval_attempt(attempt_bundle, r, val_raw, dat, i) if (isTRUE(res$attempted)) { dat[i, n_conversion_attempts := n_conversion_attempts + 1L] tried <- tried + 1L } if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) + factor_try <- attempt_bundle$factor_num[r] if (!is.na(factor_try) && factor_try == 1) { rp <- 0L } else { - rp <- if (!is.na(attempt_row$next_attempt) && attempt_row$next_attempt > 0) as.integer(attempt_row$next_attempt) else 1L + rp <- if (!is.na(attempt_bundle$next_attempt[r]) && attempt_bundle$next_attempt[r] > 0) as.integer(attempt_bundle$next_attempt[r]) else 1L } conv_code <- conv_success if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L @@ -186,22 +241,21 @@ NULL list(success = FALSE, tried = tried) } -.mo_try_fallbacks <- function(dat, i, attempts, val_raw, conv_success = 1L, other_flow = FALSE) { - if (nrow(attempts) <= 0) { +.mo_try_fallbacks <- function(dat, i, attempt_bundle, val_raw, conv_success = 1L, other_flow = FALSE) { + if (attempt_bundle$n <= 0) { return(FALSE) } - fallback_order <- unique(attempts$next_attempt[!is.na(attempts$next_attempt) & attempts$next_attempt > 0]) + fallback_order <- unique(attempt_bundle$next_attempt[!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0]) fallback_order <- sort(fallback_order) attempts_made <- 0L for (code in fallback_order) { - rows_idx <- which(attempts$next_attempt == code) + rows_idx <- which(attempt_bundle$next_attempt == code) for (j in rows_idx) { dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - attempt_row <- attempts[j] - res <- .mo_eval_attempt(attempt_row, val_raw, dat, i) + res <- .mo_eval_attempt(attempt_bundle, j, val_raw, dat, i) attempts_made <- attempts_made + 1L if (isTRUE(res$success)) { - factor_try <- suppressWarnings(as.numeric(attempt_row$multiplication_factor_from_origin_to_target)) + factor_try <- attempt_bundle$factor_num[j] conv_code <- conv_success if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = ifelse(attempts_made == 1L, 1L, 2L))] @@ -243,7 +297,9 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { meta[, unit_matched := ifelse(is.na(unit_matched) | unit_matched == "", NA_character_, .mo_norm(unit_matched))] meta[, .meta_key := paste(concept_id, unit_target, sep = "\r")] meta_by_key <- split(meta, by = ".meta_key", keep.by = FALSE, sorted = FALSE) + meta_bundle_by_key <- lapply(meta_by_key, .mo_prepare_attempt_bundle) empty_attempts <- meta[0] + empty_attempt_bundle <- .mo_prepare_attempt_bundle(empty_attempts) simple_direct_meta <- meta[ !is.na(unit_matched) & unit_matched != "missing" & @@ -253,8 +309,16 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { ] if (nrow(simple_direct_meta) > 0) { simple_direct_meta[, factor_num := suppressWarnings(as.numeric(multiplication_factor_from_origin_to_target))] + simple_direct_meta[, has_expr := !is.na(conversion_not_multiplication) & conversion_not_multiplication != ""] + simple_direct_meta[, expr_parsed := lapply(conversion_not_multiplication, function(expr_text) { + if (is.na(expr_text) || expr_text == "") { + quote(NA_real_) + } else { + parse(text = expr_text)[[1]] + } + })] simple_direct_meta <- simple_direct_meta[ - !is.na(factor_num) & + (!is.na(factor_num) | has_expr) & !is.na(Min) & !is.na(Max) ] @@ -264,6 +328,23 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { } } + assumed_prefill_keys <- character(0) + if (nrow(meta) > 0) { + assumed_prefill_rows <- meta[ + unit_matched == "missing" & + (is.na(condition_on_value) | condition_on_value == "") & + !is.na(assumed_unit_if_missing) & + assumed_unit_if_missing != "" + ] + if (nrow(assumed_prefill_rows) > 0) { + assumed_prefill_keys <- unique(paste( + assumed_prefill_rows$.meta_key, + .mo_norm(assumed_prefill_rows$assumed_unit_if_missing), + sep = "\r" + )) + } + } + # Use internal helpers dat[, `:=`( @@ -292,6 +373,8 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { } meta_key_vec <- paste(cid_vec, target_vec, sep = "\r") direct_attempt_done <- rep(FALSE, nrow(dat)) + prefilled_attempt_done <- rep(FALSE, nrow(dat)) + prefilled_attempt_tried <- integer(nrow(dat)) if (exists("simple_direct_meta") && nrow(simple_direct_meta) > 0) { direct_key_vec <- paste(meta_key_vec, unit_origin_vec, sep = "\r") @@ -301,7 +384,11 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { direct_attempt_done[fast_direct_idx] <- TRUE dat[fast_direct_idx, n_conversion_attempts := n_conversion_attempts + 1L] meta_rows <- simple_direct_meta[direct_meta_pos[fast_direct_idx]] - val_conv_fast <- val_raw_vec[fast_direct_idx] * meta_rows$factor_num + val_conv_fast <- .mo_eval_vectorized_values( + values = val_raw_vec[fast_direct_idx], + meta_rows = meta_rows, + meta_pos = direct_meta_pos[fast_direct_idx] + ) ok_fast <- !is.na(val_conv_fast) & val_conv_fast >= meta_rows$Min & val_conv_fast <= meta_rows$Max @@ -317,57 +404,97 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { )] } } - } - for (i in seq_len(nrow(dat))) { - if (!is.na(dat$included[i])) next + prefill_key_vec <- paste(meta_key_vec, row_unit_matched_vec, sep = "\r") + prefill_meta_pos <- match(prefill_key_vec, simple_direct_meta$.direct_key) + fast_prefill_idx <- which( + is.na(dat$included) & + unit_missing_vec & + !is.na(row_unit_matched_vec) & + !is.na(prefill_meta_pos) & + prefill_key_vec %in% assumed_prefill_keys + ) + if (length(fast_prefill_idx) > 0) { + prefilled_attempt_done[fast_prefill_idx] <- TRUE + prefilled_attempt_tried[fast_prefill_idx] <- 1L + dat[fast_prefill_idx, n_conversion_attempts := n_conversion_attempts + 1L] + meta_rows <- simple_direct_meta[prefill_meta_pos[fast_prefill_idx]] + val_conv_fast <- .mo_eval_vectorized_values( + values = val_raw_vec[fast_prefill_idx], + meta_rows = meta_rows, + meta_pos = prefill_meta_pos[fast_prefill_idx] + ) + ok_fast <- !is.na(val_conv_fast) & + val_conv_fast >= meta_rows$Min & + val_conv_fast <= meta_rows$Max + success_idx <- fast_prefill_idx[ok_fast] + if (length(success_idx) > 0) { + success_meta_rows <- meta_rows[ok_fast] + success_conv <- val_conv_fast[ok_fast] + factor_try <- success_meta_rows$factor_num + rp <- ifelse( + !is.na(factor_try) & factor_try == 1, + 0L, + ifelse(!is.na(success_meta_rows$next_attempt) & success_meta_rows$next_attempt > 0, as.integer(success_meta_rows$next_attempt), 1L) + ) + dat[success_idx, `:=`( + included = 1L, + value_converted = success_conv, + conversion = 3L, + rule_applied = rp + )] + } + } + } + pending_idx <- which(is.na(dat$included)) + for (i in pending_idx) { cid <- cid_vec[i] target <- target_vec[i] val_raw <- val_raw_vec[i] unit_origin <- unit_origin_vec[i] unit_missing_flag <- unit_missing_vec[i] - attempts <- meta_by_key[[meta_key_vec[i]]] - if (is.null(attempts)) attempts <- empty_attempts - attempt_unit_matched <- if (nrow(attempts) > 0) ifelse(is.na(attempts$unit_matched), unit_origin, attempts$unit_matched) else character(0) + attempt_bundle <- meta_bundle_by_key[[meta_key_vec[i]]] + if (is.null(attempt_bundle)) attempt_bundle <- empty_attempt_bundle + attempt_unit_matched <- if (attempt_bundle$n > 0) ifelse(is.na(attempt_bundle$unit_matched), unit_origin, attempt_bundle$unit_matched) else character(0) # Identify 'OTHER' origin: unit present but not listed among attempts - origin_other <- !is.na(unit_origin) && nrow(attempts) > 0 && !(unit_origin %in% attempt_unit_matched) + origin_other <- !is.na(unit_origin) && attempt_bundle$n > 0 && !(unit_origin %in% attempt_unit_matched) # 1) Try direct matches (unit_origin equals unit_matched) - if (!direct_attempt_done[i] && .mo_try_direct_matches(dat, i, attempts, attempt_unit_matched, unit_origin, target, val_raw)) next + if (!direct_attempt_done[i] && .mo_try_direct_matches(dat, i, attempt_bundle, attempt_unit_matched, unit_origin, target, val_raw)) next # If origin is 'OTHER' (present but not listed), treat it like MISSING for # conversion attempts (but use conversion=1 on success). Otherwise, for # genuinely missing units use conversion=3 on success. - missing_attempts_tried <- 0L + missing_attempts_tried <- prefilled_attempt_tried[i] if (origin_other) { row_unit_matched <- row_unit_matched_vec[i] - pref_res <- list(success = FALSE, tried = 0L) - if (!is.na(row_unit_matched) && nrow(attempts) > 0) { - pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE) + pref_res <- list(success = FALSE, tried = prefilled_attempt_tried[i]) + if (!prefilled_attempt_done[i] && !is.na(row_unit_matched) && attempt_bundle$n > 0) { + pref_res <- .mo_try_prefilled_assumed(dat, i, attempt_bundle, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE) missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } - skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_ - miss_res <- .mo_try_missing_chain(dat, i, attempts, attempt_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE, skip_assumed_unit = skip_assumed_unit) + skip_assumed_unit <- if (prefilled_attempt_done[i] || pref_res$tried > 0L) row_unit_matched else NA_character_ + miss_res <- .mo_try_missing_chain(dat, i, attempt_bundle, attempt_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE, skip_assumed_unit = skip_assumed_unit) missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } else { if (unit_missing_flag) { row_unit_matched <- row_unit_matched_vec[i] - pref_res <- list(success = FALSE, tried = 0L) - if (!is.na(row_unit_matched) && nrow(attempts) > 0) { - pref_res <- .mo_try_prefilled_assumed(dat, i, attempts, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) + pref_res <- list(success = FALSE, tried = prefilled_attempt_tried[i]) + if (!prefilled_attempt_done[i] && !is.na(row_unit_matched) && attempt_bundle$n > 0) { + pref_res <- .mo_try_prefilled_assumed(dat, i, attempt_bundle, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } # Only call the missing chain if prefill did not already cover the single attempt # (i.e., prefill did not run, or there are chained attempts with next_attempt > 0 to pursue). - if (nrow(attempts) > 0 && (pref_res$tried == 0L || nrow(attempts[!is.na(next_attempt) & next_attempt > 0L]) > 0L)) { - skip_assumed_unit <- if (pref_res$tried > 0L) row_unit_matched else NA_character_ - miss_res <- .mo_try_missing_chain(dat, i, attempts, attempt_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = skip_assumed_unit) + if (attempt_bundle$n > 0 && (pref_res$tried == 0L || any(!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0L))) { + skip_assumed_unit <- if (prefilled_attempt_done[i] || pref_res$tried > 0L) row_unit_matched else NA_character_ + miss_res <- .mo_try_missing_chain(dat, i, attempt_bundle, attempt_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = skip_assumed_unit) missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } @@ -377,7 +504,7 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { # 3) Fallback: try all attempts ordered by next_attempt (1,2,...) as fallbacks for conversion conv_success_current <- if (origin_other) 1L else if (unit_missing_flag) 3L else 1L other_flow_current <- origin_other - if (.mo_try_fallbacks(dat, i, attempts, val_raw, conv_success_current, other_flow_current)) next + if (.mo_try_fallbacks(dat, i, attempt_bundle, val_raw, conv_success_current, other_flow_current)) next # 4) If we get here, conversion failed or no applicable attempts # Decide conversion & rule codes per README semantics @@ -388,10 +515,10 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { attempts_made_final <- missing_attempts_tried rp_fail <- ifelse(is.na(attempts_made_final) || attempts_made_final == 0L, 90L, ifelse(attempts_made_final == 1L, 91L, 92L)) dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = 3L, rule_applied = rp_fail)] - } else if (!is.na(unit_origin) && nrow(attempts) == 0) { + } else if (!is.na(unit_origin) && attempt_bundle$n == 0) { # unit present but no conversion metadata -> treat as OTHER dat[i, `:=`(included = 1L, value_converted = val_raw, conversion = 2L, rule_applied = 0L)] - } else if (!is.na(unit_origin) && unit_origin != target) { + } else if (!is.na(unit_origin) && !is.na(target) && unit_origin != target) { # Tried conversions but none accepted # If no conversion attempts were actually made for this row, treat as OTHER accepted-as-is attempts_made <- dat$n_conversion_attempts[i] From 5dc425b4e62d1e35f0f19eabf99fa2d7c61ed350 Mon Sep 17 00:00:00 2001 From: YMao-UMCU <159454837+YMao-UMCU@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:27:55 +0200 Subject: [PATCH 5/5] refactor for speed --- R/mo_convert.R | 478 +++++++++++------- .../dataset_cleaned_lab_values.csv | 3 + .../Example 6/i_input/LAB_target_units.csv | 2 + .../data/Example 6/i_input/LAB_threshold.csv | 2 + .../Example 6/i_input/LAB_unit_conversion.csv | 2 + .../Example 6/i_input/dataset_lab_values.csv | 3 + tests/testthat/test-examples.R | 17 +- tests/testthat/test-validation.R | 25 - 8 files changed, 311 insertions(+), 221 deletions(-) create mode 100644 tests/testthat/data/Example 6/i_ground_truth/dataset_cleaned_lab_values.csv create mode 100644 tests/testthat/data/Example 6/i_input/LAB_target_units.csv create mode 100644 tests/testthat/data/Example 6/i_input/LAB_threshold.csv create mode 100644 tests/testthat/data/Example 6/i_input/LAB_unit_conversion.csv create mode 100644 tests/testthat/data/Example 6/i_input/dataset_lab_values.csv diff --git a/R/mo_convert.R b/R/mo_convert.R index c559328..6105117 100644 --- a/R/mo_convert.R +++ b/R/mo_convert.R @@ -9,14 +9,15 @@ NULL .mo_norm <- function(x) tolower(trimws(as.character(x))) -.mo_get_var_value <- function(dat, row_idx, varname) { +.mo_get_cached_var_value <- function(var_cache, row_idx, varname) { if (is.null(varname) || is.na(varname) || varname == "") { return(NA) } - if (varname %in% names(dat)) { - return(dat[[varname]][row_idx]) + values <- var_cache[[varname]] + if (is.null(values)) { + return(NA) } - NA + values[row_idx] } .mo_parse_optional_expr <- function(expr_text, default_expr = quote(NA_real_)) { @@ -63,6 +64,36 @@ NULL ) } +.mo_finalize_attempt_bundle <- function(attempt_bundle) { + if (attempt_bundle$n == 0L) { + attempt_bundle$next_positive_idx <- integer(0) + attempt_bundle$missing_idx <- integer(0) + attempt_bundle$fallback_idx_by_code <- list() + attempt_bundle$assumed_units <- character(0) + return(attempt_bundle) + } + + next_pos <- which(!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0) + attempt_bundle$next_positive_idx <- if (length(next_pos) > 0) { + next_pos[order(attempt_bundle$next_attempt[next_pos])] + } else { + integer(0) + } + attempt_bundle$missing_idx <- which(attempt_bundle$unit_matched == "missing") + fallback_order <- sort(unique(attempt_bundle$next_attempt[next_pos])) + attempt_bundle$fallback_idx_by_code <- lapply(fallback_order, function(code) { + which(attempt_bundle$next_attempt == code) + }) + names(attempt_bundle$fallback_idx_by_code) <- as.character(fallback_order) + + assumed_idx <- which( + attempt_bundle$unit_matched == "missing" & + (is.na(attempt_bundle$condition_on_value) | attempt_bundle$condition_on_value == "") + ) + attempt_bundle$assumed_units <- unique(na.omit(.mo_norm(attempt_bundle$assumed_unit_if_missing[assumed_idx]))) + attempt_bundle +} + .mo_eval_vectorized_values <- function(values, meta_rows, meta_pos = NULL) { out <- rep(NA_real_, length(values)) factor_num <- meta_rows$factor_num @@ -84,8 +115,7 @@ NULL out } -.mo_eval_attempt <- function(attempt_bundle, attempt_idx, val_raw, dat, row_idx) { - # Returns list(success=TRUE/FALSE, val_conv=?, attempted=TRUE/FALSE) +.mo_eval_attempt_prepared <- function(attempt_bundle, attempt_idx, val_raw, row_idx, var_cache) { factor <- attempt_bundle$factor_num[attempt_idx] if (!is.na(factor)) { val_conv <- val_raw * factor @@ -105,7 +135,7 @@ NULL if (!is.null(cond_var) && !is.na(cond_var) && cond_var != "") { varname <- attempt_bundle$variable[attempt_idx] if (!is.null(varname) && !is.na(varname) && varname == "") varname <- NULL - cond_val <- .mo_get_var_value(dat, row_idx, varname) + cond_val <- .mo_get_cached_var_value(var_cache, row_idx, varname) if (is.na(cond_val)) { return(list(success = FALSE, attempted = FALSE)) } @@ -124,146 +154,24 @@ NULL list(success = FALSE, attempted = attempted) } -# Small, top-level helpers to keep `mo_convert()` concise. They operate by -# modifying `dat` by reference (using `i`) and return success flags and -# attempt counts where appropriate. -.mo_try_direct_matches <- function(dat, i, attempt_bundle, attempt_unit_matched, unit_origin, target, val_raw) { - if (attempt_bundle$n > 0 && !is.na(unit_origin)) { - direct_idx <- which(attempt_unit_matched == unit_origin) - if (length(direct_idx) > 0) { - for (r in direct_idx) { - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - res <- .mo_eval_attempt(attempt_bundle, r, val_raw, dat, i) - if (isTRUE(res$success)) { - dat[i, `:=`( - included = 1L, - value_converted = res$val_conv, - conversion = ifelse(is.na(unit_origin) || unit_origin == target, 0L, ifelse(unit_origin == target, 0L, 1L)), - rule_applied = ifelse(is.na(unit_origin) || unit_origin == target, 0L, 1L) - )] - return(TRUE) - } - } - } - } - FALSE -} - -.mo_try_prefilled_assumed <- function(dat, i, attempt_bundle, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) { - # Returns list(success=TRUE/FALSE, tried=0/1) - if (is.na(row_unit_matched) || attempt_bundle$n == 0) { - return(list(success = FALSE, tried = 0L)) +.mo_success_codes <- function(attempt_bundle, attempt_idx, conv_success, other_flow = FALSE, fallback_attempts_made = NA_integer_) { + factor_try <- attempt_bundle$factor_num[attempt_idx] + conv_code <- conv_success + if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) { + conv_code <- 2L } - assumed_idx <- which(attempt_unit_matched == "missing" & (is.na(attempt_bundle$condition_on_value) | attempt_bundle$condition_on_value == "")) - assumed_list <- unique(na.omit(tolower(trimws(as.character(attempt_bundle$assumed_unit_if_missing[assumed_idx]))))) - if (!(row_unit_matched %in% assumed_list)) { - return(list(success = FALSE, tried = 0L)) - } - assumed_rows_idx <- which(attempt_unit_matched == row_unit_matched) - if (length(assumed_rows_idx) == 0) { - return(list(success = FALSE, tried = 0L)) - } - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - assumed_idx_first <- assumed_rows_idx[1] - res <- .mo_eval_attempt(attempt_bundle, assumed_idx_first, val_raw, dat, i) - if (isTRUE(res$success)) { - factor_try <- attempt_bundle$factor_num[assumed_idx_first] - if (!is.na(factor_try) && factor_try == 1) { - rp <- 0L + if (!is.na(fallback_attempts_made)) { + rp <- ifelse(fallback_attempts_made == 1L, 1L, 2L) + } else if (!is.na(factor_try) && factor_try == 1) { + rp <- 0L + } else { + rp <- if (!is.na(attempt_bundle$next_attempt[attempt_idx]) && attempt_bundle$next_attempt[attempt_idx] > 0) { + as.integer(attempt_bundle$next_attempt[attempt_idx]) } else { - rp <- if (!is.na(attempt_bundle$next_attempt[assumed_idx_first]) && attempt_bundle$next_attempt[assumed_idx_first] > 0) as.integer(attempt_bundle$next_attempt[assumed_idx_first]) else 1L + 1L } - conv_code <- conv_success - if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L - dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = rp)] - return(list(success = TRUE, tried = 1L)) } - list(success = FALSE, tried = 1L) -} - -.mo_try_missing_chain <- function(dat, i, attempt_bundle, attempt_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = NA_character_) { - # Returns list(success=TRUE/FALSE, tried=number_of_attempts_tried) - tried <- 0L - # explicit next_attempt chain (>0) - miss_attempts_idx <- which(!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0) - if (length(miss_attempts_idx) > 0) { - miss_attempts_idx <- miss_attempts_idx[order(attempt_bundle$next_attempt[miss_attempts_idx])] - for (r in miss_attempts_idx) { - res <- .mo_eval_attempt(attempt_bundle, r, val_raw, dat, i) - if (isTRUE(res$attempted)) { - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - tried <- tried + 1L - } - if (isTRUE(res$success)) { - factor_try <- attempt_bundle$factor_num[r] - if (!is.na(factor_try) && factor_try == 1) { - rp <- 0L - } else { - rp <- if (!is.na(attempt_bundle$next_attempt[r]) && attempt_bundle$next_attempt[r] > 0) as.integer(attempt_bundle$next_attempt[r]) else 1L - } - conv_code <- conv_success - if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L - dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = rp)] - return(list(success = TRUE, tried = tried)) - } - } - } - # rows labelled as missing - missing_idx <- which(attempt_unit_matched == "missing") - if (!is.na(skip_assumed_unit) && length(missing_idx) > 0) { - missing_idx <- missing_idx[ - is.na(attempt_bundle$next_attempt[missing_idx]) | - attempt_bundle$next_attempt[missing_idx] != 0 | - .mo_norm(attempt_bundle$assumed_unit_if_missing[missing_idx]) != skip_assumed_unit - ] - } - if (length(missing_idx) > 0) { - for (r in missing_idx) { - res <- .mo_eval_attempt(attempt_bundle, r, val_raw, dat, i) - if (isTRUE(res$attempted)) { - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - tried <- tried + 1L - } - if (isTRUE(res$success)) { - factor_try <- attempt_bundle$factor_num[r] - if (!is.na(factor_try) && factor_try == 1) { - rp <- 0L - } else { - rp <- if (!is.na(attempt_bundle$next_attempt[r]) && attempt_bundle$next_attempt[r] > 0) as.integer(attempt_bundle$next_attempt[r]) else 1L - } - conv_code <- conv_success - if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L - dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = rp)] - return(list(success = TRUE, tried = tried)) - } - } - } - list(success = FALSE, tried = tried) -} - -.mo_try_fallbacks <- function(dat, i, attempt_bundle, val_raw, conv_success = 1L, other_flow = FALSE) { - if (attempt_bundle$n <= 0) { - return(FALSE) - } - fallback_order <- unique(attempt_bundle$next_attempt[!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0]) - fallback_order <- sort(fallback_order) - attempts_made <- 0L - for (code in fallback_order) { - rows_idx <- which(attempt_bundle$next_attempt == code) - for (j in rows_idx) { - dat[i, n_conversion_attempts := n_conversion_attempts + 1L] - res <- .mo_eval_attempt(attempt_bundle, j, val_raw, dat, i) - attempts_made <- attempts_made + 1L - if (isTRUE(res$success)) { - factor_try <- attempt_bundle$factor_num[j] - conv_code <- conv_success - if (!is.na(factor_try) && factor_try == 1 && conv_success == 1L && isTRUE(other_flow)) conv_code <- 2L - dat[i, `:=`(included = 1L, value_converted = res$val_conv, conversion = conv_code, rule_applied = ifelse(attempts_made == 1L, 1L, 2L))] - return(TRUE) - } - } - } - FALSE + list(conversion = as.integer(conv_code), rule_applied = as.integer(rp)) } ##' Convert values using conversion metadata @@ -297,9 +205,9 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { meta[, unit_matched := ifelse(is.na(unit_matched) | unit_matched == "", NA_character_, .mo_norm(unit_matched))] meta[, .meta_key := paste(concept_id, unit_target, sep = "\r")] meta_by_key <- split(meta, by = ".meta_key", keep.by = FALSE, sorted = FALSE) - meta_bundle_by_key <- lapply(meta_by_key, .mo_prepare_attempt_bundle) + meta_bundle_by_key <- lapply(meta_by_key, function(x) .mo_finalize_attempt_bundle(.mo_prepare_attempt_bundle(x))) empty_attempts <- meta[0] - empty_attempt_bundle <- .mo_prepare_attempt_bundle(empty_attempts) + empty_attempt_bundle <- .mo_finalize_attempt_bundle(.mo_prepare_attempt_bundle(empty_attempts)) simple_direct_meta <- meta[ !is.na(unit_matched) & unit_matched != "missing" & @@ -345,18 +253,9 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { } } - # Use internal helpers - - dat[, `:=`( - included = as.integer(NA), - value_converted = as.numeric(NA), - conversion = as.integer(NA), - rule_applied = as.integer(NA), - n_conversion_attempts = 0L - )] - # local alias helpers for readability norm <- .mo_norm + n_dat <- nrow(dat) cid_vec <- dat$concept_id target_vec <- dat$unit_target val_raw_vec <- suppressWarnings(as.numeric(dat$value)) @@ -369,12 +268,25 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { row_unit_matched_vec <- if ("unit_matched" %in% names(dat)) { ifelse(is.na(dat$unit_matched) | dat$unit_matched == "", NA_character_, norm(dat$unit_matched)) } else { - rep(NA_character_, nrow(dat)) + rep(NA_character_, n_dat) } meta_key_vec <- paste(cid_vec, target_vec, sep = "\r") - direct_attempt_done <- rep(FALSE, nrow(dat)) - prefilled_attempt_done <- rep(FALSE, nrow(dat)) - prefilled_attempt_tried <- integer(nrow(dat)) + direct_attempt_done <- rep(FALSE, n_dat) + prefilled_attempt_done <- rep(FALSE, n_dat) + prefilled_attempt_tried <- integer(n_dat) + n_conversion_attempts <- integer(n_dat) + included <- rep(NA_integer_, n_dat) + value_converted <- rep(NA_real_, n_dat) + conversion <- rep(NA_integer_, n_dat) + rule_applied <- rep(NA_integer_, n_dat) + + condition_variables <- unique(unlist(lapply(meta_bundle_by_key, function(bundle) bundle$variable), use.names = FALSE)) + condition_variables <- condition_variables[!is.na(condition_variables) & condition_variables != "" & condition_variables %in% names(dat)] + var_cache <- if (length(condition_variables) > 0) { + dat[, ..condition_variables] + } else { + list() + } if (exists("simple_direct_meta") && nrow(simple_direct_meta) > 0) { direct_key_vec <- paste(meta_key_vec, unit_origin_vec, sep = "\r") @@ -382,7 +294,7 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { fast_direct_idx <- which(!is.na(direct_meta_pos) & !is.na(unit_origin_vec)) if (length(fast_direct_idx) > 0) { direct_attempt_done[fast_direct_idx] <- TRUE - dat[fast_direct_idx, n_conversion_attempts := n_conversion_attempts + 1L] + n_conversion_attempts[fast_direct_idx] <- n_conversion_attempts[fast_direct_idx] + 1L meta_rows <- simple_direct_meta[direct_meta_pos[fast_direct_idx]] val_conv_fast <- .mo_eval_vectorized_values( values = val_raw_vec[fast_direct_idx], @@ -396,19 +308,17 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { if (length(success_idx) > 0) { success_conv <- val_conv_fast[ok_fast] success_is_same_unit <- unit_origin_vec[success_idx] == target_vec[success_idx] - dat[success_idx, `:=`( - included = 1L, - value_converted = success_conv, - conversion = ifelse(success_is_same_unit, 0L, 1L), - rule_applied = ifelse(success_is_same_unit, 0L, 1L) - )] + included[success_idx] <- 1L + value_converted[success_idx] <- success_conv + conversion[success_idx] <- ifelse(success_is_same_unit, 0L, 1L) + rule_applied[success_idx] <- ifelse(success_is_same_unit, 0L, 1L) } } prefill_key_vec <- paste(meta_key_vec, row_unit_matched_vec, sep = "\r") prefill_meta_pos <- match(prefill_key_vec, simple_direct_meta$.direct_key) fast_prefill_idx <- which( - is.na(dat$included) & + is.na(included) & unit_missing_vec & !is.na(row_unit_matched_vec) & !is.na(prefill_meta_pos) & @@ -417,7 +327,7 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { if (length(fast_prefill_idx) > 0) { prefilled_attempt_done[fast_prefill_idx] <- TRUE prefilled_attempt_tried[fast_prefill_idx] <- 1L - dat[fast_prefill_idx, n_conversion_attempts := n_conversion_attempts + 1L] + n_conversion_attempts[fast_prefill_idx] <- n_conversion_attempts[fast_prefill_idx] + 1L meta_rows <- simple_direct_meta[prefill_meta_pos[fast_prefill_idx]] val_conv_fast <- .mo_eval_vectorized_values( values = val_raw_vec[fast_prefill_idx], @@ -437,19 +347,16 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { 0L, ifelse(!is.na(success_meta_rows$next_attempt) & success_meta_rows$next_attempt > 0, as.integer(success_meta_rows$next_attempt), 1L) ) - dat[success_idx, `:=`( - included = 1L, - value_converted = success_conv, - conversion = 3L, - rule_applied = rp - )] + included[success_idx] <- 1L + value_converted[success_idx] <- success_conv + conversion[success_idx] <- 3L + rule_applied[success_idx] <- rp } } } - pending_idx <- which(is.na(dat$included)) + pending_idx <- which(is.na(included)) for (i in pending_idx) { - cid <- cid_vec[i] target <- target_vec[i] val_raw <- val_raw_vec[i] unit_origin <- unit_origin_vec[i] @@ -463,7 +370,25 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { origin_other <- !is.na(unit_origin) && attempt_bundle$n > 0 && !(unit_origin %in% attempt_unit_matched) # 1) Try direct matches (unit_origin equals unit_matched) - if (!direct_attempt_done[i] && .mo_try_direct_matches(dat, i, attempt_bundle, attempt_unit_matched, unit_origin, target, val_raw)) next + if (!direct_attempt_done[i] && attempt_bundle$n > 0 && !is.na(unit_origin)) { + direct_idx <- which(attempt_unit_matched == unit_origin) + if (length(direct_idx) > 0) { + direct_success <- FALSE + for (r in direct_idx) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$success)) { + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- ifelse(is.na(unit_origin) || unit_origin == target, 0L, ifelse(unit_origin == target, 0L, 1L)) + rule_applied[i] <- ifelse(is.na(unit_origin) || unit_origin == target, 0L, 1L) + direct_success <- TRUE + break + } + } + if (direct_success) next + } + } # If origin is 'OTHER' (present but not listed), treat it like MISSING for # conversion attempts (but use conversion=1 on success). Otherwise, for @@ -473,12 +398,73 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { row_unit_matched <- row_unit_matched_vec[i] pref_res <- list(success = FALSE, tried = prefilled_attempt_tried[i]) if (!prefilled_attempt_done[i] && !is.na(row_unit_matched) && attempt_bundle$n > 0) { - pref_res <- .mo_try_prefilled_assumed(dat, i, attempt_bundle, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE) + if (row_unit_matched %in% attempt_bundle$assumed_units) { + assumed_rows_idx <- which(attempt_unit_matched == row_unit_matched) + if (length(assumed_rows_idx) > 0) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + assumed_idx_first <- assumed_rows_idx[1] + res <- .mo_eval_attempt_prepared(attempt_bundle, assumed_idx_first, val_raw, i, var_cache) + pref_res <- list(success = isTRUE(res$success), tried = 1L) + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, assumed_idx_first, conv_success = 1L, other_flow = TRUE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + } + } + } missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } skip_assumed_unit <- if (prefilled_attempt_done[i] || pref_res$tried > 0L) row_unit_matched else NA_character_ - miss_res <- .mo_try_missing_chain(dat, i, attempt_bundle, attempt_unit_matched, val_raw, conv_success = 1L, other_flow = TRUE, skip_assumed_unit = skip_assumed_unit) + miss_res <- list(success = FALSE, tried = 0L) + if (length(attempt_bundle$next_positive_idx) > 0) { + for (r in attempt_bundle$next_positive_idx) { + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$attempted)) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + miss_res$tried <- miss_res$tried + 1L + } + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, r, conv_success = 1L, other_flow = TRUE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + miss_res$success <- TRUE + break + } + } + } + if (!isTRUE(miss_res$success)) { + missing_idx <- attempt_bundle$missing_idx + if (!is.na(skip_assumed_unit) && length(missing_idx) > 0) { + missing_idx <- missing_idx[ + is.na(attempt_bundle$next_attempt[missing_idx]) | + attempt_bundle$next_attempt[missing_idx] != 0 | + .mo_norm(attempt_bundle$assumed_unit_if_missing[missing_idx]) != skip_assumed_unit + ] + } + if (length(missing_idx) > 0) { + for (r in missing_idx) { + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$attempted)) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + miss_res$tried <- miss_res$tried + 1L + } + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, r, conv_success = 1L, other_flow = TRUE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + miss_res$success <- TRUE + break + } + } + } + } missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } else { @@ -486,7 +472,22 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { row_unit_matched <- row_unit_matched_vec[i] pref_res <- list(success = FALSE, tried = prefilled_attempt_tried[i]) if (!prefilled_attempt_done[i] && !is.na(row_unit_matched) && attempt_bundle$n > 0) { - pref_res <- .mo_try_prefilled_assumed(dat, i, attempt_bundle, attempt_unit_matched, row_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE) + if (row_unit_matched %in% attempt_bundle$assumed_units) { + assumed_rows_idx <- which(attempt_unit_matched == row_unit_matched) + if (length(assumed_rows_idx) > 0) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + assumed_idx_first <- assumed_rows_idx[1] + res <- .mo_eval_attempt_prepared(attempt_bundle, assumed_idx_first, val_raw, i, var_cache) + pref_res <- list(success = isTRUE(res$success), tried = 1L) + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, assumed_idx_first, conv_success = 3L, other_flow = FALSE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + } + } + } missing_attempts_tried <- missing_attempts_tried + pref_res$tried if (isTRUE(pref_res$success)) next } @@ -494,7 +495,53 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { # (i.e., prefill did not run, or there are chained attempts with next_attempt > 0 to pursue). if (attempt_bundle$n > 0 && (pref_res$tried == 0L || any(!is.na(attempt_bundle$next_attempt) & attempt_bundle$next_attempt > 0L))) { skip_assumed_unit <- if (prefilled_attempt_done[i] || pref_res$tried > 0L) row_unit_matched else NA_character_ - miss_res <- .mo_try_missing_chain(dat, i, attempt_bundle, attempt_unit_matched, val_raw, conv_success = 3L, other_flow = FALSE, skip_assumed_unit = skip_assumed_unit) + miss_res <- list(success = FALSE, tried = 0L) + if (length(attempt_bundle$next_positive_idx) > 0) { + for (r in attempt_bundle$next_positive_idx) { + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$attempted)) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + miss_res$tried <- miss_res$tried + 1L + } + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, r, conv_success = 3L, other_flow = FALSE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + miss_res$success <- TRUE + break + } + } + } + if (!isTRUE(miss_res$success)) { + missing_idx <- attempt_bundle$missing_idx + if (!is.na(skip_assumed_unit) && length(missing_idx) > 0) { + missing_idx <- missing_idx[ + is.na(attempt_bundle$next_attempt[missing_idx]) | + attempt_bundle$next_attempt[missing_idx] != 0 | + .mo_norm(attempt_bundle$assumed_unit_if_missing[missing_idx]) != skip_assumed_unit + ] + } + if (length(missing_idx) > 0) { + for (r in missing_idx) { + res <- .mo_eval_attempt_prepared(attempt_bundle, r, val_raw, i, var_cache) + if (isTRUE(res$attempted)) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + miss_res$tried <- miss_res$tried + 1L + } + if (isTRUE(res$success)) { + codes <- .mo_success_codes(attempt_bundle, r, conv_success = 3L, other_flow = FALSE) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + miss_res$success <- TRUE + break + } + } + } + } missing_attempts_tried <- missing_attempts_tried + miss_res$tried if (isTRUE(miss_res$success)) next } @@ -504,37 +551,86 @@ mo_convert <- function(dat_unit_matched, metadata_convert) { # 3) Fallback: try all attempts ordered by next_attempt (1,2,...) as fallbacks for conversion conv_success_current <- if (origin_other) 1L else if (unit_missing_flag) 3L else 1L other_flow_current <- origin_other - if (.mo_try_fallbacks(dat, i, attempt_bundle, val_raw, conv_success_current, other_flow_current)) next + fallback_success <- FALSE + if (attempt_bundle$n > 0 && length(attempt_bundle$fallback_idx_by_code) > 0) { + attempts_made <- 0L + for (rows_idx in attempt_bundle$fallback_idx_by_code) { + for (j in rows_idx) { + n_conversion_attempts[i] <- n_conversion_attempts[i] + 1L + res <- .mo_eval_attempt_prepared(attempt_bundle, j, val_raw, i, var_cache) + attempts_made <- attempts_made + 1L + if (isTRUE(res$success)) { + codes <- .mo_success_codes( + attempt_bundle, + j, + conv_success = conv_success_current, + other_flow = other_flow_current, + fallback_attempts_made = attempts_made + ) + included[i] <- 1L + value_converted[i] <- res$val_conv + conversion[i] <- codes$conversion + rule_applied[i] <- codes$rule_applied + fallback_success <- TRUE + break + } + } + if (fallback_success) break + } + } + if (fallback_success) next # 4) If we get here, conversion failed or no applicable attempts # Decide conversion & rule codes per README semantics if (is.na(val_raw) || !is.numeric(val_raw)) { - dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = 3L, rule_applied = 99L)] + included[i] <- 0L + value_converted[i] <- NA_real_ + conversion[i] <- 3L + rule_applied[i] <- 99L } else if (unit_missing_flag) { # Use only the missing-unit attempts count to decide 90/91/92 attempts_made_final <- missing_attempts_tried rp_fail <- ifelse(is.na(attempts_made_final) || attempts_made_final == 0L, 90L, ifelse(attempts_made_final == 1L, 91L, 92L)) - dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = 3L, rule_applied = rp_fail)] + included[i] <- 0L + value_converted[i] <- NA_real_ + conversion[i] <- 3L + rule_applied[i] <- rp_fail } else if (!is.na(unit_origin) && attempt_bundle$n == 0) { # unit present but no conversion metadata -> treat as OTHER - dat[i, `:=`(included = 1L, value_converted = val_raw, conversion = 2L, rule_applied = 0L)] + included[i] <- 1L + value_converted[i] <- val_raw + conversion[i] <- 2L + rule_applied[i] <- 0L } else if (!is.na(unit_origin) && !is.na(target) && unit_origin != target) { # Tried conversions but none accepted # If no conversion attempts were actually made for this row, treat as OTHER accepted-as-is - attempts_made <- dat$n_conversion_attempts[i] + attempts_made <- n_conversion_attempts[i] if (is.na(attempts_made) || attempts_made == 0L) { - dat[i, `:=`(included = 1L, value_converted = val_raw, conversion = 2L, rule_applied = 0L)] + included[i] <- 1L + value_converted[i] <- val_raw + conversion[i] <- 2L + rule_applied[i] <- 0L } else { # Use actual number of attempts made on this row to determine rule_applied (91 if one try, 92 if multiple) final_conv <- if (origin_other) 2L else 1L - dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = final_conv, rule_applied = ifelse(attempts_made <= 1L, 91L, 92L))] + included[i] <- 0L + value_converted[i] <- NA_real_ + conversion[i] <- final_conv + rule_applied[i] <- ifelse(attempts_made <= 1L, 91L, 92L) } } else { - dat[i, `:=`(included = 0L, value_converted = NA_real_, conversion = 0L, rule_applied = 90L)] + included[i] <- 0L + value_converted[i] <- NA_real_ + conversion[i] <- 0L + rule_applied[i] <- 90L } } - # Clean helper columns - dat[, n_conversion_attempts := NULL] + dat[, `:=`( + included = included, + value_converted = value_converted, + conversion = conversion, + rule_applied = rule_applied + )] return(dat) } diff --git a/tests/testthat/data/Example 6/i_ground_truth/dataset_cleaned_lab_values.csv b/tests/testthat/data/Example 6/i_ground_truth/dataset_cleaned_lab_values.csv new file mode 100644 index 0000000..134023c --- /dev/null +++ b/tests/testthat/data/Example 6/i_ground_truth/dataset_cleaned_lab_values.csv @@ -0,0 +1,3 @@ +person_id,concept_id,value_origin,unit_origin,included,value,unit_target,conversion,rule_applied,age +P01,LAB_TEST,10,foo,1,10,mg,1,1,20 +P02,LAB_TEST,10,foo,0,,mg,1,91,10 diff --git a/tests/testthat/data/Example 6/i_input/LAB_target_units.csv b/tests/testthat/data/Example 6/i_input/LAB_target_units.csv new file mode 100644 index 0000000..5538ea4 --- /dev/null +++ b/tests/testthat/data/Example 6/i_input/LAB_target_units.csv @@ -0,0 +1,2 @@ +concept_id,unit_target +LAB_TEST,mg diff --git a/tests/testthat/data/Example 6/i_input/LAB_threshold.csv b/tests/testthat/data/Example 6/i_input/LAB_threshold.csv new file mode 100644 index 0000000..81be290 --- /dev/null +++ b/tests/testthat/data/Example 6/i_input/LAB_threshold.csv @@ -0,0 +1,2 @@ +concept_id,Min,Max,unit_target,condition_on_variable,variable +LAB_TEST,0,100,mg,age >= 18,age diff --git a/tests/testthat/data/Example 6/i_input/LAB_unit_conversion.csv b/tests/testthat/data/Example 6/i_input/LAB_unit_conversion.csv new file mode 100644 index 0000000..bcdbc9c --- /dev/null +++ b/tests/testthat/data/Example 6/i_input/LAB_unit_conversion.csv @@ -0,0 +1,2 @@ +concept_id,datasource,unit_origin,unit_target,multiplication_factor_from_origin_to_target,conversion_rate,conversion_not_multiplication,condition_on_value,assumed_unit_if_missing,next_attempt +LAB_TEST,,foo,mg,1,,,,,0 diff --git a/tests/testthat/data/Example 6/i_input/dataset_lab_values.csv b/tests/testthat/data/Example 6/i_input/dataset_lab_values.csv new file mode 100644 index 0000000..c24bea3 --- /dev/null +++ b/tests/testthat/data/Example 6/i_input/dataset_lab_values.csv @@ -0,0 +1,3 @@ +person_id,concept_id,value,unit,age +P01,LAB_TEST,10,foo,20 +P02,LAB_TEST,10,foo,10 diff --git a/tests/testthat/test-examples.R b/tests/testthat/test-examples.R index 5a7f231..bca99d3 100644 --- a/tests/testthat/test-examples.R +++ b/tests/testthat/test-examples.R @@ -4,7 +4,7 @@ library(testthat) base_path <- function(...) test_path("data", ...) -run_example <- function(ex) { +run_example <- function(ex, datasource = "") { input_dir <- base_path(ex, "i_input") gt_dir <- base_path(ex, "i_ground_truth") @@ -17,7 +17,8 @@ run_example <- function(ex) { dataset = dataset, lab_target_units = path_target_units, lab_unit_conversion = path_unit_conversion, - lab_thresholds = path_thresholds + lab_thresholds = path_thresholds, + datasource = datasource ) gt <- fread(file.path(gt_dir, "dataset_cleaned_lab_values.csv")) @@ -27,8 +28,14 @@ run_example <- function(ex) { all.equal(cleaned, gt, check.attributes = FALSE) } -for (i in 1:4) { - test_that(paste("Example", i, "matches ground truth"), { - expect_true(isTRUE(run_example(paste("Example", i)))) + +example_cases <- data.table( + example = paste("Example", 1:6), + datasource = c("", "", "", "", "DS_A", "") +) + +for (case_idx in seq_len(nrow(example_cases))) { + test_that(paste(example_cases$example[case_idx], "matches ground truth"), { + expect_true(isTRUE(run_example(example_cases$example[case_idx], example_cases$datasource[case_idx]))) }) } diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R index ddb9d09..c937357 100644 --- a/tests/testthat/test-validation.R +++ b/tests/testthat/test-validation.R @@ -28,28 +28,3 @@ test_that("Example 4 with wrong unit conversion does not match ground truth", { expect_false(isTRUE(all.equal(cleaned, gt, check.attributes = FALSE))) }) - -test_that("Example 5 with specified datasource matches ground truth", { - input_dir <- test_path("data", "Example 5", "i_input") - gt_dir <- test_path("data", "Example 5", "i_ground_truth") - - dataset <- fread(file.path(input_dir, "dataset_lab_values.csv")) - path_target_units <- file.path(input_dir, "LAB_target_units.csv") - path_unit_conversion <- file.path(input_dir, "LAB_unit_conversion.csv") - path_thresholds <- file.path(input_dir, "LAB_threshold.csv") - - cleaned <- CleanLabValuesDataset( - dataset = dataset, - lab_target_units = path_target_units, - lab_unit_conversion = path_unit_conversion, - lab_thresholds = path_thresholds, - datasource = "DS_A" - ) - - gt <- fread(file.path(gt_dir, "dataset_cleaned_lab_values.csv")) - - setorder(cleaned, person_id, concept_id) - setorder(gt, person_id, concept_id) - - expect_true(isTRUE(all.equal(cleaned, gt, check.attributes = FALSE))) -})