Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ Description: Ingests instructions to clean datasets containing results from
License: AGPL-3
Encoding: UTF-8
Imports:
data.table
data.table,
logger
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
Expand Down
112 changes: 55 additions & 57 deletions R/check_metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ check_dataset_model <- function(dataset) {
# num_nonnumeric <- sum(invalid_)
# if (num_nonnumeric > 0) {
# stop(paste("The file 'dataset' has", num_nonnumeric,"record(s) where the variable ", varname," contains a non-numeric value."))
# }
# }
# }
message("[CleanLabValues] Dataset model check passed successfully.")
logger::log_info("[CleanLabValues] Dataset model check passed successfully.")
}

#######################################################
Expand All @@ -83,11 +83,10 @@ check_lab_target_units <- function(lab_target_units) {
if (!all(c("concept_id", "unit_target") %in% names(dt))) {
stop(paste("The file", lab_target_units, "should be a csv file with data model concept_id,unit_target"))
}
message("[CleanLabValues] LAB_target_units check passed successfully.")
logger::log_info("[CleanLabValues] LAB_target_units check passed successfully.")
invisible(dt)
}


#######################################################
# lab_unit_conversion

Expand All @@ -114,48 +113,47 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal
stop(paste("The file", lab_unit_conversion, "should include the variable", varname, "in its data model"))
}
}
# check that whenever multiplication_factor_from_origin_to_target is missing in at least one row, then the variable conversion_not_multiplication exists, and that there is no row where both variables are missing

# check that whenever multiplication_factor_from_origin_to_target is missing in at least one row, then the variable conversion_not_multiplication exists, and that there is no row where both variables are missing
mult_raw <- dt[["multiplication_factor_from_origin_to_target"]]
mult_chr <- trimws(as.character(mult_raw))
is_missing_mult <- is.na(mult_raw) | mult_chr == ""
nummissing_mult <- sum(is_missing_mult)

mult_num <- suppressWarnings(as.numeric(mult_chr))
invalid_mult <- !is_missing_mult & is.na(mult_num)
num_nonnumeric <- sum(invalid_mult)

if (num_nonnumeric > 0) {
stop(paste("The file", lab_unit_conversion, "has", num_nonnumeric,"record(s) where the conversion method multiplication_factor_from_origin_to_target contains a non-numeric value."))
}
stop(paste("The file", lab_unit_conversion, "has", num_nonnumeric, "record(s) where the conversion method multiplication_factor_from_origin_to_target contains a non-numeric value."))
}

varconv <- "conversion_not_multiplication"
exists_conv <- varconv %in% names(dt)

if (exists_conv) {
conv_raw <- dt[[varconv]]
conv_chr <- trimws(as.character(conv_raw))
is_missing_conv <- is.na(conv_raw) | conv_chr == ""
# count rows where both conversion methods are missing
both_present_mult_and_conv <- !is_missing_mult & !is_missing_conv
numboth_present_mult_and_conv <- sum(both_present_mult_and_conv)
#count rows where both conversion methods are included
# count rows where both conversion methods are included
both_missing_mult_and_conv <- is_missing_mult & is_missing_conv
nummissing_mult_and_conv <- sum(both_missing_mult_and_conv)
}else{
} else {
# conv_raw <- dt[["conversion_not_multiplication"]]
# conv_chr <- trimws(as.character(conv_raw))
# is_missing_conv <- is.na(conv_raw) | conv_chr == ""
}

if (nummissing_mult > 0) {
if (!exists_conv) {
stop(paste("The file", lab_unit_conversion, "has", nummissing_mult,"record(s) where the concversion method multiplication_factor_from_origin_to_target is missing. This can only happen if there is an alternative conversion method stored in a variable named", varconv, "."))
}else{

stop(paste("The file", lab_unit_conversion, "has", nummissing_mult, "record(s) where the concversion method multiplication_factor_from_origin_to_target is missing. This can only happen if there is an alternative conversion method stored in a variable named", varconv, "."))
} else {
# Check 1: no row should have both conversion methods missing.


if (nummissing_mult_and_conv > 0) {
stop(
"There are ",
Expand All @@ -164,9 +162,9 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal
"and conversion_not_multiplication are missing."
)
}

# Check 2: no row should have both conversion methods non-missing.

if (numboth_present_mult_and_conv > 0) {
stop(
"In the file ", lab_unit_conversion, " there are ",
Expand All @@ -177,9 +175,9 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal
)
}
}
}else{
} else {
# if multiplication_factor_from_origin_to_target does not have missing values, we still need to check that there is no conflicting information on the conversion
if (exists_conv) {
if (exists_conv) {
if (numboth_present_mult_and_conv > 0) {
stop(
"In the file", lab_unit_conversion, "there are ",
Expand All @@ -191,52 +189,49 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal
}
}
}

# If the alternative conversion method exists or the condition_on_value exists, check that it contains valid R expressions using dataset$value.
for (varconv in c("conversion_not_multiplication", "condition_on_value")) {
if (varconv %in% names(dt)) {

conv_raw <- dt[[varconv]]

if (is.character(conv_raw)) {
conv_chr <- trimws(conv_raw)
} else {
conv_chr <- trimws(as.character(conv_raw))
}

is_missing_conv <- is.na(conv_raw) | conv_chr == ""

has_nonempty_conv <- any(!is_missing_conv)

if (has_nonempty_conv) {

if (!is.character(conv_raw)) {
stop(
"The column ", varconv ," of the file ", lab_unit_conversion, " must be character."
"The column ", varconv, " of the file ", lab_unit_conversion, " must be character."
)
}

rows_with_conv <- which(!is_missing_conv)

bad_parse_conv <- rep(FALSE, nrow(dt))
bad_eval_conv <- rep(FALSE, nrow(dt))
bad_eval_conv <- rep(FALSE, nrow(dt))
eval_error_conv <- rep(NA_character_, nrow(dt))

for (i in rows_with_conv) {

expr_txt <- conv_chr[i]

parsed_expr <- tryCatch(
parse(text = expr_txt),
error = function(e) e
)

if (inherits(parsed_expr, "error") || length(parsed_expr) != 1L) {
bad_parse_conv[i] <- TRUE
eval_error_conv[i] <- conditionMessage(parsed_expr)
next
}

eval_result <- tryCatch(
eval(
parse(text = expr_txt),
Expand All @@ -245,51 +240,54 @@ check_lab_unit_conversion <- function(lab_unit_conversion, datasource, list_anal
),
error = function(e) e
)

if (inherits(eval_result, "error")) {
bad_eval_conv[i] <- TRUE
eval_error_conv[i] <- conditionMessage(eval_result)
}
}

numbad_parse_conv <- sum(bad_parse_conv)
numbad_eval_conv <- sum(bad_eval_conv)
numbad_eval_conv <- sum(bad_eval_conv)

if (numbad_parse_conv > 0) {
stop(
"The column ", varconv ," of the file ", lab_unit_conversion, " contains ",
"The column ", varconv, " of the file ", lab_unit_conversion, " contains ",
numbad_parse_conv,
" expression(s) that cannot be parsed as valid R expressions."
)
}

if (numbad_eval_conv > 0) {
stop(
"The column ", varconv ," of the file ", lab_unit_conversion, " contains ",
"The column ", varconv, " of the file ", lab_unit_conversion, " contains ",
numbad_eval_conv,
" expression(s) that cannot be evaluated as R expressions involving a numeric variable named value."
)
}
}
}
}

# if the non-mandatory argument 'datasource' is used, check that it corresponds to a variable in lab_unit_conversion
if (datasource != "" & !("datasource" %in% names(dt))) {
stop(paste("You specified the argument 'datasource' but in this case the file", lab_unit_conversion, "should include the variable 'datasource' in its data model, while it does not"))
}
for (variable in unique(c(list_analyses))) {
if (nrow(dt[concept_id == variable, .(unit_target)]) > 0) {
if (trimws(unique(dt[concept_id == variable, .(unit_target)])) != target_unit[[variable]]) {
stop(paste("In the lab_unit_conversion, the concept_id", variable, "is described with target units that are inconsistent with the target unit assigned in the tab LAB_target_units, which is", target_unit[[variable]]))
target_unit_in_data <- trimws(unique(dt[concept_id == variable, .(unit_target)]))
target_unit_in_meta <- target_unit[[variable]]
if (!is.na(target_unit_in_data) & !is.na(target_unit_in_meta)) {
if (target_unit_in_data != target_unit_in_meta) {
stop(paste("In the lab_unit_conversion, the concept_id", variable, "is described with target units that are inconsistent with the target unit assigned in the tab LAB_target_units, which is", target_unit_in_meta))
}
}
}
}
message("[CleanLabValues] LAB_unit_conversion check passed successfully.")
logger::log_info("[CleanLabValues] LAB_unit_conversion check passed successfully.")
invisible(dt)
}


#####################################
# lab_thresholds

Expand All @@ -313,20 +311,20 @@ check_lab_thresholds <- function(lab_thresholds, dataset) {
stop(paste("The file", lab_thresholds, "should include the variable", varname, "in its data model"))
}
}

for (varname in c("Min", "Max")) {
vec_ <- dt[[varname]]
is_num_ <- suppressWarnings(as.numeric(vec_))
invalid_ <- is.na(is_num_)
num_nonnumeric <- sum(invalid_)

if (num_nonnumeric > 0) {
stop(paste("The file", lab_thresholds, "has", num_nonnumeric,"record(s) where the variable ", varname," contains a non-numeric value."))
}
stop(paste("The file", lab_thresholds, "has", num_nonnumeric, "record(s) where the variable ", varname, " contains a non-numeric value."))
}
}

message("[CleanLabValues] LAB_thresholds check passed successfully.")


logger::log_info("[CleanLabValues] LAB_thresholds check passed successfully.")
invisible(dt)
}
49 changes: 30 additions & 19 deletions R/clean_lab_main.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,30 +56,42 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u
# Step 1: Fill missing units
dt <- fill_missing_unit(dt, meta_unit_conv, target_unit)

dt_cols <- names(dt)
dt_by_cid <- split(dt, by = "concept_id", keep.by = TRUE, sorted = FALSE)
meta_by_cid <- split(meta_unit_conv, by = "concept_id", keep.by = TRUE, sorted = FALSE)
thresholds_by_cid <- split(meta_thresholds, by = "concept_id", keep.by = TRUE, sorted = FALSE)
empty_meta_cid <- meta_unit_conv[0]
empty_thresholds_cid <- meta_thresholds[0]
input_out_cols <- input_cols
if ("value" %in% input_out_cols) input_out_cols[input_out_cols == "value"] <- "value_origin"
if ("unit" %in% input_out_cols) input_out_cols[input_out_cols == "unit"] <- "unit_origin"

# Prepare result list
result_list <- list()
for (cid in unique(dt$concept_id)) {
for (cid in names(dt_by_cid)) {
# Preserve all columns from the original input for thresholding (e.g., age)
dt_cid <- dt[concept_id == cid, .SD, .SDcols = names(dt)]
meta_cid <- meta_unit_conv[concept_id == cid]
dt_cid <- dt_by_cid[[cid]]
meta_cid <- meta_by_cid[[cid]]
if (is.null(meta_cid)) meta_cid <- empty_meta_cid
target_unit_cid <- target_unit[[cid]]
if (nrow(dt_cid) == 0) next
logger::log_info(paste0("[CleanLabValues] Processing concept_id ", cid, " with ", nrow(dt_cid), " row(s)."))
# Step 2: Prepare unit_matched (unit_filled if present, else unit)
dt_cid[, unit_matched := unit_filled]
dt_cid[is.na(unit_matched) | unit_matched == "", unit_matched := target_unit_cid]
dt_cid[, unit_target := target_unit_cid]
# Mark rows with missing original unit
dt_cid[, unit_missing := is.na(unit_origin) | unit_origin == ""]
# For missing unit, set unit_matched to assumed_unit_if_missing from metadata (per-row, not just first)
# For missing unit, set unit_matched to assumed_unit_if_missing when metadata provides one.
if ("assumed_unit_if_missing" %in% names(meta_cid)) {
idx_missing <- which(dt_cid$unit_missing)
if (length(idx_missing) > 0) {
for (j in idx_missing) {
assumed_unit <- meta_cid[unit_target == dt_cid$unit_target[j] & !is.na(assumed_unit_if_missing) & assumed_unit_if_missing != "", assumed_unit_if_missing][1]
if (!is.na(assumed_unit) && assumed_unit != "") {
dt_cid$unit_matched[j] <- assumed_unit
}
}
assumed_unit <- meta_cid[
unit_target == target_unit_cid &
!is.na(assumed_unit_if_missing) &
assumed_unit_if_missing != "",
assumed_unit_if_missing[1]
]
if (length(assumed_unit) > 0 && !is.na(assumed_unit) && assumed_unit != "") {
dt_cid[unit_missing == TRUE, unit_matched := assumed_unit]
}
}

Expand All @@ -97,8 +109,9 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u
# Always ensure direct match row (unit_matched == unit_target, factor 1, next_attempt from existing rows) is present and first
# Use the first available next_attempt for this concept_id and unit_target, or 0 if none
direct_next_attempt <- 0
if (nrow(meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)]) > 0) {
direct_next_attempt <- meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)][[1, "next_attempt"]]
next_attempt_rows <- meta_cid[unit_target == target_unit_cid & !is.na(next_attempt)]
if (nrow(next_attempt_rows) > 0) {
direct_next_attempt <- next_attempt_rows[[1, "next_attempt"]]
}
direct_row <- data.table::data.table(
concept_id = cid,
Expand All @@ -109,7 +122,8 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u
)
meta_cid_full <- rbind(direct_row, meta_cid_full, fill = TRUE)
# Join thresholds on concept_id and unit_target only
meta_thresholds_cid <- meta_thresholds[concept_id == cid]
meta_thresholds_cid <- thresholds_by_cid[[cid]]
if (is.null(meta_thresholds_cid)) meta_thresholds_cid <- empty_thresholds_cid
meta_cid_full <- merge(meta_cid_full, meta_thresholds_cid, by = c("concept_id", "unit_target"), all.x = TRUE)
# For compatibility with mo_convert, rename unit_origin to unit_matched
setnames(meta_cid_full, "unit_origin", "unit_matched", skip_absent = TRUE)
Expand All @@ -125,13 +139,10 @@ clean_lab_main <- function(dataset, list_analyses = c(), lab_target_units, lab_u
# but expose the original measurement as `value_origin`/`unit_origin` (these are created above),
# then append the cleaning result columns.
result_cols <- c("included", "value", "unit_target", "conversion", "rule_applied")
# map input column names to output names: original `value` -> `value_origin`, `unit` -> `unit_origin`
input_out_cols <- input_cols
if ("value" %in% input_out_cols) input_out_cols[input_out_cols == "value"] <- "value_origin"
if ("unit" %in% input_out_cols) input_out_cols[input_out_cols == "unit"] <- "unit_origin"
out_cols <- c(intersect(input_out_cols, names(dt_cid)), result_cols, ".order_id")
out <- dt_cid[, ..out_cols]
result_list[[cid]] <- out
logger::log_info(paste0("[CleanLabValues] Completed concept_id ", cid, "."))
}
result <- data.table::rbindlist(result_list, fill = TRUE)
# restore original ordering and remove temporary order column (drop explicitly)
Expand Down
5 changes: 5 additions & 0 deletions R/fill_missing_unit.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@ fill_missing_unit <- function(dt, meta_unit_conv, target_unit, concept_id_col =
for (cid in unique(dt[[concept_id_col]])) {
assumed_unit <- meta_unit_conv[concept_id == cid & unit_origin == "MISSING" & is.na(condition_on_value), assumed_unit_if_missing]
idx <- which(dt[[concept_id_col]] == cid & (dt[[unit_col]] == "" | is.na(dt[[unit_col]])))
if (length(idx) == 0) {
next
}
if (length(assumed_unit) > 0 && !is.na(assumed_unit[1])) {
logger::log_info(paste0("[CleanLabValues] Filling missing units for concept_id ", cid, " using assumed unit '", assumed_unit[1], "' on ", length(idx), " row(s)."))
dt[idx, unit_filled := assumed_unit[1]]
} else {
logger::log_info(paste0("[CleanLabValues] Filling missing units for concept_id ", cid, " using target unit '", target_unit[[cid]], "' on ", length(idx), " row(s)."))
dt[idx, unit_filled := target_unit[[cid]]]
}
}
Expand Down
Loading