|
| 1 | +#' Clean raw MZMine files |
| 2 | +#' |
| 3 | +#' Operates on the column names produced by MZMine after MSstatsConvert's |
| 4 | +#' internal column-name standardization (spaces collapsed and dots removed): |
| 5 | +#' "row ID" becomes `rowID`, "row m/z" becomes `rowmz`, "row retention time" |
| 6 | +#' becomes `rowretentiontime`, and each "<sample> Peak area" becomes |
| 7 | +#' `<standardized-sample>Peakarea`. |
| 8 | +#' |
| 9 | +#' @param msstats_object an object of class `MSstatsMZMineFiles`. |
| 10 | +#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library |
| 11 | +#' annotations with columns `id`, `compound_name`, `score`. When supplied, |
| 12 | +#' the highest-scoring `compound_name` per feature is used as `ProteinName`. |
| 13 | +#' Features without a matching annotation row fall back to an mz_rt string |
| 14 | +#' `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature |
| 15 | +#' uses the mz_rt fallback. |
| 16 | +#' @return data.table |
| 17 | +#' @keywords internal |
| 18 | +.cleanRawMZMine <- function(msstats_object, mzmine_annotations = NULL) { |
| 19 | + ProteinName = PeptideSequence = Intensity = Run = NULL |
| 20 | + PrecursorCharge = FragmentIon = ProductCharge = IsotopeLabelType = NULL |
| 21 | + sample_col = id = score = compound_name = NULL |
| 22 | + |
| 23 | + mz_input <- getInputFile(msstats_object, "input") |
| 24 | + mz_input <- data.table::as.data.table(mz_input) |
| 25 | + |
| 26 | + peak_area_suffix <- "Peakarea" |
| 27 | + peak_area_cols <- grep(paste0(peak_area_suffix, "$"), |
| 28 | + colnames(mz_input), value = TRUE) |
| 29 | + if (length(peak_area_cols) == 0) { |
| 30 | + stop("No 'Peak area' columns found in the input. Expected per-sample ", |
| 31 | + "columns named '<run> Peak area' (e.g. 'sampleA.mzML Peak area').") |
| 32 | + } |
| 33 | + id_col <- "rowID" |
| 34 | + mz_col <- "rowmz" |
| 35 | + rt_col <- "rowretentiontime" |
| 36 | + required_meta <- c(id_col, mz_col, rt_col) |
| 37 | + missing_meta <- setdiff(required_meta, colnames(mz_input)) |
| 38 | + if (length(missing_meta) > 0) { |
| 39 | + stop("Missing required MZMine metadata column(s) (expected 'row ID', ", |
| 40 | + "'row m/z', 'row retention time'). After standardization, ", |
| 41 | + "looked for: ", paste(missing_meta, collapse = ", "), ".") |
| 42 | + } |
| 43 | + |
| 44 | + mz_rt_fallback <- paste0(round(mz_input[[mz_col]], 4), "_", |
| 45 | + round(mz_input[[rt_col]], 2)) |
| 46 | + |
| 47 | + if (!is.null(mzmine_annotations)) { |
| 48 | + ann <- data.table::as.data.table(mzmine_annotations) |
| 49 | + required_ann <- c("id", "compound_name", "score") |
| 50 | + missing_ann <- setdiff(required_ann, colnames(ann)) |
| 51 | + if (length(missing_ann) > 0) { |
| 52 | + stop("mzmine_annotations is missing required column(s): ", |
| 53 | + paste(missing_ann, collapse = ", "), ".") |
| 54 | + } |
| 55 | + data.table::setorder(ann, id, -score) |
| 56 | + ann_top <- unique(ann, by = "id") |
| 57 | + matched <- ann_top[match(mz_input[[id_col]], ann_top[["id"]]), |
| 58 | + compound_name] |
| 59 | + compound <- ifelse(is.na(matched), mz_rt_fallback, matched) |
| 60 | + } else { |
| 61 | + compound <- mz_rt_fallback |
| 62 | + } |
| 63 | + |
| 64 | + mz_input[, ProteinName := compound] |
| 65 | + mz_input[, PeptideSequence := as.character(get(id_col))] |
| 66 | + |
| 67 | + long <- data.table::melt( |
| 68 | + mz_input, |
| 69 | + id.vars = c("ProteinName", "PeptideSequence"), |
| 70 | + measure.vars = peak_area_cols, |
| 71 | + variable.name = "sample_col", |
| 72 | + value.name = "Intensity", |
| 73 | + variable.factor = FALSE) |
| 74 | + |
| 75 | + long[, PrecursorCharge := NA_integer_] |
| 76 | + long[, FragmentIon := NA_character_] |
| 77 | + long[, ProductCharge := NA_integer_] |
| 78 | + long[, IsotopeLabelType := "Light"] |
| 79 | + long[, Run := sub(paste0(peak_area_suffix, "$"), "", sample_col)] |
| 80 | + |
| 81 | + final_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", |
| 82 | + "FragmentIon", "ProductCharge", "IsotopeLabelType", |
| 83 | + "Run", "Intensity") |
| 84 | + long <- long[, final_cols, with = FALSE] |
| 85 | + |
| 86 | + .logSuccess("MZMine", "clean") |
| 87 | + long |
| 88 | +} |
0 commit comments