Skip to content

Commit 589ebb5

Browse files
committed
Add MZMinetoMSstatsFormat converter
Brings metabolomics into the MSstats family by adding an MZMine converter that mirrors the structure of DIANNtoMSstatsFormat. Phase 1 of a two-phase task; Phase 2 (MSstatsShiny BIO=Metabolomics) will be a separate PR.
1 parent b9564f2 commit 589ebb5

16 files changed

Lines changed: 513 additions & 3 deletions

.Rbuildignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@
66
^_pkgdown\.yml$
77
^docs$
88
^pkgdown$
9+
^\.positai$
10+
^\.claude$
11+
^doc$
12+
^Meta$

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,7 @@ inst/doc
1010
*.so
1111
*.dll
1212
.lintr
13-
.vscode
13+
.vscode
14+
.positai
15+
/doc/
16+
/Meta/

DESCRIPTION

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Suggests:
3535
rmarkdown
3636
LinkingTo: Rcpp
3737
Collate:
38+
'clean_MZMine.R'
3839
'clean_ProteinProspector.R'
3940
'clean_Metamorpheus.R'
4041
'clean_DIANN.R'
@@ -53,6 +54,7 @@ Collate:
5354
'converters_DIANNtoMSstatsFormat.R'
5455
'converters_DIAUmpiretoMSstatsFormat.R'
5556
'converters_FragPipetoMSstatsFormat.R'
57+
'converters_MZMinetoMSstatsFormat.R'
5658
'converters_MaxQtoMSstatsFormat.R'
5759
'converters_MaxQtoMSstatsTMTFormat.R'
5860
'converters_MetamorpheusToMSstatsFormat.R'
@@ -81,4 +83,4 @@ Collate:
8183
'utils_fractions.R'
8284
'utils_logging.R'
8385
'utils_shared_peptides.R'
84-
VignetteBuilder: knitr
86+
VignetteBuilder: knitr

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ export(MSstatsLogsSettings)
1414
export(MSstatsMakeAnnotation)
1515
export(MSstatsPreprocess)
1616
export(MSstatsSaveSessionInfo)
17+
export(MZMinetoMSstatsFormat)
1718
export(MaxQtoMSstatsFormat)
1819
export(MaxQtoMSstatsTMTFormat)
1920
export(MetamorpheusToMSstatsFormat)

R/MSstatsConvert_core_functions.R

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ setClass("MSstatsMetamorpheusFiles", contains = "MSstatsInputFiles")
7171
#' @rdname MSstatsInputFiles
7272
#' @keywords internal
7373
setClass("MSstatsProteinProspectorFiles", contains = "MSstatsInputFiles")
74+
#' MSstatsMZMineFiles: class for MZMine files.
75+
#' @rdname MSstatsInputFiles
76+
#' @keywords internal
77+
setClass("MSstatsMZMineFiles", contains = "MSstatsInputFiles")
7478

7579

7680
#' Get one of files contained in an instance of `MSstatsInputFiles` class.
@@ -291,8 +295,15 @@ setMethod("MSstatsClean", signature = "MSstatsMetamorpheusFiles",
291295
#' @rdname MSstatsClean
292296
#' @inheritParams .cleanRawProteinProspector
293297
#' @return data.table
294-
setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles",
298+
setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles",
295299
.cleanRawProteinProspector)
300+
#' Clean MZMine files
301+
#' @include clean_MZMine.R
302+
#' @rdname MSstatsClean
303+
#' @inheritParams .cleanRawMZMine
304+
#' @return data.table
305+
setMethod("MSstatsClean", signature = "MSstatsMZMineFiles",
306+
.cleanRawMZMine)
296307

297308

298309
#' Preprocess outputs from MS signal processing tools for analysis with MSstats

R/clean_MZMine.R

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#' Clean raw MZMine files
2+
#'
3+
#' Operates on the column names produced by MZMine after MSstatsConvert's
4+
#' internal column-name standardization (spaces collapsed and dots removed):
5+
#' "row ID" becomes `rowID`, "row m/z" becomes `rowmz`, "row retention time"
6+
#' becomes `rowretentiontime`, and each "<sample> Peak area" becomes
7+
#' `<standardized-sample>Peakarea`.
8+
#'
9+
#' @param msstats_object an object of class `MSstatsMZMineFiles`.
10+
#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library
11+
#' annotations with columns `id`, `compound_name`, `score`. When supplied,
12+
#' the highest-scoring `compound_name` per feature is used as `ProteinName`.
13+
#' Features without a matching annotation row fall back to an mz_rt string
14+
#' `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature
15+
#' uses the mz_rt fallback.
16+
#' @return data.table
17+
#' @keywords internal
18+
.cleanRawMZMine <- function(msstats_object, mzmine_annotations = NULL) {
19+
ProteinName = PeptideSequence = Intensity = Run = NULL
20+
PrecursorCharge = FragmentIon = ProductCharge = IsotopeLabelType = NULL
21+
sample_col = id = score = compound_name = NULL
22+
23+
mz_input <- getInputFile(msstats_object, "input")
24+
mz_input <- data.table::as.data.table(mz_input)
25+
26+
peak_area_suffix <- "Peakarea"
27+
peak_area_cols <- grep(paste0(peak_area_suffix, "$"),
28+
colnames(mz_input), value = TRUE)
29+
if (length(peak_area_cols) == 0) {
30+
stop("No 'Peak area' columns found in the input. Expected per-sample ",
31+
"columns named '<run> Peak area' (e.g. 'sampleA.mzML Peak area').")
32+
}
33+
id_col <- "rowID"
34+
mz_col <- "rowmz"
35+
rt_col <- "rowretentiontime"
36+
required_meta <- c(id_col, mz_col, rt_col)
37+
missing_meta <- setdiff(required_meta, colnames(mz_input))
38+
if (length(missing_meta) > 0) {
39+
stop("Missing required MZMine metadata column(s) (expected 'row ID', ",
40+
"'row m/z', 'row retention time'). After standardization, ",
41+
"looked for: ", paste(missing_meta, collapse = ", "), ".")
42+
}
43+
44+
mz_rt_fallback <- paste0(round(mz_input[[mz_col]], 4), "_",
45+
round(mz_input[[rt_col]], 2))
46+
47+
if (!is.null(mzmine_annotations)) {
48+
ann <- data.table::as.data.table(mzmine_annotations)
49+
required_ann <- c("id", "compound_name", "score")
50+
missing_ann <- setdiff(required_ann, colnames(ann))
51+
if (length(missing_ann) > 0) {
52+
stop("mzmine_annotations is missing required column(s): ",
53+
paste(missing_ann, collapse = ", "), ".")
54+
}
55+
data.table::setorder(ann, id, -score)
56+
ann_top <- unique(ann, by = "id")
57+
matched <- ann_top[match(mz_input[[id_col]], ann_top[["id"]]),
58+
compound_name]
59+
compound <- ifelse(is.na(matched), mz_rt_fallback, matched)
60+
} else {
61+
compound <- mz_rt_fallback
62+
}
63+
64+
mz_input[, ProteinName := compound]
65+
mz_input[, PeptideSequence := as.character(get(id_col))]
66+
67+
long <- data.table::melt(
68+
mz_input,
69+
id.vars = c("ProteinName", "PeptideSequence"),
70+
measure.vars = peak_area_cols,
71+
variable.name = "sample_col",
72+
value.name = "Intensity",
73+
variable.factor = FALSE)
74+
75+
long[, PrecursorCharge := NA_integer_]
76+
long[, FragmentIon := NA_character_]
77+
long[, ProductCharge := NA_integer_]
78+
long[, IsotopeLabelType := "Light"]
79+
long[, Run := sub(paste0(peak_area_suffix, "$"), "", sample_col)]
80+
81+
final_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge",
82+
"FragmentIon", "ProductCharge", "IsotopeLabelType",
83+
"Run", "Intensity")
84+
long <- long[, final_cols, with = FALSE]
85+
86+
.logSuccess("MZMine", "clean")
87+
long
88+
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#' Import MZMine files
2+
#'
3+
#' @inheritParams .sharedParametersAmongConverters
4+
#' @param input MZMine feature-quantification table (wide format; one row per
5+
#' feature). Must include the metadata columns `row ID`, `row m/z`,
6+
#' `row retention time`, and per-sample peak-area columns named
7+
#' `"<run> Peak area"` (e.g. `"sampleA.mzML Peak area"`).
8+
#' @param annotation `data.frame` with columns `Run`, `Condition`,
9+
#' `BioReplicate`. `Run` values must match the sample column names with the
10+
#' trailing `" Peak area"` stripped.
11+
#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library
12+
#' annotations with columns `id`, `compound_name`, `score`. When supplied,
13+
#' the highest-scoring `compound_name` per feature is used as `ProteinName`;
14+
#' features without a matching annotation row fall back to an mz_rt string
15+
#' `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature
16+
#' uses the mz_rt fallback.
17+
#' @param removeProtein_with1Feature `TRUE` will remove proteins (compounds)
18+
#' represented by a single feature. Default `FALSE`.
19+
#' @param summaryforMultipleRows `max` (default) or `sum` — used when multiple
20+
#' rows map to the same feature/run combination.
21+
#'
22+
#' @return data.table in the MSstats required format.
23+
#'
24+
#' @export
25+
#'
26+
#' @examples
27+
#' input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv",
28+
#' package = "MSstatsConvert")
29+
#' annot_path = system.file("tinytest/raw_data/MZMine/annotation.csv",
30+
#' package = "MSstatsConvert")
31+
#' lib_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv",
32+
#' package = "MSstatsConvert")
33+
#' input = data.table::fread(input_path)
34+
#' annot = data.table::fread(annot_path)
35+
#' lib = data.table::fread(lib_path)
36+
#' output = MZMinetoMSstatsFormat(input, annotation = annot,
37+
#' mzmine_annotations = lib,
38+
#' use_log_file = FALSE)
39+
#' head(output)
40+
MZMinetoMSstatsFormat = function(
41+
input,
42+
annotation = NULL,
43+
mzmine_annotations = NULL,
44+
removeProtein_with1Feature = FALSE,
45+
summaryforMultipleRows = max,
46+
use_log_file = TRUE,
47+
append = FALSE,
48+
verbose = TRUE,
49+
log_file_path = NULL,
50+
...) {
51+
MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
52+
log_file_path)
53+
54+
input = MSstatsConvert::MSstatsImport(list(input = input),
55+
"MSstats", "MZMine")
56+
input = MSstatsConvert::MSstatsClean(
57+
input, mzmine_annotations = mzmine_annotations)
58+
annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
59+
60+
feature_columns = c("PeptideSequence", "PrecursorCharge",
61+
"FragmentIon", "ProductCharge")
62+
fill_isotope_label_type = if ("IsotopeLabelType" %in% colnames(input))
63+
list() else list("IsotopeLabelType" = "Light")
64+
65+
input = MSstatsConvert::MSstatsPreprocess(
66+
input,
67+
annotation,
68+
feature_columns,
69+
remove_shared_peptides = FALSE,
70+
remove_single_feature_proteins = removeProtein_with1Feature,
71+
exact_filtering = NULL,
72+
pattern_filtering = NULL,
73+
aggregate_isotopic = FALSE,
74+
feature_cleaning = list(
75+
remove_features_with_few_measurements = FALSE,
76+
summarize_multiple_psms = summaryforMultipleRows),
77+
columns_to_fill = c(list(Fraction = 1), fill_isotope_label_type))
78+
input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
79+
80+
input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns,
81+
fill_incomplete = TRUE,
82+
handle_fractions = FALSE,
83+
remove_few = FALSE)
84+
85+
msg_final = paste("** Finished preprocessing. The dataset is ready",
86+
"to be processed by the dataProcess function.")
87+
getOption("MSstatsLog")("INFO", msg_final)
88+
getOption("MSstatsMsg")("INFO", msg_final)
89+
getOption("MSstatsLog")("INFO", "\n")
90+
input
91+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Run,Condition,BioReplicate
2+
sampleA.mzML,Control,1
3+
sampleB.mzML,Control,2
4+
sampleC.mzML,Treatment,3
5+
sampleD.mzML,Treatment,4
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
id,compound_name,score,adduct
2+
1,Caffeine,0.95,[M+H]+
3+
2,GlucoseLow,0.72,[M+H]+
4+
2,GlucoseHigh,0.91,[M-H]-
5+
3,Lactate,0.88,[M+H]+
6+
6,Caffeine,0.80,[M+Na]+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
row ID,row m/z,row retention time,sampleA.mzML Peak area,sampleB.mzML Peak area,sampleC.mzML Peak area,sampleD.mzML Peak area
2+
1,123.0560,1.23,1000,1100,1200,1300
3+
2,245.1290,3.45,5000,4800,5200,4900
4+
3,367.2010,5.67,800,0,750,820
5+
4,489.3340,7.89,2000,2100,1900,2050
6+
5,555.4470,9.10,100,0,0,0
7+
6,123.0560,1.45,600,650,700,680

0 commit comments

Comments
 (0)