Skip to content

Commit 8e38072

Browse files
authored
feat(diann): Add support for DIANN 2.0 (#109)
* feat(diann): Add support for DIANN 2.0 * add unit tests - pending test files * optimize code for large datasets * use parquet file for input instead of CSV for examples * update documentation for quantificationColumn * edit docs for clean diann function * refactor DIANN code to be more readable
1 parent ee9286c commit 8e38072

9 files changed

Lines changed: 241 additions & 62 deletions

DESCRIPTION

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ Suggests:
2929
tinytest,
3030
covr,
3131
knitr,
32-
rmarkdown
32+
rmarkdown,
33+
arrow
3334
Collate:
3435
'clean_ProteinProspector.R'
3536
'clean_Metamorpheus.R'

R/clean_DIANN.R

Lines changed: 176 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,183 @@
11
#' Clean raw Diann files
22
#' @param msstats_object an object of class `MSstatsDIANNFiles`.
33
#' @param MBR True if analysis was done with match between runs
4-
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead.
4+
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
5+
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
6+
#' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
57
#' @return data.table
68
#' @importFrom stats na.omit
79
#' @keywords internal
8-
.cleanRawDIANN = function(msstats_object, MBR = TRUE,
9-
quantificationColumn = "FragmentQuantCorrected") {
10-
dn_input = getInputFile(msstats_object, "input")
11-
dn_input = data.table::as.data.table(dn_input)
12-
13-
if (!is.element("PrecursorMz", colnames(dn_input))) {
14-
dn_input[, PrecursorMz := NA]
15-
}
16-
if (!is.element('FragmentInfo', colnames(dn_input))) {
17-
dn_input[, FragmentInfo := NA]
18-
}
19-
req_cols = c('ProteinNames', 'StrippedSequence',
20-
'ModifiedSequence', 'PrecursorCharge',
21-
quantificationColumn, 'QValue',
22-
'PrecursorMz', 'FragmentInfo', 'Run')
23-
if (MBR) {
24-
req_cols = c(req_cols, c('LibQValue', 'LibPGQValue'))
25-
} else{
26-
req_cols = c(req_cols, c('GlobalQValue', 'GlobalPGQValue'))
27-
}
28-
dn_input = dn_input[, req_cols, with = FALSE]
29-
dn_input = dn_input[, lapply(.SD, function(x) unlist(tstrsplit(x, ";"))),
30-
.SDcols = c(quantificationColumn, "FragmentInfo"),
31-
by = setdiff(colnames(dn_input), c("FragmentInfo", quantificationColumn))]
32-
if (all(is.na(dn_input[["FragmentInfo"]]))) {
33-
dn_input[, FragmentInfo := paste0("Frag", 1:.N),
34-
by = c("ProteinNames", "ModifiedSequence", "PrecursorCharge", "Run")]
35-
}
36-
dn_input[, (quantificationColumn) := lapply(.SD, as.numeric), .SDcols = quantificationColumn]
37-
dn_input[, FragmentIon := sub('\\^\\.\\*', '', FragmentInfo)]
38-
if (any(grepl("/", dn_input$FragmentInfo))) {
39-
dn_input[, ProductCharge := unlist(strsplit(FragmentInfo, split = "/"))[[1]], by = FragmentInfo]
40-
dn_input[, ProductCharge := strtoi(sub("\\.\\*\\^", "", ProductCharge))]
41-
} else {
42-
dn_input[, ProductCharge := 1]
43-
}
44-
dn_input = dn_input[!grepl("NH3", FragmentIon), ]
45-
dn_input = dn_input[!grepl("H2O", FragmentIon), ]
46-
dn_input = na.omit(dn_input, cols = quantificationColumn)
47-
data.table::setnames(dn_input, old = c('ProteinNames', 'StrippedSequence',
48-
'ModifiedSequence','PrecursorCharge',
49-
quantificationColumn, 'QValue',
50-
'PrecursorMz', 'FragmentIon','Run',
51-
'ProductCharge'),
52-
new = c('ProteinName', 'PeptideSequence',
53-
'PeptideModifiedSequence','PrecursorCharge',
54-
'Intensity', 'DetectionQValue',
55-
'PrecursorMz', 'FragmentIon','Run',
56-
'ProductCharge'),
57-
skip_absent = TRUE)
58-
dn_input[, PeptideSequence := NULL]
59-
setnames(dn_input, "PeptideModifiedSequence", "PeptideSequence")
60-
.logSuccess("DIANN", "clean")
61-
dn_input
10+
.cleanRawDIANN <- function(msstats_object, MBR = TRUE,
11+
quantificationColumn = "FragmentQuantCorrected") {
12+
dn_input <- getInputFile(msstats_object, "input")
13+
dn_input <- data.table::as.data.table(dn_input)
14+
15+
# Process quantification columns
16+
quantificationColumn <- .cleanDIANNProcessQuantificationColumns(dn_input, quantificationColumn)
17+
18+
# Add missing columns
19+
dn_input <- .cleanDIANNAddMissingColumns(dn_input)
20+
21+
# Select required columns
22+
dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR)
23+
24+
# Split concatenated values
25+
dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn)
26+
27+
# Process fragment information
28+
dn_input <- .cleanDIANNProcessFragmentInfo(dn_input, quantificationColumn)
29+
30+
# Clean and filter data
31+
dn_input <- .cleanDIANNCleanAndFilterData(dn_input, quantificationColumn)
32+
33+
# Rename columns
34+
dn_input <- .cleanDIANNRenameColumns(dn_input, quantificationColumn)
35+
36+
.logSuccess("DIANN", "clean")
37+
dn_input
6238
}
39+
40+
#' Process quantification columns for DIANN 2.0 format
41+
#' @param dn_input data.table input
42+
#' @param quantificationColumn quantification column name
43+
#' @return updated quantification column name
44+
#' @noRd
45+
.cleanDIANNProcessQuantificationColumns <- function(dn_input, quantificationColumn) {
46+
if (quantificationColumn == "auto") {
47+
fragment_columns <- grep("^Fr[0-9]+Quantity$", names(dn_input), value = TRUE)
48+
if (length(fragment_columns) == 0) {
49+
stop("No fragment quantification columns found. Please check your input.")
50+
}
51+
dn_input[, FragmentQuantCorrected := do.call(paste, c(.SD, sep = ";")),
52+
.SDcols = fragment_columns]
53+
quantificationColumn <- "FragmentQuantCorrected"
54+
}
55+
return(quantificationColumn)
56+
}
57+
58+
#' Add missing required columns
59+
#' @param dn_input data.table input
60+
#' @return data.table with missing columns added
61+
#' @noRd
62+
.cleanDIANNAddMissingColumns <- function(dn_input) {
63+
if (!is.element("PrecursorMz", colnames(dn_input))) {
64+
dn_input[, PrecursorMz := NA]
65+
}
66+
if (!is.element('FragmentInfo', colnames(dn_input))) {
67+
dn_input[, FragmentInfo := NA]
68+
}
69+
return(dn_input)
70+
}
71+
72+
#' Select required columns based on MBR setting
73+
#' @param dn_input data.table input
74+
#' @param quantificationColumn quantification column name
75+
#' @param MBR logical indicating if match between runs was used
76+
#' @return data.table with selected columns
77+
#' @noRd
78+
.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR) {
79+
base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
80+
'PrecursorCharge', quantificationColumn, 'QValue',
81+
'PrecursorMz', 'FragmentInfo', 'Run')
82+
83+
mbr_cols <- if (MBR) {
84+
c('LibQValue', 'LibPGQValue')
85+
} else {
86+
c('GlobalQValue', 'GlobalPGQValue')
87+
}
88+
89+
req_cols <- c(base_cols, mbr_cols)
90+
return(dn_input[, req_cols, with = FALSE])
91+
}
92+
93+
#' Split concatenated values in quantification and fragment info columns
94+
#' @param dn_input data.table input
95+
#' @param quantificationColumn quantification column name
96+
#' @return data.table with split values
97+
#' @noRd
98+
.cleanDIANNSplitConcatenatedValues <- function(dn_input, quantificationColumn) {
99+
split_cols <- c(quantificationColumn, "FragmentInfo")
100+
by_cols <- setdiff(colnames(dn_input), split_cols)
101+
102+
dn_input <- dn_input[, lapply(.SD, function(x) unlist(tstrsplit(x, ";"))),
103+
.SDcols = split_cols,
104+
by = by_cols]
105+
return(dn_input)
106+
}
107+
108+
#' Process fragment information and add derived columns
109+
#' @param dn_input data.table input
110+
#' @param quantificationColumn quantification column name
111+
#' @return data.table with processed fragment info
112+
#' @noRd
113+
.cleanDIANNProcessFragmentInfo <- function(dn_input, quantificationColumn) {
114+
# Generate fragment info if missing
115+
if (all(is.na(dn_input[["FragmentInfo"]]))) {
116+
dn_input[, FragmentInfo := paste0("Frag", 1:.N),
117+
by = c("ProteinNames", "ModifiedSequence", "PrecursorCharge", "Run")]
118+
}
119+
120+
# Convert quantification column to numeric
121+
dn_input[, (quantificationColumn) := lapply(.SD, as.numeric),
122+
.SDcols = quantificationColumn]
123+
124+
# Process fragment ion information
125+
dn_input[, FragmentIon := sub('\\^\\.\\*', '', FragmentInfo)]
126+
127+
# Extract product charge
128+
if (any(grepl("/", dn_input$FragmentInfo))) {
129+
dn_input[, ProductCharge := .cleanDIANNExtractProductCharge(FragmentInfo), by = FragmentInfo]
130+
} else {
131+
dn_input[, ProductCharge := 1]
132+
}
133+
134+
return(dn_input)
135+
}
136+
137+
#' Extract product charge from fragment info
138+
#' @param fragment_info fragment information string
139+
#' @return numeric product charge
140+
#' @noRd
141+
.cleanDIANNExtractProductCharge <- function(fragment_info) {
142+
charge_part <- unlist(strsplit(fragment_info, split = "/"))[[1]]
143+
return(strtoi(sub("\\.\\*\\^", "", charge_part)))
144+
}
145+
146+
#' Clean and filter data by removing unwanted fragments and NA values
147+
#' @param dn_input data.table input
148+
#' @param quantificationColumn quantification column name
149+
#' @return cleaned data.table
150+
#' @noRd
151+
.cleanDIANNCleanAndFilterData <- function(dn_input, quantificationColumn) {
152+
# Remove NH3 and H2O loss fragments
153+
dn_input <- dn_input[!grepl("NH3", FragmentIon)]
154+
dn_input <- dn_input[!grepl("H2O", FragmentIon)]
155+
156+
# Remove rows with NA in quantification column
157+
dn_input <- na.omit(dn_input, cols = quantificationColumn)
158+
159+
return(dn_input)
160+
}
161+
162+
#' Rename columns to standardized names
163+
#' @param dn_input data.table input
164+
#' @param quantificationColumn quantification column name
165+
#' @return data.table with renamed columns
166+
#' @noRd
167+
.cleanDIANNRenameColumns <- function(dn_input, quantificationColumn) {
168+
old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
169+
'PrecursorCharge', quantificationColumn, 'QValue',
170+
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
171+
172+
new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence',
173+
'PrecursorCharge', 'Intensity', 'DetectionQValue',
174+
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
175+
176+
data.table::setnames(dn_input, old = old_names, new = new_names, skip_absent = TRUE)
177+
178+
# Clean up peptide sequence columns
179+
dn_input[, PeptideSequence := NULL]
180+
setnames(dn_input, "PeptideModifiedSequence", "PeptideSequence")
181+
182+
return(dn_input)
183+
}

R/converters_DIANNtoMSstatsFormat.R

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
#' @param removeFewMeasurements should proteins with few measurements be removed
2121
#' @param removeOxidationMpeptides should peptides with oxidation be removed
2222
#' @param removeProtein_with1Feature should proteins with a single feature be removed
23-
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead.
23+
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
24+
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
25+
#' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
2426
#' @param ... additional parameters to `data.table::fread`.
2527
#'
2628
#' @return data.frame in the MSstats required format.
@@ -30,7 +32,6 @@
3032
#' @export
3133
#'
3234
#' @examples
33-
#' # See https://github.com/vdemichev/DiaNN/discussions/1525 for workaround for DIANN 2.0
3435
#' input_file_path = system.file("tinytest/raw_data/DIANN/diann_input.tsv",
3536
#' package="MSstatsConvert")
3637
#' annotation_file_path = system.file("tinytest/raw_data/DIANN/annotation.csv",
@@ -40,6 +41,17 @@
4041
#' output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE,
4142
#' use_log_file = FALSE)
4243
#' head(output)
44+
#'
45+
#' # For DIANN 2.0, set quantificationColumn = 'auto'
46+
#' input_file_path_2_0 = system.file("tinytest/raw_data/DIANN/diann_2.0.parquet",
47+
#' package="MSstatsConvert")
48+
#' annotation_file_path_2_0 = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0.csv",
49+
#' package = "MSstatsConvert")
50+
#' input_2_0 = arrow::read_parquet(input_file_path_2_0)
51+
#' annot_2_0 = data.table::fread(annotation_file_path_2_0)
52+
#' output_2_0 = DIANNtoMSstatsFormat(input_2_0, annotation = annot_2_0, MBR = FALSE,
53+
#' use_log_file = FALSE, quantificationColumn = 'auto')
54+
#' head(output_2_0)
4355
DIANNtoMSstatsFormat = function(input, annotation = NULL,
4456
global_qvalue_cutoff = 0.01,
4557
qvalue_cutoff = 0.01,
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Run,BioReplicate,Condition
2+
Run1,1,Control
3+
Run2,2,Control
4+
Run3,3,Control
5+
Run4,4,Control
6+
Run5,5,Treatment
7+
Run6,6,Treatment
8+
Run7,7,Treatment
9+
Run8,8,Treatment
49.8 KB
Binary file not shown.

inst/tinytest/test_converters_DIANNtoMSstatsFormat.R

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,24 @@ expect_true("ProductCharge" %in% colnames(output))
1616
expect_true("IsotopeLabelType" %in% colnames(output))
1717
expect_true("Condition" %in% colnames(output))
1818
expect_true("BioReplicate" %in% colnames(output))
19+
expect_true("Fraction" %in% colnames(output))
20+
21+
# Test DIANNtoMSstatsFormat DIANN 2.0 ------------------------
22+
input_file_path = system.file("tinytest/raw_data/DIANN/diann_2.0.parquet", package="MSstatsConvert")
23+
annotation_file_path = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0.csv", package = "MSstatsConvert")
24+
input = arrow::read_parquet(input_file_path)
25+
annot = data.table::fread(annotation_file_path)
26+
output = DIANNtoMSstatsFormat(input, annotation = annot, MBR = FALSE, use_log_file = FALSE, quantificationColumn = 'auto')
27+
expect_equal(ncol(output), 11)
28+
expect_equal(nrow(output), 180)
29+
expect_true("Run" %in% colnames(output))
30+
expect_true("ProteinName" %in% colnames(output))
31+
expect_true("PeptideSequence" %in% colnames(output))
32+
expect_true("PrecursorCharge" %in% colnames(output))
33+
expect_true("Intensity" %in% colnames(output))
34+
expect_true("FragmentIon" %in% colnames(output))
35+
expect_true("ProductCharge" %in% colnames(output))
36+
expect_true("IsotopeLabelType" %in% colnames(output))
37+
expect_true("Condition" %in% colnames(output))
38+
expect_true("BioReplicate" %in% colnames(output))
1939
expect_true("Fraction" %in% colnames(output))

man/DIANNtoMSstatsFormat.Rd

Lines changed: 14 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/MSstatsClean.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/dot-cleanRawDIANN.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)