11# ' Clean raw Diann files
22# ' @param msstats_object an object of class `MSstatsDIANNFiles`.
33# ' @param MBR True if analysis was done with match between runs
4- # ' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities. 'FragmentQuantRaw' can be used instead.
4+ # ' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
5+ # ' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
6+ # ' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
57# ' @return data.table
68# ' @importFrom stats na.omit
79# ' @keywords internal
8- .cleanRawDIANN = function (msstats_object , MBR = TRUE ,
9- quantificationColumn = " FragmentQuantCorrected" ) {
10- dn_input = getInputFile(msstats_object , " input" )
11- dn_input = data.table :: as.data.table(dn_input )
12-
13- if (! is.element(" PrecursorMz" , colnames(dn_input ))) {
14- dn_input [, PrecursorMz : = NA ]
15- }
16- if (! is.element(' FragmentInfo' , colnames(dn_input ))) {
17- dn_input [, FragmentInfo : = NA ]
18- }
19- req_cols = c(' ProteinNames' , ' StrippedSequence' ,
20- ' ModifiedSequence' , ' PrecursorCharge' ,
21- quantificationColumn , ' QValue' ,
22- ' PrecursorMz' , ' FragmentInfo' , ' Run' )
23- if (MBR ) {
24- req_cols = c(req_cols , c(' LibQValue' , ' LibPGQValue' ))
25- } else {
26- req_cols = c(req_cols , c(' GlobalQValue' , ' GlobalPGQValue' ))
27- }
28- dn_input = dn_input [, req_cols , with = FALSE ]
29- dn_input = dn_input [, lapply(.SD , function (x ) unlist(tstrsplit(x , " ;" ))),
30- .SDcols = c(quantificationColumn , " FragmentInfo" ),
31- by = setdiff(colnames(dn_input ), c(" FragmentInfo" , quantificationColumn ))]
32- if (all(is.na(dn_input [[" FragmentInfo" ]]))) {
33- dn_input [, FragmentInfo : = paste0(" Frag" , 1 : .N ),
34- by = c(" ProteinNames" , " ModifiedSequence" , " PrecursorCharge" , " Run" )]
35- }
36- dn_input [, (quantificationColumn ) : = lapply(.SD , as.numeric ), .SDcols = quantificationColumn ]
37- dn_input [, FragmentIon : = sub(' \\ ^\\ .\\ *' , ' ' , FragmentInfo )]
38- if (any(grepl(" /" , dn_input $ FragmentInfo ))) {
39- dn_input [, ProductCharge : = unlist(strsplit(FragmentInfo , split = " /" ))[[1 ]], by = FragmentInfo ]
40- dn_input [, ProductCharge : = strtoi(sub(" \\ .\\ *\\ ^" , " " , ProductCharge ))]
41- } else {
42- dn_input [, ProductCharge : = 1 ]
43- }
44- dn_input = dn_input [! grepl(" NH3" , FragmentIon ), ]
45- dn_input = dn_input [! grepl(" H2O" , FragmentIon ), ]
46- dn_input = na.omit(dn_input , cols = quantificationColumn )
47- data.table :: setnames(dn_input , old = c(' ProteinNames' , ' StrippedSequence' ,
48- ' ModifiedSequence' ,' PrecursorCharge' ,
49- quantificationColumn , ' QValue' ,
50- ' PrecursorMz' , ' FragmentIon' ,' Run' ,
51- ' ProductCharge' ),
52- new = c(' ProteinName' , ' PeptideSequence' ,
53- ' PeptideModifiedSequence' ,' PrecursorCharge' ,
54- ' Intensity' , ' DetectionQValue' ,
55- ' PrecursorMz' , ' FragmentIon' ,' Run' ,
56- ' ProductCharge' ),
57- skip_absent = TRUE )
58- dn_input [, PeptideSequence : = NULL ]
59- setnames(dn_input , " PeptideModifiedSequence" , " PeptideSequence" )
60- .logSuccess(" DIANN" , " clean" )
61- dn_input
10+ .cleanRawDIANN <- function (msstats_object , MBR = TRUE ,
11+ quantificationColumn = " FragmentQuantCorrected" ) {
12+ dn_input <- getInputFile(msstats_object , " input" )
13+ dn_input <- data.table :: as.data.table(dn_input )
14+
15+ # Process quantification columns
16+ quantificationColumn <- .cleanDIANNProcessQuantificationColumns(dn_input , quantificationColumn )
17+
18+ # Add missing columns
19+ dn_input <- .cleanDIANNAddMissingColumns(dn_input )
20+
21+ # Select required columns
22+ dn_input <- .cleanDIANNSelectRequiredColumns(dn_input , quantificationColumn , MBR )
23+
24+ # Split concatenated values
25+ dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input , quantificationColumn )
26+
27+ # Process fragment information
28+ dn_input <- .cleanDIANNProcessFragmentInfo(dn_input , quantificationColumn )
29+
30+ # Clean and filter data
31+ dn_input <- .cleanDIANNCleanAndFilterData(dn_input , quantificationColumn )
32+
33+ # Rename columns
34+ dn_input <- .cleanDIANNRenameColumns(dn_input , quantificationColumn )
35+
36+ .logSuccess(" DIANN" , " clean" )
37+ dn_input
6238}
39+
40+ # ' Process quantification columns for DIANN 2.0 format
41+ # ' @param dn_input data.table input
42+ # ' @param quantificationColumn quantification column name
43+ # ' @return updated quantification column name
44+ # ' @noRd
45+ .cleanDIANNProcessQuantificationColumns <- function (dn_input , quantificationColumn ) {
46+ if (quantificationColumn == " auto" ) {
47+ fragment_columns <- grep(" ^Fr[0-9]+Quantity$" , names(dn_input ), value = TRUE )
48+ if (length(fragment_columns ) == 0 ) {
49+ stop(" No fragment quantification columns found. Please check your input." )
50+ }
51+ dn_input [, FragmentQuantCorrected : = do.call(paste , c(.SD , sep = " ;" )),
52+ .SDcols = fragment_columns ]
53+ quantificationColumn <- " FragmentQuantCorrected"
54+ }
55+ return (quantificationColumn )
56+ }
57+
58+ # ' Add missing required columns
59+ # ' @param dn_input data.table input
60+ # ' @return data.table with missing columns added
61+ # ' @noRd
62+ .cleanDIANNAddMissingColumns <- function (dn_input ) {
63+ if (! is.element(" PrecursorMz" , colnames(dn_input ))) {
64+ dn_input [, PrecursorMz : = NA ]
65+ }
66+ if (! is.element(' FragmentInfo' , colnames(dn_input ))) {
67+ dn_input [, FragmentInfo : = NA ]
68+ }
69+ return (dn_input )
70+ }
71+
72+ # ' Select required columns based on MBR setting
73+ # ' @param dn_input data.table input
74+ # ' @param quantificationColumn quantification column name
75+ # ' @param MBR logical indicating if match between runs was used
76+ # ' @return data.table with selected columns
77+ # ' @noRd
78+ .cleanDIANNSelectRequiredColumns <- function (dn_input , quantificationColumn , MBR ) {
79+ base_cols <- c(' ProteinNames' , ' StrippedSequence' , ' ModifiedSequence' ,
80+ ' PrecursorCharge' , quantificationColumn , ' QValue' ,
81+ ' PrecursorMz' , ' FragmentInfo' , ' Run' )
82+
83+ mbr_cols <- if (MBR ) {
84+ c(' LibQValue' , ' LibPGQValue' )
85+ } else {
86+ c(' GlobalQValue' , ' GlobalPGQValue' )
87+ }
88+
89+ req_cols <- c(base_cols , mbr_cols )
90+ return (dn_input [, req_cols , with = FALSE ])
91+ }
92+
93+ # ' Split concatenated values in quantification and fragment info columns
94+ # ' @param dn_input data.table input
95+ # ' @param quantificationColumn quantification column name
96+ # ' @return data.table with split values
97+ # ' @noRd
98+ .cleanDIANNSplitConcatenatedValues <- function (dn_input , quantificationColumn ) {
99+ split_cols <- c(quantificationColumn , " FragmentInfo" )
100+ by_cols <- setdiff(colnames(dn_input ), split_cols )
101+
102+ dn_input <- dn_input [, lapply(.SD , function (x ) unlist(tstrsplit(x , " ;" ))),
103+ .SDcols = split_cols ,
104+ by = by_cols ]
105+ return (dn_input )
106+ }
107+
108+ # ' Process fragment information and add derived columns
109+ # ' @param dn_input data.table input
110+ # ' @param quantificationColumn quantification column name
111+ # ' @return data.table with processed fragment info
112+ # ' @noRd
113+ .cleanDIANNProcessFragmentInfo <- function (dn_input , quantificationColumn ) {
114+ # Generate fragment info if missing
115+ if (all(is.na(dn_input [[" FragmentInfo" ]]))) {
116+ dn_input [, FragmentInfo : = paste0(" Frag" , 1 : .N ),
117+ by = c(" ProteinNames" , " ModifiedSequence" , " PrecursorCharge" , " Run" )]
118+ }
119+
120+ # Convert quantification column to numeric
121+ dn_input [, (quantificationColumn ) : = lapply(.SD , as.numeric ),
122+ .SDcols = quantificationColumn ]
123+
124+ # Process fragment ion information
125+ dn_input [, FragmentIon : = sub(' \\ ^\\ .\\ *' , ' ' , FragmentInfo )]
126+
127+ # Extract product charge
128+ if (any(grepl(" /" , dn_input $ FragmentInfo ))) {
129+ dn_input [, ProductCharge : = .cleanDIANNExtractProductCharge(FragmentInfo ), by = FragmentInfo ]
130+ } else {
131+ dn_input [, ProductCharge : = 1 ]
132+ }
133+
134+ return (dn_input )
135+ }
136+
137+ # ' Extract product charge from fragment info
138+ # ' @param fragment_info fragment information string
139+ # ' @return numeric product charge
140+ # ' @noRd
141+ .cleanDIANNExtractProductCharge <- function (fragment_info ) {
142+ charge_part <- unlist(strsplit(fragment_info , split = " /" ))[[1 ]]
143+ return (strtoi(sub(" \\ .\\ *\\ ^" , " " , charge_part )))
144+ }
145+
146+ # ' Clean and filter data by removing unwanted fragments and NA values
147+ # ' @param dn_input data.table input
148+ # ' @param quantificationColumn quantification column name
149+ # ' @return cleaned data.table
150+ # ' @noRd
151+ .cleanDIANNCleanAndFilterData <- function (dn_input , quantificationColumn ) {
152+ # Remove NH3 and H2O loss fragments
153+ dn_input <- dn_input [! grepl(" NH3" , FragmentIon )]
154+ dn_input <- dn_input [! grepl(" H2O" , FragmentIon )]
155+
156+ # Remove rows with NA in quantification column
157+ dn_input <- na.omit(dn_input , cols = quantificationColumn )
158+
159+ return (dn_input )
160+ }
161+
162+ # ' Rename columns to standardized names
163+ # ' @param dn_input data.table input
164+ # ' @param quantificationColumn quantification column name
165+ # ' @return data.table with renamed columns
166+ # ' @noRd
167+ .cleanDIANNRenameColumns <- function (dn_input , quantificationColumn ) {
168+ old_names <- c(' ProteinNames' , ' StrippedSequence' , ' ModifiedSequence' ,
169+ ' PrecursorCharge' , quantificationColumn , ' QValue' ,
170+ ' PrecursorMz' , ' FragmentIon' , ' Run' , ' ProductCharge' )
171+
172+ new_names <- c(' ProteinName' , ' PeptideSequence' , ' PeptideModifiedSequence' ,
173+ ' PrecursorCharge' , ' Intensity' , ' DetectionQValue' ,
174+ ' PrecursorMz' , ' FragmentIon' , ' Run' , ' ProductCharge' )
175+
176+ data.table :: setnames(dn_input , old = old_names , new = new_names , skip_absent = TRUE )
177+
178+ # Clean up peptide sequence columns
179+ dn_input [, PeptideSequence : = NULL ]
180+ setnames(dn_input , " PeptideModifiedSequence" , " PeptideSequence" )
181+
182+ return (dn_input )
183+ }
0 commit comments