@@ -62,7 +62,7 @@ MSstatsPreprocessBig <- function(input_file,
6262 calculateAnomalyScores ,
6363 anomalyModelFeatures )
6464 } else if (backend == " sparklyr" ) {
65- MSstatsPreprocessBigSparklyr(connection , input , output_file_name ,
65+ MSstatsPreprocessBigSparklyr(connection , input_file , output_file_name ,
6666 max_feature_count , filter_unique_peptides ,
6767 aggregate_psms , filter_few_obs ,
6868 remove_annotation )
@@ -96,10 +96,16 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name,
9696 filter_few_obs = FALSE ,
9797 remove_annotation = FALSE ,
9898 connection = NULL ) {
99- MSstatsPreprocessBig(input_file , output_file_name ,
100- backend , max_feature_count , filter_unique_peptides ,
101- aggregate_psms , filter_few_obs , remove_annotation ,
102- connection = connection )
99+ MSstatsPreprocessBig(
100+ input_file = input_file ,
101+ output_file_name = output_file_name ,
102+ backend = backend ,
103+ max_feature_count = max_feature_count ,
104+ filter_unique_peptides = filter_unique_peptides ,
105+ aggregate_psms = aggregate_psms ,
106+ filter_few_obs = filter_few_obs ,
107+ remove_annotation = remove_annotation ,
108+ connection = connection )
103109}
104110
105111
@@ -140,15 +146,23 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
140146 calculateAnomalyScores = FALSE ,
141147 anomalyModelFeatures = c(),
142148 connection = NULL ) {
143- reduceBigSpectronaut(input_file , paste0(" reduce_output_" , output_file_name ),
149+ reduced_file <- .prefixedPath(" reduce_output_" , output_file_name )
150+ reduceBigSpectronaut(input_file , reduced_file ,
144151 intensity , filter_by_excluded , filter_by_identified ,
145152 filter_by_qvalue , qvalue_cutoff ,
146153 calculateAnomalyScores , anomalyModelFeatures )
147154 msstats_data <- MSstatsPreprocessBig(
148- paste0(" reduce_output_" , output_file_name ),
149- output_file_name , backend , max_feature_count ,
150- aggregate_psms , filter_few_obs , remove_annotation , calculateAnomalyScores ,
151- anomalyModelFeatures , connection )
155+ input_file = reduced_file ,
156+ output_file_name = output_file_name ,
157+ backend = backend ,
158+ max_feature_count = max_feature_count ,
159+ filter_unique_peptides = filter_unique_peptides ,
160+ aggregate_psms = aggregate_psms ,
161+ filter_few_obs = filter_few_obs ,
162+ remove_annotation = remove_annotation ,
163+ calculateAnomalyScores = calculateAnomalyScores ,
164+ anomalyModelFeatures = anomalyModelFeatures ,
165+ connection = connection )
152166
153167 return (msstats_data )
154168
@@ -184,22 +198,59 @@ bigDIANNtoMSstatsFormat <- function(input_file,
184198 connection = NULL ) {
185199
186200 # Reduce and clean the DIANN report file in chunks
187- reduceBigDIANN(input_file ,
188- paste0(" reduce_output_" , output_file_name ),
201+ reduced_file <- .prefixedPath(" reduce_output_" , output_file_name )
202+ reduceBigDIANN(input_file ,
203+ reduced_file ,
189204 MBR ,
190205 quantificationColumn ,
191- global_qvalue_cutoff , qvalue_cutoff , pg_qvalue_cutoff ,
206+ global_qvalue_cutoff , qvalue_cutoff , pg_qvalue_cutoff ,
192207 calculateAnomalyScores , anomalyModelFeatures ,
193208 annotation )
194-
209+
210+ reduced <- arrow :: open_dataset(reduced_file , format = " csv" )
211+
212+ # Identify columns where Arrow inferred 'null' type (all values NA)
213+ null_cols <- names(reduced $ schema )[
214+ vapply(reduced $ schema $ fields , function (f ) f $ type $ ToString() == " null" , logical (1 ))
215+ ]
216+
217+ if (length(null_cols ) > 0 ) {
218+ # Drop null-typed columns using a lazy select (no data loaded into memory)
219+ reduced <- dplyr :: select(reduced , - dplyr :: all_of(null_cols ))
220+
221+ # Write back using Arrow's streaming writer — stays out-of-memory.
222+ # write_dataset creates a directory, but open_dataset can read
223+ # directories just as easily as single files.
224+ cleaned_file <- .prefixedPath(" cleaned_" , output_file_name )
225+ arrow :: write_dataset(reduced , cleaned_file , format = " csv" )
226+ reduced_file <- cleaned_file
227+ }
228+
195229 # Preprocess the cleaned data (feature selection, etc.)
196230 msstats_data <- MSstatsPreprocessBig(
197- paste0(" reduce_output_" , output_file_name ),
198- output_file_name , backend , max_feature_count ,
199- filter_unique_peptides , aggregate_psms , filter_few_obs ,
200- remove_annotation , calculateAnomalyScores ,
201- anomalyModelFeatures , connection )
202-
231+ input_file = reduced_file ,
232+ output_file_name = output_file_name ,
233+ backend = backend ,
234+ max_feature_count = max_feature_count ,
235+ filter_unique_peptides = filter_unique_peptides ,
236+ aggregate_psms = aggregate_psms ,
237+ filter_few_obs = filter_few_obs ,
238+ remove_annotation = remove_annotation ,
239+ calculateAnomalyScores = calculateAnomalyScores ,
240+ anomalyModelFeatures = anomalyModelFeatures ,
241+ connection = connection )
242+
243+ # Merge annotation with the preprocessed data and persist the merge so
244+ # callers reopening output_file_name see Condition/BioReplicate. The arrow
245+ # rewrite stays lazy — the underlying source is reduced_file, not
246+ # output_file_name, so we can safely overwrite the directory we just wrote.
247+ if (! is.null(annotation )) {
248+ msstats_data <- MSstatsAddAnnotationBig(msstats_data , annotation )
249+ if (backend == " arrow" ) {
250+ unlink(output_file_name , recursive = TRUE , force = TRUE )
251+ arrow :: write_dataset(msstats_data , output_file_name , format = " csv" )
252+ }
253+ }
203254 return (msstats_data )
204255}
205256
@@ -232,5 +283,19 @@ bigDIANNtoMSstatsFormat <- function(input_file,
232283# ' @return table of `input` and `annotation` merged by Run column.
233284# '
234285MSstatsAddAnnotationBig <- function (input , annotation ) {
235- dplyr :: inner_join(input , annotation , by = " Run" )
286+ join_keys <- " Run"
287+
288+ # Use tbl_vars which works reliably on both Arrow
289+ # datasets, arrow_dplyr_query objects, and data frames
290+ input_cols <- dplyr :: tbl_vars(input )
291+
292+ overlap_cols <- setdiff(
293+ intersect(input_cols , colnames(annotation )),
294+ join_keys
295+ )
296+ if (length(overlap_cols ) > 0 ) {
297+ input <- dplyr :: select(input , - dplyr :: all_of(overlap_cols ))
298+ }
299+
300+ dplyr :: inner_join(input , annotation , by = join_keys )
236301}
0 commit comments