44# ' @param path the destination of the output files
55# ' (gtf, transcript counts, and gene counts)
66# ' @param prefix the prefix of the output files
7- # ' @details The function will write the output from Bambu to files. The
8- # ' annotations will be written to a .gtf file, transcript counts (total counts,
7+ # ' @param outputAnnFormat the file format in which to output annotations, must
8+ # ' be one of \code{"gtf"} or \code{"gff3"}. \code{"gtf"} is specified by default.
9+ # ' @details The function will write the output from Bambu to files. The
10+ # ' annotations will be written to a .gtf or .gff3 file, transcript counts (total counts,
911# ' CPM, full-length counts, partial-length counts, and unique counts) and gene counts
10- # ' will be written to .txt files.
12+ # ' will be written to .txt files.
1113# ' @export
1214# ' @examples
1315# ' se <- readRDS(system.file("extdata",
1719# ' path <- tempdir()
1820# ' writeBambuOutput(se, path)
1921writeBambuOutput <- function (se , path , prefix = " " , outputExtendedAnno = TRUE ,
20- outputAll = TRUE , outputBambuModels = TRUE , outputNovelOnly = TRUE , seperateSamples = FALSE ) {
21- if (missing(se ) | missing(path )) {
22- stop(" Both summarizedExperiment object from bambu and
22+ outputAll = TRUE , outputBambuModels = TRUE , outputNovelOnly = TRUE , seperateSamples = FALSE ,
23+ outputAnnFormat = " gtf" ) {
24+
25+ if (missing(se ) | missing(path )) {
26+ stop(" Both summarizedExperiment object from bambu and
2327 the path for the output files are required." )
24- } else {
25- outdir <- paste0(path , " /" )
26- if (! dir.exists(outdir ))
27- dir.create(outdir , recursive = TRUE )
28-
29- transcript_grList <- rowRanges(se )
30- prefix <- ifelse(prefix != " " , paste0(prefix , " _" ), " " )
31- transcript_gtffn <- paste(outdir , prefix , sep = " " )
32- gtf <- writeAnnotationsToGTF(annotation = transcript_grList ,
33- file = transcript_gtffn , outputExtendedAnno = outputExtendedAnno ,
34- outputAll = outputAll , outputBambuModels = outputBambuModels , outputNovelOnly = outputNovelOnly )
28+ } else if (! outputAnnFormat %in% c(" gtf" , " gff3" )) {
29+ stop(" Please specify a valid annotation outputAnnFormat: 'gtf' or 'gff3', or leave blank to output to GTF by default." )
30+ } else {
31+ outdir <- paste0(path , " /" )
32+ if (! dir.exists(outdir ))
33+ dir.create(outdir , recursive = TRUE )
34+
35+ transcript_grList <- rowRanges(se )
36+ prefix <- ifelse(prefix != " " , paste0(prefix , " _" ), " " )
37+ transcript_annfn <- paste(outdir , prefix , sep = " " )
38+
39+ if (outputAnnFormat == " gtf" ) {
40+ gtf <- writeAnnotationsToGTF(annotation = transcript_grList ,
41+ file = transcript_annfn , outputExtendedAnno = outputExtendedAnno ,
42+ outputAll = outputAll , outputBambuModels = outputBambuModels , outputNovelOnly = outputNovelOnly )
43+ }
44+
45+ if (outputAnnFormat == " gff3" ) {
46+ gff3 <- writeAnnotationsToGFF3(annotation = transcript_grList ,
47+ file = transcript_annfn , outputExtendedAnno = outputExtendedAnno ,
48+ outputAll = outputAll , outputBambuModels = outputBambuModels , outputNovelOnly = outputNovelOnly )
49+ }
3550
36- utils :: write.table(colData(se ), file = paste0(transcript_gtffn , " sampleData.tsv" ),
51+ utils :: write.table(colData(se ), file = paste0(transcript_annfn , " sampleData.tsv" ),
3752 sep = " \t " , quote = FALSE , row.names = FALSE , col.names = TRUE )
3853 for (d in names(assays(se ))){
3954 writeCountsOutput(se , varname = d ,
@@ -43,17 +58,17 @@ writeBambuOutput <- function(se, path, prefix = "", outputExtendedAnno = TRUE,
4358 # write incompatible counts
4459 if (! is.null(metadata(se )$ incompatibleCounts )){
4560 estimates <- metadata(se )$ incompatibleCounts
46- estimatesfn <- paste(transcript_gtffn , " incompatibleCounts.mtx" , sep = " " )
61+ estimatesfn <- paste(transcript_annfn , " incompatibleCounts.mtx" , sep = " " )
4762 Matrix :: writeMM(estimates , estimatesfn )
4863 }
4964 seGene <- transcriptToGeneExpression(se )
5065 writeCountsOutput(seGene , varname = ' counts' , feature = ' gene' ,outdir , prefix )
5166 # utils::write.table(paste0(colnames(se), "-1"), file = paste0(outdir, "barcodes.tsv"), quote = FALSE, row.names = FALSE, col.names = FALSE)
5267 # R.utils::gzip(paste0(outdir, "barcodes.tsv"))
5368 txANDGenes <- data.table(as.data.frame(rowData(se ))[,c(" TXNAME" ," GENEID" )])
54- utils :: write.table(txANDGenes , file = paste0(transcript_gtffn , " txANDgenes.tsv" ),
69+ utils :: write.table(txANDGenes , file = paste0(transcript_annfn , " txANDgenes.tsv" ),
5570 sep = " \t " , quote = FALSE , row.names = FALSE , col.names = FALSE )
56- utils :: write.table(names(seGene ), file = paste0(transcript_gtffn , " genes.tsv" ),
71+ utils :: write.table(names(seGene ), file = paste0(transcript_annfn , " genes.tsv" ),
5772 sep = " \t " , quote = FALSE , row.names = FALSE , col.names = FALSE )
5873
5974 # R.utils::gzip(paste0(outdir, "txANDgenes.tsv"))
@@ -247,6 +262,135 @@ writeAnnotationsToGTF <- function(annotation, file, geneIDs = NULL, outputExtend
247262 }
248263}
249264
265+ # Write to GFF3 file
266+ writeToGFF3 <- function (annotation , file , geneIDs = NULL ) {
267+
268+ if (missing(annotation ) | missing(file )) {
269+ stop(" Both GRangesList and the name of the output file are required." )
270+ } else if (! is(annotation , " CompressedGRangesList" )) { # Check filename
271+ stop(" The inputted GRangesList is of the wrong class." )
272+ }
273+
274+ NDR = NULL
275+ txScore = NULL
276+ txScore.noFit = NULL
277+ novelGene = NULL
278+ novelTranscript = NULL
279+ txClassDescription = NULL
280+
281+ df <- as_tibble(annotation )
282+
283+ # Build exon number in GTF format
284+ df $ exon_rank <- paste(" rank=" , df $ exon_rank , " ;" , sep = " " )
285+
286+ # if there is an NDR column
287+ if (! is.null(mcols(annotation )$ NDR )){
288+
289+ NDR = rep(mcols(annotation )$ NDR , unname(elementNROWS(annotation )))
290+ txScore = rep(mcols(annotation )$ maxTxScore , unname(elementNROWS(annotation )))
291+ txScore.noFit = rep(mcols(annotation )$ maxTxScore.noFit , unname(elementNROWS(annotation )))
292+ novelGene = rep(mcols(annotation )$ novelGene , unname(elementNROWS(annotation )))
293+ novelTranscript = rep(mcols(annotation )$ novelTranscript , unname(elementNROWS(annotation )))
294+ txClassDescription = rep(mcols(annotation )$ txClassDescription , unname(elementNROWS(annotation )))
295+
296+ df $ NDR <- paste(" NDR=" , as.character(NDR ), " ;" , sep = " " )
297+ df $ txScore <- paste(" maxTxScore=" , as.character(txScore ), " ;" , sep = " " )
298+ df $ txScore.noFit <- paste(" maxTxScore.noFit=" , as.character(txScore.noFit ), " ;" , sep = " " )
299+ df $ novelGene <- paste(" novelGene=" , as.character(novelGene ), " ;" , sep = " " )
300+ df $ novelTranscript <- paste(" novelTranscript=" , as.character(novelTranscript ), " ;" , sep = " " )
301+ df $ txClassDescription <- paste(" txClassDescription=" , as.character(txClassDescription ), " ;" , sep = " " )
302+ }
303+
304+ # Join gene IDs
305+ if (is.null(geneIDs )) {
306+ if (! is.null(mcols(annotation , use.names = FALSE )$ GENEID )) {
307+ geneIDs <- as_tibble(mcols(annotation , use.names = FALSE )[,
308+ c(" TXNAME" , " GENEID" )])
309+ geneIDs $ seqnames <- unlist(unique(seqnames(annotation )))
310+ geneIDs $ strand <- unlist(unique(strand(annotation )))
311+ }
312+ }
313+
314+ df <- left_join(df , geneIDs [, c(" TXNAME" , " GENEID" )],
315+ by = c(" group_name" = " TXNAME" ))
316+
317+ # Convert group_name and gene ID into GFF3 format
318+ df $ group_name <- paste(" ID=" , df $ group_name , " ;" , sep = " " )
319+ df $ GENEID <- paste(" Parent=" , df $ GENEID , " ;" , sep = " " )
320+
321+ # Create GFF3 formatted dataframes for exons and transcripts separately
322+
323+ # Exons: no need gene IDs but set Parent = transcript ID
324+ dfExon <- mutate(df , source = " Bambu" , feature = " exon" , score = " ." ,
325+ frame = " ." , attributes = paste0(group_name , exon_rank , NDR , txScore , txScore.noFit , novelGene , novelTranscript , txClassDescription )) %> %
326+ select(seqnames , source , feature , start , end , score ,
327+ strand , frame , attributes , group_name ) %> %
328+ mutate(attributes = sub(" ID=" , " Parent=" , attributes ))
329+
330+ # Dataframe for transcripts
331+ dfTx <- as_tibble(as.data.frame(range(ranges(annotation ))))
332+ dfTx <- left_join(dfTx , geneIDs , by = c(" group_name" = " TXNAME" ))
333+ dfTx $ group_name <- paste(" ID=" , dfTx $ group_name , " ;" , sep = " " )
334+ dfTx $ GENEID <- paste(" Parent=" , dfTx $ GENEID , " ;" , sep = " " )
335+
336+
337+ if (! is.null(mcols(annotation )$ NDR )) {
338+ dfTx $ NDR <- paste(" NDR=" , as.character(mcols(annotation )$ NDR ), " ;" , sep = " " )
339+ dfTx $ txScore <- paste(" txScore=" , as.character(mcols(annotation )$ txScore ), " ;" , sep = " " )
340+ dfTx $ txScore.noFit <- paste(" txScore.noFit=" , as.character(mcols(annotation )$ txScore.noFit ), " ;" , sep = " " )
341+ dfTx $ novelGene <- paste(" novelGene=" , as.character(mcols(annotation )$ novelGene ), " ;" , sep = " " )
342+ dfTx $ novelTranscript <- paste(" novelTranscript=" , as.character(mcols(annotation )$ novelTranscript ), " ;" , sep = " " )
343+ dfTx $ txClassDescription <- paste(" txClassDescription=" , as.character(mcols(annotation )$ txClassDescription ), " ;" , sep = " " )
344+ }
345+
346+ # Parse to gff3 format
347+ dfTx <- mutate(dfTx , source = " Bambu" , feature = " mRNA" , score = " ." ,
348+ frame = " ." , attributes = paste0(GENEID , group_name , NDR , txScore , txScore.noFit , novelGene , novelTranscript , txClassDescription )) %> %
349+ select(seqnames , source , feature , start , end , score ,
350+ strand , frame , attributes , group_name )
351+
352+ # Concat tx and exon, then drop the group_name
353+ gff3 <- rbind(dfTx , dfExon ) %> % group_by(group_name ) %> %
354+ arrange(as.character(seqnames ), start ) %> % ungroup() %> %
355+ select(seqnames , source , feature , start , end , score ,
356+ strand , frame , attributes )
357+
358+ # replace * with . and remove trailing ;
359+ gff3 <- mutate(gff3 , strand = recode_factor(strand , `*` = " ." ),
360+ attributes = sub(" ;$" , " " , attributes ))
361+
362+ # Export
363+ writeLines(" ##gff-version 3" , file )
364+ utils :: write.table(gff3 , file = file , append = TRUE , quote = FALSE , row.names = FALSE ,
365+ col.names = FALSE , sep = " \t " )
366+ }
367+
368+
369+ writeAnnotationsToGFF3 <- function (annotation , file , geneIDs = NULL , outputExtendedAnno = TRUE ,
370+ outputAll = TRUE , outputBambuModels = TRUE , outputNovelOnly = TRUE ){
371+ if (outputExtendedAnno ){
372+ writeToGFF3(annotation , paste0(file , " extendedAnnotations.gff3" ), geneIDs )
373+ }
374+ if (outputAll ){
375+ annotationAll = setNDR(annotation , 1 )
376+ if (length(annotationAll ) == length(annotation ))
377+ message(" The current NDR threshold already outputs all transcript models. This may result in reduced precision for th extendedAnnotations and supportedTranscriptModels gtfs" )
378+ writeToGFF3(annotationAll , paste0(file , " allTranscriptModels.gff3" ), geneIDs )
379+ }
380+
381+ # todo - have this write bambu start and ends for annotated transcripts
382+ if (outputBambuModels ){
383+ annotationBambu = annotation [! is.na(mcols(annotation )$ readCount )]
384+ writeToGFF3(annotationBambu , paste0(file , " supportedTranscriptModels.gff3" ), geneIDs )
385+ }
386+
387+ if (outputNovelOnly ){
388+ annotationNovel = annotation [mcols(annotation )$ novelTranscript ]
389+ writeToGFF3(annotationNovel , paste0(file , " novelTranscripts.gff3" ), geneIDs )
390+ }
391+ }
392+
393+
250394
251395# ' Outputs GRangesList object from reading a GTF file
252396# ' @title convert a GTF file into a GRangesList
0 commit comments