Skip to content

Commit 67f184d

Browse files
committed
implement gff3 functionality
1 parent fdf8db5 commit 67f184d

2 files changed

Lines changed: 167 additions & 23 deletions

File tree

R/readWrite.R

Lines changed: 165 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
#' @param path the destination of the output files
55
#' (gtf, transcript counts, and gene counts)
66
#' @param prefix the prefix of the output files
7-
#' @details The function will write the output from Bambu to files. The
8-
#' annotations will be written to a .gtf file, transcript counts (total counts,
7+
#' @param outputAnnFormat the file format in which to output annotations, must
8+
#' be one of \code{"gtf"} or \code{"gff3"}. \code{"gtf"} is specified by default.
9+
#' @details The function will write the output from Bambu to files. The
10+
#' annotations will be written to a .gtf or .gff3 file, transcript counts (total counts,
911
#' CPM, full-length counts, partial-length counts, and unique counts) and gene counts
10-
#' will be written to .txt files.
12+
#' will be written to .txt files.
1113
#' @export
1214
#' @examples
1315
#' se <- readRDS(system.file("extdata",
@@ -17,23 +19,36 @@
1719
#' path <- tempdir()
1820
#' writeBambuOutput(se, path)
1921
writeBambuOutput <- function(se, path, prefix = "", outputExtendedAnno = TRUE,
20-
outputAll = TRUE, outputBambuModels = TRUE, outputNovelOnly = TRUE, seperateSamples = FALSE) {
21-
if (missing(se) | missing(path)) {
22-
stop("Both summarizedExperiment object from bambu and
22+
outputAll = TRUE, outputBambuModels = TRUE, outputNovelOnly = TRUE, seperateSamples = FALSE,
23+
outputAnnFormat = "gtf") {
24+
25+
if (missing(se) | missing(path)) {
26+
stop("Both summarizedExperiment object from bambu and
2327
the path for the output files are required.")
24-
} else {
25-
outdir <- paste0(path, "/")
26-
if (!dir.exists(outdir))
27-
dir.create(outdir, recursive = TRUE)
28-
29-
transcript_grList <- rowRanges(se)
30-
prefix <- ifelse(prefix != "", paste0(prefix, "_"), "")
31-
transcript_gtffn <- paste(outdir, prefix, sep = "")
32-
gtf <- writeAnnotationsToGTF(annotation = transcript_grList,
33-
file = transcript_gtffn, outputExtendedAnno = outputExtendedAnno,
34-
outputAll = outputAll, outputBambuModels = outputBambuModels, outputNovelOnly = outputNovelOnly)
28+
} else if (!outputAnnFormat %in% c("gtf", "gff3")) {
29+
stop("Please specify a valid annotation outputAnnFormat: 'gtf' or 'gff3', or leave blank to output to GTF by default.")
30+
} else {
31+
outdir <- paste0(path, "/")
32+
if (!dir.exists(outdir))
33+
dir.create(outdir, recursive = TRUE)
34+
35+
transcript_grList <- rowRanges(se)
36+
prefix <- ifelse(prefix != "", paste0(prefix, "_"), "")
37+
transcript_annfn <- paste(outdir, prefix, sep = "")
38+
39+
if (outputAnnFormat == "gtf") {
40+
gtf <- writeAnnotationsToGTF(annotation = transcript_grList,
41+
file = transcript_annfn, outputExtendedAnno = outputExtendedAnno,
42+
outputAll = outputAll, outputBambuModels = outputBambuModels, outputNovelOnly = outputNovelOnly)
43+
}
44+
45+
if (outputAnnFormat == "gff3") {
46+
gff3 <- writeAnnotationsToGFF3(annotation = transcript_grList,
47+
file = transcript_annfn, outputExtendedAnno = outputExtendedAnno,
48+
outputAll = outputAll, outputBambuModels = outputBambuModels, outputNovelOnly = outputNovelOnly)
49+
}
3550

36-
utils::write.table(colData(se), file = paste0(transcript_gtffn, "sampleData.tsv"),
51+
utils::write.table(colData(se), file = paste0(transcript_annfn, "sampleData.tsv"),
3752
sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
3853
for(d in names(assays(se))){
3954
writeCountsOutput(se, varname=d,
@@ -43,17 +58,17 @@ writeBambuOutput <- function(se, path, prefix = "", outputExtendedAnno = TRUE,
4358
#write incompatible counts
4459
if(!is.null(metadata(se)$incompatibleCounts)){
4560
estimates <- metadata(se)$incompatibleCounts
46-
estimatesfn <- paste(transcript_gtffn, "incompatibleCounts.mtx", sep = "")
61+
estimatesfn <- paste(transcript_annfn, "incompatibleCounts.mtx", sep = "")
4762
Matrix::writeMM(estimates, estimatesfn)
4863
}
4964
seGene <- transcriptToGeneExpression(se)
5065
writeCountsOutput(seGene, varname='counts', feature='gene',outdir, prefix)
5166
#utils::write.table(paste0(colnames(se), "-1"), file = paste0(outdir, "barcodes.tsv"), quote = FALSE, row.names = FALSE, col.names = FALSE)
5267
#R.utils::gzip(paste0(outdir, "barcodes.tsv"))
5368
txANDGenes <- data.table(as.data.frame(rowData(se))[,c("TXNAME","GENEID")])
54-
utils::write.table(txANDGenes, file = paste0(transcript_gtffn, "txANDgenes.tsv"),
69+
utils::write.table(txANDGenes, file = paste0(transcript_annfn, "txANDgenes.tsv"),
5570
sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
56-
utils::write.table(names(seGene), file = paste0(transcript_gtffn, "genes.tsv"),
71+
utils::write.table(names(seGene), file = paste0(transcript_annfn, "genes.tsv"),
5772
sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
5873

5974
#R.utils::gzip(paste0(outdir, "txANDgenes.tsv"))
@@ -247,6 +262,135 @@ writeAnnotationsToGTF <- function(annotation, file, geneIDs = NULL, outputExtend
247262
}
248263
}
249264

265+
# Write to GFF3 file
266+
writeToGFF3 <- function(annotation, file, geneIDs = NULL) {
267+
268+
if (missing(annotation) | missing(file)) {
269+
stop("Both GRangesList and the name of the output file are required.")
270+
} else if (!is(annotation, "CompressedGRangesList")) { # Check filename
271+
stop("The inputted GRangesList is of the wrong class.")
272+
}
273+
274+
NDR = NULL
275+
txScore = NULL
276+
txScore.noFit = NULL
277+
novelGene = NULL
278+
novelTranscript = NULL
279+
txClassDescription = NULL
280+
281+
df <- as_tibble(annotation)
282+
283+
# Build exon number in GTF format
284+
df$exon_rank <- paste("rank=", df$exon_rank, ";", sep = "")
285+
286+
# if there is an NDR column
287+
if(!is.null(mcols(annotation)$NDR)){
288+
289+
NDR = rep(mcols(annotation)$NDR, unname(elementNROWS(annotation)))
290+
txScore = rep(mcols(annotation)$maxTxScore, unname(elementNROWS(annotation)))
291+
txScore.noFit = rep(mcols(annotation)$maxTxScore.noFit, unname(elementNROWS(annotation)))
292+
novelGene = rep(mcols(annotation)$novelGene, unname(elementNROWS(annotation)))
293+
novelTranscript = rep(mcols(annotation)$novelTranscript, unname(elementNROWS(annotation)))
294+
txClassDescription = rep(mcols(annotation)$txClassDescription, unname(elementNROWS(annotation)))
295+
296+
df$NDR <- paste("NDR=", as.character(NDR), ";", sep = "")
297+
df$txScore <- paste("maxTxScore=", as.character(txScore), ";", sep = "")
298+
df$txScore.noFit <- paste("maxTxScore.noFit=", as.character(txScore.noFit), ";", sep = "")
299+
df$novelGene <- paste("novelGene=", as.character(novelGene), ";", sep = "")
300+
df$novelTranscript <- paste("novelTranscript=", as.character(novelTranscript), ";", sep = "")
301+
df$txClassDescription <- paste("txClassDescription=", as.character(txClassDescription), ";", sep = "")
302+
}
303+
304+
# Join gene IDs
305+
if (is.null(geneIDs)) {
306+
if (!is.null(mcols(annotation, use.names = FALSE)$GENEID)) {
307+
geneIDs <- as_tibble(mcols(annotation, use.names = FALSE)[,
308+
c("TXNAME", "GENEID")])
309+
geneIDs$seqnames <- unlist(unique(seqnames(annotation)))
310+
geneIDs$strand <- unlist(unique(strand(annotation)))
311+
}
312+
}
313+
314+
df <- left_join(df, geneIDs[, c("TXNAME", "GENEID")],
315+
by = c("group_name" = "TXNAME"))
316+
317+
# Convert group_name and gene ID into GFF3 format
318+
df$group_name <- paste("ID=", df$group_name, ";", sep = "")
319+
df$GENEID <- paste("Parent=", df$GENEID, ";", sep = "")
320+
321+
# Create GFF3 formatted dataframes for exons and transcripts separately
322+
323+
# Exons: no need gene IDs but set Parent = transcript ID
324+
dfExon <- mutate(df, source = "Bambu", feature = "exon", score = ".",
325+
frame = ".", attributes = paste0(group_name, exon_rank, NDR, txScore, txScore.noFit, novelGene, novelTranscript, txClassDescription )) %>%
326+
select(seqnames, source, feature, start, end, score,
327+
strand, frame, attributes, group_name) %>%
328+
mutate(attributes = sub("ID=", "Parent=", attributes))
329+
330+
# Dataframe for transcripts
331+
dfTx <- as_tibble(as.data.frame(range(ranges(annotation))))
332+
dfTx <- left_join(dfTx, geneIDs, by = c("group_name" = "TXNAME"))
333+
dfTx$group_name <- paste("ID=", dfTx$group_name, ";", sep = "")
334+
dfTx$GENEID <- paste("Parent=", dfTx$GENEID, ";", sep = "")
335+
336+
337+
if(!is.null(mcols(annotation)$NDR)) {
338+
dfTx$NDR <- paste("NDR=", as.character(mcols(annotation)$NDR), ";", sep = "")
339+
dfTx$txScore <- paste("txScore=", as.character(mcols(annotation)$txScore), ";", sep = "")
340+
dfTx$txScore.noFit <- paste("txScore.noFit=", as.character(mcols(annotation)$txScore.noFit), ";", sep = "")
341+
dfTx$novelGene <- paste("novelGene=", as.character(mcols(annotation)$novelGene), ";", sep = "")
342+
dfTx$novelTranscript <- paste("novelTranscript=", as.character(mcols(annotation)$novelTranscript), ";", sep = "")
343+
dfTx$txClassDescription <- paste("txClassDescription=", as.character(mcols(annotation)$txClassDescription), ";", sep = "")
344+
}
345+
346+
# Parse to gff3 format
347+
dfTx <- mutate(dfTx, source = "Bambu", feature = "mRNA", score = ".",
348+
frame = ".", attributes = paste0(GENEID, group_name, NDR, txScore, txScore.noFit, novelGene, novelTranscript, txClassDescription )) %>%
349+
select(seqnames, source, feature, start, end, score,
350+
strand, frame, attributes, group_name)
351+
352+
# Concat tx and exon, then drop the group_name
353+
gff3 <- rbind(dfTx, dfExon) %>% group_by(group_name) %>%
354+
arrange(as.character(seqnames), start) %>% ungroup() %>%
355+
select(seqnames, source, feature, start, end, score,
356+
strand, frame, attributes)
357+
358+
# replace * with . and remove trailing ;
359+
gff3 <- mutate(gff3, strand = recode_factor(strand, `*` = "."),
360+
attributes = sub(";$", "", attributes))
361+
362+
# Export
363+
writeLines("##gff-version 3", file)
364+
utils::write.table(gff3, file = file, append = TRUE, quote = FALSE, row.names = FALSE,
365+
col.names = FALSE, sep = "\t")
366+
}
367+
368+
369+
writeAnnotationsToGFF3 <- function(annotation, file, geneIDs = NULL, outputExtendedAnno = TRUE,
370+
outputAll = TRUE, outputBambuModels = TRUE, outputNovelOnly = TRUE){
371+
if(outputExtendedAnno){
372+
writeToGFF3(annotation, paste0(file, "extendedAnnotations.gff3"), geneIDs)
373+
}
374+
if(outputAll){
375+
annotationAll = setNDR(annotation, 1)
376+
if(length(annotationAll) == length(annotation))
377+
message("The current NDR threshold already outputs all transcript models. This may result in reduced precision for th extendedAnnotations and supportedTranscriptModels gtfs")
378+
writeToGFF3(annotationAll, paste0(file, "allTranscriptModels.gff3"), geneIDs)
379+
}
380+
381+
#todo - have this write bambu start and ends for annotated transcripts
382+
if(outputBambuModels){
383+
annotationBambu = annotation[!is.na(mcols(annotation)$readCount)]
384+
writeToGFF3(annotationBambu, paste0(file, "supportedTranscriptModels.gff3"), geneIDs)
385+
}
386+
387+
if(outputNovelOnly){
388+
annotationNovel = annotation[mcols(annotation)$novelTranscript]
389+
writeToGFF3(annotationNovel, paste0(file, "novelTranscripts.gff3"), geneIDs)
390+
}
391+
}
392+
393+
250394

251395
#' Outputs GRangesList object from reading a GTF file
252396
#' @title convert a GTF file into a GRangesList

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,9 @@ Access transcript expression estimates by extracting a variable (such as counts
205205

206206
For a full description of the other outputs see [Output Description](#Output-Description)
207207

208-
The full output can be written to a file using writeBambuOutput(). Using this function will generate six files, including 4four .gtf files(detailed below), and two .txt files for the expression counts at transcript and gene levels.
208+
The full output can be written to a file using writeBambuOutput(). Using this function will generate six files, including four .gtf/.gff3 files(detailed below), and two .txt files for the expression counts at transcript and gene levels.
209209

210-
By default bambu will write four .gtf files
210+
By default bambu will write four .gtf files. Functionality to write .gff3 files can be specified with the parameter `outputAnnFormat = "gff3"`
211211
- **extendedAnnotations.gtf** - Contains all transcript models from the reference annotations and any novel high confidence transcript models (below NDR threshold) from Bambu
212212
- **allTranscriptModels** - Contains all transcript models from the reference annotations and all novel transcript models, irrespective of their NDR score. This is useful for reloading into Bambu with prepareAnnotations() to redo the analysis or reoutput the annotations at different NDR thresholds.
213213
- **supportedTranscriptModels** - Contains only transcript models that are fully supported by at least one read across the samples provided. Note that if multiple reference annotations share the same intron junctions, an abitrary one will selected to be be included in this output.

0 commit comments

Comments
 (0)