diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index 03d64e75..7b5cb122 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -10,6 +10,13 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, rcAssignmen readClassList <- splitReadClassFiles(readClassList) readClassDt <- genEquiRCs(metadata(readClassList)$readClassDist, annotations, verbose) readClassDt$eqClass.match = match(readClassDt$eqClassById,metadata(readClassList)$eqClassById) + + # Add columnIds and columnCounts columns + readClassDt[, `:=`(columnIds = vector("list", .N), columnCounts = vector("list", .N))] + observedIdx <- which(!is.na(readClassDt$eqClass.match)) + readClassDt$columnIds[observedIdx] <- metadata(readClassList)$columnIds[readClassDt$eqClass.match[observedIdx]] + readClassDt$columnCounts[observedIdx] <- metadata(readClassList)$columnCounts[readClassDt$eqClass.match[observedIdx]] + readClassDt <- simplifyNames(readClassDt) readClassDt <- readClassDt %>% group_by(eqClassId, gene_sid) %>% mutate(multi_align = length(unique(txid))>1) %>% @@ -18,55 +25,33 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, rcAssignmen data.table() #return non-em counts ColData <- generateColData(readClassList, sampleMetadata, extractBarcodeUMI) - quantData <- SummarizedExperiment(assays = SimpleList( - counts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)), - rowRanges = annotations, - colData = ColData) - colnames(quantData) <- ColData$id - if(sum(metadata(readClassList)$incompatibleCountMatrix)==0){ - metadata(quantData)$incompatibleCounts <- NULL - }else{ - metadata(quantData)$incompatibleCounts <- generateIncompatibleCounts(metadata(readClassList)$incompatibleCountMatrix, annotations) - } - metadata(quantData)$nonuniqueCounts <- generateNonUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations) - metadata(quantData)$readClassDt <- readClassDt - metadata(quantData)$countMatrix <- metadata(readClassList)$countMatrix - metadata(quantData)$incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix - metadata(quantData)$sampleName <- metadata(readClassList)$sampleData$sampleName - if(returnDistTable) - metadata(quantData)$distTable <- metadata(metadata(readClassList)$readClassDist)$distTableOld - - if(trackReads) - metadata(quantData)$readToTranscriptMap <- - generateReadToTranscriptMap(readClassList, - metadata(readClassList)$readClassDist, - annotations) + + incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix + incompatibleCounts <- generateIncompatibleCounts(incompatibleCountMatrix, annotations) - return(quantData) + distTable <- if(returnDistTable) { + metadata(metadata(readClassList)$readClassDist)$distTableOld + } else{ + NULL + } -} + readToTranscriptMap <- if(trackReads) { + generateReadToTranscriptMap(readClassList, metadata(readClassList)$readClassDist,annotations) + } else{ + NULL + } -#' Generate unique counts -#' @noRd -generateUniqueCounts <- function(readClassDt, countMatrix, annotations){ - x <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) - uniqueCounts <- countMatrix[x$eqClass.match,] - uniqueCounts.tx <- sparse.model.matrix(~ factor(x$txid) - 1) - uniqueCounts <- t(uniqueCounts.tx) %*% uniqueCounts - rownames(uniqueCounts) <- names(annotations)[match(as.numeric(levels(factor(x$txid))),mcols(annotations)$txid)] - counts <- sparseMatrix(length(annotations), ncol(uniqueCounts), x = 0) - rownames(counts) <- names(annotations) - counts[rownames(uniqueCounts),] <- uniqueCounts - return(counts) - - # these three lines appear after return, so it's not used, is this used for debug only? - # counts.total = colSums(countMatrix) + colSums(incompatibleCountMatrix) - # counts.total[counts.total==0] = 1 - # counts.CPM = counts/counts.total * 10^6 + quantData <- constructQuantData( + sampleData = data.frame(ColData), + readClassDt = readClassDt, + incompatibleCounts = incompatibleCounts, + distTable = distTable, + readToTranscriptMap = readToTranscriptMap + ) + return(quantData) } - #' Generate incompatible counts #' @noRd generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ @@ -74,37 +59,7 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ rownames(incompatibleCountMatrix) <- genes[as.numeric(rownames(incompatibleCountMatrix))] geneMat <- sparseMatrix(length(genes), ncol(incompatibleCountMatrix), x = 0) rownames(geneMat) <- genes - geneMat[rownames(incompatibleCountMatrix),] <- incompatibleCountMatrix - return(geneMat) -} - - -#' Generate non-unique counts -#' @noRd -generateNonUniqueCounts <- function(readClassDt, countMatrix, annotations){ - #fuse multi align RCs by gene - x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) - x <- x %>% distinct(eqClassId, .keep_all = TRUE) - nonuniqueCounts <- countMatrix[x$eqClass.match,, drop = FALSE] - if(nrow(x)>1 & length(unique(x$gene_sid))>1){ - nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } else{ - warning("The factor variable 'gene_sid' has only one level. Adjusting output.") - nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } - #covert ids into gene ids - geneids <- as.numeric(levels(factor(x$gene_sid))) - geneids <- x$txid[match(geneids, x$gene_sid)] - geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] - rownames(nonuniqueCounts) <- geneids - #create matrix for all annotated genes - genes <- levels(factor(unique(mcols(annotations)$GENEID))) - geneMat <- sparseMatrix(length(genes), ncol(nonuniqueCounts), x = 0) - rownames(geneMat) <- genes - if(!is.null(rownames(nonuniqueCounts))){ - geneMat[rownames(nonuniqueCounts),] <- nonuniqueCounts - } - return(geneMat) + colnames(geneMat) <- colnames(incompatibleCountMatrix) + geneMat[match(rownames(incompatibleCountMatrix), rownames(geneMat)), ] <- incompatibleCountMatrix + return(geneMat[unique(mcols(annotations)$GENEID), , drop = FALSE]) } diff --git a/R/bambu-classes.R b/R/bambu-classes.R new file mode 100644 index 00000000..e9ccaf43 --- /dev/null +++ b/R/bambu-classes.R @@ -0,0 +1,78 @@ +#' @import methods +#' @importFrom Matrix Matrix +#' @importFrom data.table data.table +#' @importClassesFrom Matrix Matrix +#' @importClassesFrom data.table data.table + +# Valid values for metadata(se)$seType, which identifies the SE variant: +# EMCounts — transcript-level SE with EM-estimated counts (main bambu output) +# geneCounts — gene-level SE derived by collapsing transcript counts +# uniqueCounts — transcript-level SE with uniquely-assigned counts only, no EM +SE_TYPES <- c( + EMCounts = "EMCounts", + geneCounts = "geneCounts", + uniqueCounts = "uniqueCounts" +) + +# quantData holds per-sample intermediate results from the assignDist step. +# distTable and readToTranscriptMap are optional: populated only when +# returnDistTable=TRUE or trackReads=TRUE respectively, and lifted into +# metadata(countsSe) at the end of bambu(). +setClass("quantData", + slots = c( + sampleData = "data.frame", # per-sample metadata (id, sampleName) + readClassDt = "data.table", # read-class-level count and assignment data + incompatibleCounts = "sparseMatrix", # counts of reads incompatible with any annotation + distTable = "ANY", # read-class-to-transcript compatibility table (DataFrame or NULL) + readToTranscriptMap = "ANY" # per-read assignment to transcripts (tibble or NULL) + ), + prototype = list( + sampleData = data.frame(id = integer(), sampleName = character()), + readClassDt = data.table::data.table() + ), + validity = function(object) { + errs <- character() + if (!all(c("id", "sampleName") %in% names(object@sampleData))) + errs <- c(errs, "sampleData must have columns 'id' and 'sampleName'") + if (!is.null(object@distTable) && !is(object@distTable, "DataFrame")) + errs <- c(errs, "distTable must be a DataFrame or NULL") + if (!is.null(object@readToTranscriptMap) && !is(object@readToTranscriptMap, "tbl_df")) + errs <- c(errs, "readToTranscriptMap must be a tibble or NULL") + if (length(errs)) errs else TRUE + }) + +#' Construct a quantData object +#' @noRd +constructQuantData <- function(sampleData, readClassDt, + incompatibleCounts, + distTable = NULL, + readToTranscriptMap = NULL) { + new("quantData", + sampleData = sampleData, + readClassDt = readClassDt, + incompatibleCounts = incompatibleCounts, + distTable = distTable, + readToTranscriptMap = readToTranscriptMap) +} + +#' @noRd +setGeneric("getSampleData", function(x) standardGeneric("getSampleData")) +#' @noRd +setGeneric("getReadClassDt", function(x) standardGeneric("getReadClassDt")) +#' @noRd +setGeneric("getIncompatibleCounts", function(x) standardGeneric("getIncompatibleCounts")) +#' @noRd +setGeneric("getDistTable", function(x) standardGeneric("getDistTable")) +#' @noRd +setGeneric("getReadToTranscriptMap", function(x) standardGeneric("getReadToTranscriptMap")) + +#' @noRd +setMethod("getSampleData", "quantData", function(x) x@sampleData) +#' @noRd +setMethod("getReadClassDt", "quantData", function(x) x@readClassDt) +#' @noRd +setMethod("getIncompatibleCounts", "quantData", function(x) x@incompatibleCounts) +#' @noRd +setMethod("getDistTable", "quantData", function(x) x@distTable) +#' @noRd +setMethod("getReadToTranscriptMap", "quantData", function(x) x@readToTranscriptMap) diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R index e3629caf..477db52e 100644 --- a/R/bambu-processReads.R +++ b/R/bambu-processReads.R @@ -56,6 +56,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence, processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode, extractBarcodeUMI = extractBarcodeUMI, dedupUMI = dedupUMI, index = 1)}, BPPARAM = bpParameters) + names(readClassList) <- names(reads) } else { readGrgList <- bplapply(seq_along(reads), function(i) { bambu.readsByFile(bam.file = reads[i], @@ -162,22 +163,23 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, mcols(readGrgList)$id <- seq_along(readGrgList) - if(extractBarcodeUMI){ - mcols(readGrgList)$sampleID <- as.numeric(mcols(readGrgList)$CB) + if(extractBarcodeUMI){ + mcols(readGrgList)$columnID <- as.numeric(mcols(readGrgList)$CB) } else { - mcols(readGrgList)$sampleID <- index + mcols(readGrgList)$columnID <- index } + runName <- names(bam.file)[1] # construct read classes for each chromosome seperately if(processByChromosome){ se <- lowMemoryConstructReadClasses(readGrgList, genomeSequence, - annotations, stranded, verbose,bam.file) + annotations, stranded, verbose, bam.file) } else{ unlisted_junctions <- unlistIntrons(readGrgList, use.ids = TRUE) uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions, annotations,genomeSequence, stranded = stranded, verbose = verbose) se <- isore.constructReadClasses(readGrgList, - unlisted_junctions, uniqueJunctions, runName = "TODO", + unlisted_junctions, uniqueJunctions, runName = runName, annotations, stranded, verbose) } @@ -297,18 +299,18 @@ bambu.readsByFile <- function(bam.file, genomeSequence, annotations, constructReadClasses <- function(readGrgList, genomeSequence, annotations, stranded = FALSE, min.readCount = 2, fitReadClassModel = TRUE, min.exonOverlap = 10, defaultModels = NULL, returnModel = FALSE, - verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE){ + verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE, runName = "sample"){ if(processByChromosome){ # construct read classes for each chromosome seperately se <- lowMemoryConstructReadClasses(readGrgList, genomeSequence, - annotations, stranded, verbose,"TODO", fusionMode) + annotations, stranded, verbose, runName, fusionMode) } else{ unlisted_junctions <- unlistIntrons(readGrgList, use.ids = TRUE) uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions, annotations,genomeSequence, stranded = stranded, verbose = verbose) se <- isore.constructReadClasses(readGrgList, - unlisted_junctions, uniqueJunctions, runName = "TODO", + unlisted_junctions, uniqueJunctions, runName = runName, annotations, stranded, verbose) } @@ -336,13 +338,14 @@ constructReadClasses <- function(readGrgList, genomeSequence, annotations, #' Low memory mode for construct read classes (processByChromosome) #' @noRd lowMemoryConstructReadClasses <- function(readGrgList, genomeSequence, - annotations, stranded, verbose,bam.file, fusionMode = FALSE){ + annotations, stranded, verbose, bam.file, fusionMode = FALSE){ if(fusionMode){ readGrgList <- list(readGrgList) names(readGrgList) <- c("fusion") } else{ readGrgList <- split(readGrgList, getChrFromGrList(readGrgList)) } + runName <- names(bam.file)[1] se <- lapply(names(readGrgList),FUN = function(i){ if(length(readGrgList[[i]]) == 0) return(NULL) # create error and strand corrected junction tables @@ -350,7 +353,7 @@ lowMemoryConstructReadClasses <- function(readGrgList, genomeSequence, uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions, annotations,genomeSequence, stranded = stranded, verbose = verbose) se.temp <- isore.constructReadClasses(readGrgList[[i]], - unlisted_junctions, uniqueJunctions, runName = "TODO", + unlisted_junctions, uniqueJunctions, runName = runName, annotations, stranded, verbose) return(se.temp) }) @@ -385,10 +388,12 @@ splitReadClassFiles = function(readClassFile){ distTable <- metadata(metadata(readClassFile)$readClassDist)$distTable eqClasses <- distTable %>% group_by(eqClassById) %>% distinct(eqClassById, readCount,GENEID, totalWidth, firstExonWidth, .keep_all = TRUE) - eqClasses$sampleIDs <- rowData(readClassFile)$sampleIDs[match(eqClasses$readClassId, rownames(readClassFile))] + eqClasses$columnIds <- rowData(readClassFile)$columnIds[match(eqClasses$readClassId, rownames(readClassFile))] eqClasses <- eqClasses %>% summarise(nobs = sum(readCount), - sampleIDs = list(unlist(sampleIDs))) - counts.table <- tableFunction(eqClasses$sampleIDs) + columnIds = list(unlist(columnIds))) + counts.table <- tableFunction(eqClasses$columnIds) + metadata(readClassFile)$columnIds <- lapply(counts.table, function(x) as.numeric(names(x))) + metadata(readClassFile)$columnCounts <- lapply(counts.table, function(x) as.numeric(x)) counts <- sparseMatrix( i = rep(seq_along(counts.table), lengths(counts.table)), j = as.numeric(names(unlist(counts.table))), @@ -397,14 +402,14 @@ splitReadClassFiles = function(readClassFile){ #incompatible counts distTable <- metadata(metadata(readClassFile)$readClassDist)$distTable.incompatible if(nrow(distTable)==0) { - counts.incompatible <- sparseMatrix(i= 1, j = 1, x = 0, - dims = c(1, length(metadata(readClassFile)$sampleData$id))) - rownames(counts.incompatible) <- "TODO" + counts.incompatible <- sparseMatrix(i= integer(0), j = integer(0), x = numeric(0), + dims = c(0, length(metadata(readClassFile)$sampleData$id))) + rownames(counts.incompatible) <- character(0) } else{ - distTable$sampleIDs <- rowData(readClassFile)$sampleIDs[match(distTable$readClassId, rownames(readClassFile))] + distTable$columnIds <- rowData(readClassFile)$columnIds[match(distTable$readClassId, rownames(readClassFile))] distTable <- distTable %>% group_by(GENEID.i) %>% summarise(counts = sum(readCount), - sampleIDs = list(unlist(sampleIDs))) - counts.table <- lapply(distTable$sampleIDs, FUN = function(x){table(x)}) + columnIds = list(unlist(columnIds))) + counts.table <- lapply(distTable$columnIds, FUN = function(x){table(x)}) counts.incompatible <- sparseMatrix( i = rep(seq_along(counts.table), lengths(counts.table)), j = as.numeric(names(unlist(counts.table))), @@ -416,7 +421,6 @@ splitReadClassFiles = function(readClassFile){ colnames(counts) <- metadata(readClassFile)$sampleData$id metadata(readClassFile)$eqClassById <- eqClasses$eqClassById #rownames(counts) = eqClasses$eqClassById - metadata(readClassFile)$countMatrix <- counts metadata(readClassFile)$incompatibleCountMatrix <- counts.incompatible return(readClassFile) } @@ -426,7 +430,7 @@ splitReadClassFiles = function(readClassFile){ #' @importFrom Matrix #' @noRd splitReadClassFilesByRC <- function(readClassFile){ - counts.table <- tableFunction(rowData(readClassFile)$sampleIDs) + counts.table <- tableFunction(rowData(readClassFile)$columnIds) counts <- sparseMatrix( i = rep(seq_along(counts.table), lengths(counts.table)), j = as.numeric(names(unlist(counts.table))), diff --git a/R/bambu-processReads_utilityConstructReadClasses.R b/R/bambu-processReads_utilityConstructReadClasses.R index a9610045..6b05202d 100644 --- a/R/bambu-processReads_utilityConstructReadClasses.R +++ b/R/bambu-processReads_utilityConstructReadClasses.R @@ -9,15 +9,15 @@ #' @inheritParams bambu #' @noRd isore.constructReadClasses <- function(readGrgList, unlisted_junctions, - uniqueJunctions, runName = "sample1", + uniqueJunctions, runName = "sample", annotations, stranded = FALSE, verbose = FALSE) { #split reads into single exon and multi exon reads reads.singleExon <- unlist(readGrgList[elementNROWS(readGrgList) == 1], use.names = FALSE) mcols(reads.singleExon)$id <- mcols(readGrgList[ elementNROWS(readGrgList) == 1])$id - mcols(reads.singleExon)$sampleID <- mcols(readGrgList[ - elementNROWS(readGrgList) == 1])$sampleID + mcols(reads.singleExon)$columnID <- mcols(readGrgList[ + elementNROWS(readGrgList) == 1])$columnID #only keep multi exons reads in readGrgList readGrgList <- readGrgList[elementNROWS(readGrgList) > 1] if (!identical(mcols(readGrgList)$id,unique(mcols(unlisted_junctions)$id))) @@ -45,7 +45,7 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions, annotations, exonsByRC.spliced, stranded, verbose) } exonsByRC <- c(exonsByRC.spliced, exonsByRC.unspliced) - colDataDf <- DataFrame(name = runName, row.names = runName) + colDataDf <- DataFrame(sampleName = runName, row.names = runName) counts <- matrix(mcols(exonsByRC)$readCount, dimnames = list(names(exonsByRC), runName)) @@ -100,7 +100,7 @@ constructSplicedReadClasses <- function(uniqueJunctions, unlisted_junctions, readTable <- readTable %>% dplyr::select(chr.rc = chr, strand.rc = strand, startSD = startSD, endSD = endSD, readCount.posStrand = readCount.posStrand, intronStarts, intronEnds, - confidenceType, readCount, readIds, sampleIDs) + confidenceType, readCount, readIds, columnIds) mcols(exonsByReadClass) <- readTable options(scipen = 0) return(exonsByReadClass) @@ -183,7 +183,7 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, strand = readStrand, confidenceType = readConfidence, alignmentStrand = as.character(getStrandFromGrList(readGrgList))=='+', readId = mcols(readGrgList)$id, - sampleID = mcols(readGrgList)$sampleID) + columnID = mcols(readGrgList)$columnID) rm(readRanges, readStrand, unlisted_junctions_start, unlisted_junctions_end, unlisted_junctions_id, readConfidence, intronStartCoordinatesInt, intronEndCoordinatesInt) @@ -194,7 +194,7 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, start = nth(x = start, n = ceiling(readCount / 5), order_by = start), end = nth(x = end, n = ceiling(readCount / 1.25), order_by = end), readCount.posStrand = sum(alignmentStrand, na.rm = TRUE), - readIds = list(readId), sampleIDs = list(sampleID), + readIds = list(readId), columnIds = list(columnID), .groups = 'drop') %>% arrange(chr, start, end) %>% mutate(readClassId = paste("rc", row_number(), sep = ".")) @@ -252,11 +252,11 @@ constructUnsplicedReadClasses <- function(reads.singleExon, annotations, # by their minimum read class coordinates #remove duplicate ranges counts = as.data.frame(reads.singleExon) %>% - mutate(id = mcols(reads.singleExon)$id, - sampleID = mcols(reads.singleExon)$sampleID) %>% + mutate(id = mcols(reads.singleExon)$id, + columnID = mcols(reads.singleExon)$columnID) %>% group_by(seqnames,start,end,strand) %>% - mutate(counts=n(), id = list(id), sampleID = list(sampleID)) %>% - ungroup() %>% + mutate(counts=n(), id = list(id), columnID = list(columnID)) %>% + ungroup() %>% as.data.frame() reads.singleExon = GRanges(counts) reads.singleExon = unique(reads.singleExon) @@ -274,7 +274,7 @@ constructUnsplicedReadClasses <- function(reads.singleExon, annotations, exonsByReadClass <- rcUnsplicedAnnotation }else{ referenceExons <- reduce(reads.singleExon, ignore.strand = !stranded) - #(2) reads do not fall within a annotated exon/high confidence read class + #(2) reads do not fall within a annotated exon/high confidence read class # exon are summarised based on the union of overlapping unspliced reads rcUnsplicedReduced <- getUnsplicedReadClassByReference( granges = reads.singleExon, grangesReference = referenceExons, @@ -286,17 +286,17 @@ constructUnsplicedReadClasses <- function(reads.singleExon, annotations, if (verbose) message("Finished create single exon transcript models ", "(read classes) in ", round((end.ptm - start.ptm)[3] / 60, 1), " mins.") return(exonsByReadClass) -} + } -#' reconstruct read classes using unspliced reads that fall -#' within exons from annotations -#' @importFrom GenomicRanges GRanges relist -#' @importFrom dplyr %>% select group_by summarise .groups mutate cut_group_id -#' ungroup distinct -#' @noRd -getUnsplicedReadClassByReference <- function(granges, grangesReference, + #' reconstruct read classes using unspliced reads that fall + #' within exons from annotations + #' @importFrom GenomicRanges GRanges relist + #' @importFrom dplyr %>% select group_by summarise .groups mutate cut_group_id + #' ungroup distinct + #' @noRd + getUnsplicedReadClassByReference <- function(granges, grangesReference, confidenceType = "unspliced", stranded = TRUE) { if (is.null(mcols(granges)$id)) stop("ID column is missing from mcols(granges)") @@ -319,17 +319,17 @@ getUnsplicedReadClassByReference <- function(granges, grangesReference, readEnd = end(granges)[queryHits], counts = mcols(granges)$counts[queryHits], readId = mcols(granges[queryHits])$id, - sampleID = mcols(granges[queryHits])$sampleID) + columnID = mcols(granges[queryHits])$columnID) hitsDF <- hitsDF %>% dplyr::select(chr, start, end, readStart, readEnd, strand, readClassId, alignmentStrand, - counts, readId, sampleID) %>% + counts, readId, columnID) %>% group_by(readClassId) %>% summarise(start = start[1], end = end[1], strand = strand[1], chr = chr[1], readCount = sum(counts), startSD = sd(rep(readStart,counts)), endSD = sd(rep(readEnd,counts)), readCount.posStrand = sum(rep(alignmentStrand,counts)), - readIds = list(unlist(readId)), sampleIDs = list(unlist(sampleID))) %>% + readIds = list(unlist(readId)), columnIds = list(unlist(columnID))) %>% mutate(confidenceType = confidenceType, intronStarts = NA, intronEnds = NA) if(nrow(hitsDF)==0){ @@ -348,11 +348,10 @@ getUnsplicedReadClassByReference <- function(granges, grangesReference, hitsDF <- dplyr::select(hitsDF, chr.rc = chr, strand.rc = strand, intronStarts, intronEnds, confidenceType, readCount, startSD, endSD, - readCount.posStrand, readIds, sampleIDs) + readCount.posStrand, readIds, columnIds) mcols(exByReadClassUnspliced) <- hitsDF return(exByReadClassUnspliced) -} - + } #' initiate the hits dataframe #' @param hitsWithin hitsWithin #' @param grangesReference grangesReference diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index 9dff19a2..99f9eaaa 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -2,14 +2,29 @@ #' @inheritParams bambu #' @import data.table #' @noRd -bambu.quantify <- function(readClassDt, countMatrix, incompatibleCountMatrix, txid.index, GENEIDs, emParameters, +bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, txid.index, GENEIDs, emParameters, trackReads = FALSE, returnDistTable = FALSE, verbose = FALSE) { start.ptm <- proc.time() - readClassDt$nobs = countMatrix[readClassDt$eqClass.match] - readClassDt$nobs[is.na(readClassDt$nobs)] = 0 + + # Calculate nobs for sample(s) columnIdx + # Use data.table syntax for in-place creation of nobs column for the scope of this function + readClassDt[, nobs := { + ids <- columnIds[[1]] + if (is.null(ids) || length(ids) == 0 || is.na(ids[1])) { + 0L + } else if (length(columnIdx) == 1) { + # single-cell: look up the single barcode's read count directly + match_idx <- match(columnIdx, ids) + if (is.na(match_idx)) 0L else columnCounts[[1]][match_idx] + } else { + # cluster: sum read counts across all barcodes in the cluster + sum(columnCounts[[1]][ids %in% columnIdx]) + } + }, by = eqClassId] + compatibleCounts <- bambu.quantDT(readClassDt, emParameters = emParameters,verbose = verbose) - incompatibleCounts <- incompatibleCountMatrix[data.table(GENEID.i = GENEIDs), on = "GENEID.i"] + incompatibleCounts <- incompatibleCounts[data.table(GENEID.i = GENEIDs), on = "GENEID.i"] incompatibleCounts[is.na(counts), counts := 0] compatibleCounts <- calculateCPM(compatibleCounts, incompatibleCounts) counts <- compatibleCounts[match(txid.index, txid)] diff --git a/R/bambu-quantify_utilityFunctions.R b/R/bambu-quantify_utilityFunctions.R index 936763f1..5122577d 100644 --- a/R/bambu-quantify_utilityFunctions.R +++ b/R/bambu-quantify_utilityFunctions.R @@ -106,10 +106,9 @@ getUniCountPerEquiRC <- function(distTable){ mutate(anyEqual = any(equal)) %>% select(eqClassById, firstExonWidth,totalWidth, readCount,GENEID,anyEqual) %>% #eqClassByIdTemp, distinct() %>% - mutate(nobs = sum(readCount), - rcWidth = ifelse(anyEqual, max(totalWidth), - max(firstExonWidth))) %>% - select(eqClassById,GENEID,nobs,rcWidth) %>% #eqClassByIdTemp, + mutate(rcWidth = ifelse(anyEqual, max(totalWidth), + max(firstExonWidth))) %>% + select(eqClassById,GENEID,rcWidth) %>% #eqClassByIdTemp, ungroup() %>% distinct() return(eqClassCount) @@ -124,11 +123,10 @@ addEmptyRC <- function(eqClassCount, annotations){ eqClassCount <- createEqClassToTxMapping(eqClassCount) eqClassCountJoin <- full_join(eqClassCount, minEquiRC, by = c("eqClassById","GENEID","txid","equal")) eqClassCountJoin[is.na(eqClassCountJoin)] <- 0 - eqClassCount_final <- eqClassCountJoin %>% + eqClassCount_final <- eqClassCountJoin %>% group_by(eqClassById) %>% - mutate(nobs = max(nobs), - rcWidth = max(rcWidth), - minRC = max(minRC)) %>% + mutate(rcWidth = max(rcWidth, na.rm = TRUE), + minRC = max(minRC, na.rm = TRUE)) %>% ungroup() %>% distinct() return(eqClassCount_final) diff --git a/R/bambu.R b/R/bambu.R index 856f69bc..759707b6 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -275,6 +275,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, trackReads = trackReads ) }, BPPARAM = bpParameters) + names(quantData) <- names(readClassList) if (!quant) return(quantData) } } @@ -289,7 +290,6 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if(length(annotations)==0) stop("No valid annotations, if running de novo please try less stringent parameters") if(is.null(quantData)) stop("quantData must be provided or assignDist = TRUE") - GENEIDs.i <- as.numeric(factor(unique(mcols(annotations)$GENEID))) start.ptm <- proc.time() countsSeCompressed.all <- NULL ColNames <- c() @@ -297,17 +297,18 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, for(i in seq_along(quantData)){ quantData_i <- quantData[[i]] #load in the barcode clustering from file if provided - iter <- seq_len(ncol(metadata(quantData_i)$countMatrix)) # iter is integer + # single-cell mode: iter is an integer vector of column indices, one per barcode + iter <- seq_len(nrow(getSampleData(quantData_i))) if(!is.null(clusters)){ if(class(clusters[[i]])!="CompressedCharacterList"){ # !is.list(clusters) is FALSE for CompressedCharacterList clusterMaps <- NULL - for(j in seq_along(metadata(quantData_i)$sampleNames)){ #load in a file per sample name provided - clusterMap <- fread(clusters[[j]], header = FALSE, + for(j in seq_along(getSampleData(quantData_i)$sampleName)){ #load in a file per sample name provided + clusterMap <- fread(clusters[[j]], header = FALSE, data.table = FALSE) # read.table(clusters[[j]], # sep = ifelse(grepl(".tsv$",clusters[[j]]), "\t", ","), # header = FALSE) - clusterMap[,1] <- paste0(metadata(quantData_i)$sampleNames[j], + clusterMap[,1] <- paste0(getSampleData(quantData_i)$sampleName[j], "_",clusterMap[,1]) clusterMaps <- rbind(clusterMaps, clusterMap) } @@ -319,19 +320,18 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } else{ #if clusters is a list iter <- clusters[[i]] } + # cluster mode: convert barcode strings to integer column indices; + # iter becomes a named list of integer vectors, one per cluster + iter <- lapply(iter, match, getSampleData(quantData_i)$id) } - countsSeCompressed <- bplapply(iter, FUN = function(j){ # previous i changed to j to avoid duplicated assignment - #i = iter[i %in% colnames(metadata(quantData_i)$countMatrix)] #bug, after assignment, i become emptyprint(i) - countMatrix <- unname(metadata(quantData_i)$countMatrix[,j]) # same here - incompatibleCountMatrix <- unname(metadata(quantData_i)$incompatibleCountMatrix[,j]) # same here - if(!is.null(dim(countMatrix))){ - countMatrix <- rowSums(countMatrix) - incompatibleCountMatrix <- rowSums(metadata(quantData_i)$incompatibleCountMatrix[,j]) # same here - } - return(bambu.quantify(readClassDt = metadata(quantData_i)$readClassDt, countMatrix = countMatrix, - incompatibleCountMatrix = data.table(GENEID.i = as.numeric(rownames(metadata(quantData_i)$incompatibleCountMatrix)), counts = incompatibleCountMatrix), - txid.index = mcols(annotations)$txid, GENEIDs = GENEIDs.i, - emParameters = opt.em, trackReads = trackReads, + countsSeCompressed <- bplapply(iter, FUN = function(columnIdx){ # previous i changed to j to avoid duplicated assignment + + incompatibleCounts_i <- getIncompatibleCounts(quantData_i)[, columnIdx, drop = FALSE] + + return(bambu.quantify(readClassDt = getReadClassDt(quantData_i), columnIdx = columnIdx, + incompatibleCounts = data.table(GENEID.i = rownames(incompatibleCounts_i), counts = as.vector(incompatibleCounts_i)), + txid.index = mcols(annotations)$txid, GENEIDs = rownames(incompatibleCounts_i), + emParameters = opt.em, trackReads = trackReads, verbose = verbose))}, BPPARAM = bpParameters) end.ptm <- proc.time() @@ -344,19 +344,28 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, row.names = names(countsSeCompressed) ) } else{ - ColNames <- c(ColNames, colnames(quantData_i)) - colData.all[[i]] <- data.frame(colData(quantData_i)) + ColNames <- c(ColNames, rownames(getSampleData(quantData_i))) + colData.all[[i]] <- data.frame(getSampleData(quantData_i)) } countsSeCompressed.all <- c(countsSeCompressed.all, countsSeCompressed) } names(countsSeCompressed.all) <- ColNames countsSe <- combineCountSes(countsSeCompressed.all, colData.all, annotations) + if(trackReads){ + readToTranscriptMaps = list() + for(i in seq_along(quantData)){ + readToTranscriptMaps[[i]] <- getReadToTranscriptMap(quantData[[i]]) + } + names(readToTranscriptMaps) <- names(quantData) + metadata(countsSe)$readToTranscriptMaps <- readToTranscriptMaps + } if(returnDistTable){ distTables = list() for(i in seq_along(quantData)){ - distTables[[i]] <- metadata(quantData[[i]])$distTable + distTables[[i]] <- getDistTable(quantData[[i]]) } + names(distTables) <- names(quantData) metadata(countsSe)$distTables <- distTables } return(countsSe) diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index 6af1e86f..c5056d20 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -290,6 +290,7 @@ combineCountSes <- function(countsSe, colDataList, annotations){ fullLengthCounts = countsDataMat$fullLengthCounts, uniqueCounts = countsDataMat$uniqueCounts)) metadata(combinedCountsSe)$incompatibleCounts <- countsDataMat$incompatibleCounts + metadata(combinedCountsSe)$seType <- SE_TYPES[["EMCounts"]] rowRanges(combinedCountsSe) <- annotations colData(combinedCountsSe) <- DataFrame(bind_rows(colDataList)) @@ -333,7 +334,7 @@ generateColData <- function(readClassList, sampleMetadata, extractBarcodeUMI) { } # Quick wrapper function (https://stackoverflow.com/questions/13273833/merging-multiple-data-tables) -#' @noRd +#' @noRd merge_wrapper <- function(x,y){ merge.data.table(x,y,by = "GENEID",all=TRUE) } diff --git a/R/transcriptToGeneExpression.R b/R/transcriptToGeneExpression.R index 963a9b00..a76aa7d0 100644 --- a/R/transcriptToGeneExpression.R +++ b/R/transcriptToGeneExpression.R @@ -2,7 +2,7 @@ #' @title transcript to gene expression #' @param se a summarizedExperiment object from \code{\link{bambu}} #' @return A SummarizedExperiment object -#' @import data.table +#' @import data.table #' @export #' @examples #' se <- readRDS(system.file("extdata", @@ -12,17 +12,15 @@ #' transcriptToGeneExpression(se) transcriptToGeneExpression <- function(se) { counts <- assays(se)$counts - runnames <- colnames(counts)[-1] rowDataSe <- as.data.table(rowData(se)) - - counts = fac2sparse(factor(rowData(se)$GENEID, levels = unique(rowData(se)$GENEID))) %*% counts - if(!is.null(metadata(se)$incompatibleCounts)){ + + counts = fac2sparse(factor(rowData(se)$GENEID, levels = unique(rowData(se)$GENEID))) %*% counts + if (!is.null(metadata(se)$incompatibleCounts)) { incompatibleCounts <- metadata(se)$incompatibleCounts - if("nonuniqueCounts" %in% names(metadata(se))){ - incompatibleCounts = incompatibleCounts + metadata(se)$nonuniqueCounts - } - incompatibleCounts = Matrix(incompatibleCounts[match(rownames(counts), rownames(incompatibleCounts)),], sparse = TRUE) - counts = counts + incompatibleCounts + if ("nonuniqueCounts" %in% names(metadata(se))) + incompatibleCounts <- incompatibleCounts + metadata(se)$nonuniqueCounts + incompatibleCounts <- Matrix(incompatibleCounts[match(rownames(counts), rownames(incompatibleCounts)), ], sparse = TRUE) + counts <- counts + incompatibleCounts } counts.total = colSums(counts) counts.total[counts.total==0] = 1 @@ -48,6 +46,55 @@ transcriptToGeneExpression <- function(se) { CPM = counts.CPM), rowRanges = exByGene[RowNames], colData = ColData) - + metadata(seOutput)$seType <- SE_TYPES[["geneCounts"]] return(seOutput) -} \ No newline at end of file +} + +#' Generate a SummarizedExperiment of unique counts from quantData +#' @description This function is intended to be used after the transcript +#' discovery and \code{assignDist} steps in \code{\link{bambu}}. It builds a +#' transcript-level SummarizedExperiment containing raw unique counts (reads +#' uniquely assigned to a single transcript) without EM estimation, which can +#' be passed directly to \code{\link{transcriptToGeneExpression}} to obtain +#' gene-level counts as uniqueCounts + nonuniqueCounts + incompatibleCounts. +#' @param quantData a list of quantData objects produced by the assignDist step +#' @param annotations a GRangesList of transcript annotations +#' @return A SummarizedExperiment object with \code{assays$uniqueCounts}, +#' \code{metadata$incompatibleCounts}, and \code{metadata$nonuniqueCounts} +#' @import data.table +#' @noRd +generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { + uniqueCountsList <- lapply(quantData, function(x) { + readClassDt <- getReadClassDt(x) + x_filtered <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) + + uniqueCounts <- if (nrow(x_filtered) == 0) { + sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nrow(getSampleData(x)))) + } else { + txids <- mcols(annotations)$txid + i <- rep(match(x_filtered$txid, txids), lengths(x_filtered$columnIds)) + j <- unlist(x_filtered$columnIds) + x_vals <- unlist(x_filtered$columnCounts) + sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nrow(getSampleData(x)))) + } + rownames(uniqueCounts) <- names(annotations) + colnames(uniqueCounts) <- rownames(getSampleData(x)) + return(uniqueCounts) + }) + uniqueCounts <- do.call(cbind, unname(uniqueCountsList)) + + incompatibleCounts <- do.call(cbind, unname(lapply(quantData, getIncompatibleCounts))) + nonuniqueCounts <- do.call(cbind, unname(lapply(quantData, function(x) { + generateNonUniqueCountMatrix(getReadClassDt(x), annotations, getSampleData(x)$id) + }))) + + colData <- do.call(rbind, unname(lapply(quantData, getSampleData))) + + se <- SummarizedExperiment(assays = SimpleList(counts = uniqueCounts)) + rowRanges(se) <- annotations + colData(se) <- DataFrame(colData) + metadata(se)$incompatibleCounts <- incompatibleCounts + metadata(se)$nonuniqueCounts <- nonuniqueCounts + metadata(se)$seType <- SE_TYPES[["uniqueCounts"]] + return(se) +} diff --git a/R/transcriptToGeneExpression_utilityFunctions.R b/R/transcriptToGeneExpression_utilityFunctions.R index c0e61165..aff2eb50 100644 --- a/R/transcriptToGeneExpression_utilityFunctions.R +++ b/R/transcriptToGeneExpression_utilityFunctions.R @@ -1,4 +1,40 @@ +#' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt +#' @noRd +generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ + genes <- unique(mcols(annotations)$GENEID) + nSamples <- length(sampleIds) + geneMat <- sparseMatrix(length(genes), nSamples, x = 0) + rownames(geneMat) <- genes + colnames(geneMat) <- sampleIds + + x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) + x <- x %>% distinct(eqClassId, .keep_all = TRUE) + if (nrow(x) == 0) return(geneMat) + + i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) + j <- unlist(x$columnIds) + x_vals <- unlist(x$columnCounts) + nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) + + if (nrow(x) > 1 & length(unique(x$gene_sid)) > 1) { + nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } else { + nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } + + geneids <- as.numeric(levels(factor(x$gene_sid))) + geneids <- x$txid[match(geneids, x$gene_sid)] + geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] + rownames(nonuniqueCounts) <- geneids + colnames(nonuniqueCounts) <- sampleIds + + geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts + return(geneMat) +} + #' rename runnames when there are duplicated names #' @title rename_duplicatedNames #' @param runnames sample names