diff --git a/NAMESPACE b/NAMESPACE index 3adb4e37..df261f3a 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(bambu) +export(bambu.singlecell) export(plotBambu) export(prepareAnnotations) export(readFromGTF) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index 750c1e87..03d64e75 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -2,11 +2,11 @@ #' @inheritParams bambu #' @import data.table #' @noRd -assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParameters, - verbose, sampleMetadata, demultiplexed, +assignReadClasstoTranscripts <- function(readClassList, annotations, rcAssignmentParameters, + verbose, sampleMetadata, extractBarcodeUMI, returnDistTable = FALSE, trackReads = TRUE) { if (is.character(readClassList)) readClassList <- readRDS(file = readClassList) - metadata(readClassList)$readClassDist <- calculateDistTable(readClassList, annotations, isoreParameters, verbose, returnDistTable) + metadata(readClassList)$readClassDist <- calculateDistTable(readClassList, annotations, rcAssignmentParameters, verbose, returnDistTable) readClassList <- splitReadClassFiles(readClassList) readClassDt <- genEquiRCs(metadata(readClassList)$readClassDist, annotations, verbose) readClassDt$eqClass.match = match(readClassDt$eqClassById,metadata(readClassList)$eqClassById) @@ -17,7 +17,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame mutate(aval = 1) %>% data.table() #return non-em counts - ColData <- generateColData(readClassList, sampleMetadata, demultiplexed) + ColData <- generateColData(readClassList, sampleMetadata, extractBarcodeUMI) quantData <- SummarizedExperiment(assays = SimpleList( counts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)), rowRanges = annotations, diff --git a/R/bambu-extendAnnotations-utilityExtend.R b/R/bambu-extendAnnotations-utilityExtend.R index d65d70ea..9ef32445 100644 --- a/R/bambu-extendAnnotations-utilityExtend.R +++ b/R/bambu-extendAnnotations-utilityExtend.R @@ -112,7 +112,7 @@ filterTranscriptsByAnnotation <- function(rowDataCombined, annotationGrangesList } else if(is.null(NDR)) { NDR <- 0.5 } - filterSet <- (rowDataCombined$NDR <= NDR | rowDataCombined$readClassType == "equal:compatible") + filterSet <- ((!is.na(rowDataCombined$NDR) & rowDataCombined$NDR <= NDR) | rowDataCombined$readClassType == "equal:compatible") lowConfidenceTranscripts <- combindRowDataWithRanges( rowDataCombined[!filterSet,], exonRangesCombined[!filterSet]) @@ -224,7 +224,7 @@ calculateNDROnTranscripts <- function(combinedTranscripts, useTxScore = FALSE){ } else { combinedTranscripts$NDR <- calculateNDR(combinedTranscripts$maxTxScore, equal) } - combinedTranscripts$NDR[combinedTranscripts$maxTxScore==-1] <- 1 + combinedTranscripts$NDR[combinedTranscripts$maxTxScore==-1] <- NA return(combinedTranscripts) } @@ -827,7 +827,6 @@ addGeneIdsToReadClassTable <- function(readClassTable, distTable, #' @description This function train a model for use on other data #' @param extendedAnnotations A GRangesList object produced from bambu(quant = FALSE) or rowRanges(se) #' @param NDR The maximum NDR for novel transcripts to be in extendedAnnotations (0-1). If not provided a recommended NDR is calculated. -#' @param includeRef A boolean which if TRUE will also filter out reference annotations based on their NDR #' @param prefix A string which determines which transcripts are considered novel by bambu and will be filtered (by default = 'Bambu') #' @param baselineFDR a value between 0-1. Bambu uses this FDR on the trained model to recommend an equivilent NDR threshold to be used for the sample. By default, a baseline FDR of 0.1 is used. This does not impact the analysis if an NDR is set. #' @param defaultModels a bambu trained model object that bambu will use when fitReadClassModel==FALSE or the data is not suitable for training, defaults to the pretrained model in the bambu package @@ -835,7 +834,7 @@ addGeneIdsToReadClassTable <- function(readClassTable, distTable, #' @details #' @return extendedAnnotations with a new NDR threshold #' @export -setNDR <- function(extendedAnnotations, NDR = NULL, includeRef = FALSE, prefix = 'Bambu', baselineFDR = 0.1, defaultModels2 = defaultModels){ +setNDR <- function(extendedAnnotations, NDR = NULL, prefix = 'Bambu', baselineFDR = 0.1, defaultModels2 = defaultModels){ #Check to see if the annotations/gtf are dervived from Bambu if(is.null(mcols(extendedAnnotations)$NDR)){ warning("Annotations were not extended by Bambu (or the wrong prefix was provided). NDR can not be set") @@ -852,17 +851,10 @@ setNDR <- function(extendedAnnotations, NDR = NULL, includeRef = FALSE, prefix = message("Recommending a novel discovery rate (NDR) of: ", NDR) } - #If reference annotations should be filtered too (note that reference annotations with no read support arn't filtered) - if(includeRef){ - toRemove <- (!is.na(mcols(extendedAnnotations)$NDR) & mcols(extendedAnnotations)$NDR > NDR) - toAdd <- !is.na(mcols(metadata(extendedAnnotations)$lowConfidenceTranscripts)$NDR) & - mcols(metadata(extendedAnnotations)$lowConfidenceTranscripts)$NDR <= NDR - } else { - toRemove <- (mcols(extendedAnnotations)$NDR > NDR & - grepl(prefix, mcols(extendedAnnotations)$TXNAME)) - toAdd <- (mcols(metadata(extendedAnnotations)$lowConfidenceTranscripts)$NDR <= NDR & - grepl(prefix, mcols(metadata(extendedAnnotations)$lowConfidenceTranscripts)$TXNAME)) - } + toRemove <- (mcols(extendedAnnotations)$NDR > NDR & + grepl(prefix, mcols(extendedAnnotations)$TXNAME)) + toAdd <- (mcols(metadata(extendedAnnotations)$lowConfidenceTranscripts)$NDR <= NDR & + grepl(prefix, mcols(metadata(extendedAnnotations)$lowConfidenceTranscripts)$TXNAME)) temp <- c(metadata(extendedAnnotations)$lowConfidenceTranscripts[!toAdd], extendedAnnotations[toRemove]) extendedAnnotations <- c(extendedAnnotations[!toRemove], metadata(extendedAnnotations)$lowConfidenceTranscripts[toAdd]) @@ -880,7 +872,7 @@ setNDR <- function(extendedAnnotations, NDR = NULL, includeRef = FALSE, prefix = #' Extend annotations by clusters #' @noRd -isore.extendAnnotations.clusters <- function(readClassList, annotations, clusters, NDR, isoreParameters, stranded, bpParameters, fusionMode, verbose = FALSE){ +isore.extendAnnotations.clusters <- function(readClassList, annotations, clusters, NDR, discoveryParameters, stranded, bpParameters, fusionMode, verbose = FALSE){ message("--- Start extending annotations for clusters ---") #if clustering is a csv, create a list with the barcodes for each cluster #csv must have two cols with heading barcode, cluster @@ -898,7 +890,7 @@ isore.extendAnnotations.clusters <- function(readClassList, annotations, cluster for(i in seq_along(clusters)){ print(names(clusters)[i]) ###TODO need to account for the sample name here which is added to the barcode - index <- match(clusters[[i]],gsub('demultiplexed','',metadata(readClassList[[1]])$samples)) + index <- match(clusters[[i]],gsub('demultiplexed','',metadata(readClassList[[1]])$samples)) index <- index[!is.na(index)] print(length(index)) if(length(index)<20) next @@ -912,8 +904,8 @@ isore.extendAnnotations.clusters <- function(readClassList, annotations, cluster rowData(rcf.filt)$startSD <- 0 rowData(rcf.filt)$endSD <- 0 rowData(rcf.filt)$readCount.posStrand <- 0 - thresholdIndex <- which(rowData(rcf.filt)$readCount>=isoreParameters$min.readCount) - model <- trainBambu(rcf.filt, verbose = verbose, min.readCount = isoreParameters$min.readCount) + thresholdIndex <- which(rowData(rcf.filt)$readCount>=discoveryParameters$min.readCount) + model <- trainBambu(rcf.filt, verbose = verbose, min.readCount = discoveryParameters$min.readCount) txScore <- getTranscriptScore(rowData(rcf.filt)[thresholdIndex,], model, defaultModels) rowData(rcf.filt)$txScore <- rep(NA,nrow(rcf.filt)) @@ -921,12 +913,12 @@ isore.extendAnnotations.clusters <- function(readClassList, annotations, cluster #txScores = cbind(txScores, rowData(rcf.filt)$txScore) rcfs.clusters[[names(clusters)[i]]] <- rcf.filt annotations.clusters[[names(clusters)[i]]] <- bambu.extendAnnotations(list(rcf.filt), annotations, NDR, - isoreParameters, stranded, bpParameters, fusionMode, verbose) + discoveryParameters, stranded, bpParameters, fusionMode, verbose) } if(length(rcfs.clusters)>0){ print("--- Merging all individual clusters ---") annotations.clusters[["merged"]] <- bambu.extendAnnotations(rcfs.clusters, annotations, NDR, - isoreParameters, stranded, bpParameters, fusionMode, verbose) + discoveryParameters, stranded, bpParameters, fusionMode, verbose) } return(annotations.clusters) diff --git a/R/bambu-extendAnnotations.R b/R/bambu-extendAnnotations.R index 7404fa61..ec9cb3e1 100644 --- a/R/bambu-extendAnnotations.R +++ b/R/bambu-extendAnnotations.R @@ -3,14 +3,14 @@ #' @inheritParams bambu #' @noRd bambu.extendAnnotations <- function(readClassList, annotations, NDR, - isoreParameters, stranded, bpParameters, fusionMode = FALSE, verbose = FALSE) { + discoveryParameters, stranded, bpParameters, fusionMode = FALSE, verbose = FALSE) { start.ptm_all <- proc.time() combinedTxCandidates <- isore.combineTranscriptCandidates(readClassList, stranded, ## stranded used for unspliced reduce - min.readCount = isoreParameters[["min.readCount"]], - min.readFractionByGene = isoreParameters[["min.readFractionByGene"]], - min.txScore.multiExon = isoreParameters[["min.txScore.multiExon"]], - min.txScore.singleExon = isoreParameters[["min.txScore.singleExon"]], + min.readCount = discoveryParameters[["min.readCount"]], + min.readFractionByGene = discoveryParameters[["min.readFractionByGene"]], + min.txScore.multiExon = discoveryParameters[["min.txScore.multiExon"]], + min.txScore.singleExon = discoveryParameters[["min.txScore.singleExon"]], bpParameters, verbose) end.ptm_all <- proc.time() @@ -20,21 +20,21 @@ bambu.extendAnnotations <- function(readClassList, annotations, NDR, annotations <- isore.extendAnnotations( combinedTranscripts = combinedTxCandidates, annotationGrangesList = annotations, - remove.subsetTx = isoreParameters[["remove.subsetTx"]], - min.sampleNumber = isoreParameters[["min.sampleNumber"]], + remove.subsetTx = discoveryParameters[["remove.subsetTx"]], + min.sampleNumber = discoveryParameters[["min.sampleNumber"]], NDR = NDR, - min.exonDistance = isoreParameters[["min.exonDistance"]], - min.exonOverlap = isoreParameters[["min.exonOverlap"]], + min.exonDistance = discoveryParameters[["min.exonDistance"]], + min.exonOverlap = discoveryParameters[["min.exonOverlap"]], min.primarySecondaryDist = - isoreParameters[['min.primarySecondaryDist']], + discoveryParameters[['min.primarySecondaryDist']], min.primarySecondaryDistStartEnd = - isoreParameters[['min.primarySecondaryDistStartEnd1']], + discoveryParameters[['min.primarySecondaryDistStartEnd1']], min.readFractionByEqClass = - isoreParameters[['min.readFractionByEqClass']], + discoveryParameters[['min.readFractionByEqClass']], fusionMode = fusionMode, - prefix = isoreParameters[["prefix"]], - baselineFDR = isoreParameters[["baselineFDR"]], - defaultModels = isoreParameters[["defaultModels"]], + prefix = discoveryParameters[["prefix"]], + baselineFDR = discoveryParameters[["baselineFDR"]], + defaultModels = discoveryParameters[["defaultModels"]], verbose = verbose) end.ptm_all <- proc.time() if (verbose) message("extend annotations in ", diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R index dd988584..e3629caf 100644 --- a/R/bambu-processReads.R +++ b/R/bambu-processReads.R @@ -14,9 +14,9 @@ #' @noRd bambu.processReads <- function(reads, annotations, genomeSequence, readClass.outputDir=NULL, yieldSize=1000000, bpParameters, - stranded=FALSE, verbose=FALSE, isoreParameters = setIsoreParameters(NULL), + stranded=FALSE, verbose=FALSE, discoveryParameters = setDiscoveryParameters(NULL), processByChromosome = FALSE, processByBam = TRUE, trackReads = trackReads, fusionMode = fusionMode, - demultiplexed = FALSE, cleanReads = FALSE, dedupUMI = FALSE, sampleNames = NULL, barcodesToFilter = NULL) { + extractBarcodeUMI = FALSE, dedupUMI = FALSE) { genomeSequence <- checkInputSequence(genomeSequence) # ===# create BamFileList object from character #===# if (is(reads, "BamFile")) { @@ -40,19 +40,11 @@ bambu.processReads <- function(reads, annotations, genomeSequence, reads <- BamFileList(reads, yieldSize = yieldSize) names(reads) <- tools::file_path_sans_ext(BiocGenerics::basename(reads)) } - if(!is.null(sampleNames)){ - if(length(sampleNames==length(reads))){ - names(reads) <- sampleNames - } else{ - message("Not enough provided sample names. Using them in order of inputted files and the remaining files will use the file names") - names(reads)[seq_along(sampleNames)] <- sampleNames - } - } - min.readCount <- isoreParameters[["min.readCount"]] - fitReadClassModel <- isoreParameters[["fitReadClassModel"]] - defaultModels <- isoreParameters[["defaultModels"]] - returnModel <- isoreParameters[["returnModel"]] - min.exonOverlap <- isoreParameters[["min.exonOverlap"]] + min.readCount <- discoveryParameters[["min.readCount"]] + fitReadClassModel <- discoveryParameters[["fitReadClassModel"]] + defaultModels <- discoveryParameters[["defaultModels"]] + returnModel <- discoveryParameters[["returnModel"]] + min.exonOverlap <- discoveryParameters[["min.exonOverlap"]] if(processByBam){ readClassList <- bplapply(seq_along(reads), function(i) { @@ -62,24 +54,23 @@ bambu.processReads <- function(reads, annotations, genomeSequence, fitReadClassModel = fitReadClassModel, min.exonOverlap = min.exonOverlap, defaultModels = defaultModels, returnModel = returnModel, verbose = verbose, processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode, - demultiplexed = demultiplexed, cleanReads = cleanReads, dedupUMI = dedupUMI, index = 1, barcodesToFilter = barcodesToFilter)}, + extractBarcodeUMI = extractBarcodeUMI, dedupUMI = dedupUMI, index = 1)}, BPPARAM = bpParameters) } else { readGrgList <- bplapply(seq_along(reads), function(i) { bambu.readsByFile(bam.file = reads[i], genomeSequence = genomeSequence,annotations = annotations, - stranded = stranded, min.readCount = min.readCount, - fitReadClassModel = fitReadClassModel, min.exonOverlap = min.exonOverlap, - defaultModels = defaultModels, returnModel = returnModel, verbose = verbose, - trackReads = trackReads, fusionMode = fusionMode, - demultiplexed = demultiplexed, cleanReads = cleanReads, dedupUMI = dedupUMI, index = i, barcodesToFilter = barcodesToFilter)}, + stranded = stranded, min.readCount = min.readCount, + fitReadClassModel = fitReadClassModel, min.exonOverlap = min.exonOverlap, + defaultModels = defaultModels, returnModel = returnModel, verbose = verbose, + trackReads = trackReads, fusionMode = fusionMode, + extractBarcodeUMI = extractBarcodeUMI, dedupUMI = dedupUMI, index = i)}, BPPARAM = bpParameters) - sampleNames <- as.numeric(as.factor(sampleNames)) for(i in seq_along(readGrgList)){ - if(!isFALSE(demultiplexed)){ + if(extractBarcodeUMI){ mcols(readGrgList[[i]])$CB <- paste0(names(reads)[i], '_', mcols(readGrgList[[i]])$CB) } else{ - mcols(readGrgList[[i]])$CB <- sampleNames[i] + mcols(readGrgList[[i]])$CB <- names(reads)[i] } mcols(readGrgList[[i]])$CB <- as.factor(mcols(readGrgList[[i]])$CB) @@ -87,19 +78,19 @@ bambu.processReads <- function(reads, annotations, genomeSequence, } readGrgList <- do.call(c, readGrgList) mcols(readGrgList)$id <- seq_along(readGrgList) - if(!isFALSE(demultiplexed)){ + if(extractBarcodeUMI){ mcols(readGrgList)$sampleID <- as.numeric(mcols(readGrgList)$CB) } else { mcols(readGrgList)$sampleID <- i } readClassList <- constructReadClasses(readGrgList, genomeSequence = genomeSequence,annotations = annotations, - stranded = stranded, min.readCount = min.readCount, - fitReadClassModel = fitReadClassModel, min.exonOverlap = min.exonOverlap, - defaultModels = defaultModels, returnModel = returnModel, verbose = verbose, + stranded = stranded, min.readCount = min.readCount, + fitReadClassModel = fitReadClassModel, min.exonOverlap = min.exonOverlap, + defaultModels = defaultModels, returnModel = returnModel, verbose = verbose, processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode) metadata(readClassList)$samples <- names(reads) metadata(readClassList)$sampleNames <- names(reads) - if(!isFALSE(demultiplexed)) metadata(readClassList)$samples <- levels(mcols(readGrgList)$CB) + if(extractBarcodeUMI) metadata(readClassList)$samples <- levels(mcols(readGrgList)$CB) readClassList <- list(readClassList) } @@ -124,14 +115,12 @@ bambu.processReads <- function(reads, annotations, genomeSequence, bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, yieldSize = NULL, stranded = FALSE, min.readCount = 2, fitReadClassModel = TRUE, min.exonOverlap = 10, defaultModels = NULL, returnModel = FALSE, - verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE, demultiplexed = FALSE, - cleanReads = FALSE, dedupUMI = FALSE, index = 0, barcodesToFilter = NULL) { + verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE, + extractBarcodeUMI = FALSE, dedupUMI = FALSE, index = 0) { if(verbose) message(names(bam.file)[1]) - readGrgList <- prepareDataFromBam(bam.file[[1]], verbose = verbose, yieldSize = yieldSize, use.names = trackReads, demultiplexed = demultiplexed, cleanReads = cleanReads, dedupUMI = dedupUMI) + readGrgList <- prepareDataFromBam(bam.file[[1]], verbose = verbose, yieldSize = yieldSize, use.names = trackReads, extractBarcodeUMI = extractBarcodeUMI, dedupUMI = dedupUMI) if(verbose) message(paste0("Number of alignments/reads: ",length(readGrgList))) warnings <- c() - if(!is.null(barcodesToFilter) & !isFALSE(demultiplexed)) - readGrgList <- readGrgList[!(mcols(readGrgList)$CB %in% barcodesToFilter)] warnings <- seqlevelCheckReadsAnnotation(readGrgList, annotations) if(verbose & length(warnings) > 0) warning(paste(warnings,collapse = "\n")) #check seqlevels for consistency, drop ranges not present in genomeSequence @@ -173,7 +162,7 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, mcols(readGrgList)$id <- seq_along(readGrgList) - if(!isFALSE(demultiplexed)){ + if(extractBarcodeUMI){ mcols(readGrgList)$sampleID <- as.numeric(mcols(readGrgList)$CB) } else { mcols(readGrgList)$sampleID <- index @@ -210,7 +199,7 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, fusionMode = fusionMode, verbose = verbose) - if (demultiplexed) { + if (extractBarcodeUMI) { barcodes <- levels(mcols(readGrgList)$CB) metadata(se)$sampleData <- tibble( id = paste(names(bam.file)[1], barcodes, sep = '_'), @@ -234,12 +223,10 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, bambu.readsByFile <- function(bam.file, genomeSequence, annotations, yieldSize = NULL, stranded = FALSE, min.readCount = 2, fitReadClassModel = TRUE, min.exonOverlap = 10, defaultModels = NULL, returnModel = FALSE, - verbose = FALSE, trackReads = FALSE, fusionMode = FALSE, demultiplexed = FALSE, - cleanReads = TRUE, dedupUMI = FALSE, index = 0, barcodesToFilter = NULL) { - readGrgList <- prepareDataFromBam(bam.file[[1]], verbose = verbose, yieldSize = yieldSize, use.names = trackReads, demultiplexed = demultiplexed, cleanReads = cleanReads, dedupUMI = dedupUMI) - - if(!is.null(barcodesToFilter) & !isFALSE(demultiplexed)) readGrgList <- readGrgList[!mcols(readGrgList)$CB %in% barcodesToFilter] - + verbose = FALSE, trackReads = FALSE, fusionMode = FALSE, + extractBarcodeUMI = FALSE, dedupUMI = FALSE, index = 0) { + readGrgList <- prepareDataFromBam(bam.file[[1]], verbose = verbose, yieldSize = yieldSize, use.names = trackReads, extractBarcodeUMI = extractBarcodeUMI, dedupUMI = dedupUMI) + if(verbose) message("Number of alignments/reads: ",length(readGrgList)) warnings <- c() @@ -290,7 +277,7 @@ bambu.readsByFile <- function(bam.file, genomeSequence, annotations, stop("No reads left after filtering.") ## add ### - #if (isTRUE(demultiplexed)){ + #if (extractBarcodeUMI){ # cellBarcodeAssign <- tibble(index = mcols(readGrgList)$id, CB = mcols(readGrgList)$CB) %>% nest(.by = "CB") # if (!dir.exists("CB")){ diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index 68b41395..9dff19a2 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -2,9 +2,9 @@ #' @inheritParams bambu #' @import data.table #' @noRd -bambu.quantify <- function(readClassDt, countMatrix, incompatibleCountMatrix, txid.index, GENEIDs, emParameters, +bambu.quantify <- function(readClassDt, countMatrix, incompatibleCountMatrix, txid.index, GENEIDs, emParameters, trackReads = FALSE, returnDistTable = FALSE, - verbose = FALSE, isoreParameters = setIsoreParameters(NULL)) { + verbose = FALSE) { start.ptm <- proc.time() readClassDt$nobs = countMatrix[readClassDt$eqClass.match] readClassDt$nobs[is.na(readClassDt$nobs)] = 0 diff --git a/R/bambu.R b/R/bambu.R index cf61a555..856f69bc 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -34,11 +34,8 @@ #' annotation to be assigned to the same gene id, defaults to 10bp} #' \item{min.primarySecondaryDist}{specifying the minimum number of distance #' threshold, defaults to 5bp} -#' \item{min.primarySecondaryDistStartEnd1}{specifying the minimum number +#' \item{min.primarySecondaryDistStartEnd1}{specifying the minimum number #' of distance threshold, used for extending annotation, defaults to 5bp} -#' \item{min.primarySecondaryDistStartEnd2}{specifying the minimum number -#' of distance threshold, used for estimating distance to annotation, -#' defaults to 5bp} #' \item{min.txScore.multiExon}{specifying the minimum transcript level #' threshold for multi-exon transcripts during sample combining, #' defaults to 0} @@ -62,6 +59,17 @@ #' \item{prefix}{specifying prefix for new gene Ids (genePrefix.number), #' defaults to "Bambu"} #' } +#' @param opt.rcAssignment A list of controlling parameters for the read class +#' to transcript assignment process: +#' \describe{ +#' \item{min.exonDistance}{specifying minimum distance to known transcript +#' to be considered a valid match, defaults to 35bp} +#' \item{min.primarySecondaryDist}{specifying the minimum distance +#' threshold between primary and secondary assignments, defaults to 5bp} +#' \item{min.primarySecondaryDistStartEnd2}{specifying the minimum +#' distance threshold for start/end positions used for read assignment, +#' defaults to 5bp} +#' } #' @param opt.em A list of controlling parameters for quantification #' algorithm estimation process: #' \describe{ @@ -101,6 +109,15 @@ #' @param fusionMode A logical variable indicating whether run in fusion mode #' @param verbose A logical variable indicating whether processing messages will #' be printed. +#' @param opt.singlecell A list of single-cell specific parameters: +#' \describe{ +#' \item{extractBarcodeUMI}{Logical, whether to extract cell barcodes and +#' UMIs from BAM tags or read names. Defaults to FALSE} +#' \item{dedupUMI}{Logical, whether to perform UMI-based deduplication per +#' barcode. Defaults to FALSE} +#' \item{clusters}{A named list mapping cluster names to barcode vectors, +#' used for cluster-level transcript discovery. Defaults to NULL} +#' } #' @details #' @return \code{bambu} will output different results depending on whether #' \emph{quant} mode is on. By default, \emph{quant} is set to TRUE, so @@ -141,11 +158,10 @@ #' genome = fa.file, discovery = TRUE, quant = TRUE) #' @export bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, - mode = NULL, opt.discovery = NULL, opt.em = NULL, rcOutDir = NULL, discovery = TRUE, - assignDist = TRUE, quant = TRUE, stranded = FALSE, ncore = 1, yieldSize = NULL, - trackReads = FALSE, returnDistTable = FALSE, lowMemory = FALSE, sampleData = NULL, - fusionMode = FALSE, verbose = FALSE, demultiplexed = FALSE, quantData = NULL, - sampleNames = NULL, cleanReads = FALSE, dedupUMI = FALSE, barcodesToFilter = NULL, clusters = NULL, + mode = NULL, opt.discovery = NULL, opt.rcAssignment = NULL, opt.em = NULL, opt.singlecell = NULL, + rcOutDir = NULL, discovery = TRUE, assignDist = TRUE, quant = TRUE, stranded = FALSE, + ncore = 1, yieldSize = NULL, trackReads = FALSE, returnDistTable = FALSE, lowMemory = FALSE, + sampleData = NULL, fusionMode = FALSE, verbose = FALSE, quantData = NULL, processByChromosome = FALSE, processByBam = TRUE) { message(paste0("Running Bambu-v", "3.9.0")) if(!is.null(mode)){ @@ -154,8 +170,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, processByBam <- TRUE } if(mode == "multiplexed"){ - demultiplexed <- TRUE - cleanReads <- TRUE + opt.singlecell$extractBarcodeUMI <- TRUE opt.em <- list(degradationBias = FALSE) quant <- FALSE processByChromosome <- TRUE @@ -163,6 +178,10 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if(mode == "fusion"){ NDR <- 1 fusionMode <- TRUE + if(is.null(opt.discovery)) opt.discovery <- list() + opt.discovery$remove.subsetTx <- FALSE + opt.discovery$min.readCount <- 1 + opt.discovery$min.sampleNumber <- 0 } if(mode == "debug"){ verbose <- TRUE @@ -172,20 +191,25 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } if(lowMemory) message("lowMemory has been deprecated and split into processByChromosome and processByBam. Please see Documentation") + if("min.primarySecondaryDistStartEnd2" %in% names(opt.discovery)) + message("min.primarySecondaryDistStartEnd2 has been moved to opt.rcAssignment. Please pass this parameter via opt.rcAssignment instead.") if(is.null(annotations)){ annotations <- GRangesList() } else { annotations <- checkInputs(annotations, reads, - readClass.outputDir = rcOutDir, - genomeSequence = genome, discovery = discovery, - sampleNames = sampleNames, sampleData = sampleData, quantData = quantData) + readClass.outputDir = rcOutDir, + genomeSequence = genome, discovery = discovery, + sampleData = sampleData, quantData = quantData) } - isoreParameters <- setIsoreParameters(isoreParameters = opt.discovery) + opt.discovery <- setDiscoveryParameters(discoveryParameters = opt.discovery) #below line is to be compatible with earlier version of running bambu - if(!is.null(isoreParameters$max.txNDR)) NDR = isoreParameters$max.txNDR - - emParameters <- setEmParameters(emParameters = opt.em) - bpParameters <- setBiocParallelParameters(reads, ncore, verbose, demultiplexed) + if(!is.null(opt.discovery$max.txNDR)) NDR = opt.discovery$max.txNDR + opt.rcAssignment <- setRcAssignmentParameters(rcAssignmentParameters = opt.rcAssignment) + opt.em <- setEmParameters(emParameters = opt.em) + extractBarcodeUMI <- isTRUE(opt.singlecell$extractBarcodeUMI) + dedupUMI <- isTRUE(opt.singlecell$dedupUMI) + clusters <- opt.singlecell$clusters + bpParameters <- setBiocParallelParameters(reads, ncore, verbose, extractBarcodeUMI) xgb.set.config(nthread = 1) # only when reads is not NULL, this proceed, otherwise, it will jump to quant step if(!is.null(reads)){ @@ -210,12 +234,11 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, genomeSequence = genome, readClass.outputDir = rcOutDir, yieldSize = yieldSize, bpParameters = bpParameters, stranded = stranded, verbose = verbose, - isoreParameters = isoreParameters, trackReads = trackReads, + discoveryParameters = opt.discovery, trackReads = trackReads, fusionMode = fusionMode, processByChromosome = processByChromosome, processByBam = processByBam, - demultiplexed = demultiplexed, - sampleNames = sampleNames, cleanReads = cleanReads, - dedupUMI = dedupUMI,barcodesToFilter = barcodesToFilter) + extractBarcodeUMI = extractBarcodeUMI, + dedupUMI = dedupUMI) } #warnings = handleWarnings(readClassList, verbose) @@ -223,14 +246,14 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if (discovery) { message("--- Start extending annotations ---") extendedAnnotations <- bambu.extendAnnotations(readClassList, annotations, NDR, - isoreParameters, stranded, bpParameters, fusionMode, verbose) + opt.discovery, stranded, bpParameters, fusionMode, verbose) metadata(extendedAnnotations)$warnings = warnings #### cluster based transcript discovery if(!is.null(clusters)){ annotations.clusters <- isore.extendAnnotations.clusters(readClassList, - annotations, clusters, NDR, - isoreParameters, stranded, bpParameters, fusionMode, verbose = FALSE) + annotations, clusters, NDR, + opt.discovery, stranded, bpParameters, fusionMode, verbose = FALSE) metadata(extendedAnnotations)$clusters <- annotations.clusters } annotations <- extendedAnnotations @@ -242,12 +265,12 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, quantData <- bplapply(seq_along(readClassList), function(i){ assignReadClasstoTranscripts( readClassList = readClassList[[i]], - annotations = annotations, - isoreParameters = isoreParameters, - verbose = verbose, + annotations = annotations, + rcAssignmentParameters = opt.rcAssignment, + verbose = verbose, # for bulk data, there is one sampleData (keep sampleData[1]), for single-cell, there is one per sample sampleMetadata = if(length(sampleData) == 1) sampleData[1] else sampleData[i], - demultiplexed = demultiplexed, + extractBarcodeUMI = extractBarcodeUMI, returnDistTable = returnDistTable, trackReads = trackReads ) @@ -259,10 +282,10 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if (quant) { message("--- Start isoform EM quantification ---") if(!is.null(NDR) & !discovery)# this step is used when reset NDR is needed - annotations <- setNDR(annotations, NDR, - prefix = isoreParameters$prefix, - baselineFDR = isoreParameters[["baselineFDR"]], - defaultModels2 = isoreParameters[["defaultModels"]]) + annotations <- setNDR(annotations, NDR, + prefix = opt.discovery$prefix, + baselineFDR = opt.discovery[["baselineFDR"]], + defaultModels2 = opt.discovery[["defaultModels"]]) if(length(annotations)==0) stop("No valid annotations, if running de novo please try less stringent parameters") if(is.null(quantData)) stop("quantData must be provided or assignDist = TRUE") @@ -276,25 +299,25 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, #load in the barcode clustering from file if provided iter <- seq_len(ncol(metadata(quantData_i)$countMatrix)) # iter is integer if(!is.null(clusters)){ - if(class(clusters[[i]])!="CompressedCharacterList"){ # !is.list(clusters) is FALSE for CompressedCharacterList + if(class(clusters[[i]])!="CompressedCharacterList"){ # !is.list(clusters) is FALSE for CompressedCharacterList clusterMaps <- NULL for(j in seq_along(metadata(quantData_i)$sampleNames)){ #load in a file per sample name provided - clusterMap <- fread(clusters[[j]], header = FALSE, + clusterMap <- fread(clusters[[j]], header = FALSE, data.table = FALSE) - # read.table(clusters[[j]], - # sep = ifelse(grepl(".tsv$",clusters[[j]]), "\t", ","), + # read.table(clusters[[j]], + # sep = ifelse(grepl(".tsv$",clusters[[j]]), "\t", ","), # header = FALSE) clusterMap[,1] <- paste0(metadata(quantData_i)$sampleNames[j], "_",clusterMap[,1]) - clusterMaps <- rbind(clusterMaps, clusterMap) + clusterMaps <- rbind(clusterMaps, clusterMap) } - clustering <- splitAsList(clusterMaps[,1], clusterMaps[,2]) + clustering <- splitAsList(clusterMaps[,1], clusterMaps[,2]) rm(clusterMaps) rm(clusterMap) iter <- clustering - + } else{ #if clusters is a list - iter <- clusters[[i]] + iter <- clusters[[i]] } } countsSeCompressed <- bplapply(iter, FUN = function(j){ # previous i changed to j to avoid duplicated assignment @@ -307,8 +330,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } return(bambu.quantify(readClassDt = metadata(quantData_i)$readClassDt, countMatrix = countMatrix, incompatibleCountMatrix = data.table(GENEID.i = as.numeric(rownames(metadata(quantData_i)$incompatibleCountMatrix)), counts = incompatibleCountMatrix), - txid.index = mcols(annotations)$txid, GENEIDs = GENEIDs.i, isoreParameters = isoreParameters, - emParameters = emParameters, trackReads = trackReads, + txid.index = mcols(annotations)$txid, GENEIDs = GENEIDs.i, + emParameters = opt.em, trackReads = trackReads, verbose = verbose))}, BPPARAM = bpParameters) end.ptm <- proc.time() @@ -338,4 +361,58 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } return(countsSe) } - } \ No newline at end of file + } + +#' Single-cell isoform reconstruction and quantification +#' @title Single-cell isoform reconstruction and quantification with Bambu +#' @description Analyse single-cell long-read RNA-seq data with Bambu, +#' performing isoform discovery and quantification at single-cell resolution. +#' This function calls the main \code{\link{bambu}} function with cell +#' barcode/UMI extraction and UMI-based deduplication enabled by default, +#' and returns a \emph{SummarizedExperiment} object with per-cell transcript +#' expression estimates. +#' +#' We recommend processing single-cell data using the Bambu Nextflow pipeline, +#' which handles preprocessing, barcode demultiplexing, and alignment prior to +#' running this function. See \url{https://github.com/GoekeLab/bambu-singlecell-spatial}. +#' @param reads A string or vector of strings specifying paths to BAM files. +#' BAM files must contain cell barcode and UMI information, either as BAM tags +#' (\code{CB} for cell barcode, \code{UB} for UMI) or encoded in the read name +#' using the format \code{CB_UMI#READNAME} (note: \code{CB} and \code{UMI} +#' must not contain underscores). +#' @param annotations A path to a .gtf file or a \code{TxDb} object for +#' transcript annotations. Defaults to NULL. +#' @param genome A path to a fasta file or a \code{BSGenome} object. +#' Defaults to NULL. +#' @param NDR Numeric specifying the maximum NDR rate for novel transcript +#' discovery. Defaults to NULL. +#' @param discovery Logical, whether transcript discovery is performed. +#' Defaults to TRUE. +#' @param assignDist Logical, whether to assign reads to transcripts. +#' Defaults to TRUE. +#' @param quant Logical, whether quantification is performed. Defaults to TRUE. +#' @param clusters A named list mapping cluster names to barcode vectors, +#' used for cluster-level transcript discovery. Defaults to NULL. +#' @param stranded Logical, whether reads are stranded. Defaults to FALSE. +#' @param ncore Integer specifying the number of cores for parallel processing. +#' Defaults to 1. +#' @param ... Additional arguments passed to \code{\link{bambu}}, such as +#' \code{verbose}, \code{lowMemory}, \code{opt.discovery}, etc. +#' @examples +#' sc.bam <- system.file("extdata", "demultiplexed.bam", package = "bambu") +#' fa.file <- system.file("extdata", +#' "Homo_sapiens.GRCh38.dna_sm.primary_assembly_chr9_1_1000000.fa", +#' package = "bambu") +#' gr <- readRDS(system.file("extdata", +#' "annotationGranges_txdbGrch38_91_chr9_1_1000000.rds", +#' package = "bambu")) +#' se <- bambu.singlecell(reads = sc.bam, annotations = gr, genome = fa.file) +#' @export +bambu.singlecell <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, + clusters = NULL, discovery = TRUE, assignDist = TRUE, quant = TRUE, + stranded = FALSE, ncore = 1, ...) { + bambu(reads = reads, annotations = annotations, genome = genome, NDR = NDR, + opt.singlecell = list(extractBarcodeUMI = TRUE, dedupUMI = TRUE, clusters = clusters), + discovery = discovery, assignDist = assignDist, quant = quant, + stranded = stranded, ncore = ncore, ...) +} diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index 44c2d747..6af1e86f 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -3,30 +3,28 @@ #' setBiocParallelParameters #' @importFrom BiocParallel bpparam #' @noRd -setBiocParallelParameters <- function(reads, ncore, verbose, demultiplexed){ +setBiocParallelParameters <- function(reads, ncore, verbose, extractBarcodeUMI){ bpParameters <- bpparam() #===# set parallel options: otherwise use parallel to distribute samples - # when demultiplexed is FALSE, isFALSE(demultiplexed) is TRUE - bpParameters$workers <- ifelse(length(reads) == 1 & isFALSE(demultiplexed), 1, ncore) + bpParameters$workers <- ifelse(length(reads) == 1 & !extractBarcodeUMI, 1, ncore) bpParameters$progressbar <- ifelse(length(reads) > 1 & !verbose, TRUE, FALSE) return(bpParameters) } -#' setIsoreparameters +#' setDiscoveryParameters #' @noRd -setIsoreParameters <- function(isoreParameters){ +setDiscoveryParameters <- function(discoveryParameters){ # ===# set default controlling parameters for isoform reconstruction #===# - isoreParameters.default <- list( - remove.subsetTx = TRUE, + discoveryParameters.default <- list( + remove.subsetTx = TRUE, min.readCount = 2, min.readFractionByGene = 0.05, min.sampleNumber = 1, min.exonDistance = 35, - min.exonOverlap = 10, + min.exonOverlap = 10, min.primarySecondaryDist = 5, min.primarySecondaryDistStartEnd1 = 5, # for creating new annotations - min.primarySecondaryDistStartEnd2 = 5, # for read assignment min.txScore.multiExon = 0, min.txScore.singleExon = 1, fitReadClassModel = TRUE, @@ -34,10 +32,23 @@ setIsoreParameters <- function(isoreParameters){ returnModel = FALSE, baselineFDR = 0.1, min.readFractionByEqClass = 0, - prefix = "Bambu") - isoreParameters <- - updateParameters(isoreParameters, isoreParameters.default) - return(isoreParameters) + prefix = "Bambu") + discoveryParameters <- + updateParameters(discoveryParameters, discoveryParameters.default) + return(discoveryParameters) +} + + +#' setRcAssignmentParameters +#' @noRd +setRcAssignmentParameters <- function(rcAssignmentParameters){ + rcAssignmentParameters.default <- list( + min.exonDistance = 35, + min.primarySecondaryDist = 5, + min.primarySecondaryDistStartEnd2 = 5) + rcAssignmentParameters <- + updateParameters(rcAssignmentParameters, rcAssignmentParameters.default) + return(rcAssignmentParameters) } @@ -72,8 +83,8 @@ updateParameters <- function(Parameters, Parameters.default) { #' @param readClass.outputDir path to readClass output directory #' @importFrom methods is #' @noRd -checkInputs <- function(annotations, reads, readClass.outputDir, genomeSequence, - discovery, sampleNames, sampleData, quantData){ +checkInputs <- function(annotations, reads, readClass.outputDir, genomeSequence, + discovery, sampleData, quantData){ # ===# Check annotation inputs #===# if (!is.null(annotations)) { if (is(annotations, "CompressedGRangesList")) { @@ -148,14 +159,6 @@ checkInputs <- function(annotations, reads, readClass.outputDir, genomeSequence, use of Rsamtools for opening.") } - #check single-cell and spatial inputs match - if(!is.null(sampleNames)){ - if(length(reads)!=length(sampleNames)){ - stop("There are not the same number of sampleNames as input files to reads. ", - "Make sure these two arguments are vectors of the same length") - } - } - if(!is.null(sampleData)){ if (!all(grepl("\\.(csv|tsv|txt)$", na.omit(sampleData), ignore.case = TRUE))){ stop("Not all paths for sample metadata files are .csv/.tsv/.txt files") @@ -242,11 +245,11 @@ handleWarnings <- function(readClassList, verbose){ } #' Calculate the dist table used for Bambu Quantification -calculateDistTable <- function(readClassList, annotations, isoreParameters, verbose, returnDistTable){ +calculateDistTable <- function(readClassList, annotations, rcAssignmentParameters, verbose, returnDistTable){ readClassDist <- isore.estimateDistanceToAnnotations(readClassList, annotations, - min.exonDistance = isoreParameters[["min.exonDistance"]], - min.primarySecondaryDist = isoreParameters[['min.primarySecondaryDist']], - min.primarySecondaryDistStartEnd = isoreParameters[['min.primarySecondaryDistStartEnd2']], + min.exonDistance = rcAssignmentParameters[["min.exonDistance"]], + min.primarySecondaryDist = rcAssignmentParameters[['min.primarySecondaryDist']], + min.primarySecondaryDistStartEnd = rcAssignmentParameters[['min.primarySecondaryDistStartEnd2']], verbose = verbose) metadata(readClassDist)$distTable <- modifyIncompatibleAssignment(metadata(readClassDist)$distTable) if(returnDistTable) metadata(readClassDist)$distTableOld <- metadata(readClassDist)$distTable @@ -297,25 +300,25 @@ combineCountSes <- function(countsSe, colDataList, annotations){ #' Generate the colData using the external sampleMetadata.csv provided by the user in the sampleMetadata argument #' @param readClassList A list object containingmetadata about read classes. #' @param sampleMetadata A path to a CSV file or NULL/NA if there is no metadata for the sample. -#' @param demultiplexed Logical; indicates if data is demultiplexed. +#' @param extractBarcodeUMI Logical; indicates if barcodes and UMIs are extracted. #' #' @return A DataFrame containing colData for the sample. #' @export -generateColData <- function(readClassList, sampleMetadata, demultiplexed) { +generateColData <- function(readClassList, sampleMetadata, extractBarcodeUMI) { sampleMetadataDf <- if (is.null(sampleMetadata) || is.na(sampleMetadata)) { - if (demultiplexed) tibble(barcode = character()) else tibble(sampleName = character()) + if (extractBarcodeUMI) tibble(barcode = character()) else tibble(sampleName = character()) } else { fread(sampleMetadata) } - joinKey <- if (demultiplexed) "barcode" else "sampleName" + joinKey <- if (extractBarcodeUMI) "barcode" else "sampleName" colData <- tibble( - id = metadata(readClassList)$sampleData$id, + id = metadata(readClassList)$sampleData$id, sampleName = metadata(readClassList)$sampleData$sampleName - ) + ) - if (demultiplexed) { + if (extractBarcodeUMI) { colData <- colData %>% mutate(barcode = metadata(readClassList)$sampleData$barcode) } diff --git a/R/prepareDataFromBam.R b/R/prepareDataFromBam.R index 81beb4b2..3c86fdb8 100755 --- a/R/prepareDataFromBam.R +++ b/R/prepareDataFromBam.R @@ -7,8 +7,8 @@ #' @importFrom GenomicAlignments grglist readGAlignments #' @importFrom GenomicRanges width #' @noRd -prepareDataFromBam <- function(bamFile, yieldSize = NULL, verbose = FALSE, - use.names = FALSE, demultiplexed = FALSE, cleanReads = TRUE, dedupUMI = FALSE) { +prepareDataFromBam <- function(bamFile, yieldSize = NULL, verbose = FALSE, + use.names = FALSE, extractBarcodeUMI = FALSE, dedupUMI = FALSE) { if (is(bamFile, "BamFile")) { if (!is.null(yieldSize)) { yieldSize(bamFile) <- yieldSize @@ -26,79 +26,39 @@ prepareDataFromBam <- function(bamFile, yieldSize = NULL, verbose = FALSE, bf <- open(bamFile) readGrgList <- list() counter <- 1 - cells <- c() - umi <- c() use.names.OG <- use.names - if(!isFALSE(demultiplexed) | cleanReads) use.names <- TRUE - # grepl(".[ct]sv$",demultiplexed) - if(!is.logical(demultiplexed)){ # if demultiplexed argument is not logical value - if(file.exists(demultiplexed)){ # check if file path exists, it has to be a regular delimited file, can be compressed - readMap <- fread(demultiplexed,header = FALSE, data.table = FALSE) # changed function to more efficiently read in data and also correct error of "no lines available as input" for csv format input - }else{ - stop("Provided barcode to map file does not exists! Please provide the correct path to demultiplex argument!") - } - } + if(extractBarcodeUMI) use.names <- TRUE while (isIncomplete(bf)) { alignmentInfo <- readGAlignments(bf, param = ScanBamParam(tag = c("CB", "UB"), flag = scanBamFlag(isSecondaryAlignment = FALSE)), use.names = use.names) readGrgList[[counter]] <-grglist(alignmentInfo) - if (!isFALSE(demultiplexed)){ # if demultiplexed is TRUE or a string path - if(isTRUE(demultiplexed)){ # if demultiplexed is TRUE - - # a checkpoint to parse CB and UMI from the bam file, either from reads or CB/UMI tags. - # currently read name only accepts the format CB_UMI#READNAME (CB & UMI cannot have '_', otherwise parsing fails) - mcols(readGrgList[[counter]])$CB <- case_when( - !is.na(mcols(alignmentInfo)$CB) ~ mcols(alignmentInfo)$CB, - grepl("^[^_]+_[^#]+#", names(readGrgList[[counter]]), perl = TRUE) ~ sub("_.*", "", names(readGrgList[[counter]])), - TRUE ~ NA - ) + if(extractBarcodeUMI){ + # parse CB and UMI from the bam file, either from CB/UB tags or read names. + # read name format: CB_UMI#READNAME (CB & UMI cannot have '_', otherwise parsing fails) + mcols(readGrgList[[counter]])$CB <- case_when( + !is.na(mcols(alignmentInfo)$CB) ~ mcols(alignmentInfo)$CB, + grepl("^[^_]+_[^#]+#", names(readGrgList[[counter]]), perl = TRUE) ~ sub("_.*", "", names(readGrgList[[counter]])), + TRUE ~ NA + ) - mcols(readGrgList[[counter]])$UMI <- case_when( - !is.na(mcols(alignmentInfo)$UB) ~ mcols(alignmentInfo)$UB, - grepl("^[^_]+_[^#]+#", names(readGrgList[[counter]]), perl = TRUE) ~ sub("^[^_]+_([^#]+)#.*$", "\\1", names(readGrgList[[counter]])), - TRUE ~ NA - ) - - } else{ # if demultiplexed is a string path - mcols(readGrgList[[counter]])$CB <- NA - mcols(readGrgList[[counter]])$UMI <- NA - mcols(readGrgList[[counter]])$CB <- readMap[,2][match(names(readGrgList[[counter]]),readMap[,1])] - if(ncol(readMap)>2){ - mcols(readGrgList[[counter]])$UMI <- readMap[,3][match(names(readGrgList[[counter]]),readMap[,1])] - } - } - } - if(cleanReads){ - softClip5Prime <- clipFunction(cigarData = GenomicAlignments::cigar(alignmentInfo), grep_pattern = '^(\\d*)[S].*', replace_pattern = '\\1') - softClip3Prime <- clipFunction(cigarData = GenomicAlignments::cigar(alignmentInfo), grep_pattern = '.*\\D(\\d*)[S]$', replace_pattern = '\\1') - hardClip5Prime <- clipFunction(cigarData = GenomicAlignments::cigar(alignmentInfo), grep_pattern = '^(\\d*)[H].*', replace_pattern = '\\1') - hardClip3Prime <- clipFunction(cigarData = GenomicAlignments::cigar(alignmentInfo), grep_pattern = '.*\\D(\\d*)[H]$', replace_pattern = '\\1') - # softClip5Prime <-suppressWarnings(pmax(0,as.numeric(gsub('^(\\d*)[S].*','\\1',GenomicAlignments::cigar(alignmentInfo))), na.rm=T)) - # softClip3Prime <-suppressWarnings(pmax(0,as.numeric(gsub('.*\\D(\\d*)[S]$','\\1',GenomicAlignments::cigar(alignmentInfo))), na.rm=T)) - # hardClip5Prime <-suppressWarnings(pmax(0,as.numeric(gsub('^(\\d*)[H].*','\\1',GenomicAlignments::cigar(alignmentInfo))), na.rm=T)) - # hardClip3Prime <-suppressWarnings(pmax(0,as.numeric(gsub('.*\\D(\\d*)[H]$','\\1',GenomicAlignments::cigar(alignmentInfo))), na.rm=T)) - mcols(readGrgList[[counter]])$clip5Prime <- pmax(softClip5Prime, hardClip5Prime) - mcols(readGrgList[[counter]])$clip3Prime <- pmax(softClip3Prime, hardClip3Prime) - rev <- as.vector(strand(alignmentInfo) == '-') - rev2 <- grepl("_-.+of", names(alignmentInfo)) - temp <- mcols(readGrgList[[counter]])$clip5Prime - mcols(readGrgList[[counter]])$clip5Prime[rev != rev2] <- mcols(readGrgList[[counter]])$clip3Prime[rev != rev2] - mcols(readGrgList[[counter]])$clip3Prime[rev != rev2] <- temp[rev != rev2] + mcols(readGrgList[[counter]])$UMI <- case_when( + !is.na(mcols(alignmentInfo)$UB) ~ mcols(alignmentInfo)$UB, + grepl("^[^_]+_[^#]+#", names(readGrgList[[counter]]), perl = TRUE) ~ sub("^[^_]+_([^#]+)#.*$", "\\1", names(readGrgList[[counter]])), + TRUE ~ NA + ) } counter <- counter + 1 } on.exit(close(bf)) - rm(cells) - rm(umi) if (length(readGrgList) > 1) { readGrgList <- do.call(c, readGrgList) } else { readGrgList <- readGrgList[[1]] } - if (demultiplexed){ + if (extractBarcodeUMI){ mcols(readGrgList)$CB <- factor(mcols(readGrgList)$CB, levels = sort(unique(mcols(readGrgList)$CB))) } @@ -106,30 +66,9 @@ prepareDataFromBam <- function(bamFile, yieldSize = NULL, verbose = FALSE, readGrgList <- readGrgList <- readGrgList[sum(width(readGrgList)) > 1] numNoCBs <- sum(is.na(mcols(readGrgList)$CB)) if(numNoCBs > 0){ - message("Removing ", numNoCBs, " reads that were not assigned barcodes. If this is unexpected check the barcode map input") + message("Removing ", numNoCBs, " reads with no extracted cell barcode. If this is unexpected, check that extractBarcodeUMI is TRUE and that the BAM contains CB/UB tags or read names follow the CB_UMI_READNAME format.") readGrgList <- readGrgList[!is.na(mcols(readGrgList)$CB)] } - if(cleanReads){ - #extract duplicated reads from flexiplex to clean - #leave other reads alone as supplimental alignments maybe fusion transcripts - #commented out because it takes awhile - # dt = data.table(name = substr(readNames2,0,nchar(readNames2[1])-2), - # strand = substr(readNames2,nchar(readNames2[1]),nchar(readNames2[1]))) - # dt[, id := .I] - # dt <- dt[, .(ids = list(id), toFilt = any(strand == "+" & strand == "-")), by = name] - # readGrgList.keep = readGrgList[c(dt$ids[!dt$toFilt])] - # readGrgList.filt = readGrgList[c(dt$ids[dt$toFilt])] - - #select alignments closest to barcode - start.ptm <- proc.time() - df <- data.frame(name = names(readGrgList), - clip5 = mcols(readGrgList)$clip5Prime) - df <- df %>% mutate(id = row_number()) %>% group_by(name) %>% summarise(primary.id = id[which.min(clip5)]) - readGrgList <- readGrgList[df$primary.id] - end.ptm <- proc.time() - message("Primary alignment selection time ", round((end.ptm - start.ptm)[3] / 60, 3), " mins.") - - } if(dedupUMI){ #UMI deduplication by barcode start.ptm <- proc.time() diff --git a/R/readWrite.R b/R/readWrite.R index ee9dce71..566b8b87 100644 --- a/R/readWrite.R +++ b/R/readWrite.R @@ -16,8 +16,8 @@ #' )) #' path <- tempdir() #' writeBambuOutput(se, path) -writeBambuOutput <- function(se, path, prefix = "", outputExtendedAnno = TRUE, - outputAll = TRUE, outputBambuModels = TRUE, outputNovelOnly = TRUE, seperateSamples = FALSE) { +writeBambuOutput <- function(se, path, prefix = "", outputExtendedAnno = TRUE, + outputAll = TRUE, outputBambuModels = TRUE, outputNovelOnly = TRUE) { if (missing(se) | missing(path)) { stop("Both summarizedExperiment object from bambu and the path for the output files are required.") @@ -59,29 +59,6 @@ writeBambuOutput <- function(se, path, prefix = "", outputExtendedAnno = TRUE, #R.utils::gzip(paste0(outdir, "txANDgenes.tsv")) #R.utils::gzip(paste0(outdir, "genes.tsv")) - #If there are multiple samples (when demultiplexed), seperate each sample into its own directory - if(seperateSamples){ - fullSe = se - for(sampleName in unique(colData(fullSe)$sampleName)){ - dir.create(file.path(outdir, sampleName), showWarnings = FALSE) - se = fullSe[,colData(fullSe)$sampleName == sampleName] - metadata(se)$incompatibleCounts = metadata(se)$incompatibleCounts[,colData(fullSe)$sampleName == sampleName] - for(d in names(assays(se))){ - writeCountsOutput(se, varname=d, - feature='transcript',outdir=paste0(outdir, sampleName,"/"), prefix) - } - if(!is.null(metadata(se)$incompatibleCounts)){ - estimates = metadata(se)$incompatibleCounts - estimatesfn <- paste(outdir, "/", sampleName,"/", prefix, "incompatibleCounts.mtx", sep = "") - Matrix::writeMM(estimates, estimatesfn) - } - seGene <- transcriptToGeneExpression(se) - writeCountsOutput(seGene, varname='counts', feature='gene',paste0(outdir, sampleName,"/"), prefix) - utils::write.table(colData(se), file = paste0(outdir, "/", sampleName, "/", prefix, "sampleData.tsv"), - sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) - #utils::write.table(paste0(colnames(se), "-1"), file = paste0(outdir, "barcodes.tsv"), quote = FALSE, row.names = FALSE, col.names = FALSE) - } - } } } diff --git a/inst/extdata/demultiplexed.bam b/inst/extdata/demultiplexed.bam new file mode 100644 index 00000000..ae85fd59 Binary files /dev/null and b/inst/extdata/demultiplexed.bam differ