diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index 750c1e87..47b019c3 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -2,11 +2,11 @@ #' @inheritParams bambu #' @import data.table #' @noRd -assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParameters, - verbose, sampleMetadata, demultiplexed, +assignReadClasstoTranscripts <- function(readClassList, annotations, + rcAssignmentParameters, verbose, sampleMetadata, demultiplexed, returnDistTable = FALSE, trackReads = TRUE) { if (is.character(readClassList)) readClassList <- readRDS(file = readClassList) - metadata(readClassList)$readClassDist <- calculateDistTable(readClassList, annotations, isoreParameters, verbose, returnDistTable) + metadata(readClassList)$readClassDist <- calculateDistTable(readClassList, annotations, rcAssignmentParameters, verbose, returnDistTable) readClassList <- splitReadClassFiles(readClassList) readClassDt <- genEquiRCs(metadata(readClassList)$readClassDist, annotations, verbose) readClassDt$eqClass.match = match(readClassDt$eqClassById,metadata(readClassList)$eqClassById) diff --git a/R/bambu-extendAnnotations-utilityExtend.R b/R/bambu-extendAnnotations-utilityExtend.R index dc11677f..e1b2b123 100644 --- a/R/bambu-extendAnnotations-utilityExtend.R +++ b/R/bambu-extendAnnotations-utilityExtend.R @@ -872,7 +872,7 @@ setNDR <- function(extendedAnnotations, NDR = NULL, prefix = 'Bambu', baselineFD #' Extend annotations by clusters #' @noRd -isore.extendAnnotations.clusters <- function(readClassList, annotations, clusters, NDR, isoreParameters, stranded, bpParameters, fusionMode, verbose = FALSE){ +isore.extendAnnotations.clusters <- function(readClassList, annotations, clusters, NDR, discoveryParameters, stranded, bpParameters, fusionMode, verbose = FALSE){ message("--- Start extending annotations for clusters ---") #if clustering is a csv, create a list with the barcodes for each cluster #csv must have two cols with heading barcode, cluster @@ -904,8 +904,8 @@ isore.extendAnnotations.clusters <- function(readClassList, annotations, cluster rowData(rcf.filt)$startSD <- 0 rowData(rcf.filt)$endSD <- 0 rowData(rcf.filt)$readCount.posStrand <- 0 - thresholdIndex <- which(rowData(rcf.filt)$readCount>=isoreParameters$min.readCount) - model <- trainBambu(rcf.filt, verbose = verbose, min.readCount = isoreParameters$min.readCount) + thresholdIndex <- which(rowData(rcf.filt)$readCount>=discoveryParameters$min.readCount) + model <- trainBambu(rcf.filt, verbose = verbose, min.readCount = discoveryParameters$min.readCount) txScore <- getTranscriptScore(rowData(rcf.filt)[thresholdIndex,], model, defaultModels) rowData(rcf.filt)$txScore <- rep(NA,nrow(rcf.filt)) @@ -913,12 +913,12 @@ isore.extendAnnotations.clusters <- function(readClassList, annotations, cluster #txScores = cbind(txScores, rowData(rcf.filt)$txScore) rcfs.clusters[[names(clusters)[i]]] <- rcf.filt annotations.clusters[[names(clusters)[i]]] <- bambu.extendAnnotations(list(rcf.filt), annotations, NDR, - isoreParameters, stranded, bpParameters, fusionMode, verbose) + discoveryParameters, stranded, bpParameters, fusionMode, verbose) } if(length(rcfs.clusters)>0){ print("--- Merging all individual clusters ---") annotations.clusters[["merged"]] <- bambu.extendAnnotations(rcfs.clusters, annotations, NDR, - isoreParameters, stranded, bpParameters, fusionMode, verbose) + discoveryParameters, stranded, bpParameters, fusionMode, verbose) } return(annotations.clusters) diff --git a/R/bambu-extendAnnotations.R b/R/bambu-extendAnnotations.R index 7404fa61..ec9cb3e1 100644 --- a/R/bambu-extendAnnotations.R +++ b/R/bambu-extendAnnotations.R @@ -3,14 +3,14 @@ #' @inheritParams bambu #' @noRd bambu.extendAnnotations <- function(readClassList, annotations, NDR, - isoreParameters, stranded, bpParameters, fusionMode = FALSE, verbose = FALSE) { + discoveryParameters, stranded, bpParameters, fusionMode = FALSE, verbose = FALSE) { start.ptm_all <- proc.time() combinedTxCandidates <- isore.combineTranscriptCandidates(readClassList, stranded, ## stranded used for unspliced reduce - min.readCount = isoreParameters[["min.readCount"]], - min.readFractionByGene = isoreParameters[["min.readFractionByGene"]], - min.txScore.multiExon = isoreParameters[["min.txScore.multiExon"]], - min.txScore.singleExon = isoreParameters[["min.txScore.singleExon"]], + min.readCount = discoveryParameters[["min.readCount"]], + min.readFractionByGene = discoveryParameters[["min.readFractionByGene"]], + min.txScore.multiExon = discoveryParameters[["min.txScore.multiExon"]], + min.txScore.singleExon = discoveryParameters[["min.txScore.singleExon"]], bpParameters, verbose) end.ptm_all <- proc.time() @@ -20,21 +20,21 @@ bambu.extendAnnotations <- function(readClassList, annotations, NDR, annotations <- isore.extendAnnotations( combinedTranscripts = combinedTxCandidates, annotationGrangesList = annotations, - remove.subsetTx = isoreParameters[["remove.subsetTx"]], - min.sampleNumber = isoreParameters[["min.sampleNumber"]], + remove.subsetTx = discoveryParameters[["remove.subsetTx"]], + min.sampleNumber = discoveryParameters[["min.sampleNumber"]], NDR = NDR, - min.exonDistance = isoreParameters[["min.exonDistance"]], - min.exonOverlap = isoreParameters[["min.exonOverlap"]], + min.exonDistance = discoveryParameters[["min.exonDistance"]], + min.exonOverlap = discoveryParameters[["min.exonOverlap"]], min.primarySecondaryDist = - isoreParameters[['min.primarySecondaryDist']], + discoveryParameters[['min.primarySecondaryDist']], min.primarySecondaryDistStartEnd = - isoreParameters[['min.primarySecondaryDistStartEnd1']], + discoveryParameters[['min.primarySecondaryDistStartEnd1']], min.readFractionByEqClass = - isoreParameters[['min.readFractionByEqClass']], + discoveryParameters[['min.readFractionByEqClass']], fusionMode = fusionMode, - prefix = isoreParameters[["prefix"]], - baselineFDR = isoreParameters[["baselineFDR"]], - defaultModels = isoreParameters[["defaultModels"]], + prefix = discoveryParameters[["prefix"]], + baselineFDR = discoveryParameters[["baselineFDR"]], + defaultModels = discoveryParameters[["defaultModels"]], verbose = verbose) end.ptm_all <- proc.time() if (verbose) message("extend annotations in ", diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R index dd988584..041e2c0d 100644 --- a/R/bambu-processReads.R +++ b/R/bambu-processReads.R @@ -14,7 +14,7 @@ #' @noRd bambu.processReads <- function(reads, annotations, genomeSequence, readClass.outputDir=NULL, yieldSize=1000000, bpParameters, - stranded=FALSE, verbose=FALSE, isoreParameters = setIsoreParameters(NULL), + stranded=FALSE, verbose=FALSE, discoveryParameters = setDiscoveryParameters(NULL), processByChromosome = FALSE, processByBam = TRUE, trackReads = trackReads, fusionMode = fusionMode, demultiplexed = FALSE, cleanReads = FALSE, dedupUMI = FALSE, sampleNames = NULL, barcodesToFilter = NULL) { genomeSequence <- checkInputSequence(genomeSequence) @@ -48,11 +48,11 @@ bambu.processReads <- function(reads, annotations, genomeSequence, names(reads)[seq_along(sampleNames)] <- sampleNames } } - min.readCount <- isoreParameters[["min.readCount"]] - fitReadClassModel <- isoreParameters[["fitReadClassModel"]] - defaultModels <- isoreParameters[["defaultModels"]] - returnModel <- isoreParameters[["returnModel"]] - min.exonOverlap <- isoreParameters[["min.exonOverlap"]] + min.readCount <- discoveryParameters[["min.readCount"]] + fitReadClassModel <- discoveryParameters[["fitReadClassModel"]] + defaultModels <- discoveryParameters[["defaultModels"]] + returnModel <- discoveryParameters[["returnModel"]] + min.exonOverlap <- discoveryParameters[["min.exonOverlap"]] if(processByBam){ readClassList <- bplapply(seq_along(reads), function(i) { diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index 68b41395..9dff19a2 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -2,9 +2,9 @@ #' @inheritParams bambu #' @import data.table #' @noRd -bambu.quantify <- function(readClassDt, countMatrix, incompatibleCountMatrix, txid.index, GENEIDs, emParameters, +bambu.quantify <- function(readClassDt, countMatrix, incompatibleCountMatrix, txid.index, GENEIDs, emParameters, trackReads = FALSE, returnDistTable = FALSE, - verbose = FALSE, isoreParameters = setIsoreParameters(NULL)) { + verbose = FALSE) { start.ptm <- proc.time() readClassDt$nobs = countMatrix[readClassDt$eqClass.match] readClassDt$nobs[is.na(readClassDt$nobs)] = 0 diff --git a/R/bambu.R b/R/bambu.R index 031848a4..2663a317 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -34,11 +34,8 @@ #' annotation to be assigned to the same gene id, defaults to 10bp} #' \item{min.primarySecondaryDist}{specifying the minimum number of distance #' threshold, defaults to 5bp} -#' \item{min.primarySecondaryDistStartEnd1}{specifying the minimum number +#' \item{min.primarySecondaryDistStartEnd1}{specifying the minimum number #' of distance threshold, used for extending annotation, defaults to 5bp} -#' \item{min.primarySecondaryDistStartEnd2}{specifying the minimum number -#' of distance threshold, used for estimating distance to annotation, -#' defaults to 5bp} #' \item{min.txScore.multiExon}{specifying the minimum transcript level #' threshold for multi-exon transcripts during sample combining, #' defaults to 0} @@ -62,6 +59,17 @@ #' \item{prefix}{specifying prefix for new gene Ids (genePrefix.number), #' defaults to "Bambu"} #' } +#' @param opt.rcAssignment A list of controlling parameters for the read class +#' to transcript assignment process: +#' \describe{ +#' \item{min.exonDistance}{specifying minimum distance to known transcript +#' to be considered a valid match, defaults to 35bp} +#' \item{min.primarySecondaryDist}{specifying the minimum distance +#' threshold between primary and secondary assignments, defaults to 5bp} +#' \item{min.primarySecondaryDistStartEnd2}{specifying the minimum +#' distance threshold for start/end positions used for read assignment, +#' defaults to 5bp} +#' } #' @param opt.em A list of controlling parameters for quantification #' algorithm estimation process: #' \describe{ @@ -141,8 +149,8 @@ #' genome = fa.file, discovery = TRUE, quant = TRUE) #' @export bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, - mode = NULL, opt.discovery = NULL, opt.em = NULL, rcOutDir = NULL, discovery = TRUE, - assignDist = TRUE, quant = TRUE, stranded = FALSE, ncore = 1, yieldSize = NULL, + mode = NULL, opt.discovery = NULL, opt.rcAssignment = NULL, opt.em = NULL, rcOutDir = NULL, discovery = TRUE, + assignDist = TRUE, quant = TRUE, stranded = FALSE, ncore = 1, yieldSize = NULL, trackReads = FALSE, returnDistTable = FALSE, lowMemory = FALSE, sampleData = NULL, fusionMode = FALSE, verbose = FALSE, demultiplexed = FALSE, quantData = NULL, sampleNames = NULL, cleanReads = FALSE, dedupUMI = FALSE, barcodesToFilter = NULL, clusters = NULL, @@ -176,6 +184,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } if(lowMemory) message("lowMemory has been deprecated and split into processByChromosome and processByBam. Please see Documentation") + if("min.primarySecondaryDistStartEnd2" %in% names(opt.discovery)) + message("min.primarySecondaryDistStartEnd2 has been moved to opt.rcAssignment. Please pass this parameter via opt.rcAssignment instead.") if(is.null(annotations)){ annotations <- GRangesList() } else { @@ -184,11 +194,11 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, genomeSequence = genome, discovery = discovery, sampleNames = sampleNames, sampleData = sampleData, quantData = quantData) } - isoreParameters <- setIsoreParameters(isoreParameters = opt.discovery) + opt.discovery <- setDiscoveryParameters(discoveryParameters = opt.discovery) #below line is to be compatible with earlier version of running bambu - if(!is.null(isoreParameters$max.txNDR)) NDR = isoreParameters$max.txNDR - - emParameters <- setEmParameters(emParameters = opt.em) + if(!is.null(opt.discovery$max.txNDR)) NDR = opt.discovery$max.txNDR + opt.rcAssignment <- setRcAssignmentParameters(rcAssignmentParameters = opt.rcAssignment) + opt.em <- setEmParameters(emParameters = opt.em) bpParameters <- setBiocParallelParameters(reads, ncore, verbose, demultiplexed) xgb.set.config(nthread = 1) # only when reads is not NULL, this proceed, otherwise, it will jump to quant step @@ -214,7 +224,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, genomeSequence = genome, readClass.outputDir = rcOutDir, yieldSize = yieldSize, bpParameters = bpParameters, stranded = stranded, verbose = verbose, - isoreParameters = isoreParameters, trackReads = trackReads, + discoveryParameters = opt.discovery, trackReads = trackReads, fusionMode = fusionMode, processByChromosome = processByChromosome, processByBam = processByBam, demultiplexed = demultiplexed, @@ -227,14 +237,14 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if (discovery) { message("--- Start extending annotations ---") extendedAnnotations <- bambu.extendAnnotations(readClassList, annotations, NDR, - isoreParameters, stranded, bpParameters, fusionMode, verbose) + opt.discovery, stranded, bpParameters, fusionMode, verbose) metadata(extendedAnnotations)$warnings = warnings #### cluster based transcript discovery if(!is.null(clusters)){ annotations.clusters <- isore.extendAnnotations.clusters(readClassList, - annotations, clusters, NDR, - isoreParameters, stranded, bpParameters, fusionMode, verbose = FALSE) + annotations, clusters, NDR, + opt.discovery, stranded, bpParameters, fusionMode, verbose = FALSE) metadata(extendedAnnotations)$clusters <- annotations.clusters } annotations <- extendedAnnotations @@ -246,9 +256,9 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, quantData <- bplapply(seq_along(readClassList), function(i){ assignReadClasstoTranscripts( readClassList = readClassList[[i]], - annotations = annotations, - isoreParameters = isoreParameters, - verbose = verbose, + annotations = annotations, + rcAssignmentParameters = opt.rcAssignment, + verbose = verbose, # for bulk data, there is one sampleData (keep sampleData[1]), for single-cell, there is one per sample sampleMetadata = if(length(sampleData) == 1) sampleData[1] else sampleData[i], demultiplexed = demultiplexed, @@ -263,10 +273,10 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if (quant) { message("--- Start isoform EM quantification ---") if(!is.null(NDR) & !discovery)# this step is used when reset NDR is needed - annotations <- setNDR(annotations, NDR, - prefix = isoreParameters$prefix, - baselineFDR = isoreParameters[["baselineFDR"]], - defaultModels2 = isoreParameters[["defaultModels"]]) + annotations <- setNDR(annotations, NDR, + prefix = opt.discovery$prefix, + baselineFDR = opt.discovery[["baselineFDR"]], + defaultModels2 = opt.discovery[["defaultModels"]]) if(length(annotations)==0) stop("No valid annotations, if running de novo please try less stringent parameters") if(is.null(quantData)) stop("quantData must be provided or assignDist = TRUE") @@ -311,8 +321,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } return(bambu.quantify(readClassDt = metadata(quantData_i)$readClassDt, countMatrix = countMatrix, incompatibleCountMatrix = data.table(GENEID.i = as.numeric(rownames(metadata(quantData_i)$incompatibleCountMatrix)), counts = incompatibleCountMatrix), - txid.index = mcols(annotations)$txid, GENEIDs = GENEIDs.i, isoreParameters = isoreParameters, - emParameters = emParameters, trackReads = trackReads, + txid.index = mcols(annotations)$txid, GENEIDs = GENEIDs.i, + emParameters = opt.em, trackReads = trackReads, verbose = verbose))}, BPPARAM = bpParameters) end.ptm <- proc.time() diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index 44c2d747..90fa4b42 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -13,20 +13,19 @@ setBiocParallelParameters <- function(reads, ncore, verbose, demultiplexed){ } -#' setIsoreparameters +#' setDiscoveryParameters #' @noRd -setIsoreParameters <- function(isoreParameters){ +setDiscoveryParameters <- function(discoveryParameters){ # ===# set default controlling parameters for isoform reconstruction #===# - isoreParameters.default <- list( - remove.subsetTx = TRUE, + discoveryParameters.default <- list( + remove.subsetTx = TRUE, min.readCount = 2, min.readFractionByGene = 0.05, min.sampleNumber = 1, min.exonDistance = 35, - min.exonOverlap = 10, + min.exonOverlap = 10, min.primarySecondaryDist = 5, min.primarySecondaryDistStartEnd1 = 5, # for creating new annotations - min.primarySecondaryDistStartEnd2 = 5, # for read assignment min.txScore.multiExon = 0, min.txScore.singleExon = 1, fitReadClassModel = TRUE, @@ -34,10 +33,23 @@ setIsoreParameters <- function(isoreParameters){ returnModel = FALSE, baselineFDR = 0.1, min.readFractionByEqClass = 0, - prefix = "Bambu") - isoreParameters <- - updateParameters(isoreParameters, isoreParameters.default) - return(isoreParameters) + prefix = "Bambu") + discoveryParameters <- + updateParameters(discoveryParameters, discoveryParameters.default) + return(discoveryParameters) +} + + +#' setRcAssignmentParameters +#' @noRd +setRcAssignmentParameters <- function(rcAssignmentParameters){ + rcAssignmentParameters.default <- list( + min.exonDistance = 35, + min.primarySecondaryDist = 5, + min.primarySecondaryDistStartEnd2 = 5) + rcAssignmentParameters <- + updateParameters(rcAssignmentParameters, rcAssignmentParameters.default) + return(rcAssignmentParameters) } @@ -242,11 +254,11 @@ handleWarnings <- function(readClassList, verbose){ } #' Calculate the dist table used for Bambu Quantification -calculateDistTable <- function(readClassList, annotations, isoreParameters, verbose, returnDistTable){ +calculateDistTable <- function(readClassList, annotations, rcAssignmentParameters, verbose, returnDistTable){ readClassDist <- isore.estimateDistanceToAnnotations(readClassList, annotations, - min.exonDistance = isoreParameters[["min.exonDistance"]], - min.primarySecondaryDist = isoreParameters[['min.primarySecondaryDist']], - min.primarySecondaryDistStartEnd = isoreParameters[['min.primarySecondaryDistStartEnd2']], + min.exonDistance = rcAssignmentParameters[["min.exonDistance"]], + min.primarySecondaryDist = rcAssignmentParameters[['min.primarySecondaryDist']], + min.primarySecondaryDistStartEnd = rcAssignmentParameters[['min.primarySecondaryDistStartEnd2']], verbose = verbose) metadata(readClassDist)$distTable <- modifyIncompatibleAssignment(metadata(readClassDist)$distTable) if(returnDistTable) metadata(readClassDist)$distTableOld <- metadata(readClassDist)$distTable