diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index be73e0a8..750c1e87 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -3,7 +3,7 @@ #' @import data.table #' @noRd assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParameters, - verbose, demultiplexed, spatial, + verbose, sampleMetadata, demultiplexed, returnDistTable = FALSE, trackReads = TRUE) { if (is.character(readClassList)) readClassList <- readRDS(file = readClassList) metadata(readClassList)$readClassDist <- calculateDistTable(readClassList, annotations, isoreParameters, verbose, returnDistTable) @@ -17,7 +17,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame mutate(aval = 1) %>% data.table() #return non-em counts - ColData <- generateColData(colnames(metadata(readClassList)$countMatrix), clusters = NULL, demultiplexed, spatial) + ColData <- generateColData(readClassList, sampleMetadata, demultiplexed) quantData <- SummarizedExperiment(assays = SimpleList( counts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)), rowRanges = annotations, @@ -32,7 +32,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame metadata(quantData)$readClassDt <- readClassDt metadata(quantData)$countMatrix <- metadata(readClassList)$countMatrix metadata(quantData)$incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix - metadata(quantData)$sampleNames <- metadata(readClassList)$sampleNames + metadata(quantData)$sampleName <- metadata(readClassList)$sampleData$sampleName if(returnDistTable) metadata(quantData)$distTable <- metadata(metadata(readClassList)$readClassDist)$distTableOld diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R index f144c036..dd988584 100644 --- a/R/bambu-processReads.R +++ b/R/bambu-processReads.R @@ -54,7 +54,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence, returnModel <- isoreParameters[["returnModel"]] min.exonOverlap <- isoreParameters[["min.exonOverlap"]] - if(processByBam){ # bulk mode + if(processByBam){ readClassList <- bplapply(seq_along(reads), function(i) { bambu.processReadsByFile(bam.file = reads[i], genomeSequence = genomeSequence,annotations = annotations, @@ -64,7 +64,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence, processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode, demultiplexed = demultiplexed, cleanReads = cleanReads, dedupUMI = dedupUMI, index = 1, barcodesToFilter = barcodesToFilter)}, BPPARAM = bpParameters) - } else { # single cell mode + } else { readGrgList <- bplapply(seq_along(reads), function(i) { bambu.readsByFile(bam.file = reads[i], genomeSequence = genomeSequence,annotations = annotations, @@ -173,13 +173,6 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, mcols(readGrgList)$id <- seq_along(readGrgList) - sampleName <- names(bam.file)[1] - if(!isFALSE(demultiplexed)){ - mcols(readGrgList)$CB <- paste0(sampleName, '_', mcols(readGrgList)$CB) - } else{ - mcols(readGrgList)$CB <- sampleName - } - mcols(readGrgList)$CB <- as.factor(mcols(readGrgList)$CB) if(!isFALSE(demultiplexed)){ mcols(readGrgList)$sampleID <- as.numeric(mcols(readGrgList)$CB) } else { @@ -217,9 +210,20 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, fusionMode = fusionMode, verbose = verbose) - metadata(se)$samples <- names(bam.file)[1] - metadata(se)$sampleNames <- names(bam.file)[1] - if(!isFALSE(demultiplexed)) metadata(se)$samples <- levels(mcols(readGrgList)$CB) + if (demultiplexed) { + barcodes <- levels(mcols(readGrgList)$CB) + metadata(se)$sampleData <- tibble( + id = paste(names(bam.file)[1], barcodes, sep = '_'), + sampleName = names(bam.file)[1], + barcode = barcodes + ) + } else{ + metadata(se)$sampleData <- tibble( + id = names(bam.file)[1], + sampleName = names(bam.file)[1] + ) + } + return(se) } @@ -307,7 +311,6 @@ constructReadClasses <- function(readGrgList, genomeSequence, annotations, stranded = FALSE, min.readCount = 2, fitReadClassModel = TRUE, min.exonOverlap = 10, defaultModels = NULL, returnModel = FALSE, verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE){ - warnings <- c() ###TODO if(processByChromosome){ # construct read classes for each chromosome seperately @@ -403,12 +406,12 @@ splitReadClassFiles = function(readClassFile){ i = rep(seq_along(counts.table), lengths(counts.table)), j = as.numeric(names(unlist(counts.table))), x = unlist(counts.table), - dims = c(nrow(eqClasses), length(metadata(readClassFile)$samples))) + dims = c(nrow(eqClasses), nrow(metadata(readClassFile)$sampleData))) #incompatible counts distTable <- metadata(metadata(readClassFile)$readClassDist)$distTable.incompatible if(nrow(distTable)==0) { counts.incompatible <- sparseMatrix(i= 1, j = 1, x = 0, - dims = c(1, length(metadata(readClassFile)$samples))) + dims = c(1, length(metadata(readClassFile)$sampleData$id))) rownames(counts.incompatible) <- "TODO" } else{ distTable$sampleIDs <- rowData(readClassFile)$sampleIDs[match(distTable$readClassId, rownames(readClassFile))] @@ -419,11 +422,11 @@ splitReadClassFiles = function(readClassFile){ i = rep(seq_along(counts.table), lengths(counts.table)), j = as.numeric(names(unlist(counts.table))), x = unlist(counts.table), - dims = c(nrow(distTable), length(metadata(readClassFile)$samples))) - colnames(counts.incompatible) <- metadata(readClassFile)$samples + dims = c(nrow(distTable), length(metadata(readClassFile)$sampleData$id))) + colnames(counts.incompatible) <- metadata(readClassFile)$sampleData$id rownames(counts.incompatible) <- distTable$GENEID.i } - colnames(counts) <- metadata(readClassFile)$samples + colnames(counts) <- metadata(readClassFile)$sampleData$id metadata(readClassFile)$eqClassById <- eqClasses$eqClassById #rownames(counts) = eqClasses$eqClassById metadata(readClassFile)$countMatrix <- counts diff --git a/R/bambu-processReads_utilityConstructReadClasses.R b/R/bambu-processReads_utilityConstructReadClasses.R index fec9adbf..a9610045 100644 --- a/R/bambu-processReads_utilityConstructReadClasses.R +++ b/R/bambu-processReads_utilityConstructReadClasses.R @@ -25,12 +25,14 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions, Please report this") start.ptm <- proc.time() if(!is.null(uniqueJunctions)){ - exonsByRC.spliced <- constructSplicedReadClasses( - uniqueJunctions = uniqueJunctions, - unlisted_junctions = unlisted_junctions, - readGrgList = readGrgList, - stranded = stranded)} - else{exonsByRC.spliced = GRangesList()} + exonsByRC.spliced <- constructSplicedReadClasses( + uniqueJunctions = uniqueJunctions, + unlisted_junctions = unlisted_junctions, + readGrgList = readGrgList, + stranded = stranded) + } else{ + exonsByRC.spliced = GRangesList() + } end.ptm <- proc.time() rm(readGrgList, unlisted_junctions, uniqueJunctions) if (verbose) @@ -38,11 +40,13 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions, "spliced junctions in ", round((end.ptm - start.ptm)[3] / 60, 1)," mins.") if(length(reads.singleExon)==0) { exonsByRC.unspliced <- NULL - } else {exonsByRC.unspliced <- constructUnsplicedReadClasses(reads.singleExon, - annotations, exonsByRC.spliced, stranded, verbose)} + } else { + exonsByRC.unspliced <- constructUnsplicedReadClasses(reads.singleExon, + annotations, exonsByRC.spliced, stranded, verbose) + } exonsByRC <- c(exonsByRC.spliced, exonsByRC.unspliced) colDataDf <- DataFrame(name = runName, row.names = runName) - #TODO later remove assays = SimpleList(counts = counts) + counts <- matrix(mcols(exonsByRC)$readCount, dimnames = list(names(exonsByRC), runName)) se <- SummarizedExperiment(assays = SimpleList(counts = counts), diff --git a/R/bambu.R b/R/bambu.R index edd8a8cc..cf61a555 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -93,6 +93,11 @@ #' distTables. The output is a list with an entry for each sample. #' @param lowMemory Read classes will be processed by chromosomes when lowMemory #' is specified. This option provides an efficient way to process big samples. +#' @param sampleData A character vector of paths to metadata CSV files (or \code{NA} if +#' unavailable for specific samples); defaults to \code{NULL}. Files must contain a +#' "sampleName" column for bulk data or a "barcode" column for single-cell/spatial data. +#' For bulk data, one metadata CSV file for all samples is sufficient, whereas single-cell/spatial +#' data requires one metadata CSV file per sample. #' @param fusionMode A logical variable indicating whether run in fusion mode #' @param verbose A logical variable indicating whether processing messages will #' be printed. @@ -138,8 +143,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, mode = NULL, opt.discovery = NULL, opt.em = NULL, rcOutDir = NULL, discovery = TRUE, assignDist = TRUE, quant = TRUE, stranded = FALSE, ncore = 1, yieldSize = NULL, - trackReads = FALSE, returnDistTable = FALSE, lowMemory = FALSE, - fusionMode = FALSE, verbose = FALSE, demultiplexed = FALSE, spatial = NULL, quantData = NULL, + trackReads = FALSE, returnDistTable = FALSE, lowMemory = FALSE, sampleData = NULL, + fusionMode = FALSE, verbose = FALSE, demultiplexed = FALSE, quantData = NULL, sampleNames = NULL, cleanReads = FALSE, dedupUMI = FALSE, barcodesToFilter = NULL, clusters = NULL, processByChromosome = FALSE, processByBam = TRUE) { message(paste0("Running Bambu-v", "3.9.0")) @@ -173,7 +178,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, annotations <- checkInputs(annotations, reads, readClass.outputDir = rcOutDir, genomeSequence = genome, discovery = discovery, - sampleNames = sampleNames, spatial = spatial,quantData = quantData) + sampleNames = sampleNames, sampleData = sampleData, quantData = quantData) } isoreParameters <- setIsoreParameters(isoreParameters = opt.discovery) #below line is to be compatible with earlier version of running bambu @@ -234,16 +239,19 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } if(assignDist){ message("--- Start calculating equivilance classes ---") - quantData <- bplapply(readClassList, - FUN = assignReadClasstoTranscripts, - annotations = annotations, - isoreParameters = isoreParameters, - verbose = verbose, - demultiplexed = demultiplexed, - spatial = spatial, - returnDistTable = returnDistTable, - trackReads = trackReads, - BPPARAM = bpParameters) + quantData <- bplapply(seq_along(readClassList), function(i){ + assignReadClasstoTranscripts( + readClassList = readClassList[[i]], + annotations = annotations, + isoreParameters = isoreParameters, + verbose = verbose, + # for bulk data, there is one sampleData (keep sampleData[1]), for single-cell, there is one per sample + sampleMetadata = if(length(sampleData) == 1) sampleData[1] else sampleData[i], + demultiplexed = demultiplexed, + returnDistTable = returnDistTable, + trackReads = trackReads + ) + }, BPPARAM = bpParameters) if (!quant) return(quantData) } } @@ -262,6 +270,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, start.ptm <- proc.time() countsSeCompressed.all <- NULL ColNames <- c() + colData.all <- list() for(i in seq_along(quantData)){ quantData_i <- quantData[[i]] #load in the barcode clustering from file if provided @@ -285,11 +294,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, iter <- clustering } else{ #if clusters is a list - if(length(quantData)>1){ - iter <- clusters[[i]] #lowMemory mode - }else{ - iter <- clusters#do.call(c,clusters) - } + iter <- clusters[[i]] } } countsSeCompressed <- bplapply(iter, FUN = function(j){ # previous i changed to j to avoid duplicated assignment @@ -310,13 +315,20 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, message("Total Time ", round((end.ptm - start.ptm)[3] / 60, 3), " mins.") if(!is.null(clusters)){ ColNames <- c(ColNames, names(iter)) + colData.all[[i]] <- data.frame( + id = names(countsSeCompressed), + sampleName = names(countsSeCompressed), + row.names = names(countsSeCompressed) + ) } else{ ColNames <- c(ColNames, colnames(quantData_i)) + colData.all[[i]] <- data.frame(colData(quantData_i)) } countsSeCompressed.all <- c(countsSeCompressed.all, countsSeCompressed) } - countsSeCompressed.all$colnames <- ColNames - countsSe <- combineCountSes(countsSeCompressed.all, annotations) + names(countsSeCompressed.all) <- ColNames + + countsSe <- combineCountSes(countsSeCompressed.all, colData.all, annotations) if(returnDistTable){ distTables = list() for(i in seq_along(quantData)){ @@ -324,11 +336,6 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } metadata(countsSe)$distTables <- distTables } - #metadata(countsSe)$warnings = warnings - - ColData <- generateColData(colnames(countsSe), clusters, demultiplexed, spatial) - colData(countsSe) <- ColData - colnames(countsSe) <- ColData[,1] return(countsSe) } } \ No newline at end of file diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index 5deac8ba..44c2d747 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -73,7 +73,7 @@ updateParameters <- function(Parameters, Parameters.default) { #' @importFrom methods is #' @noRd checkInputs <- function(annotations, reads, readClass.outputDir, genomeSequence, - discovery, sampleNames, spatial, quantData){ + discovery, sampleNames, sampleData, quantData){ # ===# Check annotation inputs #===# if (!is.null(annotations)) { if (is(annotations, "CompressedGRangesList")) { @@ -156,13 +156,18 @@ checkInputs <- function(annotations, reads, readClass.outputDir, genomeSequence, } } - if(!is.null(spatial)){ - #if(!all(grepl(".tsv^", spatial))){stop("Not all paths for spatial are .tsv files")} - if(length(spatial)==1 & length(reads)>1){ - warning("Using the same whitelist and coordinates for all input samples") - } else if(length(reads)!=length(spatial)){ - stop("There are not the same number spatial whitelist paths as input files to reads. ", - "Make sure these two arguments are vectors of the same length") + if(!is.null(sampleData)){ + if (!all(grepl("\\.(csv|tsv|txt)$", na.omit(sampleData), ignore.case = TRUE))){ + stop("Not all paths for sample metadata files are .csv/.tsv/.txt files") + } + if(length(sampleData)==1 & length(reads)>1){ # one sample metadata for all samples + message("Using the same sample metadata file for all input samples") + } else if(length(reads)!=length(sampleData)){ # multiple sample metadatas for multiple samples + stop( + "The number of sample metadata files does not match the number of input read files. ", + "These two arguments (sampleData & reads) must be vectors of the same length. ", + "If a specific sample has no metadata, please use 'NA' as a placeholder in the sampleData vector." + ) } } return(annotations) @@ -255,13 +260,11 @@ calculateDistTable <- function(readClassList, annotations, isoreParameters, verb return(readClassDist) } -#' Combine count se object while preserving the metadata objects +#' Combine combined count se object from multiple samples, cells or spatial locations #' @noRd -combineCountSes <- function(countsSe, annotations){ - countsData <- c("counts", "CPM", "fullLengthCounts", - "uniqueCounts", "incompatibleCounts") - sampleNames <- countsSe$colnames - countsSe$colnames <- NULL +combineCountSes <- function(countsSe, colDataList, annotations){ + countsData <- c("counts", "CPM", "fullLengthCounts", "uniqueCounts", "incompatibleCounts") + sampleNames <- names(countsSe) countsDataMat <- lapply(countsData, FUN = function(k){ countsVecList <- lapply(countsSe, function(j){j[[k]]}) countsMat <- sparseMatrix(i = unlist(lapply(countsVecList, function(j) j@i)), @@ -279,56 +282,51 @@ combineCountSes <- function(countsSe, annotations){ return(countsMat) }) names(countsDataMat) <- countsData - countsSe <- SummarizedExperiment(assays = SimpleList(counts = countsDataMat$counts, + combinedCountsSe <- SummarizedExperiment(assays = SimpleList(counts = countsDataMat$counts, CPM = countsDataMat$CPM, fullLengthCounts = countsDataMat$fullLengthCounts, uniqueCounts = countsDataMat$uniqueCounts)) - metadata(countsSe)$incompatibleCounts <- countsDataMat$incompatibleCounts - rowRanges(countsSe) <- annotations - return(countsSe) + metadata(combinedCountsSe)$incompatibleCounts <- countsDataMat$incompatibleCounts + rowRanges(combinedCountsSe) <- annotations + + colData(combinedCountsSe) <- DataFrame(bind_rows(colDataList)) + + return(combinedCountsSe) } -#' Generate the coldata for se options using colnames, and other option inputs -#' @noRd -generateColData <- function(sampleNames, clusters, demultiplexed, spatial){ - ColData <- DataFrame(id = sampleNames) - if(!isFALSE(demultiplexed) & is.null(clusters)){ - ColData <- DataFrame(id = sampleNames, - sampleName = gsub("_[^_]+$","", sampleNames, perl = TRUE), - Barcode = gsub(".*_(?=[^_]*$)","", sampleNames, perl = TRUE)) - } - if(!is.null(spatial) & is.null(clusters)){ - ColData$x_coordinate <- NA - ColData$y_coordinate <- NA - if(length(spatial)==1){ - # the following line takes a regular delimited file as input - # it can either has header or without header - # it can also be compressed - bc_coords <- fread(spatial, - col.names = c("Barcode", "x_coordinate", "y_coordinate"), - data.table = FALSE) - # DataFrame(read.table(gzfile(spatial), - # col.names = c("Barcode", "x_coordinate", "y_coordinate"))) - bcMatch <- match(ColData$Barcode, bc_coords$Barcode) - ColData$x_coordinate <- bc_coords$x_coordinate[bcMatch] - ColData$y_coordinate <- bc_coords$y_coordinate[bcMatch] - } else{ - spatial.unique <- unique(spatial) - for(whitelist in spatial.unique){ - i <- which(spatial.unique==whitelist) - bc_coords <- fread(whitelist, - col.names = c("Barcode", "x_coordinate", "y_coordinate"), - data.table = FALSE) - # DataFrame(read.table(gzfile(whitelist), - # col.names = c("Barcode", "x_coordinate", "y_coordinate"))) - bcSampleIndex <- ColData$sampleName %in% sampleNames[i] - bcMatch <- match(ColData$Barcode[bcSampleIndex], bc_coords$Barcode) - ColData$x_coordinate[bcSampleIndex] <- bc_coords$x_coordinate[bcMatch] - ColData$y_coordinate[bcSampleIndex] <- bc_coords$y_coordinate[bcMatch] - } - } - } - return(ColData) +#' Generate the colData using the external sampleMetadata.csv provided by the user in the sampleMetadata argument +#' @param readClassList A list object containingmetadata about read classes. +#' @param sampleMetadata A path to a CSV file or NULL/NA if there is no metadata for the sample. +#' @param demultiplexed Logical; indicates if data is demultiplexed. +#' +#' @return A DataFrame containing colData for the sample. +#' @export +generateColData <- function(readClassList, sampleMetadata, demultiplexed) { + sampleMetadataDf <- if (is.null(sampleMetadata) || is.na(sampleMetadata)) { + if (demultiplexed) tibble(barcode = character()) else tibble(sampleName = character()) + } else { + fread(sampleMetadata) + } + + joinKey <- if (demultiplexed) "barcode" else "sampleName" + + colData <- tibble( + id = metadata(readClassList)$sampleData$id, + sampleName = metadata(readClassList)$sampleData$sampleName + ) + + if (demultiplexed) { + colData <- colData %>% + mutate(barcode = metadata(readClassList)$sampleData$barcode) + } + + colData <- colData %>% + left_join(sampleMetadataDf, by = joinKey) %>% + as.data.frame() + + rownames(colData) <- colData$id + + colData } # Quick wrapper function (https://stackoverflow.com/questions/13273833/merging-multiple-data-tables) diff --git a/R/prepareDataFromBam.R b/R/prepareDataFromBam.R index e9634a92..81beb4b2 100755 --- a/R/prepareDataFromBam.R +++ b/R/prepareDataFromBam.R @@ -46,13 +46,20 @@ prepareDataFromBam <- function(bamFile, yieldSize = NULL, verbose = FALSE, if (!isFALSE(demultiplexed)){ # if demultiplexed is TRUE or a string path if(isTRUE(demultiplexed)){ # if demultiplexed is TRUE - mcols(readGrgList[[counter]])$CB <- case_when(grepl("^[^_]+_[^#]+#", names(readGrgList[[counter]]), perl = TRUE) ~ sub("_.*", "", names(readGrgList[[counter]])), # a checkpoint to see whether CB is contained in the name, with specific format CB_UMI#READNAME, - !is.na(mcols(alignmentInfo)$CB) ~ mcols(alignmentInfo)$CB, - TRUE ~ NA) + # a checkpoint to parse CB and UMI from the bam file, either from reads or CB/UMI tags. + # currently read name only accepts the format CB_UMI#READNAME (CB & UMI cannot have '_', otherwise parsing fails) + mcols(readGrgList[[counter]])$CB <- case_when( + !is.na(mcols(alignmentInfo)$CB) ~ mcols(alignmentInfo)$CB, + grepl("^[^_]+_[^#]+#", names(readGrgList[[counter]]), perl = TRUE) ~ sub("_.*", "", names(readGrgList[[counter]])), + TRUE ~ NA + ) - mcols(readGrgList[[counter]])$UMI <- case_when(grepl("^[^_]+_[^#]+#", names(readGrgList[[counter]]), perl = TRUE) ~ sub("^[^_]+_([^#]+)#.*$", "\\1", names(readGrgList[[counter]])), # a checkpoint to see whether UMI is contained in the name, with specific format CB_UMI#READNAME, - !is.na(mcols(alignmentInfo)$UB) ~ mcols(alignmentInfo)$UB, - TRUE ~ NA) + mcols(readGrgList[[counter]])$UMI <- case_when( + !is.na(mcols(alignmentInfo)$UB) ~ mcols(alignmentInfo)$UB, + grepl("^[^_]+_[^#]+#", names(readGrgList[[counter]]), perl = TRUE) ~ sub("^[^_]+_([^#]+)#.*$", "\\1", names(readGrgList[[counter]])), + TRUE ~ NA + ) + } else{ # if demultiplexed is a string path mcols(readGrgList[[counter]])$CB <- NA mcols(readGrgList[[counter]])$UMI <- NA @@ -61,10 +68,6 @@ prepareDataFromBam <- function(bamFile, yieldSize = NULL, verbose = FALSE, mcols(readGrgList[[counter]])$UMI <- readMap[,3][match(names(readGrgList[[counter]]),readMap[,1])] } } - cells <- unique(c(cells, mcols(readGrgList[[counter]])$CB)) - mcols(readGrgList[[counter]])$CB <- factor(mcols(readGrgList[[counter]])$CB, levels = cells) - umi <- unique(c(umi, mcols(readGrgList[[counter]])$UMI)) - mcols(readGrgList[[counter]])$UMI <- factor(mcols(readGrgList[[counter]])$UMI, levels = umi) } if(cleanReads){ softClip5Prime <- clipFunction(cigarData = GenomicAlignments::cigar(alignmentInfo), grep_pattern = '^(\\d*)[S].*', replace_pattern = '\\1') @@ -94,6 +97,11 @@ prepareDataFromBam <- function(bamFile, yieldSize = NULL, verbose = FALSE, } else { readGrgList <- readGrgList[[1]] } + + if (demultiplexed){ + mcols(readGrgList)$CB <- factor(mcols(readGrgList)$CB, levels = sort(unique(mcols(readGrgList)$CB))) + } + # remove microexons of width 1bp from list readGrgList <- readGrgList <- readGrgList[sum(width(readGrgList)) > 1] numNoCBs <- sum(is.na(mcols(readGrgList)$CB))