Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
bf32099
convert quantData into an object storing list of S4 object
lingminhao Apr 15, 2026
abc7411
Refactor quantData: remove countMatrix and replace with list columns …
lingminhao Apr 15, 2026
ce1259e
remove redundant stored data in quantData
lingminhao Apr 16, 2026
9ce39c1
add readToTranscriptMaps to final se when trackReads is TRUE
lingminhao Apr 16, 2026
c7ed6d3
Add gene and unique count matrix helpers from quantData
lingminhao Apr 16, 2026
ce96f7b
add colnames and colData to readClassList
lingminhao Apr 16, 2026
ed08825
name readClassList, quantData, readToTranscriptMaps, and distTables l…
lingminhao Apr 16, 2026
7f56d21
add nonuniqueCounts to quantData & se.
lingminhao Apr 16, 2026
a30a5fd
refactor generateUniqueCountsFromQuantData to return SummarizedExperi…
lingminhao Apr 16, 2026
acf4a45
refactor transcriptToGeneExpression to compute gene counts as uniqueC…
lingminhao Apr 16, 2026
5366795
refactor constructQuantData class using claude
lingminhao Apr 16, 2026
e895bee
not storing nonuniqueCountMatrix in quantData
lingminhao Apr 17, 2026
572ef70
revert transcriptToGeneExpression & remove nonuniqueCounts from final SE
lingminhao Apr 17, 2026
4277d65
add seType tag to SE objects
lingminhao Apr 17, 2026
8a6cb1e
remove summarizeExpression.R
lingminhao Apr 17, 2026
192de76
fix gene index mapping in generateIncompatibleCounts
lingminhao Apr 17, 2026
4368f7a
reorder incompatibleCounts to annotation order
lingminhao Apr 17, 2026
b7a33ba
pass incompatibleCounts as single-column matrix to bambu.quantify
lingminhao Apr 17, 2026
219b2b1
convert clusters barcode string to integer
lingminhao Apr 17, 2026
f0e5658
define SE_TYPES vocabulary and document quantData slots
lingminhao Apr 20, 2026
bf99de5
tidy up code
lingminhao Apr 20, 2026
97018f7
fix duplicated sample name prefix in SE colnames for demultiplexed data
lingminhao Apr 20, 2026
ac7cd4f
move generateNonUniqueCountMatrix to transcriptToGeneExpression_utili…
lingminhao Apr 20, 2026
3012744
Merge branch 'devel_pre_v4' into tidy_readClassFile_quantData
lingminhao Apr 20, 2026
53d3728
fix: correct wrong variable names from merge conflict
lingminhao Apr 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 31 additions & 76 deletions R/bambu-assignDist.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, rcAssignmen
readClassList <- splitReadClassFiles(readClassList)
readClassDt <- genEquiRCs(metadata(readClassList)$readClassDist, annotations, verbose)
readClassDt$eqClass.match = match(readClassDt$eqClassById,metadata(readClassList)$eqClassById)

# Add columnIds and columnCounts columns
readClassDt[, `:=`(columnIds = vector("list", .N), columnCounts = vector("list", .N))]
observedIdx <- which(!is.na(readClassDt$eqClass.match))
readClassDt$columnIds[observedIdx] <- metadata(readClassList)$columnIds[readClassDt$eqClass.match[observedIdx]]
readClassDt$columnCounts[observedIdx] <- metadata(readClassList)$columnCounts[readClassDt$eqClass.match[observedIdx]]

Comment thread
lingminhao marked this conversation as resolved.
readClassDt <- simplifyNames(readClassDt)
readClassDt <- readClassDt %>% group_by(eqClassId, gene_sid) %>%
mutate(multi_align = length(unique(txid))>1) %>%
Expand All @@ -18,93 +25,41 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, rcAssignmen
data.table()
#return non-em counts
ColData <- generateColData(readClassList, sampleMetadata, extractBarcodeUMI)
quantData <- SummarizedExperiment(assays = SimpleList(
counts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)),
rowRanges = annotations,
colData = ColData)
colnames(quantData) <- ColData$id
if(sum(metadata(readClassList)$incompatibleCountMatrix)==0){
metadata(quantData)$incompatibleCounts <- NULL
}else{
metadata(quantData)$incompatibleCounts <- generateIncompatibleCounts(metadata(readClassList)$incompatibleCountMatrix, annotations)
}
metadata(quantData)$nonuniqueCounts <- generateNonUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)
metadata(quantData)$readClassDt <- readClassDt
metadata(quantData)$countMatrix <- metadata(readClassList)$countMatrix
metadata(quantData)$incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix
metadata(quantData)$sampleName <- metadata(readClassList)$sampleData$sampleName
if(returnDistTable)
metadata(quantData)$distTable <- metadata(metadata(readClassList)$readClassDist)$distTableOld

if(trackReads)
metadata(quantData)$readToTranscriptMap <-
generateReadToTranscriptMap(readClassList,
metadata(readClassList)$readClassDist,
annotations)

incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix
incompatibleCounts <- generateIncompatibleCounts(incompatibleCountMatrix, annotations)

return(quantData)
distTable <- if(returnDistTable) {
metadata(metadata(readClassList)$readClassDist)$distTableOld
} else{
NULL
}

}
readToTranscriptMap <- if(trackReads) {
generateReadToTranscriptMap(readClassList, metadata(readClassList)$readClassDist,annotations)
} else{
NULL
}

#' Generate unique counts
#' @noRd
generateUniqueCounts <- function(readClassDt, countMatrix, annotations){
x <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match))
uniqueCounts <- countMatrix[x$eqClass.match,]
uniqueCounts.tx <- sparse.model.matrix(~ factor(x$txid) - 1)
uniqueCounts <- t(uniqueCounts.tx) %*% uniqueCounts
rownames(uniqueCounts) <- names(annotations)[match(as.numeric(levels(factor(x$txid))),mcols(annotations)$txid)]
counts <- sparseMatrix(length(annotations), ncol(uniqueCounts), x = 0)
rownames(counts) <- names(annotations)
counts[rownames(uniqueCounts),] <- uniqueCounts
return(counts)

# these three lines appear after return, so it's not used, is this used for debug only?
# counts.total = colSums(countMatrix) + colSums(incompatibleCountMatrix)
# counts.total[counts.total==0] = 1
# counts.CPM = counts/counts.total * 10^6
quantData <- constructQuantData(
sampleData = data.frame(ColData),
readClassDt = readClassDt,
incompatibleCounts = incompatibleCounts,
distTable = distTable,
readToTranscriptMap = readToTranscriptMap
)

return(quantData)
}


#' Generate incompatible counts
#' @noRd
generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){
Comment thread
lingminhao marked this conversation as resolved.
genes <- levels(factor(unique(mcols(annotations)$GENEID)))
rownames(incompatibleCountMatrix) <- genes[as.numeric(rownames(incompatibleCountMatrix))]
geneMat <- sparseMatrix(length(genes), ncol(incompatibleCountMatrix), x = 0)
rownames(geneMat) <- genes
Comment thread
lingminhao marked this conversation as resolved.
geneMat[rownames(incompatibleCountMatrix),] <- incompatibleCountMatrix
return(geneMat)
}


#' Generate non-unique counts
#' @noRd
generateNonUniqueCounts <- function(readClassDt, countMatrix, annotations){
#fuse multi align RCs by gene
x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match))
x <- x %>% distinct(eqClassId, .keep_all = TRUE)
nonuniqueCounts <- countMatrix[x$eqClass.match,, drop = FALSE]
if(nrow(x)>1 & length(unique(x$gene_sid))>1){
nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1)
nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts
} else{
warning("The factor variable 'gene_sid' has only one level. Adjusting output.")
nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE)
nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts
}
#covert ids into gene ids
geneids <- as.numeric(levels(factor(x$gene_sid)))
geneids <- x$txid[match(geneids, x$gene_sid)]
geneids <- mcols(annotations)$GENEID[as.numeric(geneids)]
rownames(nonuniqueCounts) <- geneids
#create matrix for all annotated genes
genes <- levels(factor(unique(mcols(annotations)$GENEID)))
geneMat <- sparseMatrix(length(genes), ncol(nonuniqueCounts), x = 0)
rownames(geneMat) <- genes
if(!is.null(rownames(nonuniqueCounts))){
geneMat[rownames(nonuniqueCounts),] <- nonuniqueCounts
}
return(geneMat)
colnames(geneMat) <- colnames(incompatibleCountMatrix)
geneMat[match(rownames(incompatibleCountMatrix), rownames(geneMat)), ] <- incompatibleCountMatrix
return(geneMat[unique(mcols(annotations)$GENEID), , drop = FALSE])
}
78 changes: 78 additions & 0 deletions R/bambu-classes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#' @import methods
#' @importFrom Matrix Matrix
#' @importFrom data.table data.table
#' @importClassesFrom Matrix Matrix
#' @importClassesFrom data.table data.table

# Valid values for metadata(se)$seType, which identifies the SE variant:
# EMCounts — transcript-level SE with EM-estimated counts (main bambu output)
# geneCounts — gene-level SE derived by collapsing transcript counts
# uniqueCounts — transcript-level SE with uniquely-assigned counts only, no EM
SE_TYPES <- c(
EMCounts = "EMCounts",
geneCounts = "geneCounts",
uniqueCounts = "uniqueCounts"
)

# quantData holds per-sample intermediate results from the assignDist step.
# distTable and readToTranscriptMap are optional: populated only when
# returnDistTable=TRUE or trackReads=TRUE respectively, and lifted into
# metadata(countsSe) at the end of bambu().
setClass("quantData",
slots = c(
sampleData = "data.frame", # per-sample metadata (id, sampleName)
readClassDt = "data.table", # read-class-level count and assignment data
incompatibleCounts = "sparseMatrix", # counts of reads incompatible with any annotation
distTable = "ANY", # read-class-to-transcript compatibility table (DataFrame or NULL)
readToTranscriptMap = "ANY" # per-read assignment to transcripts (tibble or NULL)
),
prototype = list(
sampleData = data.frame(id = integer(), sampleName = character()),
readClassDt = data.table::data.table()
),
validity = function(object) {
errs <- character()
if (!all(c("id", "sampleName") %in% names(object@sampleData)))
errs <- c(errs, "sampleData must have columns 'id' and 'sampleName'")
if (!is.null(object@distTable) && !is(object@distTable, "DataFrame"))
errs <- c(errs, "distTable must be a DataFrame or NULL")
if (!is.null(object@readToTranscriptMap) && !is(object@readToTranscriptMap, "tbl_df"))
errs <- c(errs, "readToTranscriptMap must be a tibble or NULL")
if (length(errs)) errs else TRUE
})

#' Construct a quantData object
#' @noRd
constructQuantData <- function(sampleData, readClassDt,
incompatibleCounts,
distTable = NULL,
readToTranscriptMap = NULL) {
new("quantData",
sampleData = sampleData,
readClassDt = readClassDt,
incompatibleCounts = incompatibleCounts,
distTable = distTable,
readToTranscriptMap = readToTranscriptMap)
}

#' @noRd
setGeneric("getSampleData", function(x) standardGeneric("getSampleData"))
#' @noRd
setGeneric("getReadClassDt", function(x) standardGeneric("getReadClassDt"))
#' @noRd
setGeneric("getIncompatibleCounts", function(x) standardGeneric("getIncompatibleCounts"))
#' @noRd
setGeneric("getDistTable", function(x) standardGeneric("getDistTable"))
#' @noRd
setGeneric("getReadToTranscriptMap", function(x) standardGeneric("getReadToTranscriptMap"))

#' @noRd
setMethod("getSampleData", "quantData", function(x) x@sampleData)
#' @noRd
setMethod("getReadClassDt", "quantData", function(x) x@readClassDt)
#' @noRd
setMethod("getIncompatibleCounts", "quantData", function(x) x@incompatibleCounts)
#' @noRd
setMethod("getDistTable", "quantData", function(x) x@distTable)
#' @noRd
setMethod("getReadToTranscriptMap", "quantData", function(x) x@readToTranscriptMap)
46 changes: 25 additions & 21 deletions R/bambu-processReads.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence,
processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode,
extractBarcodeUMI = extractBarcodeUMI, dedupUMI = dedupUMI, index = 1)},
BPPARAM = bpParameters)
names(readClassList) <- names(reads)
} else {
readGrgList <- bplapply(seq_along(reads), function(i) {
bambu.readsByFile(bam.file = reads[i],
Expand Down Expand Up @@ -162,22 +163,23 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations,

mcols(readGrgList)$id <- seq_along(readGrgList)

if(extractBarcodeUMI){
mcols(readGrgList)$sampleID <- as.numeric(mcols(readGrgList)$CB)
if(extractBarcodeUMI){
mcols(readGrgList)$columnID <- as.numeric(mcols(readGrgList)$CB)
} else {
mcols(readGrgList)$sampleID <- index
mcols(readGrgList)$columnID <- index
}

runName <- names(bam.file)[1]
# construct read classes for each chromosome seperately
if(processByChromosome){
se <- lowMemoryConstructReadClasses(readGrgList, genomeSequence,
annotations, stranded, verbose,bam.file)
annotations, stranded, verbose, bam.file)
} else{
unlisted_junctions <- unlistIntrons(readGrgList, use.ids = TRUE)
uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions,
annotations,genomeSequence, stranded = stranded, verbose = verbose)
se <- isore.constructReadClasses(readGrgList,
unlisted_junctions, uniqueJunctions, runName = "TODO",
unlisted_junctions, uniqueJunctions, runName = runName,
annotations, stranded, verbose)

}
Expand Down Expand Up @@ -297,18 +299,18 @@ bambu.readsByFile <- function(bam.file, genomeSequence, annotations,
constructReadClasses <- function(readGrgList, genomeSequence, annotations,
stranded = FALSE, min.readCount = 2,
fitReadClassModel = TRUE, min.exonOverlap = 10, defaultModels = NULL, returnModel = FALSE,
verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE){
verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE, runName = "sample"){

if(processByChromosome){
# construct read classes for each chromosome seperately
se <- lowMemoryConstructReadClasses(readGrgList, genomeSequence,
annotations, stranded, verbose,"TODO", fusionMode)
annotations, stranded, verbose, runName, fusionMode)
Comment thread
lingminhao marked this conversation as resolved.
} else{
unlisted_junctions <- unlistIntrons(readGrgList, use.ids = TRUE)
uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions,
annotations,genomeSequence, stranded = stranded, verbose = verbose)
se <- isore.constructReadClasses(readGrgList,
unlisted_junctions, uniqueJunctions, runName = "TODO",
unlisted_junctions, uniqueJunctions, runName = runName,
annotations, stranded, verbose)

}
Expand Down Expand Up @@ -336,21 +338,22 @@ constructReadClasses <- function(readGrgList, genomeSequence, annotations,
#' Low memory mode for construct read classes (processByChromosome)
#' @noRd
lowMemoryConstructReadClasses <- function(readGrgList, genomeSequence,
annotations, stranded, verbose,bam.file, fusionMode = FALSE){
annotations, stranded, verbose, bam.file, fusionMode = FALSE){
if(fusionMode){
readGrgList <- list(readGrgList)
names(readGrgList) <- c("fusion")
} else{
readGrgList <- split(readGrgList, getChrFromGrList(readGrgList))
}
runName <- names(bam.file)[1]
se <- lapply(names(readGrgList),FUN = function(i){
if(length(readGrgList[[i]]) == 0) return(NULL)
# create error and strand corrected junction tables
unlisted_junctions <- unlistIntrons(readGrgList[[i]], use.ids = TRUE)
uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions,
annotations,genomeSequence, stranded = stranded, verbose = verbose)
se.temp <- isore.constructReadClasses(readGrgList[[i]],
unlisted_junctions, uniqueJunctions, runName = "TODO",
unlisted_junctions, uniqueJunctions, runName = runName,
annotations, stranded, verbose)
return(se.temp)
})
Expand Down Expand Up @@ -385,10 +388,12 @@ splitReadClassFiles = function(readClassFile){
distTable <- metadata(metadata(readClassFile)$readClassDist)$distTable
eqClasses <- distTable %>% group_by(eqClassById) %>%
distinct(eqClassById, readCount,GENEID, totalWidth, firstExonWidth, .keep_all = TRUE)
eqClasses$sampleIDs <- rowData(readClassFile)$sampleIDs[match(eqClasses$readClassId, rownames(readClassFile))]
eqClasses$columnIds <- rowData(readClassFile)$columnIds[match(eqClasses$readClassId, rownames(readClassFile))]
eqClasses <- eqClasses %>% summarise(nobs = sum(readCount),
sampleIDs = list(unlist(sampleIDs)))
counts.table <- tableFunction(eqClasses$sampleIDs)
columnIds = list(unlist(columnIds)))
counts.table <- tableFunction(eqClasses$columnIds)
metadata(readClassFile)$columnIds <- lapply(counts.table, function(x) as.numeric(names(x)))
metadata(readClassFile)$columnCounts <- lapply(counts.table, function(x) as.numeric(x))
Comment thread
lingminhao marked this conversation as resolved.
counts <- sparseMatrix(
i = rep(seq_along(counts.table), lengths(counts.table)),
j = as.numeric(names(unlist(counts.table))),
Expand All @@ -397,14 +402,14 @@ splitReadClassFiles = function(readClassFile){
#incompatible counts
distTable <- metadata(metadata(readClassFile)$readClassDist)$distTable.incompatible
if(nrow(distTable)==0) {
counts.incompatible <- sparseMatrix(i= 1, j = 1, x = 0,
dims = c(1, length(metadata(readClassFile)$sampleData$id)))
rownames(counts.incompatible) <- "TODO"
counts.incompatible <- sparseMatrix(i= integer(0), j = integer(0), x = numeric(0),
dims = c(0, length(metadata(readClassFile)$sampleData$id)))
rownames(counts.incompatible) <- character(0)
} else{
distTable$sampleIDs <- rowData(readClassFile)$sampleIDs[match(distTable$readClassId, rownames(readClassFile))]
distTable$columnIds <- rowData(readClassFile)$columnIds[match(distTable$readClassId, rownames(readClassFile))]
distTable <- distTable %>% group_by(GENEID.i) %>% summarise(counts = sum(readCount),
sampleIDs = list(unlist(sampleIDs)))
counts.table <- lapply(distTable$sampleIDs, FUN = function(x){table(x)})
columnIds = list(unlist(columnIds)))
counts.table <- lapply(distTable$columnIds, FUN = function(x){table(x)})
counts.incompatible <- sparseMatrix(
i = rep(seq_along(counts.table), lengths(counts.table)),
j = as.numeric(names(unlist(counts.table))),
Expand All @@ -416,7 +421,6 @@ splitReadClassFiles = function(readClassFile){
colnames(counts) <- metadata(readClassFile)$sampleData$id
metadata(readClassFile)$eqClassById <- eqClasses$eqClassById
#rownames(counts) = eqClasses$eqClassById
metadata(readClassFile)$countMatrix <- counts
metadata(readClassFile)$incompatibleCountMatrix <- counts.incompatible
return(readClassFile)
}
Expand All @@ -426,7 +430,7 @@ splitReadClassFiles = function(readClassFile){
#' @importFrom Matrix
#' @noRd
splitReadClassFilesByRC <- function(readClassFile){
counts.table <- tableFunction(rowData(readClassFile)$sampleIDs)
counts.table <- tableFunction(rowData(readClassFile)$columnIds)
counts <- sparseMatrix(
i = rep(seq_along(counts.table), lengths(counts.table)),
j = as.numeric(names(unlist(counts.table))),
Expand Down
Loading
Loading