Skip to content

Commit fdf8db5

Browse files
authored
Merge pull request #542 from GoekeLab/generateColData
Add sampleData argument to store multi sample's metadata information (bulk, single-cell, spatial) in se object
2 parents 63c1412 + 01f4ff8 commit fdf8db5

File tree

6 files changed

+144
-124
lines changed

6 files changed

+144
-124
lines changed

R/bambu-assignDist.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#' @import data.table
44
#' @noRd
55
assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParameters,
6-
verbose, demultiplexed, spatial,
6+
verbose, sampleMetadata, demultiplexed,
77
returnDistTable = FALSE, trackReads = TRUE) {
88
if (is.character(readClassList)) readClassList <- readRDS(file = readClassList)
99
metadata(readClassList)$readClassDist <- calculateDistTable(readClassList, annotations, isoreParameters, verbose, returnDistTable)
@@ -17,7 +17,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame
1717
mutate(aval = 1) %>%
1818
data.table()
1919
#return non-em counts
20-
ColData <- generateColData(colnames(metadata(readClassList)$countMatrix), clusters = NULL, demultiplexed, spatial)
20+
ColData <- generateColData(readClassList, sampleMetadata, demultiplexed)
2121
quantData <- SummarizedExperiment(assays = SimpleList(
2222
counts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)),
2323
rowRanges = annotations,
@@ -32,7 +32,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame
3232
metadata(quantData)$readClassDt <- readClassDt
3333
metadata(quantData)$countMatrix <- metadata(readClassList)$countMatrix
3434
metadata(quantData)$incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix
35-
metadata(quantData)$sampleNames <- metadata(readClassList)$sampleNames
35+
metadata(quantData)$sampleName <- metadata(readClassList)$sampleData$sampleName
3636
if(returnDistTable)
3737
metadata(quantData)$distTable <- metadata(metadata(readClassList)$readClassDist)$distTableOld
3838

R/bambu-processReads.R

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence,
5454
returnModel <- isoreParameters[["returnModel"]]
5555
min.exonOverlap <- isoreParameters[["min.exonOverlap"]]
5656

57-
if(processByBam){ # bulk mode
57+
if(processByBam){
5858
readClassList <- bplapply(seq_along(reads), function(i) {
5959
bambu.processReadsByFile(bam.file = reads[i],
6060
genomeSequence = genomeSequence,annotations = annotations,
@@ -64,7 +64,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence,
6464
processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode,
6565
demultiplexed = demultiplexed, cleanReads = cleanReads, dedupUMI = dedupUMI, index = 1, barcodesToFilter = barcodesToFilter)},
6666
BPPARAM = bpParameters)
67-
} else { # single cell mode
67+
} else {
6868
readGrgList <- bplapply(seq_along(reads), function(i) {
6969
bambu.readsByFile(bam.file = reads[i],
7070
genomeSequence = genomeSequence,annotations = annotations,
@@ -173,13 +173,6 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations,
173173

174174
mcols(readGrgList)$id <- seq_along(readGrgList)
175175

176-
sampleName <- names(bam.file)[1]
177-
if(!isFALSE(demultiplexed)){
178-
mcols(readGrgList)$CB <- paste0(sampleName, '_', mcols(readGrgList)$CB)
179-
} else{
180-
mcols(readGrgList)$CB <- sampleName
181-
}
182-
mcols(readGrgList)$CB <- as.factor(mcols(readGrgList)$CB)
183176
if(!isFALSE(demultiplexed)){
184177
mcols(readGrgList)$sampleID <- as.numeric(mcols(readGrgList)$CB)
185178
} else {
@@ -217,9 +210,20 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations,
217210
fusionMode = fusionMode,
218211
verbose = verbose)
219212

220-
metadata(se)$samples <- names(bam.file)[1]
221-
metadata(se)$sampleNames <- names(bam.file)[1]
222-
if(!isFALSE(demultiplexed)) metadata(se)$samples <- levels(mcols(readGrgList)$CB)
213+
if (demultiplexed) {
214+
barcodes <- levels(mcols(readGrgList)$CB)
215+
metadata(se)$sampleData <- tibble(
216+
id = paste(names(bam.file)[1], barcodes, sep = '_'),
217+
sampleName = names(bam.file)[1],
218+
barcode = barcodes
219+
)
220+
} else{
221+
metadata(se)$sampleData <- tibble(
222+
id = names(bam.file)[1],
223+
sampleName = names(bam.file)[1]
224+
)
225+
}
226+
223227
return(se)
224228
}
225229

@@ -307,7 +311,6 @@ constructReadClasses <- function(readGrgList, genomeSequence, annotations,
307311
stranded = FALSE, min.readCount = 2,
308312
fitReadClassModel = TRUE, min.exonOverlap = 10, defaultModels = NULL, returnModel = FALSE,
309313
verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE){
310-
warnings <- c() ###TODO
311314

312315
if(processByChromosome){
313316
# construct read classes for each chromosome seperately
@@ -403,12 +406,12 @@ splitReadClassFiles = function(readClassFile){
403406
i = rep(seq_along(counts.table), lengths(counts.table)),
404407
j = as.numeric(names(unlist(counts.table))),
405408
x = unlist(counts.table),
406-
dims = c(nrow(eqClasses), length(metadata(readClassFile)$samples)))
409+
dims = c(nrow(eqClasses), nrow(metadata(readClassFile)$sampleData)))
407410
#incompatible counts
408411
distTable <- metadata(metadata(readClassFile)$readClassDist)$distTable.incompatible
409412
if(nrow(distTable)==0) {
410413
counts.incompatible <- sparseMatrix(i= 1, j = 1, x = 0,
411-
dims = c(1, length(metadata(readClassFile)$samples)))
414+
dims = c(1, length(metadata(readClassFile)$sampleData$id)))
412415
rownames(counts.incompatible) <- "TODO"
413416
} else{
414417
distTable$sampleIDs <- rowData(readClassFile)$sampleIDs[match(distTable$readClassId, rownames(readClassFile))]
@@ -419,11 +422,11 @@ splitReadClassFiles = function(readClassFile){
419422
i = rep(seq_along(counts.table), lengths(counts.table)),
420423
j = as.numeric(names(unlist(counts.table))),
421424
x = unlist(counts.table),
422-
dims = c(nrow(distTable), length(metadata(readClassFile)$samples)))
423-
colnames(counts.incompatible) <- metadata(readClassFile)$samples
425+
dims = c(nrow(distTable), length(metadata(readClassFile)$sampleData$id)))
426+
colnames(counts.incompatible) <- metadata(readClassFile)$sampleData$id
424427
rownames(counts.incompatible) <- distTable$GENEID.i
425428
}
426-
colnames(counts) <- metadata(readClassFile)$samples
429+
colnames(counts) <- metadata(readClassFile)$sampleData$id
427430
metadata(readClassFile)$eqClassById <- eqClasses$eqClassById
428431
#rownames(counts) = eqClasses$eqClassById
429432
metadata(readClassFile)$countMatrix <- counts

R/bambu-processReads_utilityConstructReadClasses.R

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,24 +25,28 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions,
2525
Please report this")
2626
start.ptm <- proc.time()
2727
if(!is.null(uniqueJunctions)){
28-
exonsByRC.spliced <- constructSplicedReadClasses(
29-
uniqueJunctions = uniqueJunctions,
30-
unlisted_junctions = unlisted_junctions,
31-
readGrgList = readGrgList,
32-
stranded = stranded)}
33-
else{exonsByRC.spliced = GRangesList()}
28+
exonsByRC.spliced <- constructSplicedReadClasses(
29+
uniqueJunctions = uniqueJunctions,
30+
unlisted_junctions = unlisted_junctions,
31+
readGrgList = readGrgList,
32+
stranded = stranded)
33+
} else{
34+
exonsByRC.spliced = GRangesList()
35+
}
3436
end.ptm <- proc.time()
3537
rm(readGrgList, unlisted_junctions, uniqueJunctions)
3638
if (verbose)
3739
message("Finished creating transcript models (read classes) for reads with ",
3840
"spliced junctions in ", round((end.ptm - start.ptm)[3] / 60, 1)," mins.")
3941
if(length(reads.singleExon)==0) {
4042
exonsByRC.unspliced <- NULL
41-
} else {exonsByRC.unspliced <- constructUnsplicedReadClasses(reads.singleExon,
42-
annotations, exonsByRC.spliced, stranded, verbose)}
43+
} else {
44+
exonsByRC.unspliced <- constructUnsplicedReadClasses(reads.singleExon,
45+
annotations, exonsByRC.spliced, stranded, verbose)
46+
}
4347
exonsByRC <- c(exonsByRC.spliced, exonsByRC.unspliced)
4448
colDataDf <- DataFrame(name = runName, row.names = runName)
45-
#TODO later remove assays = SimpleList(counts = counts)
49+
4650
counts <- matrix(mcols(exonsByRC)$readCount,
4751
dimnames = list(names(exonsByRC), runName))
4852
se <- SummarizedExperiment(assays = SimpleList(counts = counts),

R/bambu.R

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@
9393
#' distTables. The output is a list with an entry for each sample.
9494
#' @param lowMemory Read classes will be processed by chromosomes when lowMemory
9595
#' is specified. This option provides an efficient way to process big samples.
96+
#' @param sampleData A character vector of paths to metadata CSV files (or \code{NA} if
97+
#' unavailable for specific samples); defaults to \code{NULL}. Files must contain a
98+
#' "sampleName" column for bulk data or a "barcode" column for single-cell/spatial data.
99+
#' For bulk data, one metadata CSV file for all samples is sufficient, whereas single-cell/spatial
100+
#' data requires one metadata CSV file per sample.
96101
#' @param fusionMode A logical variable indicating whether run in fusion mode
97102
#' @param verbose A logical variable indicating whether processing messages will
98103
#' be printed.
@@ -138,8 +143,8 @@
138143
bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL,
139144
mode = NULL, opt.discovery = NULL, opt.em = NULL, rcOutDir = NULL, discovery = TRUE,
140145
assignDist = TRUE, quant = TRUE, stranded = FALSE, ncore = 1, yieldSize = NULL,
141-
trackReads = FALSE, returnDistTable = FALSE, lowMemory = FALSE,
142-
fusionMode = FALSE, verbose = FALSE, demultiplexed = FALSE, spatial = NULL, quantData = NULL,
146+
trackReads = FALSE, returnDistTable = FALSE, lowMemory = FALSE, sampleData = NULL,
147+
fusionMode = FALSE, verbose = FALSE, demultiplexed = FALSE, quantData = NULL,
143148
sampleNames = NULL, cleanReads = FALSE, dedupUMI = FALSE, barcodesToFilter = NULL, clusters = NULL,
144149
processByChromosome = FALSE, processByBam = TRUE) {
145150
message(paste0("Running Bambu-v", "3.9.0"))
@@ -173,7 +178,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL,
173178
annotations <- checkInputs(annotations, reads,
174179
readClass.outputDir = rcOutDir,
175180
genomeSequence = genome, discovery = discovery,
176-
sampleNames = sampleNames, spatial = spatial,quantData = quantData)
181+
sampleNames = sampleNames, sampleData = sampleData, quantData = quantData)
177182
}
178183
isoreParameters <- setIsoreParameters(isoreParameters = opt.discovery)
179184
#below line is to be compatible with earlier version of running bambu
@@ -234,16 +239,19 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL,
234239
}
235240
if(assignDist){
236241
message("--- Start calculating equivilance classes ---")
237-
quantData <- bplapply(readClassList,
238-
FUN = assignReadClasstoTranscripts,
239-
annotations = annotations,
240-
isoreParameters = isoreParameters,
241-
verbose = verbose,
242-
demultiplexed = demultiplexed,
243-
spatial = spatial,
244-
returnDistTable = returnDistTable,
245-
trackReads = trackReads,
246-
BPPARAM = bpParameters)
242+
quantData <- bplapply(seq_along(readClassList), function(i){
243+
assignReadClasstoTranscripts(
244+
readClassList = readClassList[[i]],
245+
annotations = annotations,
246+
isoreParameters = isoreParameters,
247+
verbose = verbose,
248+
# for bulk data, there is one sampleData (keep sampleData[1]), for single-cell, there is one per sample
249+
sampleMetadata = if(length(sampleData) == 1) sampleData[1] else sampleData[i],
250+
demultiplexed = demultiplexed,
251+
returnDistTable = returnDistTable,
252+
trackReads = trackReads
253+
)
254+
}, BPPARAM = bpParameters)
247255
if (!quant) return(quantData)
248256
}
249257
}
@@ -262,6 +270,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL,
262270
start.ptm <- proc.time()
263271
countsSeCompressed.all <- NULL
264272
ColNames <- c()
273+
colData.all <- list()
265274
for(i in seq_along(quantData)){
266275
quantData_i <- quantData[[i]]
267276
#load in the barcode clustering from file if provided
@@ -285,11 +294,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL,
285294
iter <- clustering
286295

287296
} else{ #if clusters is a list
288-
if(length(quantData)>1){
289-
iter <- clusters[[i]] #lowMemory mode
290-
}else{
291-
iter <- clusters#do.call(c,clusters)
292-
}
297+
iter <- clusters[[i]]
293298
}
294299
}
295300
countsSeCompressed <- bplapply(iter, FUN = function(j){ # previous i changed to j to avoid duplicated assignment
@@ -310,25 +315,27 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL,
310315
message("Total Time ", round((end.ptm - start.ptm)[3] / 60, 3), " mins.")
311316
if(!is.null(clusters)){
312317
ColNames <- c(ColNames, names(iter))
318+
colData.all[[i]] <- data.frame(
319+
id = names(countsSeCompressed),
320+
sampleName = names(countsSeCompressed),
321+
row.names = names(countsSeCompressed)
322+
)
313323
} else{
314324
ColNames <- c(ColNames, colnames(quantData_i))
325+
colData.all[[i]] <- data.frame(colData(quantData_i))
315326
}
316327
countsSeCompressed.all <- c(countsSeCompressed.all, countsSeCompressed)
317328
}
318-
countsSeCompressed.all$colnames <- ColNames
319-
countsSe <- combineCountSes(countsSeCompressed.all, annotations)
329+
names(countsSeCompressed.all) <- ColNames
330+
331+
countsSe <- combineCountSes(countsSeCompressed.all, colData.all, annotations)
320332
if(returnDistTable){
321333
distTables = list()
322334
for(i in seq_along(quantData)){
323335
distTables[[i]] <- metadata(quantData[[i]])$distTable
324336
}
325337
metadata(countsSe)$distTables <- distTables
326338
}
327-
#metadata(countsSe)$warnings = warnings
328-
329-
ColData <- generateColData(colnames(countsSe), clusters, demultiplexed, spatial)
330-
colData(countsSe) <- ColData
331-
colnames(countsSe) <- ColData[,1]
332339
return(countsSe)
333340
}
334341
}

0 commit comments

Comments
 (0)