Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions R/summariseExpression.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#' Summarise transcript expression to a specified unit
#' @title summarise by expression
#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
#' @param type character, the unit to summarise by. Currently supports
#' \code{"exon"} (default) and \code{"gene"}.
#' @return A \code{RangedSummarizedExperiment} with expression summarised to
#' the requested unit. Assays \code{counts} and \code{CPM} are always
#' present; \code{fullLengthCounts} and \code{uniqueCounts} are included
#' when present in the input. For \code{type = "exon"}, rows are named
#' \code{EX1, EX2, ...} and \code{rowRanges} mcols include \code{GENEID},
#' \code{txNames}, and \code{exonClass} (\code{"annotated"} or
#' \code{"novel"}). For \code{type = "gene"}, rows are named by GENEID and
#' \code{rowRanges} is a \code{GRangesList} of reduced exon ranges per gene;
#' mcols include \code{GENEID} and \code{newGeneClass}.
#' @details Counts are summed across all transcripts belonging to the same
#' group (e.g. identical exon locus or gene). CPM is recomputed from the
#' aggregated counts. For \code{type = "exon"}, an exon is \code{"novel"}
#' only if all contributing transcripts have \code{novelTranscript = TRUE}.
#' For \code{type = "gene"}, \code{newGeneClass} is derived from
#' \code{txClassDescription}: Ensembl genes (GENEID matching \code{"ENSG"})
#' are labelled \code{"annotation"}, others inherit the transcript class.
#' Incompatible and non-unique counts stored in \code{metadata(se)} are
#' added to the aggregated counts before CPM computation.
#' @examples
#' se <- readRDS(system.file("extdata",
#' "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds",
#' package = "bambu"
#' ))
#' summariseByExpression(se, type = "exon")
#' summariseByExpression(se, type = "gene")
#' @importFrom Matrix sparseMatrix Matrix
#' @importFrom S4Vectors SimpleList metadata
#' @importFrom SummarizedExperiment rowRanges rowData assays colData SummarizedExperiment
#' @importFrom GenomicRanges GRanges seqnames start end strand mcols
#' @importFrom IRanges IRanges
#' @importFrom dplyr left_join mutate group_by summarise cur_group_id ungroup
#' @export
summariseByExpression <- function(se, type = "exon") {
# build the grouping index and output ranges for the requested type
index <- switch(type,
exon = buildExonIndex(se),
gene = buildGeneIndex(se),
stop("Unsupported type: '", type, "'")
)

# sparse aggregation matrix: rows = groups, cols = transcripts
# multiplying by transcript counts gives group-level counts
txNames <- rownames(se)
nGroups <- length(index$outRanges)
groupNames <- names(index$outRanges)

aggregationMat <- sparseMatrix(
i = index$indexDf$group_idx,
j = match(index$indexDf$TXNAME, txNames),
x = 1L,
dims = c(nGroups, length(txNames)),
dimnames = list(groupNames, txNames)
)

# aggregate transcript counts to group level
counts <- aggregationMat %*% assays(se)$counts

# for gene level, add reads that could not be assigned to any transcript
# but were localised to a gene, stored in metadata by bambu
if (type == "gene" && !is.null(metadata(se)$incompatibleCounts)) {
incompat <- metadata(se)$incompatibleCounts
if ("nonuniqueCounts" %in% names(metadata(se)))
incompat <- incompat + metadata(se)$nonuniqueCounts
incompat <- Matrix(
incompat[match(rownames(counts), rownames(incompat)), ],
sparse = TRUE
)
counts <- counts + incompat
}

# compute CPM from aggregated counts
counts.total <- colSums(counts)
counts.total[counts.total == 0] <- 1
cpm <- counts / counts.total * 10^6

# aggregate optional assays if present
outAssays <- SimpleList(counts = counts, CPM = cpm)
for (i in c("fullLengthCounts", "uniqueCounts")) {
if (i %in% names(assays(se))) {
outAssays[[i]] <- aggregationMat %*% assays(se)[[i]]
}
}

SummarizedExperiment(
assays = outAssays,
rowRanges = index$outRanges,
colData = colData(se)
)
}

#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
#' @return a list with:
#' \code{indexDf} — data.frame with columns TXNAME and group_idx
#' (one row per exon-transcript pair);
#' \code{outRanges} — GRanges with one entry per unique exon locus,
#' named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass.
#' @noRd
buildExonIndex <- function(se) {
# one row per exon-transcript pair, joined with transcript metadata
txDf <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "novelTranscript"), drop = FALSE]
exonRanges <- unlist(rowRanges(se), use.names = TRUE)
flatDf <- data.frame(
TXNAME = names(exonRanges),
seqnames = as.character(seqnames(exonRanges)),
start = start(exonRanges),
end = end(exonRanges),
strand = as.character(strand(exonRanges)),
stringsAsFactors = FALSE
) %>%
left_join(txDf, by = "TXNAME") %>%
# assign each unique exon locus a group index
group_by(seqnames, start, end, strand) %>%
mutate(group_idx = cur_group_id()) %>%
ungroup()

# one row per unique exon locus with summarised metadata
groupMeta <- flatDf %>%
group_by(group_idx) %>%
summarise(
seqnames = seqnames[1],
start = start[1],
end = end[1],
strand = strand[1],
GENEID = paste(sort(unique(GENEID)), collapse = ","),
txNames = paste(sort(unique(TXNAME)), collapse = ","),
exonClass = ifelse(all(novelTranscript), "novel", "annotated"),
.groups = "drop"
)

# build output GRanges with exon metadata
outRanges <- GRanges(
seqnames = groupMeta$seqnames,
ranges = IRanges(start = groupMeta$start, end = groupMeta$end),
strand = groupMeta$strand
)
names(outRanges) <- paste0("EX", seq_len(nrow(groupMeta)))
mcols(outRanges)$GENEID <- groupMeta$GENEID
mcols(outRanges)$txNames <- groupMeta$txNames
mcols(outRanges)$exonClass <- groupMeta$exonClass

list(
indexDf = flatDf[, c("TXNAME", "group_idx")],
outRanges = outRanges
)
}

#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
#' @return a list with:
#' \code{indexDf} — data.frame with columns TXNAME and group_idx
#' (one row per transcript);
#' \code{outRanges} — GRangesList with one element per gene, named by
#' GENEID, with mcols GENEID and newGeneClass.
#' @noRd
buildGeneIndex <- function(se) {
# one row per transcript; group_idx preserves first-appearance order of GENEIDs
rd <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "txClassDescription"), drop = FALSE] %>%
mutate(group_idx = match(GENEID, unique(GENEID)))

# one row per gene with class label derived from txClassDescription
groupMeta <- rd %>%
group_by(group_idx) %>%
summarise(
GENEID = GENEID[1],
newGeneClass = ifelse(grepl("ENSG", GENEID[1]), "annotation", unique(txClassDescription)[1]),
.groups = "drop"
)

# build output GRangesList of reduced exon ranges per gene
outRanges <- reducedRangesByGenes(rowRanges(se))[groupMeta$GENEID]
mcols(outRanges)$GENEID <- groupMeta$GENEID
mcols(outRanges)$newGeneClass <- groupMeta$newGeneClass

list(
indexDf = rd[, c("TXNAME", "group_idx")],
outRanges = outRanges
)
}
57 changes: 57 additions & 0 deletions tests/testthat/test_summariseExpression.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
context("Summarise expression")

test_that("summariseByExpression correctly reduces transcript counts to exon and gene counts", {
# TX1: exons at 1-100, 200-300 -> GENE1, count = 10
# TX2: exons at 1-100, 400-500 -> GENE1, shares first exon with TX1, count = 5
# TX3: exon at 600-700 -> GENE2, count = 8
#
# Expected exon counts:
# chr1:1-100 (TX1 + TX2) -> 15
# chr1:200-300 (TX1 only) -> 10
# chr1:400-500 (TX2 only) -> 5
# chr1:600-700 (TX3 only) -> 8
#
# Expected gene counts:
# GENE1 (TX1 + TX2) -> 15
# GENE2 (TX3 only) -> 8

txRanges <- GRangesList(
TX1 = GRanges("chr1", IRanges(c(1, 200), c(100, 300)), strand = "+"),
TX2 = GRanges("chr1", IRanges(c(1, 400), c(100, 500)), strand = "+"),
TX3 = GRanges("chr1", IRanges(600, 700), strand = "+")
)
mcols(txRanges)$TXNAME <- c("TX1", "TX2", "TX3")
mcols(txRanges)$GENEID <- c("GENE1", "GENE1", "GENE2")
mcols(txRanges)$novelTranscript <- c(FALSE, FALSE, FALSE)
mcols(txRanges)$txClassDescription <- c("annotation", "annotation", "annotation")

counts <- matrix(c(10, 5, 8), nrow = 3, ncol = 1,
dimnames = list(c("TX1", "TX2", "TX3"), "sample1"))

se <- SummarizedExperiment(
assays = list(counts = counts),
rowRanges = txRanges
)

# exon level
seExon <- summariseByExpression(se, type = "exon")
exonCounts <- as.matrix(assays(seExon)$counts)
exonRanges <- rowRanges(seExon)

shared <- which(start(exonRanges) == 1 & end(exonRanges) == 100)
tx1only <- which(start(exonRanges) == 200 & end(exonRanges) == 300)
tx2only <- which(start(exonRanges) == 400 & end(exonRanges) == 500)
tx3only <- which(start(exonRanges) == 600 & end(exonRanges) == 700)

expect_equal(exonCounts[shared, 1], 15)
expect_equal(exonCounts[tx1only, 1], 10)
expect_equal(exonCounts[tx2only, 1], 5)
expect_equal(exonCounts[tx3only, 1], 8)

# gene level
seGene <- summariseByExpression(se, type = "gene")
geneCounts <- as.matrix(assays(seGene)$counts)

expect_equal(geneCounts["GENE1", 1], 15)
expect_equal(geneCounts["GENE2", 1], 8)
})
Loading