diff --git a/R/summariseExpression.R b/R/summariseExpression.R new file mode 100644 index 00000000..05597890 --- /dev/null +++ b/R/summariseExpression.R @@ -0,0 +1,182 @@ +#' Summarise transcript expression to a specified unit +#' @title summarise by expression +#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} +#' @param type character, the unit to summarise by. Currently supports +#' \code{"exon"} (default) and \code{"gene"}. +#' @return A \code{RangedSummarizedExperiment} with expression summarised to +#' the requested unit. Assays \code{counts} and \code{CPM} are always +#' present; \code{fullLengthCounts} and \code{uniqueCounts} are included +#' when present in the input. For \code{type = "exon"}, rows are named +#' \code{EX1, EX2, ...} and \code{rowRanges} mcols include \code{GENEID}, +#' \code{txNames}, and \code{exonClass} (\code{"annotated"} or +#' \code{"novel"}). For \code{type = "gene"}, rows are named by GENEID and +#' \code{rowRanges} is a \code{GRangesList} of reduced exon ranges per gene; +#' mcols include \code{GENEID} and \code{newGeneClass}. +#' @details Counts are summed across all transcripts belonging to the same +#' group (e.g. identical exon locus or gene). CPM is recomputed from the +#' aggregated counts. For \code{type = "exon"}, an exon is \code{"novel"} +#' only if all contributing transcripts have \code{novelTranscript = TRUE}. +#' For \code{type = "gene"}, \code{newGeneClass} is derived from +#' \code{txClassDescription}: Ensembl genes (GENEID matching \code{"ENSG"}) +#' are labelled \code{"annotation"}, others inherit the transcript class. +#' Incompatible and non-unique counts stored in \code{metadata(se)} are +#' added to the aggregated counts before CPM computation. +#' @examples +#' se <- readRDS(system.file("extdata", +#' "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds", +#' package = "bambu" +#' )) +#' summariseByExpression(se, type = "exon") +#' summariseByExpression(se, type = "gene") +#' @importFrom Matrix sparseMatrix Matrix +#' @importFrom S4Vectors SimpleList metadata +#' @importFrom SummarizedExperiment rowRanges rowData assays colData SummarizedExperiment +#' @importFrom GenomicRanges GRanges seqnames start end strand mcols +#' @importFrom IRanges IRanges +#' @importFrom dplyr left_join mutate group_by summarise cur_group_id ungroup +#' @export +summariseByExpression <- function(se, type = "exon") { + # build the grouping index and output ranges for the requested type + index <- switch(type, + exon = buildExonIndex(se), + gene = buildGeneIndex(se), + stop("Unsupported type: '", type, "'") + ) + + # sparse aggregation matrix: rows = groups, cols = transcripts + # multiplying by transcript counts gives group-level counts + txNames <- rownames(se) + nGroups <- length(index$outRanges) + groupNames <- names(index$outRanges) + + aggregationMat <- sparseMatrix( + i = index$indexDf$group_idx, + j = match(index$indexDf$TXNAME, txNames), + x = 1L, + dims = c(nGroups, length(txNames)), + dimnames = list(groupNames, txNames) + ) + + # aggregate transcript counts to group level + counts <- aggregationMat %*% assays(se)$counts + + # for gene level, add reads that could not be assigned to any transcript + # but were localised to a gene, stored in metadata by bambu + if (type == "gene" && !is.null(metadata(se)$incompatibleCounts)) { + incompat <- metadata(se)$incompatibleCounts + if ("nonuniqueCounts" %in% names(metadata(se))) + incompat <- incompat + metadata(se)$nonuniqueCounts + incompat <- Matrix( + incompat[match(rownames(counts), rownames(incompat)), ], + sparse = TRUE + ) + counts <- counts + incompat + } + + # compute CPM from aggregated counts + counts.total <- colSums(counts) + counts.total[counts.total == 0] <- 1 + cpm <- counts / counts.total * 10^6 + + # aggregate optional assays if present + outAssays <- SimpleList(counts = counts, CPM = cpm) + for (i in c("fullLengthCounts", "uniqueCounts")) { + if (i %in% names(assays(se))) { + outAssays[[i]] <- aggregationMat %*% assays(se)[[i]] + } + } + + SummarizedExperiment( + assays = outAssays, + rowRanges = index$outRanges, + colData = colData(se) + ) +} + +#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} +#' @return a list with: +#' \code{indexDf} — data.frame with columns TXNAME and group_idx +#' (one row per exon-transcript pair); +#' \code{outRanges} — GRanges with one entry per unique exon locus, +#' named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass. +#' @noRd +buildExonIndex <- function(se) { + # one row per exon-transcript pair, joined with transcript metadata + txDf <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "novelTranscript"), drop = FALSE] + exonRanges <- unlist(rowRanges(se), use.names = TRUE) + flatDf <- data.frame( + TXNAME = names(exonRanges), + seqnames = as.character(seqnames(exonRanges)), + start = start(exonRanges), + end = end(exonRanges), + strand = as.character(strand(exonRanges)), + stringsAsFactors = FALSE + ) %>% + left_join(txDf, by = "TXNAME") %>% + # assign each unique exon locus a group index + group_by(seqnames, start, end, strand) %>% + mutate(group_idx = cur_group_id()) %>% + ungroup() + + # one row per unique exon locus with summarised metadata + groupMeta <- flatDf %>% + group_by(group_idx) %>% + summarise( + seqnames = seqnames[1], + start = start[1], + end = end[1], + strand = strand[1], + GENEID = paste(sort(unique(GENEID)), collapse = ","), + txNames = paste(sort(unique(TXNAME)), collapse = ","), + exonClass = ifelse(all(novelTranscript), "novel", "annotated"), + .groups = "drop" + ) + + # build output GRanges with exon metadata + outRanges <- GRanges( + seqnames = groupMeta$seqnames, + ranges = IRanges(start = groupMeta$start, end = groupMeta$end), + strand = groupMeta$strand + ) + names(outRanges) <- paste0("EX", seq_len(nrow(groupMeta))) + mcols(outRanges)$GENEID <- groupMeta$GENEID + mcols(outRanges)$txNames <- groupMeta$txNames + mcols(outRanges)$exonClass <- groupMeta$exonClass + + list( + indexDf = flatDf[, c("TXNAME", "group_idx")], + outRanges = outRanges + ) +} + +#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} +#' @return a list with: +#' \code{indexDf} — data.frame with columns TXNAME and group_idx +#' (one row per transcript); +#' \code{outRanges} — GRangesList with one element per gene, named by +#' GENEID, with mcols GENEID and newGeneClass. +#' @noRd +buildGeneIndex <- function(se) { + # one row per transcript; group_idx preserves first-appearance order of GENEIDs + rd <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "txClassDescription"), drop = FALSE] %>% + mutate(group_idx = match(GENEID, unique(GENEID))) + + # one row per gene with class label derived from txClassDescription + groupMeta <- rd %>% + group_by(group_idx) %>% + summarise( + GENEID = GENEID[1], + newGeneClass = ifelse(grepl("ENSG", GENEID[1]), "annotation", unique(txClassDescription)[1]), + .groups = "drop" + ) + + # build output GRangesList of reduced exon ranges per gene + outRanges <- reducedRangesByGenes(rowRanges(se))[groupMeta$GENEID] + mcols(outRanges)$GENEID <- groupMeta$GENEID + mcols(outRanges)$newGeneClass <- groupMeta$newGeneClass + + list( + indexDf = rd[, c("TXNAME", "group_idx")], + outRanges = outRanges + ) +} diff --git a/tests/testthat/test_summariseExpression.R b/tests/testthat/test_summariseExpression.R new file mode 100644 index 00000000..e77b22c2 --- /dev/null +++ b/tests/testthat/test_summariseExpression.R @@ -0,0 +1,57 @@ +context("Summarise expression") + +test_that("summariseByExpression correctly reduces transcript counts to exon and gene counts", { + # TX1: exons at 1-100, 200-300 -> GENE1, count = 10 + # TX2: exons at 1-100, 400-500 -> GENE1, shares first exon with TX1, count = 5 + # TX3: exon at 600-700 -> GENE2, count = 8 + # + # Expected exon counts: + # chr1:1-100 (TX1 + TX2) -> 15 + # chr1:200-300 (TX1 only) -> 10 + # chr1:400-500 (TX2 only) -> 5 + # chr1:600-700 (TX3 only) -> 8 + # + # Expected gene counts: + # GENE1 (TX1 + TX2) -> 15 + # GENE2 (TX3 only) -> 8 + + txRanges <- GRangesList( + TX1 = GRanges("chr1", IRanges(c(1, 200), c(100, 300)), strand = "+"), + TX2 = GRanges("chr1", IRanges(c(1, 400), c(100, 500)), strand = "+"), + TX3 = GRanges("chr1", IRanges(600, 700), strand = "+") + ) + mcols(txRanges)$TXNAME <- c("TX1", "TX2", "TX3") + mcols(txRanges)$GENEID <- c("GENE1", "GENE1", "GENE2") + mcols(txRanges)$novelTranscript <- c(FALSE, FALSE, FALSE) + mcols(txRanges)$txClassDescription <- c("annotation", "annotation", "annotation") + + counts <- matrix(c(10, 5, 8), nrow = 3, ncol = 1, + dimnames = list(c("TX1", "TX2", "TX3"), "sample1")) + + se <- SummarizedExperiment( + assays = list(counts = counts), + rowRanges = txRanges + ) + + # exon level + seExon <- summariseByExpression(se, type = "exon") + exonCounts <- as.matrix(assays(seExon)$counts) + exonRanges <- rowRanges(seExon) + + shared <- which(start(exonRanges) == 1 & end(exonRanges) == 100) + tx1only <- which(start(exonRanges) == 200 & end(exonRanges) == 300) + tx2only <- which(start(exonRanges) == 400 & end(exonRanges) == 500) + tx3only <- which(start(exonRanges) == 600 & end(exonRanges) == 700) + + expect_equal(exonCounts[shared, 1], 15) + expect_equal(exonCounts[tx1only, 1], 10) + expect_equal(exonCounts[tx2only, 1], 5) + expect_equal(exonCounts[tx3only, 1], 8) + + # gene level + seGene <- summariseByExpression(se, type = "gene") + geneCounts <- as.matrix(assays(seGene)$counts) + + expect_equal(geneCounts["GENE1", 1], 15) + expect_equal(geneCounts["GENE2", 1], 8) +})