Skip to content

Commit 1664813

Browse files
committed
refactor summarisebyExon
1 parent dbe2af9 commit 1664813

3 files changed

Lines changed: 122 additions & 81 deletions

File tree

R/summariseByExon.R

Lines changed: 0 additions & 81 deletions
This file was deleted.

R/summariseExpression.R

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#' Summarise transcript expression to exon-level expression
2+
#' @title summarise by exon
3+
#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
4+
#' @return A \code{RangedSummarizedExperiment} with exon-level expression.
5+
#' Rows are named \code{EX1, EX2, ...} in order of first appearance.
6+
#' \code{rowRanges} is a \code{GRanges} with one entry per unique exon locus.
7+
#' \code{mcols} columns:
8+
#' \describe{
9+
#' \item{GENEID}{Comma-separated gene IDs for transcripts sharing the exon}
10+
#' \item{txNames}{Comma-separated transcript IDs that contain this exon}
11+
#' \item{exonClass}{\code{"annotated"} if the exon coordinates exist in the
12+
#' reference annotation, \code{"novel"} otherwise}
13+
#' }
14+
#' @details Counts (and other assays) are summed across all transcripts sharing
15+
#' the same exon (identical seqnames, start, end, strand). CPM is recomputed
16+
#' from the aggregated counts. \code{fullLengthCounts} and
17+
#' \code{uniqueCounts} are summed where present.
18+
#' @importFrom Matrix sparseMatrix
19+
#' @importFrom SummarizedExperiment rowRanges rowData
20+
#' @importFrom GenomicRanges GRanges seqnames start end strand
21+
#' @importFrom IRanges IRanges
22+
#' @importFrom dplyr left_join mutate group_by summarise filter distinct
23+
#' @export
24+
summariseByExon <- function(se) {
25+
exonRanges <- unlist(rowRanges(se), use.names = TRUE)
26+
txNames <- rownames(se)
27+
28+
# Build data.frame of exon-transcript pairs
29+
exonDf <- data.frame(
30+
TXNAME = names(exonRanges),
31+
seqnames = as.character(seqnames(exonRanges)),
32+
start = start(exonRanges),
33+
end = end(exonRanges),
34+
strand = as.character(strand(exonRanges)),
35+
stringsAsFactors = FALSE
36+
) |>
37+
mutate(exon_id = paste(seqnames, start, end, strand, sep = ":"))
38+
39+
# Attach GENEID and txClassDescription from rowData
40+
geneDf <- as.data.frame(rowData(se))[, c("GENEID", "txClassDescription"), drop = FALSE]
41+
geneDf$TXNAME <- rownames(se)
42+
exonDf <- left_join(exonDf, geneDf, by = "TXNAME")
43+
44+
# Collapse coordinates and GENEID/txNames per unique exon
45+
exonMeta <- exonDf |>
46+
group_by(exon_id) |>
47+
summarise(
48+
seqnames = seqnames[1],
49+
start = start[1],
50+
end = end[1],
51+
strand = strand[1],
52+
GENEID = paste(sort(unique(GENEID)), collapse = ","),
53+
txNames = paste(sort(unique(TXNAME)), collapse = ","),
54+
.groups = "drop"
55+
)
56+
57+
# exonClass: "annotated" if the exon coordinates (seqnames, start, end, strand)
58+
# exist in the reference annotation, "novel" otherwise.
59+
annotatedCoords <- exonDf |>
60+
filter(txClassDescription == "annotation") |>
61+
distinct(seqnames, start, end, strand) |>
62+
mutate(exonClass = "annotated")
63+
64+
exonMeta <- exonMeta |>
65+
left_join(annotatedCoords, by = c("seqnames", "start", "end", "strand")) |>
66+
mutate(exonClass = ifelse(is.na(exonClass), "novel", "annotated"))
67+
68+
# Sparse binary matrix: unique_exons x transcripts
69+
uniqueExons <- exonMeta$exon_id
70+
exonNames <- paste0("EX", seq_along(uniqueExons))
71+
aggregationMat <- sparseMatrix(
72+
i = match(exonDf$exon_id, uniqueExons),
73+
j = match(exonDf$TXNAME, txNames),
74+
x = 1L,
75+
dims = c(length(uniqueExons), length(txNames)),
76+
dimnames = list(exonNames, txNames)
77+
)
78+
79+
# Build output GRanges
80+
outRanges <- GRanges(
81+
seqnames = exonMeta$seqnames,
82+
ranges = IRanges(start = exonMeta$start, end = exonMeta$end),
83+
strand = exonMeta$strand
84+
)
85+
names(outRanges) <- exonNames
86+
mcols(outRanges)$GENEID <- exonMeta$GENEID
87+
mcols(outRanges)$txNames <- exonMeta$txNames
88+
mcols(outRanges)$exonClass <- exonMeta$exonClass
89+
90+
.aggregateAssays(se, aggregationMat, outRanges)
91+
}

R/summariseExpression_utility.R

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Core aggregation engine shared by summariseByExon (and any future summariseBy* functions).
2+
# aggregationMat : sparse matrix (output_features x input_transcripts)
3+
# outRanges : GRanges / GRangesList for the output rows
4+
#' @importFrom SummarizedExperiment assays colData SummarizedExperiment
5+
#' @noRd
6+
.aggregateAssays <- function(se, aggregationMat, outRanges) {
7+
counts <- aggregationMat %*% assays(se)$counts
8+
9+
counts.total <- colSums(counts)
10+
counts.total[counts.total == 0] <- 1
11+
cpm <- counts / counts.total * 10^6
12+
13+
outAssays <- SimpleList(counts = counts, CPM = cpm)
14+
15+
for (nm in c("fullLengthCounts", "uniqueCounts")) {
16+
if (nm %in% names(assays(se))) {
17+
outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]]
18+
}
19+
}
20+
21+
ColNames <- colnames(counts)
22+
ColData <- colData(se)
23+
ColData@rownames <- ColNames
24+
ColData@listData$name <- ColNames
25+
26+
SummarizedExperiment(
27+
assays = outAssays,
28+
rowRanges = outRanges,
29+
colData = ColData
30+
)
31+
}

0 commit comments

Comments
 (0)