Skip to content

Commit abdf9cf

Browse files
authored
Merge branch 'dev' into dev-nathan-2
2 parents 45fab95 + d3d85fb commit abdf9cf

33 files changed

Lines changed: 807 additions & 424 deletions

DESCRIPTION

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: GenomicDistributions
2-
Version: 1.1.10
3-
Date: 2021-08-28
2+
Version: 1.3.2
3+
Date: 2022-01-01
44
Title: GenomicDistributions: fast analysis of genomic intervals with Bioconductor
55
Description: If you have a set of genomic ranges, this package can help you with
66
visualization and comparison. It produces several kinds of plots, for example:
@@ -38,6 +38,7 @@ Imports:
3838
Biostrings,
3939
plyr,
4040
dplyr,
41+
scales,
4142
GenomeInfoDb
4243
Suggests:
4344
AnnotationFilter,

NAMESPACE

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@ export(calcFeatureDistRefTSS)
1616
export(calcGCContent)
1717
export(calcGCContentRef)
1818
export(calcNearestGenes)
19+
export(calcNearestNeighbors)
1920
export(calcNeighborDist)
20-
export(calcOpenSignal)
2121
export(calcPartitions)
2222
export(calcPartitionsRef)
23+
export(calcSummarySignal)
2324
export(calcWidth)
2425
export(dtToGr)
2526
export(genomePartitionList)
@@ -39,9 +40,9 @@ export(plotExpectedPartitions)
3940
export(plotFeatureDist)
4041
export(plotGCContent)
4142
export(plotNeighborDist)
42-
export(plotOpenSignal)
4343
export(plotPartitions)
4444
export(plotQTHist)
45+
export(plotSummarySignal)
4546
export(retrieveFile)
4647
import(dplyr)
4748
import(ggplot2)

NEWS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
All notable changes to this project will be documented in this file. Here we
33
will document changes to major new releases only (not point releases).
44

5+
## [1.3.2] -- 2020-01-01
6+
- Bioconductor released new version
7+
- Cell specificity plots are now more genric signal summary plots - the calc function is now "calcSummarySignal", plot function "plotSummarySignal"
8+
- Default plotting of neighbor distance is now on linear scale
9+
- Corrected plotting of partition overlap for multiple region sets - now sums to 100 by group
10+
511
## [1.1.2] -- 2020-07-07
612
- Added functions to calculate and plot dinucleotide frequencies
713

R/chrom-plots.R

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,9 @@ calcChromBins = function(query, bins) {
174174
#' @param binCount Number of bins to divide the chromosomes into
175175
#' @return A data.table showing the distribution of regions across bins of the
176176
#' reference genome.
177-
#' @export
178177
#' @examples
179178
#' ChromBins = calcChromBinsRef(vistaEnhancers, "hg19")
180-
calcChromBinsRef = function(query, refAssembly, binCount=10000) {
179+
calcChromBinsRefSlow = function(query, refAssembly, binCount=3000) {
181180
.validateInputs(list(refAssembly="character",
182181
query=c("GRanges","GRangesList")))
183182
# Bin the genome
@@ -189,13 +188,64 @@ calcChromBinsRef = function(query, refAssembly, binCount=10000) {
189188
return(calcChromBins(query, genomeBins))
190189
}
191190

191+
192+
#' Returns the distribution of query over a reference assembly
193+
194+
#' Given a query set of elements (a GRanges object) and a reference assembly
195+
#' (*e.g. 'hg38'), this will aggregate and count the distribution of the query
196+
#' elements across bins of the reference genome. This is a helper function to
197+
#' create features for common genomes. It is a wrapper of
198+
#' \code{calcChromBins}, which is more general.
199+
200+
#' @param query A GenomicRanges or GenomicRangesList object with query regions
201+
#' @param refAssembly A character vector that will be used to grab chromosome
202+
#' sizes with \code{getChromSizes}
203+
#' @param binCount Number of bins to divide the chromosomes into
204+
#' @return A data.table showing the distribution of regions across bins of the
205+
#' reference genome.
206+
#' @export
207+
#' @examples
208+
#' ChromBins = calcChromBinsRef(vistaEnhancers, "hg19")
209+
calcChromBinsRef = function(query, refAssembly, binCount=3000) {
210+
.validateInputs(list(refAssembly="character",
211+
query=c("GRanges","GRangesList")))
212+
if (is(query, "GRangesList")) {
213+
# Recurse over each GRanges object
214+
x = lapply(query, calcChromBinsRef, refAssembly)
215+
# To accommodate multiple regions, we'll need to introduce a new 'name'
216+
# column to distinguish them.
217+
nameList = names(query)
218+
if(is.null(nameList)) {
219+
nameList = seq_along(query) # Fallback to sequential numbers
220+
}
221+
# Append names
222+
xb = rbindlist(x)
223+
xb$name = rep(nameList, vapply(x, nrow, integer(1)))
224+
return(xb)
225+
}
226+
# Bin the genome
227+
chromSizes = getChromSizes(refAssembly)
228+
binnedDT = binChroms(binCount, chromSizes)
229+
queryDT = grToDt(query)
230+
setnames(binnedDT, "idCol", "chr")
231+
queryDT[, midpoint:=start + (end-start)]
232+
# Here I use a non-equi join to get the overlaps
233+
res = binnedDT[queryDT, .(chr, regionID=ubinID, withinGroupID=x.binID, start=x.start, end=x.end),
234+
on=.(chr, start<=midpoint, end>=midpoint), nomatch=0L][, list(.N), by=list(chr, start, end, regionID, withinGroupID)][order(regionID),]
235+
res[, chr:=factor(chr, levels=unique(res$chr))]
236+
return(res)
237+
}
238+
239+
240+
192241
#' Plot distribution over chromosomes
193242
#'
194243
#' Plots result from \code{genomicDistribution} calculation
195244
#' @param genomeAggregate The output from the genomicDistribution function
196245
#' @param binCount Number of bins (should match the call to
197246
#' \code{genomicDistribution})
198247
#' @param plotTitle Title for plot.
248+
#' @param ylim Limit of y-axes. Default "max" sets limit to N of biggest bin.
199249
#' @return A ggplot object showing the distribution of the query
200250
#' regions over bins of
201251
#' the reference genome.
@@ -206,7 +256,7 @@ calcChromBinsRef = function(query, refAssembly, binCount=10000) {
206256
#' ChromBins = plotChromBins(agg)
207257
#'
208258
plotChromBins = function(genomeAggregate, binCount=10000,
209-
plotTitle="Distribution over chromosomes") {
259+
plotTitle="Distribution over chromosomes", ylim="max") {
210260
.validateInputs(list(genomeAggregate=c("data.table","data.frame")))
211261

212262
if ("name" %in% names(genomeAggregate)){
@@ -231,8 +281,13 @@ plotChromBins = function(genomeAggregate, binCount=10000,
231281
theme(panel.spacing=unit(0, "lines")) + # Reduce whitespace
232282
theme(strip.text.y=element_text(size=12, angle=0)) + # Rotate labels
233283
geom_hline(yintercept=0, color="#EEEEEE") + # Light chrom lines
234-
scale_y_continuous(breaks=c(max(genomeAggregate$N)),
235-
limits=c(0, max(genomeAggregate$N))) +
284+
{if (ylim == "max") {
285+
scale_y_continuous(breaks = c(max(genomeAggregate$N)),
286+
limits = c(0, max(genomeAggregate$N)))
287+
} else {
288+
scale_y_continuous(breaks = ylim,
289+
limits = c(0, ylim))
290+
}} +
236291
scale_x_continuous(breaks=c(0, binCount), labels=c("Start", "End")) +
237292
theme(plot.title=element_text(hjust=0.5)) + # Center title
238293
ggtitle(plotTitle) +

R/feature-plots.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# @return A vector of genomic distances for each query region relative to its
1818
# closest feature.
1919
calcFeatureDistBioc = function(query, features) {
20-
.validateInputs(list(query=x("GRangesList","GRanges")))
20+
.validateInputs(list(query=c("GRangesList","GRanges")))
2121
if (is(query, "GRangesList")) {
2222
# Recurse over each GRanges object
2323
x = lapply(query, calcFeatureDist, features)
@@ -337,6 +337,7 @@ cutDists = function(dists, divisions=NULL, nbins=50,
337337
labels = labelCuts(sort(divisions), collapse=" to ", infBins=infBins)
338338
cuts = cut(dists, divisions, labels)
339339
df = as.data.frame(table(cuts))
340+
setDT(df)
340341
return(df)
341342
}
342343

R/neighbor-distances.R

Lines changed: 130 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,27 @@
22

33

44
#' Group regions from the same chromosome together and
5-
#' calculate the distances between neighboring regions.
5+
#' calculate the distances of a region to its upstream and
6+
#' downstream neighboring regions.
67
#' Distances are then lumped into a numeric vector.
78
#'
89
#' @param query A GRanges or GRangesList object.
9-
#'
10+
#' @param correctRef A string indicating the reference genome
11+
#' to use if distances are corrected for the number of
12+
#' regions in a regionSet.
13+
#'
1014
#' @return A numeric vector or list with different vectors containing the
11-
#' distances within neighboring regions.
15+
#' distances of regions to their upstream/downstream neighbors.
1216
#' @export
1317
#' @examples
1418
#' dist = calcNeighborDist(vistaEnhancers)
15-
calcNeighborDist = function(query) {
16-
.validateInputs(list(query=c("GRanges","GRangesList")))
19+
calcNeighborDist = function(query, correctRef="None") {
20+
.validateInputs(list(query=c("GRanges","GRangesList"),
21+
correctRef=c("character")))
1722
# lapply if a GRangeslist is provided
1823
if (is(query, "GRangesList")) {
19-
dist = lapply(query, calcNeighborDist)
24+
dist = lapply(query,
25+
function(x){calcNeighborDist(x, correctRef = correctRef)})
2026
namelist = names(query)
2127
if (is.null(namelist)) {
2228
newnames = seq_along(query)
@@ -30,10 +36,20 @@ calcNeighborDist = function(query) {
3036
querydts = splitDataTable(querydt, "chr")
3137
distanceVectors = lapply(querydts, neighbordt)
3238
d = as.vector(unlist(distanceVectors))
33-
# remove overlaps for log10 transformation
34-
dcvec = d[!(d == "0")]
35-
dcvec = log10(dcvec)
36-
return(dcvec)
39+
# remove overlaps
40+
dcvec = d[!(d == "0")]
41+
# Correct for number of regions
42+
if (!correctRef=="None") {
43+
chromSizes = getChromSizes(correctRef)
44+
genomelen = sum(chromSizes)
45+
meanWidth = mean(calcWidth(query))
46+
expectedDist = genomelen/nrow(querydt) - meanWidth
47+
correctedDist = log10(dcvec/expectedDist)
48+
return(correctedDist)
49+
# If we just want to look at the raw neighbor distances
50+
} else {
51+
return(dcvec)
52+
}
3753
}
3854

3955
#' Internal helper function to calculate distance
@@ -46,61 +62,133 @@ neighbordt = function(querydt) {
4662
# there should be at least 2 regions for each chr
4763
if (nrow(querydt) > 1) {
4864
endVect = abs(querydt[, diff(end)])
49-
regionWidth = querydt[, (end-start)]
65+
regionWidth = querydt[, (end-start+1)]
5066
distancesVector = endVect - regionWidth[-1]
5167
# neg values represent overlaps between neighbor regions, set those to 0
5268
distancesVector[which(distancesVector < 0)] = 0
5369
return(distancesVector)
5470
}
5571
}
56-
5772

58-
#' Plot the distances between neighboring regions.The distance in the
59-
#' x axis is log10 transformed for ease of comparison between
60-
#' different regionsets and to account for outliers.
73+
74+
#' Group regions from the same chromosome together and
75+
#' compute the distance of a region to its nearest neighbor.
76+
#' Distances are then lumped into a numeric vector.
77+
#'
78+
#' @param query A GRanges or GRangesList object.
79+
#' @param correctRef A string indicating the reference genome
80+
#' to use if Nearest neighbor distances are corrected for the
81+
#' number of regions in a regionSet.
82+
#'
83+
#' @return A numeric vector or list of vectors containing the
84+
#' distance of regions to their nearest neighbors.
85+
#' @export
86+
#' @examples
87+
#' Nneighbors = calcNearestNeighbors(vistaEnhancers)
88+
calcNearestNeighbors = function(query, correctRef="None") {
89+
.validateInputs(list(query=c("GRanges","GRangesList"),
90+
correctRef=c("character")))
91+
# lapply if a GRangeslist is provided
92+
if (is(query, "GRangesList")) {
93+
dist = lapply(query,
94+
function(x){calcNearestNeighbors(x, correctRef = correctRef)})
95+
namelist = names(query)
96+
if (is.null(namelist)) {
97+
newnames = seq_along(query)
98+
namelist = newnames
99+
# Append names
100+
names(dist) = namelist
101+
}
102+
return(dist)
103+
}
104+
# Calculate nearest neighbors in a vectorized manner
105+
dist = calcNeighborDist(query)
106+
upstream = dist[-length(dist)]
107+
downstream = dist[-1]
108+
dt = data.table(i=upstream, j=downstream)
109+
pairmins = dt[, pmin(i, j)]
110+
# First and last distances are default nearest neighbors
111+
nNeighbors = c(dist[1], pairmins, dist[length(dist)])
112+
# Correct for number of regions
113+
if (!correctRef=="None") {
114+
chromSizes = getChromSizes(correctRef)
115+
genomelen = sum(chromSizes)
116+
meanWidth = mean(calcWidth(query))
117+
expectedDist = genomelen/length(query) - meanWidth
118+
correctedDist = log10(nNeighbors/expectedDist)
119+
return(correctedDist)
120+
} else {
121+
return(nNeighbors)
122+
}
123+
}
124+
125+
#' Plot the distances from regions to their upstream/downstream neighbors
126+
#' or nearest neighbors. Distances can be passed as either raw bp or
127+
#' corrected for the number of regions (log10(obs/exp)), but this has
128+
#' to be specified in the function parameters.
61129
#'
62-
#' @param dcvec A numeric vector or list with vectors containing distances
63-
#' between neighbor regions. Produced by \code{calcNeighborDist}
130+
#' @param dcvec A numeric vector or list of vectors containing distances
131+
#' to upstream/downstream neighboring regions or to nearest neighbors.
132+
#' Produced by \code{calcNeighborDist} or \code{calcNearestNeighbors}
133+
#' @param correctedDist A logical indicating if the plot axis should
134+
#' be adjusted to show distances corrected for the number of regions
135+
#' in a regionset.
136+
#' @param Nneighbors A logical indicating whether legend should be adjusted
137+
#' if Nearest neighbors are being plotted. Default legend shows distances
138+
#' to upstream/downstream neighbors.
64139
#'
65140
#' @return A ggplot density object showing the distribution of
66-
#' log10 transformed distances.
141+
#' raw or corrected distances.
67142
#' @export
68143
#' @examples
69144
#' numVector = rnorm(400, mean=5, sd=0.1)
70145
#' d = plotNeighborDist(numVector)
71-
plotNeighborDist = function(dcvec) {
146+
plotNeighborDist = function(dcvec, correctedDist=FALSE,
147+
Nneighbors=FALSE) {
72148
.validateInputs(list(dcvec=c("numeric","list")))
73-
# if input is list, conver it to a data frame with
149+
# if input is list, convert it to a data frame with
74150
# value and region set name, if input is vector - make a single
75151
# columns data.frame
76-
if (is(dcvec, "list")){
77-
nameList = names(dcvec)
78-
vectorLengths = unlist(lapply(dcvec, length))
79-
distReshaped = data.frame(value = unlist(dcvec),
80-
regionSet = rep(nameList, vectorLengths))
81-
} else {
82-
distReshaped = data.frame(value = dcvec)
83-
}
84-
85152
if (is(dcvec, "list")) {
86-
g = ggplot2::ggplot(distReshaped, aes(x=value,
87-
fill=regionSet,
153+
nameList = names(dcvec)
154+
vectorLengths = unlist(lapply(dcvec, length))
155+
distReshaped = data.frame(value = unlist(dcvec),
156+
regionSet = rep(nameList, vectorLengths))
157+
g = ggplot2::ggplot(distReshaped, aes(x=value,
158+
fill=regionSet,
88159
colour=regionSet)) +
89-
geom_density(alpha=0.5) +
90-
theme_classic() +
91-
theme(legend.position = "bottom")
160+
geom_density(alpha=0.4)
92161
} else {
162+
distReshaped = data.frame(value = dcvec)
93163
g = ggplot2::ggplot(distReshaped, aes(x=value)) +
94-
geom_density(alpha=0.4) +
95-
theme_classic()
164+
geom_density()
165+
}
166+
if (correctedDist==TRUE) {
167+
g = g +
168+
xlab(expression(log[10](over(Obs, Exp)))) +
169+
geom_vline(xintercept = 0, linetype="dashed") +
170+
ggtitle("Corrected neighboring regions distance distribution")
171+
} else {
172+
g = g +
173+
xlab(expression("bp distance")) +
174+
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
175+
labels = scales::trans_format("log10",
176+
scales::math_format(10^.x))) +
177+
ggtitle("Neighboring regions distance distribution")
96178
}
97179
g = g +
98-
xlab(expression(log[10]*("bp distance"))) +
99-
xlim(0, 10) +
100-
theme(aspect.ratio=1) +
101-
theme_blank_facet_label() +
102-
ggtitle("Neighboring regions distance distribution") +
103-
theme(plot.title = element_text(hjust=0.5))
180+
theme_classic() +
181+
theme(aspect.ratio=1,
182+
plot.title = element_text(hjust=0.5),
183+
legend.position = "bottom") +
184+
theme_blank_facet_label()
185+
186+
# Adjust legend if plotting nearest neighbors
187+
if (Nneighbors==TRUE){
188+
g = g +
189+
labs(fill="regionSet Nneighbors",
190+
colour="regionSet Nneighbors")
191+
}
104192
return(g)
105193
}
106194

0 commit comments

Comments
 (0)