22
33
44# ' Group regions from the same chromosome together and
5- # ' calculate the distances between neighboring regions.
5+ # ' calculate the distances of a region to its upstream and
6+ # ' downstream neighboring regions.
67# ' Distances are then lumped into a numeric vector.
78# '
89# ' @param query A GRanges or GRangesList object.
9- # '
10+ # ' @param correctRef A string indicating the reference genome
11+ # ' to use if distances are corrected for the number of
12+ # ' regions in a regionSet.
13+ # '
1014# ' @return A numeric vector or list with different vectors containing the
11- # ' distances within neighboring regions.
15+ # ' distances of regions to their upstream/downstream neighbors .
1216# ' @export
1317# ' @examples
1418# ' dist = calcNeighborDist(vistaEnhancers)
15- calcNeighborDist = function (query ) {
16- .validateInputs(list (query = c(" GRanges" ," GRangesList" )))
19+ calcNeighborDist = function (query , correctRef = " None" ) {
20+ .validateInputs(list (query = c(" GRanges" ," GRangesList" ),
21+ correctRef = c(" character" )))
1722 # lapply if a GRangeslist is provided
1823 if (is(query , " GRangesList" )) {
19- dist = lapply(query , calcNeighborDist )
24+ dist = lapply(query ,
25+ function (x ){calcNeighborDist(x , correctRef = correctRef )})
2026 namelist = names(query )
2127 if (is.null(namelist )) {
2228 newnames = seq_along(query )
@@ -30,10 +36,20 @@ calcNeighborDist = function(query) {
3036 querydts = splitDataTable(querydt , " chr" )
3137 distanceVectors = lapply(querydts , neighbordt )
3238 d = as.vector(unlist(distanceVectors ))
33- # remove overlaps for log10 transformation
34- dcvec = d [! (d == " 0" )]
35- dcvec = log10(dcvec )
36- return (dcvec )
39+ # remove overlaps
40+ dcvec = d [! (d == " 0" )]
41+ # Correct for number of regions
42+ if (! correctRef == " None" ) {
43+ chromSizes = getChromSizes(correctRef )
44+ genomelen = sum(chromSizes )
45+ meanWidth = mean(calcWidth(query ))
46+ expectedDist = genomelen / nrow(querydt ) - meanWidth
47+ correctedDist = log10(dcvec / expectedDist )
48+ return (correctedDist )
49+ # If we just want to look at the raw neighbor distances
50+ } else {
51+ return (dcvec )
52+ }
3753}
3854
3955# ' Internal helper function to calculate distance
@@ -46,61 +62,133 @@ neighbordt = function(querydt) {
4662 # there should be at least 2 regions for each chr
4763 if (nrow(querydt ) > 1 ) {
4864 endVect = abs(querydt [, diff(end )])
49- regionWidth = querydt [, (end - start )]
65+ regionWidth = querydt [, (end - start + 1 )]
5066 distancesVector = endVect - regionWidth [- 1 ]
5167 # neg values represent overlaps between neighbor regions, set those to 0
5268 distancesVector [which(distancesVector < 0 )] = 0
5369 return (distancesVector )
5470 }
5571}
56-
5772
58- # ' Plot the distances between neighboring regions.The distance in the
59- # ' x axis is log10 transformed for ease of comparison between
60- # ' different regionsets and to account for outliers.
73+
74+ # ' Group regions from the same chromosome together and
75+ # ' compute the distance of a region to its nearest neighbor.
76+ # ' Distances are then lumped into a numeric vector.
77+ # '
78+ # ' @param query A GRanges or GRangesList object.
79+ # ' @param correctRef A string indicating the reference genome
80+ # ' to use if Nearest neighbor distances are corrected for the
81+ # ' number of regions in a regionSet.
82+ # '
83+ # ' @return A numeric vector or list of vectors containing the
84+ # ' distance of regions to their nearest neighbors.
85+ # ' @export
86+ # ' @examples
87+ # ' Nneighbors = calcNearestNeighbors(vistaEnhancers)
88+ calcNearestNeighbors = function (query , correctRef = " None" ) {
89+ .validateInputs(list (query = c(" GRanges" ," GRangesList" ),
90+ correctRef = c(" character" )))
91+ # lapply if a GRangeslist is provided
92+ if (is(query , " GRangesList" )) {
93+ dist = lapply(query ,
94+ function (x ){calcNearestNeighbors(x , correctRef = correctRef )})
95+ namelist = names(query )
96+ if (is.null(namelist )) {
97+ newnames = seq_along(query )
98+ namelist = newnames
99+ # Append names
100+ names(dist ) = namelist
101+ }
102+ return (dist )
103+ }
104+ # Calculate nearest neighbors in a vectorized manner
105+ dist = calcNeighborDist(query )
106+ upstream = dist [- length(dist )]
107+ downstream = dist [- 1 ]
108+ dt = data.table(i = upstream , j = downstream )
109+ pairmins = dt [, pmin(i , j )]
110+ # First and last distances are default nearest neighbors
111+ nNeighbors = c(dist [1 ], pairmins , dist [length(dist )])
112+ # Correct for number of regions
113+ if (! correctRef == " None" ) {
114+ chromSizes = getChromSizes(correctRef )
115+ genomelen = sum(chromSizes )
116+ meanWidth = mean(calcWidth(query ))
117+ expectedDist = genomelen / length(query ) - meanWidth
118+ correctedDist = log10(nNeighbors / expectedDist )
119+ return (correctedDist )
120+ } else {
121+ return (nNeighbors )
122+ }
123+ }
124+
125+ # ' Plot the distances from regions to their upstream/downstream neighbors
126+ # ' or nearest neighbors. Distances can be passed as either raw bp or
127+ # ' corrected for the number of regions (log10(obs/exp)), but this has
128+ # ' to be specified in the function parameters.
61129# '
62- # ' @param dcvec A numeric vector or list with vectors containing distances
63- # ' between neighbor regions. Produced by \code{calcNeighborDist}
130+ # ' @param dcvec A numeric vector or list of vectors containing distances
131+ # ' to upstream/downstream neighboring regions or to nearest neighbors.
132+ # ' Produced by \code{calcNeighborDist} or \code{calcNearestNeighbors}
133+ # ' @param correctedDist A logical indicating if the plot axis should
134+ # ' be adjusted to show distances corrected for the number of regions
135+ # ' in a regionset.
136+ # ' @param Nneighbors A logical indicating whether legend should be adjusted
137+ # ' if Nearest neighbors are being plotted. Default legend shows distances
138+ # ' to upstream/downstream neighbors.
64139# '
65140# ' @return A ggplot density object showing the distribution of
66- # ' log10 transformed distances.
141+ # ' raw or corrected distances.
67142# ' @export
68143# ' @examples
69144# ' numVector = rnorm(400, mean=5, sd=0.1)
70145# ' d = plotNeighborDist(numVector)
71- plotNeighborDist = function (dcvec ) {
146+ plotNeighborDist = function (dcvec , correctedDist = FALSE ,
147+ Nneighbors = FALSE ) {
72148 .validateInputs(list (dcvec = c(" numeric" ," list" )))
73- # if input is list, conver it to a data frame with
149+ # if input is list, convert it to a data frame with
74150 # value and region set name, if input is vector - make a single
75151 # columns data.frame
76- if (is(dcvec , " list" )){
77- nameList = names(dcvec )
78- vectorLengths = unlist(lapply(dcvec , length ))
79- distReshaped = data.frame (value = unlist(dcvec ),
80- regionSet = rep(nameList , vectorLengths ))
81- } else {
82- distReshaped = data.frame (value = dcvec )
83- }
84-
85152 if (is(dcvec , " list" )) {
86- g = ggplot2 :: ggplot(distReshaped , aes(x = value ,
87- fill = regionSet ,
153+ nameList = names(dcvec )
154+ vectorLengths = unlist(lapply(dcvec , length ))
155+ distReshaped = data.frame (value = unlist(dcvec ),
156+ regionSet = rep(nameList , vectorLengths ))
157+ g = ggplot2 :: ggplot(distReshaped , aes(x = value ,
158+ fill = regionSet ,
88159 colour = regionSet )) +
89- geom_density(alpha = 0.5 ) +
90- theme_classic() +
91- theme(legend.position = " bottom" )
160+ geom_density(alpha = 0.4 )
92161 } else {
162+ distReshaped = data.frame (value = dcvec )
93163 g = ggplot2 :: ggplot(distReshaped , aes(x = value )) +
94- geom_density(alpha = 0.4 ) +
95- theme_classic()
164+ geom_density()
165+ }
166+ if (correctedDist == TRUE ) {
167+ g = g +
168+ xlab(expression(log [10 ](over(Obs , Exp )))) +
169+ geom_vline(xintercept = 0 , linetype = " dashed" ) +
170+ ggtitle(" Corrected neighboring regions distance distribution" )
171+ } else {
172+ g = g +
173+ xlab(expression(" bp distance" )) +
174+ scale_x_log10(breaks = scales :: trans_breaks(" log10" , function (x ) 10 ^ x ),
175+ labels = scales :: trans_format(" log10" ,
176+ scales :: math_format(10 ^ .x ))) +
177+ ggtitle(" Neighboring regions distance distribution" )
96178 }
97179 g = g +
98- xlab(expression(log [10 ]* (" bp distance" ))) +
99- xlim(0 , 10 ) +
100- theme(aspect.ratio = 1 ) +
101- theme_blank_facet_label() +
102- ggtitle(" Neighboring regions distance distribution" ) +
103- theme(plot.title = element_text(hjust = 0.5 ))
180+ theme_classic() +
181+ theme(aspect.ratio = 1 ,
182+ plot.title = element_text(hjust = 0.5 ),
183+ legend.position = " bottom" ) +
184+ theme_blank_facet_label()
185+
186+ # Adjust legend if plotting nearest neighbors
187+ if (Nneighbors == TRUE ){
188+ g = g +
189+ labs(fill = " regionSet Nneighbors" ,
190+ colour = " regionSet Nneighbors" )
191+ }
104192 return (g )
105193}
106194
0 commit comments