Skip to content

Commit 6bbcaca

Browse files
author
Peter Combs
committed
Merge branch 'Joint'
2 parents fd9d3ec + bb2be99 commit 6bbcaca

12 files changed

Lines changed: 877 additions & 107 deletions

BindUtils.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import pandas as pd
2+
from bisect import bisect
3+
from glob import glob
4+
from collections import defaultdict
5+
6+
bind_dist = 5000
7+
8+
try:
9+
tss_dict = locals()['tss_dict']
10+
except KeyError:
11+
tss_dict = defaultdict(list)
12+
tss_per_gene = defaultdict(list)
13+
for i, row in pd.read_table('Reference/tss').iterrows():
14+
tss_dict[row['chr'].replace('dmel_', '')].append((row['TSS_start'],
15+
row['gene_name']))
16+
tss_per_gene[row['gene_name']].append((row['chr'].replace('dmel_', ''),
17+
row['TSS_start']))
18+
19+
def find_near(chrom, coord, dist, nearest_only=False):
20+
out = set()
21+
center = bisect([i[0] for i in chrom], coord)
22+
for (coord2, gene) in reversed(chrom[:center]):
23+
if coord - coord2 > dist: break
24+
out.add(gene)
25+
if nearest_only:
26+
break
27+
28+
for (coord2, gene) in chrom[center:]:
29+
if coord2 - coord > dist: break
30+
out.add(gene)
31+
if nearest_only:
32+
break
33+
34+
return out
35+
36+
def find_nearest_dist(chrom, coord):
37+
center = bisect([i[0] for i in chrom, coord])
38+
dist = abs(chrom[center][0] - coord)
39+
distp1 = abs(coord - chrom[center+1][0])
40+
distm1 = abs(coord - chrom[center-1][0])
41+
# I should only need one of these, but I'm too lazy to read the
42+
# documentation to see whether it's the +1 or -1
43+
return min(dist, distp1, distm1)
44+
45+
def get_binding_matrix(genes):
46+
binding_matrix = pd.DataFrame(index=genes,
47+
columns=tfs, data=0.0, dtype=float)
48+
for tf in tfs:
49+
for gene in binding_matrix.index:
50+
binding_matrix.ix[gene, tf] = float(gene in has_tfs[tf])
51+
binding_matrix.sort_index(axis=1, inplace=True)
52+
return binding_matrix
53+
54+
tf_files = glob('Reference/*_peaks')
55+
tfs = ([i.split('/')[1].split('_')[0] for i in tf_files])
56+
has_tfs = {}
57+
for tf, tf_file in zip(tfs, tf_files):
58+
has_tf = set()
59+
for coord in pd.read_table(tf_file).NewPeak:
60+
chr, coord = coord.split(':')
61+
has_tf.update(find_near(tss_dict[chr], int(coord), bind_dist))
62+
has_tfs[tf] = has_tf

DistributionDifference.py

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
import numpy as np
2+
from scipy import interpolate
3+
4+
import collections
5+
import functools
6+
import emd
7+
8+
class memoized(object):
9+
'''Decorator. Caches a function's return value each time it is called.
10+
If called later with the same arguments, the cached value is returned
11+
(not reevaluated).
12+
'''
13+
def __init__(self, func):
14+
self.func = func
15+
self.cache = {}
16+
def __call__(self, *args):
17+
if not isinstance(args, collections.Hashable):
18+
# uncacheable. a list, for instance.
19+
# better to not cache than blow up.
20+
return self.func(*args)
21+
if tuple(*args) in self.cache:
22+
return self.cache[tuple(*args)]
23+
else:
24+
value = self.func(*args)
25+
self.cache[tuple(*args)] = value
26+
return value
27+
def __repr__(self):
28+
'''Return the function's docstring.'''
29+
return self.func.__doc__
30+
def __get__(self, obj, objtype):
31+
'''Support instance methods.'''
32+
return functools.partial(self.__call__, obj)
33+
34+
@memoized
35+
def convert_to_distribution(points):
36+
"""Converts the given data points to a smoothed distribution from 0-100%
37+
38+
"""
39+
40+
x = np.linspace(0,1, len(points), endpoint=True)
41+
f = interpolate.interp1d(x, points, kind='cubic')
42+
retval = np.cumsum(f(np.linspace(0, 1, 30, endpoint=True)).clip(0,1e5))
43+
return retval / (sum(retval)+1e-10)
44+
45+
def diff_stat(points1, points2):
46+
47+
dist1 = convert_to_distribution(points1)
48+
dist2 = convert_to_distribution(points2)
49+
50+
51+
normfac = np.log(max(max(points1), max(points2)) + 1)
52+
53+
return np.max(np.abs(dist1 - dist2)) * normfac
54+
55+
divmat = np.zeros([0,0])
56+
57+
58+
def tang_stat(points1, points2):
59+
assert len(points1) == len(points2)
60+
points1 = np.array(points1 / np.mean(points1))
61+
points2 = np.array(points2 / np.mean(points2))
62+
63+
va = np.reshape(np.repeat(points1, len(points2)), (len(points2), -1),
64+
order='C')
65+
vb = np.reshape(np.repeat(points2, len(points1)), (-1, len(points2)),
66+
order='F')
67+
68+
global divmat
69+
if np.shape(divmat) != (len(points1), len(points2)):
70+
x, y = np.mgrid[0:len(points1), 0:len(points2)]
71+
divmat = 1/(np.abs(x - y) + 1)
72+
return np.sqrt(np.sum(np.triu((va - vb)**2 * divmat)))
73+
74+
# stat = 0
75+
# for i in range(len(points1)):
76+
# for j in range(len(points2)):
77+
# stat += (points1[i] - points2[j])**2 / (np.abs(i - j)+1)
78+
# if i == j:
79+
# break
80+
#
81+
# return np.sqrt(stat)
82+
83+
def earth_mover(points1, points2):
84+
xs1 = np.linspace(0,1,len(points1),
85+
endpoint=True)[np.array(np.isfinite(points1))]
86+
xs2 = np.linspace(0,1,len(points2),
87+
endpoint=True)[np.array(np.isfinite(points2))]
88+
points1 = points1[np.isfinite(points1)]
89+
points2 = points2[np.isfinite(points2)]
90+
return emd.emd(xs1, xs2,
91+
points1/np.sum(points1),
92+
points2/np.sum(points2))
93+
94+
startswith = lambda x: lambda y: y.startswith(x)
95+
96+
def earth_mover_multi(points1, points2):
97+
dist = 0.0
98+
embs = {col.split('sl')[0] for col in points1.index}
99+
sums = [[],[]]
100+
for emb in embs:
101+
dist += earth_mover(points1.select(startswith(emb))+1e-5,
102+
points2.select(startswith(emb))+1e-5)**2
103+
sums[0].append(points1.select(startswith(emb)).mean())
104+
sums[1].append(points2.select(startswith(emb)).mean())
105+
dist += earth_mover(np.array(sums[0]), np.array(sums[1]))
106+
return dist**.5
107+
108+
def mp_earth_mover(args):
109+
i, j = args
110+
return earth_mover(i, j)
111+
112+
def mp_earth_mover_multi(args):
113+
i, j = args
114+
return earth_mover_multi(i, j)
115+
116+
import progressbar as pb
117+
def pdist(X, metric, p=2, w=None, V=None, VI=None):
118+
X = np.asarray(X, order='c')
119+
120+
121+
s = X.shape
122+
if len(s) != 2:
123+
raise ValueError('A 2-dimensional array must be passed.')
124+
125+
m, n = s
126+
dm = np.zeros((m * (m - 1) / 2,), dtype=np.double)
127+
128+
129+
130+
k = 0
131+
prog = pb.ProgressBar(widgets=['calculating distances', pb.Bar(),
132+
pb.Percentage(), pb.ETA()])
133+
for i in prog(range(0, m - 1)):
134+
for j in range(i + 1, m):
135+
dm[k] = metric(X[i], X[j])
136+
k = k + 1
137+
return dm
138+
139+
def mp_mapped(args):
140+
manager, X, i, j = args
141+
metric = manager.get_metric()
142+
return metric(X[i], X[j])
143+
144+
def mp_pdist(X, metric, p=2, w=None, V=None, VI=None):
145+
import multiprocessing
146+
from multiprocessing.managers import BaseManager
147+
X = np.asarray(X, order='c')
148+
149+
150+
s = X.shape
151+
if len(s) != 2:
152+
raise ValueError('A 2-dimensional array must be passed.')
153+
154+
m, n = s
155+
dm = np.zeros((m * (m - 1) / 2,), dtype=np.double)
156+
157+
pool = multiprocessing.Pool(10)
158+
func = globals()["mp_"+metric.__name__]
159+
160+
k = 0
161+
prog = pb.ProgressBar(widgets=['calculating distances', pb.Bar(),
162+
pb.Percentage(), pb.ETA()])
163+
for i in prog(range(0, m - 1)):
164+
ks = np.arange(k, k + m - i - 1)
165+
inputs = [(X[i], X[j]) for j in range(i+1, m)]
166+
dm[ks] = pool.map(func, inputs)
167+
k = ks[-1] + 1
168+
return dm
169+
170+
def mp_pandas_pdist(X, metric, p=2, w=None, V=None, VI=None):
171+
import multiprocessing
172+
173+
s = X.shape
174+
if len(s) != 2:
175+
raise ValueError('A 2-dimensional array must be passed.')
176+
177+
m, n = s
178+
dm = np.zeros((m * (m - 1) / 2,), dtype=np.double)
179+
180+
pool = multiprocessing.Pool()
181+
if metric.__name__.endswith('multi'):
182+
func = globals()["mp_"+metric.__name__]
183+
else:
184+
func = globals()["mp_"+metric.__name__+"_multi"]
185+
186+
k = 0
187+
prog = pb.ProgressBar(widgets=['calculating distances', pb.Bar(),
188+
pb.Percentage(), pb.ETA()])
189+
for i in prog(range(0, m - 1)):
190+
ks = np.arange(k, k + m - i - 1)
191+
inputs = [(X.ix[i], X.ix[j]) for j in range(i+1, m)]
192+
dm[ks] = pool.map(func, inputs)
193+
k = ks[-1] + 1
194+
return dm
195+
196+
def pandas_pdist(X, metric, p=2, w=None, V=None, VI=None):
197+
s = X.shape
198+
if len(s) != 2:
199+
raise ValueError('A 2-dimensional array must be passed.')
200+
201+
m, n = s
202+
dm = np.zeros((m * (m - 1) / 2,), dtype=np.double)
203+
204+
if metric.__name__.endswith('multi'):
205+
func = globals()["mp_"+metric.__name__]
206+
else:
207+
func = globals()["mp_"+metric.__name__+"_multi"]
208+
209+
k = 0
210+
prog = pb.ProgressBar(widgets=['calculating distances', pb.Bar(),
211+
pb.Percentage(), pb.ETA()])
212+
for i in prog(range(0, m - 1)):
213+
ks = np.arange(k, k + m - i - 1)
214+
inputs = [(X.ix[i], X.ix[j]) for j in range(i+1, m)]
215+
print(len(inputs))
216+
print(ks)
217+
dm[ks] = map(func, inputs)
218+
k = ks[-1] + 1
219+
return dm
220+
221+
222+
if __name__ == "__main__":
223+
import pandas as pd
224+
import matplotlib.pyplot as mpl
225+
226+
zld_exp = pd.read_table('analysis/summary.tsv', index_col=0).sort_index()
227+
wt_exp = pd.read_table('prereqs/WT5.53_summary.tsv', index_col=0).sort_index()
228+
229+
zld_bind = pd.read_table('journal.pgen.1002266.s005.xls', skiprows=1)
230+
zld_bind.TSS_gene = zld_bind.TSS_gene.apply(str.strip)
231+
by_gene = zld_bind.groupby('TSS_gene')
232+
233+
234+
types = {'Intergenic':'N', 'Intronic':'I', 'Promoter':'P', 'UTR5':'5',
235+
'CDS':'C', 'UTR3':'3'}
236+
237+
zld_comp = zld_exp.select(lambda x: 'cyc14A' in x, axis=1)
238+
wt_comp = wt_exp.select(lambda x: 'cyc14A' in x, axis=1)
239+
240+
diff_col = pd.Series(index=zld_comp.index)
241+
for gene in wt_exp.index:
242+
assert gene in zld_exp.index
243+
diff_col[gene] = diff_stat(zld_comp.ix[gene], wt_comp.ix[gene])
244+
245+
zld_exp['diff_col'] = diff_col
246+
wt_exp['diff_col'] = diff_col
247+
248+
zld_exp.sort(column='diff_col', ascending=False, inplace=True)
249+
wt_exp.sort(column='diff_col', ascending=False, inplace=True)
250+
251+
zld_fig_genes = zld_exp.select(lambda x: '14A' in x or '11' in x, axis=1)
252+
wt_fig_genes = wt_exp.select(lambda x: '14A' in x or '11' in x, axis=1)
253+
254+
zld_fig_genes = zld_fig_genes[wt_fig_genes.max(axis=1) > 10][:120]
255+
wt_fig_genes = wt_fig_genes[wt_fig_genes.max(axis=1) > 10][:120]
256+
257+
assert (zld_fig_genes.index == wt_fig_genes.index).all()
258+
import PlotUtils
259+
260+
PlotUtils.svg_heatmap((wt_fig_genes, zld_fig_genes),
261+
'analysis/results/cyc13diff.svg',
262+
norm_rows_by=wt_fig_genes.max(axis=1),
263+
draw_row_labels=True,
264+
cmap = (mpl.cm.Blues, mpl.cm.Reds),
265+
box_size=15, total_width=150)

0 commit comments

Comments
 (0)