|
| 1 | +import numpy as np |
| 2 | +from scipy import interpolate |
| 3 | + |
| 4 | +import collections |
| 5 | +import functools |
| 6 | +import emd |
| 7 | + |
| 8 | +class memoized(object): |
| 9 | + '''Decorator. Caches a function's return value each time it is called. |
| 10 | + If called later with the same arguments, the cached value is returned |
| 11 | + (not reevaluated). |
| 12 | + ''' |
| 13 | + def __init__(self, func): |
| 14 | + self.func = func |
| 15 | + self.cache = {} |
| 16 | + def __call__(self, *args): |
| 17 | + if not isinstance(args, collections.Hashable): |
| 18 | + # uncacheable. a list, for instance. |
| 19 | + # better to not cache than blow up. |
| 20 | + return self.func(*args) |
| 21 | + if tuple(*args) in self.cache: |
| 22 | + return self.cache[tuple(*args)] |
| 23 | + else: |
| 24 | + value = self.func(*args) |
| 25 | + self.cache[tuple(*args)] = value |
| 26 | + return value |
| 27 | + def __repr__(self): |
| 28 | + '''Return the function's docstring.''' |
| 29 | + return self.func.__doc__ |
| 30 | + def __get__(self, obj, objtype): |
| 31 | + '''Support instance methods.''' |
| 32 | + return functools.partial(self.__call__, obj) |
| 33 | + |
| 34 | +@memoized |
| 35 | +def convert_to_distribution(points): |
| 36 | + """Converts the given data points to a smoothed distribution from 0-100% |
| 37 | +
|
| 38 | + """ |
| 39 | + |
| 40 | + x = np.linspace(0,1, len(points), endpoint=True) |
| 41 | + f = interpolate.interp1d(x, points, kind='cubic') |
| 42 | + retval = np.cumsum(f(np.linspace(0, 1, 30, endpoint=True)).clip(0,1e5)) |
| 43 | + return retval / (sum(retval)+1e-10) |
| 44 | + |
| 45 | +def diff_stat(points1, points2): |
| 46 | + |
| 47 | + dist1 = convert_to_distribution(points1) |
| 48 | + dist2 = convert_to_distribution(points2) |
| 49 | + |
| 50 | + |
| 51 | + normfac = np.log(max(max(points1), max(points2)) + 1) |
| 52 | + |
| 53 | + return np.max(np.abs(dist1 - dist2)) * normfac |
| 54 | + |
| 55 | +divmat = np.zeros([0,0]) |
| 56 | + |
| 57 | + |
| 58 | +def tang_stat(points1, points2): |
| 59 | + assert len(points1) == len(points2) |
| 60 | + points1 = np.array(points1 / np.mean(points1)) |
| 61 | + points2 = np.array(points2 / np.mean(points2)) |
| 62 | + |
| 63 | + va = np.reshape(np.repeat(points1, len(points2)), (len(points2), -1), |
| 64 | + order='C') |
| 65 | + vb = np.reshape(np.repeat(points2, len(points1)), (-1, len(points2)), |
| 66 | + order='F') |
| 67 | + |
| 68 | + global divmat |
| 69 | + if np.shape(divmat) != (len(points1), len(points2)): |
| 70 | + x, y = np.mgrid[0:len(points1), 0:len(points2)] |
| 71 | + divmat = 1/(np.abs(x - y) + 1) |
| 72 | + return np.sqrt(np.sum(np.triu((va - vb)**2 * divmat))) |
| 73 | + |
| 74 | +# stat = 0 |
| 75 | +# for i in range(len(points1)): |
| 76 | +# for j in range(len(points2)): |
| 77 | +# stat += (points1[i] - points2[j])**2 / (np.abs(i - j)+1) |
| 78 | +# if i == j: |
| 79 | +# break |
| 80 | +# |
| 81 | +# return np.sqrt(stat) |
| 82 | + |
| 83 | +def earth_mover(points1, points2): |
| 84 | + xs1 = np.linspace(0,1,len(points1), |
| 85 | + endpoint=True)[np.array(np.isfinite(points1))] |
| 86 | + xs2 = np.linspace(0,1,len(points2), |
| 87 | + endpoint=True)[np.array(np.isfinite(points2))] |
| 88 | + points1 = points1[np.isfinite(points1)] |
| 89 | + points2 = points2[np.isfinite(points2)] |
| 90 | + return emd.emd(xs1, xs2, |
| 91 | + points1/np.sum(points1), |
| 92 | + points2/np.sum(points2)) |
| 93 | + |
| 94 | +startswith = lambda x: lambda y: y.startswith(x) |
| 95 | + |
| 96 | +def earth_mover_multi(points1, points2): |
| 97 | + dist = 0.0 |
| 98 | + embs = {col.split('sl')[0] for col in points1.index} |
| 99 | + sums = [[],[]] |
| 100 | + for emb in embs: |
| 101 | + dist += earth_mover(points1.select(startswith(emb))+1e-5, |
| 102 | + points2.select(startswith(emb))+1e-5)**2 |
| 103 | + sums[0].append(points1.select(startswith(emb)).mean()) |
| 104 | + sums[1].append(points2.select(startswith(emb)).mean()) |
| 105 | + dist += earth_mover(np.array(sums[0]), np.array(sums[1])) |
| 106 | + return dist**.5 |
| 107 | + |
| 108 | +def mp_earth_mover(args): |
| 109 | + i, j = args |
| 110 | + return earth_mover(i, j) |
| 111 | + |
| 112 | +def mp_earth_mover_multi(args): |
| 113 | + i, j = args |
| 114 | + return earth_mover_multi(i, j) |
| 115 | + |
| 116 | +import progressbar as pb |
| 117 | +def pdist(X, metric, p=2, w=None, V=None, VI=None): |
| 118 | + X = np.asarray(X, order='c') |
| 119 | + |
| 120 | + |
| 121 | + s = X.shape |
| 122 | + if len(s) != 2: |
| 123 | + raise ValueError('A 2-dimensional array must be passed.') |
| 124 | + |
| 125 | + m, n = s |
| 126 | + dm = np.zeros((m * (m - 1) / 2,), dtype=np.double) |
| 127 | + |
| 128 | + |
| 129 | + |
| 130 | + k = 0 |
| 131 | + prog = pb.ProgressBar(widgets=['calculating distances', pb.Bar(), |
| 132 | + pb.Percentage(), pb.ETA()]) |
| 133 | + for i in prog(range(0, m - 1)): |
| 134 | + for j in range(i + 1, m): |
| 135 | + dm[k] = metric(X[i], X[j]) |
| 136 | + k = k + 1 |
| 137 | + return dm |
| 138 | + |
| 139 | +def mp_mapped(args): |
| 140 | + manager, X, i, j = args |
| 141 | + metric = manager.get_metric() |
| 142 | + return metric(X[i], X[j]) |
| 143 | + |
| 144 | +def mp_pdist(X, metric, p=2, w=None, V=None, VI=None): |
| 145 | + import multiprocessing |
| 146 | + from multiprocessing.managers import BaseManager |
| 147 | + X = np.asarray(X, order='c') |
| 148 | + |
| 149 | + |
| 150 | + s = X.shape |
| 151 | + if len(s) != 2: |
| 152 | + raise ValueError('A 2-dimensional array must be passed.') |
| 153 | + |
| 154 | + m, n = s |
| 155 | + dm = np.zeros((m * (m - 1) / 2,), dtype=np.double) |
| 156 | + |
| 157 | + pool = multiprocessing.Pool(10) |
| 158 | + func = globals()["mp_"+metric.__name__] |
| 159 | + |
| 160 | + k = 0 |
| 161 | + prog = pb.ProgressBar(widgets=['calculating distances', pb.Bar(), |
| 162 | + pb.Percentage(), pb.ETA()]) |
| 163 | + for i in prog(range(0, m - 1)): |
| 164 | + ks = np.arange(k, k + m - i - 1) |
| 165 | + inputs = [(X[i], X[j]) for j in range(i+1, m)] |
| 166 | + dm[ks] = pool.map(func, inputs) |
| 167 | + k = ks[-1] + 1 |
| 168 | + return dm |
| 169 | + |
| 170 | +def mp_pandas_pdist(X, metric, p=2, w=None, V=None, VI=None): |
| 171 | + import multiprocessing |
| 172 | + |
| 173 | + s = X.shape |
| 174 | + if len(s) != 2: |
| 175 | + raise ValueError('A 2-dimensional array must be passed.') |
| 176 | + |
| 177 | + m, n = s |
| 178 | + dm = np.zeros((m * (m - 1) / 2,), dtype=np.double) |
| 179 | + |
| 180 | + pool = multiprocessing.Pool() |
| 181 | + if metric.__name__.endswith('multi'): |
| 182 | + func = globals()["mp_"+metric.__name__] |
| 183 | + else: |
| 184 | + func = globals()["mp_"+metric.__name__+"_multi"] |
| 185 | + |
| 186 | + k = 0 |
| 187 | + prog = pb.ProgressBar(widgets=['calculating distances', pb.Bar(), |
| 188 | + pb.Percentage(), pb.ETA()]) |
| 189 | + for i in prog(range(0, m - 1)): |
| 190 | + ks = np.arange(k, k + m - i - 1) |
| 191 | + inputs = [(X.ix[i], X.ix[j]) for j in range(i+1, m)] |
| 192 | + dm[ks] = pool.map(func, inputs) |
| 193 | + k = ks[-1] + 1 |
| 194 | + return dm |
| 195 | + |
| 196 | +def pandas_pdist(X, metric, p=2, w=None, V=None, VI=None): |
| 197 | + s = X.shape |
| 198 | + if len(s) != 2: |
| 199 | + raise ValueError('A 2-dimensional array must be passed.') |
| 200 | + |
| 201 | + m, n = s |
| 202 | + dm = np.zeros((m * (m - 1) / 2,), dtype=np.double) |
| 203 | + |
| 204 | + if metric.__name__.endswith('multi'): |
| 205 | + func = globals()["mp_"+metric.__name__] |
| 206 | + else: |
| 207 | + func = globals()["mp_"+metric.__name__+"_multi"] |
| 208 | + |
| 209 | + k = 0 |
| 210 | + prog = pb.ProgressBar(widgets=['calculating distances', pb.Bar(), |
| 211 | + pb.Percentage(), pb.ETA()]) |
| 212 | + for i in prog(range(0, m - 1)): |
| 213 | + ks = np.arange(k, k + m - i - 1) |
| 214 | + inputs = [(X.ix[i], X.ix[j]) for j in range(i+1, m)] |
| 215 | + print(len(inputs)) |
| 216 | + print(ks) |
| 217 | + dm[ks] = map(func, inputs) |
| 218 | + k = ks[-1] + 1 |
| 219 | + return dm |
| 220 | + |
| 221 | + |
| 222 | +if __name__ == "__main__": |
| 223 | + import pandas as pd |
| 224 | + import matplotlib.pyplot as mpl |
| 225 | + |
| 226 | + zld_exp = pd.read_table('analysis/summary.tsv', index_col=0).sort_index() |
| 227 | + wt_exp = pd.read_table('prereqs/WT5.53_summary.tsv', index_col=0).sort_index() |
| 228 | + |
| 229 | + zld_bind = pd.read_table('journal.pgen.1002266.s005.xls', skiprows=1) |
| 230 | + zld_bind.TSS_gene = zld_bind.TSS_gene.apply(str.strip) |
| 231 | + by_gene = zld_bind.groupby('TSS_gene') |
| 232 | + |
| 233 | + |
| 234 | + types = {'Intergenic':'N', 'Intronic':'I', 'Promoter':'P', 'UTR5':'5', |
| 235 | + 'CDS':'C', 'UTR3':'3'} |
| 236 | + |
| 237 | + zld_comp = zld_exp.select(lambda x: 'cyc14A' in x, axis=1) |
| 238 | + wt_comp = wt_exp.select(lambda x: 'cyc14A' in x, axis=1) |
| 239 | + |
| 240 | + diff_col = pd.Series(index=zld_comp.index) |
| 241 | + for gene in wt_exp.index: |
| 242 | + assert gene in zld_exp.index |
| 243 | + diff_col[gene] = diff_stat(zld_comp.ix[gene], wt_comp.ix[gene]) |
| 244 | + |
| 245 | + zld_exp['diff_col'] = diff_col |
| 246 | + wt_exp['diff_col'] = diff_col |
| 247 | + |
| 248 | + zld_exp.sort(column='diff_col', ascending=False, inplace=True) |
| 249 | + wt_exp.sort(column='diff_col', ascending=False, inplace=True) |
| 250 | + |
| 251 | + zld_fig_genes = zld_exp.select(lambda x: '14A' in x or '11' in x, axis=1) |
| 252 | + wt_fig_genes = wt_exp.select(lambda x: '14A' in x or '11' in x, axis=1) |
| 253 | + |
| 254 | + zld_fig_genes = zld_fig_genes[wt_fig_genes.max(axis=1) > 10][:120] |
| 255 | + wt_fig_genes = wt_fig_genes[wt_fig_genes.max(axis=1) > 10][:120] |
| 256 | + |
| 257 | + assert (zld_fig_genes.index == wt_fig_genes.index).all() |
| 258 | + import PlotUtils |
| 259 | + |
| 260 | + PlotUtils.svg_heatmap((wt_fig_genes, zld_fig_genes), |
| 261 | + 'analysis/results/cyc13diff.svg', |
| 262 | + norm_rows_by=wt_fig_genes.max(axis=1), |
| 263 | + draw_row_labels=True, |
| 264 | + cmap = (mpl.cm.Blues, mpl.cm.Reds), |
| 265 | + box_size=15, total_width=150) |
0 commit comments