Skip to content

Commit ba2be42

Browse files
Cross score sandbox fixes (#497)
* Make cross_score work via CLI and small style fixes (#474) * Cross-score fix of edge case fit bin-wise annotation of cross-scores * Black and parameter naming issue --------- Co-authored-by: Aleksandr <r1kanat@yandex.ru>
1 parent 41340d4 commit ba2be42

1 file changed

Lines changed: 129 additions & 127 deletions

File tree

cooltools/sandbox/cross_score.py

Lines changed: 129 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -11,58 +11,58 @@
1111

1212
import pandas as pd
1313

14-
1514
parser = argparse.ArgumentParser(
1615
description="""Calculate distance-dependent contact marginals of a Hi-C map.
17-
16+
1817
These contact marginals are analogous to per-bin contact-frequency-vs-distance
1918
curves, but over a small number of distance bins. Such marginals are useful to
2019
reveal genomic sites participating in frequent long-distance contacts, e.g.
2120
anchors of loops and stripes.
22-
""")
23-
24-
parser.add_argument('COOL_URI', metavar='COOL_URI',
25-
type=str,
26-
help='input cooler URI')
21+
"""
22+
)
2723

28-
parser.add_argument('--distbins',
29-
default='1e3,3e4,1e6,3e7',
30-
help='a comma-separated list of distance bins')
24+
parser.add_argument("COOL_URI", metavar="COOL_URI", type=str, help="input cooler URI")
3125

32-
parser.add_argument('--clr-weight-name',
33-
type=str,
34-
default='weight',
35-
help='Name of the column to use for data balancing')
26+
parser.add_argument(
27+
"--dist-bins",
28+
default="1e3,3e4,1e6,3e7",
29+
help="a comma-separated list of distance bins",
30+
)
3631

37-
parser.add_argument('--ignore-diags',
38-
type=int,
39-
default=2,
40-
help='How many diagonals to ignore')
32+
parser.add_argument(
33+
"--clr-weight-name",
34+
type=str,
35+
default="weight",
36+
help="Name of the column to use for data balancing",
37+
)
4138

42-
parser.add_argument('--outfolder',
43-
type=str,
44-
default='./',
45-
help='The output folder')
39+
parser.add_argument(
40+
"--ignore-diags", type=int, default=2, help="How many diagonals to ignore"
41+
)
4642

47-
parser.add_argument('--prefix',
48-
type=str,
49-
default=None,
50-
help='The prefix for output files')
43+
parser.add_argument("--outfolder", type=str, default="./", help="The output folder")
5144

52-
parser.add_argument('--format',
53-
type=str,
54-
default='bigwig',
55-
help='Comma-separated list of the output formats. Possible values: bigwig, bedgraph.',)
45+
parser.add_argument(
46+
"--prefix", type=str, default=None, help="The prefix for output files"
47+
)
5648

57-
parser.add_argument('--nproc',
58-
type=int,
59-
default=None,
60-
help='The number of processes. By default, use all available cores.')
49+
parser.add_argument(
50+
"--format",
51+
type=str,
52+
default="bigwig",
53+
help="Comma-separated list of the output formats. Possible values: bigwig, bedgraph.",
54+
)
6155

62-
parser.add_argument('--chunksize',
63-
default=1e6,
64-
help='The number of pixels per processed chunk.')
56+
parser.add_argument(
57+
"--nproc",
58+
type=int,
59+
default=None,
60+
help="The number of processes. By default, use all available cores.",
61+
)
6562

63+
parser.add_argument(
64+
"--chunksize", default=1e6, help="The number of pixels per processed chunk."
65+
)
6666

6767

6868
def get_dist_margs(clr_path, lo, hi, dist_bins, weight_name, ignore_diags):
@@ -72,125 +72,127 @@ def get_dist_margs(clr_path, lo, hi, dist_bins, weight_name, ignore_diags):
7272
chunk = clr.pixels()[lo:hi]
7373
res = clr.binsize
7474

75-
chunk = chunk[chunk['bin2_id']-chunk['bin1_id']>=ignore_diags]
75+
chunk = chunk[chunk["bin2_id"] - chunk["bin1_id"] >= ignore_diags]
7676

7777
chunk = cooler.annotate(chunk, bins)
78-
chunk['balanced'] = np.nan_to_num(chunk['count'] * chunk[f'{weight_name}1'] * chunk[f'{weight_name}2'])
78+
chunk["balanced"] = np.nan_to_num(
79+
chunk["count"] * chunk[f"{weight_name}1"] * chunk[f"{weight_name}2"]
80+
)
7981
chunk = chunk[chunk.chrom1 == chunk.chrom2]
8082

81-
del(clr)
83+
del clr
8284

8385
return _dist_margs(chunk, dist_bins, res)
8486

8587

86-
def _dist_margs(chunk, dist_bins, res):
87-
min_bin_id = chunk['bin1_id'].values[0]
88+
def _dist_margs(chunk, dist_bins, res):
89+
min_bin_id = chunk["bin1_id"].values[0]
8890

89-
dists = (chunk['bin2_id'] - chunk['bin1_id']) * res
90-
dist_bin_id = np.searchsorted(dist_bins, dists, 'right')
91+
dists = (chunk["bin2_id"] - chunk["bin1_id"]) * res
92+
dist_bin_id = np.searchsorted(dist_bins, dists, "right")
9193
n_dist_bins = len(dist_bins)
9294
margs_down_loc = np.bincount(
93-
(chunk['bin1_id'].values - min_bin_id) * n_dist_bins + dist_bin_id,
94-
weights=np.nan_to_num(chunk['balanced'].values)
95-
)
95+
(chunk["bin1_id"].values - min_bin_id) * n_dist_bins + dist_bin_id,
96+
weights=np.nan_to_num(chunk["balanced"].values),
97+
)
9698

9799
margs_up_loc = np.bincount(
98-
(chunk['bin2_id'].values - min_bin_id) * n_dist_bins + dist_bin_id,
99-
weights=np.nan_to_num(chunk['balanced'].values)
100-
)
100+
(chunk["bin2_id"].values - min_bin_id) * n_dist_bins + dist_bin_id,
101+
weights=np.nan_to_num(chunk["balanced"].values),
102+
)
101103

102104
return min_bin_id, margs_down_loc, margs_up_loc
103105

104106

105-
106107
def drop_resolution(clrname):
107-
name_parts = clrname.split('.')
108-
if name_parts[-1] in ['cool', 'mcool']:
108+
name_parts = clrname.split(".")
109+
if name_parts[-1] in ["cool", "mcool"]:
109110
name_parts = name_parts[:-1]
110111
if name_parts[-1].isnumeric():
111112
name_parts = name_parts[:-1]
112-
return '.'.join(name_parts)
113+
return ".".join(name_parts)
113114

114115

115116
args = parser.parse_args()
116117

117118
logging.basicConfig(level=logging.INFO)
119+
if __name__ == "__main__":
120+
mp.freeze_support()
121+
chunksize = int(float(args.chunksize))
122+
clr = cooler.Cooler(args.COOL_URI)
123+
bins = clr.bins()[:]
124+
n_pixels = clr.pixels().shape[0]
125+
dist_bins = np.array([int(float(i)) for i in args.dist_bins.split(",")])
126+
weight_name = args.clr_weight_name
127+
ignore_diags = args.ignore_diags
128+
formats = args.format.split(",")
118129

119-
chunksize = int(float(args.chunksize))
120-
clr = cooler.Cooler(args.COOL_URI)
121-
bins = clr.bins()[:]
122-
n_pixels = clr.pixels().shape[0]
123-
dist_bins = np.array([int(float(i)) for i in args.distbins.split(',')])
124-
weight_name = args.clr_weight_name
125-
ignore_diags = args.ignore_diags
126-
formats = args.format.split(',')
130+
n_dist_bins = len(dist_bins)
127131

128-
n_dist_bins = len(dist_bins)
132+
chunk_spans = np.r_[np.arange(0, n_pixels, chunksize), n_pixels]
129133

130-
chunk_spans = np.r_[np.arange(0, n_pixels, chunksize), n_pixels]
134+
nproc = mp.cpu_count() if args.nproc is None else args.nproc
131135

132-
nproc = mp.cpu_count() if args.nproc is None else args.nproc
136+
logging.info(f"Calculating marginals for {args.COOL_URI} using {nproc} processes")
137+
with mp.Pool(nproc) as pool:
138+
out = pool.starmap(
139+
get_dist_margs,
140+
[
141+
(args.COOL_URI, lo, hi, dist_bins, weight_name, ignore_diags)
142+
for lo, hi in zip(chunk_spans[:-1], chunk_spans[1:])
143+
],
144+
)
133145

134-
logging.info(f'Calculating marginals for {args.COOL_URI} using {nproc} processes')
135-
with mp.Pool(nproc) as pool:
136-
out = pool.starmap(get_dist_margs,
137-
[
138-
(args.COOL_URI, lo, hi, dist_bins, weight_name, ignore_diags)
139-
for lo, hi in zip(chunk_spans[:-1], chunk_spans[1:])
140-
]
141-
)
146+
margs_up = np.zeros(len(bins) * n_dist_bins + 1)
147+
margs_down = np.zeros(len(bins) * n_dist_bins + 1)
148+
149+
for min_bin_id, margs_down_loc, margs_up_loc in out:
150+
margs_down[
151+
min_bin_id * n_dist_bins : min_bin_id * n_dist_bins + len(margs_down_loc)
152+
] += margs_down_loc
153+
154+
margs_up[
155+
min_bin_id * n_dist_bins : min_bin_id * n_dist_bins + len(margs_up_loc)
156+
] += margs_up_loc
157+
158+
margs_down = margs_down[:-1].reshape((len(bins), n_dist_bins)).T
159+
margs_up = margs_up[:-1].reshape((len(bins), n_dist_bins)).T
160+
161+
out_folder = pathlib.Path(args.outfolder)
162+
clr_name = pathlib.Path(args.COOL_URI.split(":")[0]).name
163+
clr_name = drop_resolution(clr_name)
164+
165+
prefix = clr_name if args.prefix is None else args.prefix
166+
res = clr.binsize
142167

143-
margs_up = np.zeros(len(bins) * n_dist_bins)
144-
margs_down = np.zeros(len(bins) * n_dist_bins)
145-
146-
for min_bin_id, margs_down_loc, margs_up_loc in out:
147-
margs_down[
148-
min_bin_id * n_dist_bins:
149-
min_bin_id * n_dist_bins + len(margs_down_loc)
150-
] += margs_down_loc
151-
152-
margs_up[
153-
min_bin_id * n_dist_bins:
154-
min_bin_id * n_dist_bins + len(margs_up_loc)
155-
] += margs_up_loc
156-
157-
158-
margs_down = margs_down.reshape((len(bins), n_dist_bins)).T
159-
margs_up = margs_up.reshape((len(bins), n_dist_bins)).T
160-
161-
162-
out_folder = pathlib.Path(args.outfolder)
163-
clr_name = pathlib.Path(args.COOL_URI.split(':')[0]).name
164-
clr_name = drop_resolution(clr_name)
165-
166-
prefix = clr_name if args.prefix is None else args.prefix
167-
res = clr.binsize
168-
169-
for dist_bin_id in range(n_dist_bins):
170-
lo = np.r_[0, dist_bins][dist_bin_id]
171-
hi = np.r_[0, dist_bins][dist_bin_id+1]
172-
173-
for dir_str, margs in [('up', margs_up), ('down', margs_down), ('both', margs_up+margs_down)]:
174-
out_df = bins[['chrom', 'start', 'end']].copy()
175-
out_df['marg'] = margs[dist_bin_id]
176-
out_df['marg'] = out_df['marg'].mask(bins[weight_name].isnull(), np.nan)
177-
178-
if 'bigwig' in formats:
179-
file_name = f'{prefix}.{res}.cross.{dir_str}.{lo}-{hi}.bw'
180-
logging.info(f'Write output into {file_name}')
181-
bioframe.io.to_bigwig(
182-
out_df,
183-
chromsizes=clr.chromsizes,
184-
outpath=str(out_folder / file_name))
185-
186-
if 'bedgraph' in formats:
187-
file_name = f'{prefix}.{res}.cross.{dir_str}.{lo}-{hi}.bg.gz'
188-
logging.info(f'Write output into {file_name}')
189-
out_df.to_csv(
190-
str(out_folder / file_name),
191-
sep='\t',
192-
index=False,
193-
header=False,
194-
)
195-
196-
168+
for dist_bin_id in range(n_dist_bins):
169+
lo = np.r_[0, dist_bins][dist_bin_id]
170+
hi = np.r_[0, dist_bins][dist_bin_id + 1]
171+
172+
for dir_str, margs in [
173+
("up", margs_up),
174+
("down", margs_down),
175+
("both", margs_up + margs_down),
176+
]:
177+
out_df = bins[["chrom", "start", "end"]].copy()
178+
out_df["marg"] = margs[dist_bin_id]
179+
out_df["marg"] = out_df["marg"].mask(bins[weight_name].isnull(), np.nan)
180+
181+
if "bigwig" in formats:
182+
file_name = f"{prefix}.{res}.cross.{dir_str}.{lo}-{hi}.bw"
183+
logging.info(f"Write output into {file_name}")
184+
bioframe.io.to_bigwig(
185+
out_df,
186+
chromsizes=clr.chromsizes,
187+
outpath=str(out_folder / file_name),
188+
)
189+
190+
if "bedgraph" in formats:
191+
file_name = f"{prefix}.{res}.cross.{dir_str}.{lo}-{hi}.bg.gz"
192+
logging.info(f"Write output into {file_name}")
193+
out_df.to_csv(
194+
str(out_folder / file_name),
195+
sep="\t",
196+
index=False,
197+
header=False,
198+
)

0 commit comments

Comments
 (0)