1111
1212import pandas as pd
1313
14-
1514parser = argparse .ArgumentParser (
1615 description = """Calculate distance-dependent contact marginals of a Hi-C map.
17-
16+
1817 These contact marginals are analogous to per-bin contact-frequency-vs-distance
1918 curves, but over a small number of distance bins. Such marginals are useful to
2019 reveal genomic sites participating in frequent long-distance contacts, e.g.
2120 anchors of loops and stripes.
22- """ )
23-
24- parser .add_argument ('COOL_URI' , metavar = 'COOL_URI' ,
25- type = str ,
26- help = 'input cooler URI' )
21+ """
22+ )
2723
28- parser .add_argument ('--distbins' ,
29- default = '1e3,3e4,1e6,3e7' ,
30- help = 'a comma-separated list of distance bins' )
24+ parser .add_argument ("COOL_URI" , metavar = "COOL_URI" , type = str , help = "input cooler URI" )
3125
32- parser .add_argument ('--clr-weight-name' ,
33- type = str ,
34- default = 'weight' ,
35- help = 'Name of the column to use for data balancing' )
26+ parser .add_argument (
27+ "--dist-bins" ,
28+ default = "1e3,3e4,1e6,3e7" ,
29+ help = "a comma-separated list of distance bins" ,
30+ )
3631
37- parser .add_argument ('--ignore-diags' ,
38- type = int ,
39- default = 2 ,
40- help = 'How many diagonals to ignore' )
32+ parser .add_argument (
33+ "--clr-weight-name" ,
34+ type = str ,
35+ default = "weight" ,
36+ help = "Name of the column to use for data balancing" ,
37+ )
4138
42- parser .add_argument ('--outfolder' ,
43- type = str ,
44- default = './' ,
45- help = 'The output folder' )
39+ parser .add_argument (
40+ "--ignore-diags" , type = int , default = 2 , help = "How many diagonals to ignore"
41+ )
4642
47- parser .add_argument ('--prefix' ,
48- type = str ,
49- default = None ,
50- help = 'The prefix for output files' )
43+ parser .add_argument ("--outfolder" , type = str , default = "./" , help = "The output folder" )
5144
52- parser .add_argument ('--format' ,
53- type = str ,
54- default = 'bigwig' ,
55- help = 'Comma-separated list of the output formats. Possible values: bigwig, bedgraph.' ,)
45+ parser .add_argument (
46+ "--prefix" , type = str , default = None , help = "The prefix for output files"
47+ )
5648
57- parser .add_argument ('--nproc' ,
58- type = int ,
59- default = None ,
60- help = 'The number of processes. By default, use all available cores.' )
49+ parser .add_argument (
50+ "--format" ,
51+ type = str ,
52+ default = "bigwig" ,
53+ help = "Comma-separated list of the output formats. Possible values: bigwig, bedgraph." ,
54+ )
6155
62- parser .add_argument ('--chunksize' ,
63- default = 1e6 ,
64- help = 'The number of pixels per processed chunk.' )
56+ parser .add_argument (
57+ "--nproc" ,
58+ type = int ,
59+ default = None ,
60+ help = "The number of processes. By default, use all available cores." ,
61+ )
6562
63+ parser .add_argument (
64+ "--chunksize" , default = 1e6 , help = "The number of pixels per processed chunk."
65+ )
6666
6767
6868def get_dist_margs (clr_path , lo , hi , dist_bins , weight_name , ignore_diags ):
@@ -72,125 +72,127 @@ def get_dist_margs(clr_path, lo, hi, dist_bins, weight_name, ignore_diags):
7272 chunk = clr .pixels ()[lo :hi ]
7373 res = clr .binsize
7474
75- chunk = chunk [chunk [' bin2_id' ] - chunk [' bin1_id' ] >= ignore_diags ]
75+ chunk = chunk [chunk [" bin2_id" ] - chunk [" bin1_id" ] >= ignore_diags ]
7676
7777 chunk = cooler .annotate (chunk , bins )
78- chunk ['balanced' ] = np .nan_to_num (chunk ['count' ] * chunk [f'{ weight_name } 1' ] * chunk [f'{ weight_name } 2' ])
78+ chunk ["balanced" ] = np .nan_to_num (
79+ chunk ["count" ] * chunk [f"{ weight_name } 1" ] * chunk [f"{ weight_name } 2" ]
80+ )
7981 chunk = chunk [chunk .chrom1 == chunk .chrom2 ]
8082
81- del ( clr )
83+ del clr
8284
8385 return _dist_margs (chunk , dist_bins , res )
8486
8587
86- def _dist_margs (chunk , dist_bins , res ):
87- min_bin_id = chunk [' bin1_id' ].values [0 ]
88+ def _dist_margs (chunk , dist_bins , res ):
89+ min_bin_id = chunk [" bin1_id" ].values [0 ]
8890
89- dists = (chunk [' bin2_id' ] - chunk [' bin1_id' ]) * res
90- dist_bin_id = np .searchsorted (dist_bins , dists , ' right' )
91+ dists = (chunk [" bin2_id" ] - chunk [" bin1_id" ]) * res
92+ dist_bin_id = np .searchsorted (dist_bins , dists , " right" )
9193 n_dist_bins = len (dist_bins )
9294 margs_down_loc = np .bincount (
93- (chunk [' bin1_id' ].values - min_bin_id ) * n_dist_bins + dist_bin_id ,
94- weights = np .nan_to_num (chunk [' balanced' ].values )
95- )
95+ (chunk [" bin1_id" ].values - min_bin_id ) * n_dist_bins + dist_bin_id ,
96+ weights = np .nan_to_num (chunk [" balanced" ].values ),
97+ )
9698
9799 margs_up_loc = np .bincount (
98- (chunk [' bin2_id' ].values - min_bin_id ) * n_dist_bins + dist_bin_id ,
99- weights = np .nan_to_num (chunk [' balanced' ].values )
100- )
100+ (chunk [" bin2_id" ].values - min_bin_id ) * n_dist_bins + dist_bin_id ,
101+ weights = np .nan_to_num (chunk [" balanced" ].values ),
102+ )
101103
102104 return min_bin_id , margs_down_loc , margs_up_loc
103105
104106
105-
106107def drop_resolution (clrname ):
107- name_parts = clrname .split ('.' )
108- if name_parts [- 1 ] in [' cool' , ' mcool' ]:
108+ name_parts = clrname .split ("." )
109+ if name_parts [- 1 ] in [" cool" , " mcool" ]:
109110 name_parts = name_parts [:- 1 ]
110111 if name_parts [- 1 ].isnumeric ():
111112 name_parts = name_parts [:- 1 ]
112- return '.' .join (name_parts )
113+ return "." .join (name_parts )
113114
114115
115116args = parser .parse_args ()
116117
117118logging .basicConfig (level = logging .INFO )
119+ if __name__ == "__main__" :
120+ mp .freeze_support ()
121+ chunksize = int (float (args .chunksize ))
122+ clr = cooler .Cooler (args .COOL_URI )
123+ bins = clr .bins ()[:]
124+ n_pixels = clr .pixels ().shape [0 ]
125+ dist_bins = np .array ([int (float (i )) for i in args .dist_bins .split ("," )])
126+ weight_name = args .clr_weight_name
127+ ignore_diags = args .ignore_diags
128+ formats = args .format .split ("," )
118129
119- chunksize = int (float (args .chunksize ))
120- clr = cooler .Cooler (args .COOL_URI )
121- bins = clr .bins ()[:]
122- n_pixels = clr .pixels ().shape [0 ]
123- dist_bins = np .array ([int (float (i )) for i in args .distbins .split (',' )])
124- weight_name = args .clr_weight_name
125- ignore_diags = args .ignore_diags
126- formats = args .format .split (',' )
130+ n_dist_bins = len (dist_bins )
127131
128- n_dist_bins = len ( dist_bins )
132+ chunk_spans = np . r_ [ np . arange ( 0 , n_pixels , chunksize ), n_pixels ]
129133
130- chunk_spans = np . r_ [ np . arange ( 0 , n_pixels , chunksize ), n_pixels ]
134+ nproc = mp . cpu_count () if args . nproc is None else args . nproc
131135
132- nproc = mp .cpu_count () if args .nproc is None else args .nproc
136+ logging .info (f"Calculating marginals for { args .COOL_URI } using { nproc } processes" )
137+ with mp .Pool (nproc ) as pool :
138+ out = pool .starmap (
139+ get_dist_margs ,
140+ [
141+ (args .COOL_URI , lo , hi , dist_bins , weight_name , ignore_diags )
142+ for lo , hi in zip (chunk_spans [:- 1 ], chunk_spans [1 :])
143+ ],
144+ )
133145
134- logging .info (f'Calculating marginals for { args .COOL_URI } using { nproc } processes' )
135- with mp .Pool (nproc ) as pool :
136- out = pool .starmap (get_dist_margs ,
137- [
138- (args .COOL_URI , lo , hi , dist_bins , weight_name , ignore_diags )
139- for lo , hi in zip (chunk_spans [:- 1 ], chunk_spans [1 :])
140- ]
141- )
146+ margs_up = np .zeros (len (bins ) * n_dist_bins + 1 )
147+ margs_down = np .zeros (len (bins ) * n_dist_bins + 1 )
148+
149+ for min_bin_id , margs_down_loc , margs_up_loc in out :
150+ margs_down [
151+ min_bin_id * n_dist_bins : min_bin_id * n_dist_bins + len (margs_down_loc )
152+ ] += margs_down_loc
153+
154+ margs_up [
155+ min_bin_id * n_dist_bins : min_bin_id * n_dist_bins + len (margs_up_loc )
156+ ] += margs_up_loc
157+
158+ margs_down = margs_down [:- 1 ].reshape ((len (bins ), n_dist_bins )).T
159+ margs_up = margs_up [:- 1 ].reshape ((len (bins ), n_dist_bins )).T
160+
161+ out_folder = pathlib .Path (args .outfolder )
162+ clr_name = pathlib .Path (args .COOL_URI .split (":" )[0 ]).name
163+ clr_name = drop_resolution (clr_name )
164+
165+ prefix = clr_name if args .prefix is None else args .prefix
166+ res = clr .binsize
142167
143- margs_up = np .zeros (len (bins ) * n_dist_bins )
144- margs_down = np .zeros (len (bins ) * n_dist_bins )
145-
146- for min_bin_id , margs_down_loc , margs_up_loc in out :
147- margs_down [
148- min_bin_id * n_dist_bins :
149- min_bin_id * n_dist_bins + len (margs_down_loc )
150- ] += margs_down_loc
151-
152- margs_up [
153- min_bin_id * n_dist_bins :
154- min_bin_id * n_dist_bins + len (margs_up_loc )
155- ] += margs_up_loc
156-
157-
158- margs_down = margs_down .reshape ((len (bins ), n_dist_bins )).T
159- margs_up = margs_up .reshape ((len (bins ), n_dist_bins )).T
160-
161-
162- out_folder = pathlib .Path (args .outfolder )
163- clr_name = pathlib .Path (args .COOL_URI .split (':' )[0 ]).name
164- clr_name = drop_resolution (clr_name )
165-
166- prefix = clr_name if args .prefix is None else args .prefix
167- res = clr .binsize
168-
169- for dist_bin_id in range (n_dist_bins ):
170- lo = np .r_ [0 , dist_bins ][dist_bin_id ]
171- hi = np .r_ [0 , dist_bins ][dist_bin_id + 1 ]
172-
173- for dir_str , margs in [('up' , margs_up ), ('down' , margs_down ), ('both' , margs_up + margs_down )]:
174- out_df = bins [['chrom' , 'start' , 'end' ]].copy ()
175- out_df ['marg' ] = margs [dist_bin_id ]
176- out_df ['marg' ] = out_df ['marg' ].mask (bins [weight_name ].isnull (), np .nan )
177-
178- if 'bigwig' in formats :
179- file_name = f'{ prefix } .{ res } .cross.{ dir_str } .{ lo } -{ hi } .bw'
180- logging .info (f'Write output into { file_name } ' )
181- bioframe .io .to_bigwig (
182- out_df ,
183- chromsizes = clr .chromsizes ,
184- outpath = str (out_folder / file_name ))
185-
186- if 'bedgraph' in formats :
187- file_name = f'{ prefix } .{ res } .cross.{ dir_str } .{ lo } -{ hi } .bg.gz'
188- logging .info (f'Write output into { file_name } ' )
189- out_df .to_csv (
190- str (out_folder / file_name ),
191- sep = '\t ' ,
192- index = False ,
193- header = False ,
194- )
195-
196-
168+ for dist_bin_id in range (n_dist_bins ):
169+ lo = np .r_ [0 , dist_bins ][dist_bin_id ]
170+ hi = np .r_ [0 , dist_bins ][dist_bin_id + 1 ]
171+
172+ for dir_str , margs in [
173+ ("up" , margs_up ),
174+ ("down" , margs_down ),
175+ ("both" , margs_up + margs_down ),
176+ ]:
177+ out_df = bins [["chrom" , "start" , "end" ]].copy ()
178+ out_df ["marg" ] = margs [dist_bin_id ]
179+ out_df ["marg" ] = out_df ["marg" ].mask (bins [weight_name ].isnull (), np .nan )
180+
181+ if "bigwig" in formats :
182+ file_name = f"{ prefix } .{ res } .cross.{ dir_str } .{ lo } -{ hi } .bw"
183+ logging .info (f"Write output into { file_name } " )
184+ bioframe .io .to_bigwig (
185+ out_df ,
186+ chromsizes = clr .chromsizes ,
187+ outpath = str (out_folder / file_name ),
188+ )
189+
190+ if "bedgraph" in formats :
191+ file_name = f"{ prefix } .{ res } .cross.{ dir_str } .{ lo } -{ hi } .bg.gz"
192+ logging .info (f"Write output into { file_name } " )
193+ out_df .to_csv (
194+ str (out_folder / file_name ),
195+ sep = "\t " ,
196+ index = False ,
197+ header = False ,
198+ )
0 commit comments