55from progressbar import ProgressBar , ETA , Bar , Percentage
66from argparse import Namespace
77
8+
89class my_defaultdict (dict ):
910 def __init__ (self , default_factory , basename , other_args ):
1011 self .default_factory = default_factory
1112 self .basename = basename
1213 self .other_args = other_args
14+
1315 def __missing__ (self , key ):
1416 self [key ] = value = self .default_factory (self .basename % key ,
1517 ** self .other_args )
1618 return value
1719
20+
1821def get (read , tag ):
1922 return {tname .upper (): val for tname , val in read .tags }[tag .upper ()]
2023
24+
2125def get_nh (read ):
22- return {tag .upper (): val for tag ,val in read .tags }['NH' ]
26+ return {tag .upper (): val for tag , val in read .tags }['NH' ]
27+
2328
2429def get_species (read ):
2530 return references [read .rname ].split ('_' )[0 ]
2631
32+
2733def process_read (read ):
2834 if not read .tags :
2935 print read
@@ -32,13 +38,14 @@ def process_read(read):
3238 nh = get_nh (read )
3339 species = get_species (read )
3440 if nh == 1 :
35- #assigned.write(read)
41+ # assigned.write(read)
3642 specific_files [species ].write (read )
3743 species_counts [species ] += 1
3844 return
3945 else :
4046 resolve_multiread (read , nh , species )
4147
48+
4249def resolve_multiread (read , nh , species ):
4350 nm = get (read , 'NM' )
4451 has_multi_frags = bool (0x1 & read .flag )
@@ -70,12 +77,13 @@ def resolve_multiread(read, nh, species):
7077 on_last_multiread (dbs , read )
7178 else :
7279 pass
73- #print "didi we not have multiple frags?"
74- #print has_multi_frags
75- #print read.is_read1, read.is_read2
76- #print read.qname
77- #assert False
78- # WTF are we doign here?
80+ # print "didi we not have multiple frags?"
81+ # print has_multi_frags
82+ # print read.is_read1, read.is_read2
83+ # print read.qname
84+ # assert False
85+ # WTF are we doign here?
86+
7987
8088def on_last_multiread (dbs , read ):
8189 # Sort out the reads
@@ -84,19 +92,19 @@ def on_last_multiread(dbs, read):
8492 # Report the best, or if equal quality, the first (which
8593 # tophat would've given anyways)
8694 species = get_species (read )
87- #assigned.write(read)
95+ # assigned.write(read)
8896 specific_files [species ].write (read )
8997 species_counts [species ] += 1
9098 else :
9199 # Hits from multiple species
92100 vals = sorted ([(val , spec ) for spec , val in
93- dbs .to_be_resolved_vals [read .qname ].iteritems ()])
101+ dbs .to_be_resolved_vals [read .qname ].iteritems ()])
94102 diff_val = vals [1 ][0 ] - vals [0 ][0 ]
95103 ambig_counts [diff_val ] += 1
96104 if diff_val > ambig_threshold :
97105 species = vals [0 ][1 ]
98106 best_read = dbs .to_be_resolved_reads [read .qname ][species ]
99- #assigned.write(best_read)
107+ # assigned.write(best_read)
100108 specific_files [species ].write (best_read )
101109 species_counts [species ] += 1
102110 else :
@@ -110,7 +118,7 @@ def on_last_multiread(dbs, read):
110118 ambig_names .append (spec )
111119
112120 ambig_names = tuple (ambig_names )
113- ambig_types [ambig_names ]+= 1
121+ ambig_types [ambig_names ] += 1
114122 for amb_read in dbs .to_be_resolved_reads [read .qname ].itervalues ():
115123 ambig .write (amb_read )
116124
@@ -126,31 +134,31 @@ def on_last_multiread(dbs, read):
126134 samfile = pysam .Samfile (fname , 'rb' )
127135 references = samfile .references
128136 dir = path .dirname (fname )
129- #assigned = pysam.Samfile(path.join(dir, 'assigned.bam'), 'wb',
130- # template=samfile)
137+ # assigned = pysam.Samfile(path.join(dir, 'assigned.bam'), 'wb',
138+ # template=samfile)
131139 ambig = pysam .Samfile (path .join (dir , 'ambiguous.bam' ), 'wb' ,
132- template = samfile )
140+ template = samfile )
133141 specific_files = my_defaultdict (pysam .Samfile ,
134142 path .join (dir , 'assigned_%s.bam' ),
135143 {'template' : samfile ,
136144 'mode' : 'wb' })
137145
138146 to_be_resolved_reads = defaultdict (dict )
139- to_be_resolved_vals = defaultdict (lambda : defaultdict (lambda : 1000 ))
147+ to_be_resolved_vals = defaultdict (lambda : defaultdict (lambda : 1000 ))
140148 to_be_resolved_counts = Counter ()
141149 to_be_resolved_reads2 = defaultdict (dict )
142- to_be_resolved_vals2 = defaultdict (lambda : defaultdict (lambda : 1000 ))
150+ to_be_resolved_vals2 = defaultdict (lambda : defaultdict (lambda : 1000 ))
143151 to_be_resolved_counts2 = Counter ()
144152 species_counts = Counter ()
145153 ambig_counts = Counter ()
146154 ambig_types = Counter ()
147155
148156 print "Measuring file size"
149157 start = samfile .tell ()
150- maxval = path .getsize (fname ) * 2 ** 16 # I don't know why it's off by 2^16
158+ maxval = path .getsize (fname ) * 2 ** 16 # I don't know why it's off by 2^16
151159 pbar = ProgressBar (maxval = maxval - start + 2 ** 16 ,
152- widgets = [fname , ': ' , Percentage (), ' ' , Bar (), ' ' ,
153- ETA (), ' ' ])
160+ widgets = [fname , ': ' , Percentage (), ' ' , Bar (), ' ' ,
161+ ETA (), ' ' ])
154162 pbar .start ()
155163
156164 for read in samfile :
@@ -161,5 +169,3 @@ def on_last_multiread(dbs, read):
161169 print "Species assignments in %s: %s" % (fname , species_counts )
162170 print "Ambiguity distribution: " , ambig_counts
163171 print "Ambiguity types: " , ambig_types .most_common (50 )
164-
165-
0 commit comments