Skip to content

Commit 75176e2

Browse files
author
Peter Combs
committed
PEP8-ify many files
Lots of whitespace errors, line length issues, etc.
1 parent a99a215 commit 75176e2

13 files changed

Lines changed: 193 additions & 168 deletions

AssignReads2.py

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,31 @@
55
from progressbar import ProgressBar, ETA, Bar, Percentage
66
from argparse import Namespace
77

8+
89
class my_defaultdict(dict):
910
def __init__(self, default_factory, basename, other_args):
1011
self.default_factory = default_factory
1112
self.basename = basename
1213
self.other_args = other_args
14+
1315
def __missing__(self, key):
1416
self[key] = value = self.default_factory(self.basename % key,
1517
**self.other_args)
1618
return value
1719

20+
1821
def get(read, tag):
1922
return {tname.upper(): val for tname, val in read.tags}[tag.upper()]
2023

24+
2125
def get_nh(read):
22-
return {tag.upper(): val for tag,val in read.tags}['NH']
26+
return {tag.upper(): val for tag, val in read.tags}['NH']
27+
2328

2429
def get_species(read):
2530
return references[read.rname].split('_')[0]
2631

32+
2733
def process_read(read):
2834
if not read.tags:
2935
print read
@@ -32,13 +38,14 @@ def process_read(read):
3238
nh = get_nh(read)
3339
species = get_species(read)
3440
if nh == 1:
35-
#assigned.write(read)
41+
# assigned.write(read)
3642
specific_files[species].write(read)
3743
species_counts[species] += 1
3844
return
3945
else:
4046
resolve_multiread(read, nh, species)
4147

48+
4249
def resolve_multiread(read, nh, species):
4350
nm = get(read, 'NM')
4451
has_multi_frags = bool(0x1 & read.flag)
@@ -70,12 +77,13 @@ def resolve_multiread(read, nh, species):
7077
on_last_multiread(dbs, read)
7178
else:
7279
pass
73-
#print "didi we not have multiple frags?"
74-
#print has_multi_frags
75-
#print read.is_read1, read.is_read2
76-
#print read.qname
77-
#assert False
78-
# WTF are we doign here?
80+
# print "didi we not have multiple frags?"
81+
# print has_multi_frags
82+
# print read.is_read1, read.is_read2
83+
# print read.qname
84+
# assert False
85+
# WTF are we doign here?
86+
7987

8088
def on_last_multiread(dbs, read):
8189
# Sort out the reads
@@ -84,19 +92,19 @@ def on_last_multiread(dbs, read):
8492
# Report the best, or if equal quality, the first (which
8593
# tophat would've given anyways)
8694
species = get_species(read)
87-
#assigned.write(read)
95+
# assigned.write(read)
8896
specific_files[species].write(read)
8997
species_counts[species] += 1
9098
else:
9199
# Hits from multiple species
92100
vals = sorted([(val, spec) for spec, val in
93-
dbs.to_be_resolved_vals[read.qname].iteritems()])
101+
dbs.to_be_resolved_vals[read.qname].iteritems()])
94102
diff_val = vals[1][0] - vals[0][0]
95103
ambig_counts[diff_val] += 1
96104
if diff_val > ambig_threshold:
97105
species = vals[0][1]
98106
best_read = dbs.to_be_resolved_reads[read.qname][species]
99-
#assigned.write(best_read)
107+
# assigned.write(best_read)
100108
specific_files[species].write(best_read)
101109
species_counts[species] += 1
102110
else:
@@ -110,7 +118,7 @@ def on_last_multiread(dbs, read):
110118
ambig_names.append(spec)
111119

112120
ambig_names = tuple(ambig_names)
113-
ambig_types[ambig_names]+=1
121+
ambig_types[ambig_names] += 1
114122
for amb_read in dbs.to_be_resolved_reads[read.qname].itervalues():
115123
ambig.write(amb_read)
116124

@@ -126,31 +134,31 @@ def on_last_multiread(dbs, read):
126134
samfile = pysam.Samfile(fname, 'rb')
127135
references = samfile.references
128136
dir = path.dirname(fname)
129-
#assigned = pysam.Samfile(path.join(dir, 'assigned.bam'), 'wb',
130-
#template=samfile)
137+
# assigned = pysam.Samfile(path.join(dir, 'assigned.bam'), 'wb',
138+
# template=samfile)
131139
ambig = pysam.Samfile(path.join(dir, 'ambiguous.bam'), 'wb',
132-
template=samfile)
140+
template=samfile)
133141
specific_files = my_defaultdict(pysam.Samfile,
134142
path.join(dir, 'assigned_%s.bam'),
135143
{'template': samfile,
136144
'mode': 'wb'})
137145

138146
to_be_resolved_reads = defaultdict(dict)
139-
to_be_resolved_vals = defaultdict(lambda : defaultdict(lambda : 1000))
147+
to_be_resolved_vals = defaultdict(lambda: defaultdict(lambda: 1000))
140148
to_be_resolved_counts = Counter()
141149
to_be_resolved_reads2 = defaultdict(dict)
142-
to_be_resolved_vals2 = defaultdict(lambda : defaultdict(lambda : 1000))
150+
to_be_resolved_vals2 = defaultdict(lambda: defaultdict(lambda: 1000))
143151
to_be_resolved_counts2 = Counter()
144152
species_counts = Counter()
145153
ambig_counts = Counter()
146154
ambig_types = Counter()
147155

148156
print "Measuring file size"
149157
start = samfile.tell()
150-
maxval = path.getsize(fname) * 2**16 # I don't know why it's off by 2^16
158+
maxval = path.getsize(fname) * 2**16 # I don't know why it's off by 2^16
151159
pbar = ProgressBar(maxval=maxval - start + 2**16,
152-
widgets = [fname, ': ', Percentage(), ' ', Bar(), ' ',
153-
ETA(), ' '])
160+
widgets=[fname, ': ', Percentage(), ' ', Bar(), ' ',
161+
ETA(), ' '])
154162
pbar.start()
155163

156164
for read in samfile:
@@ -161,5 +169,3 @@ def on_last_multiread(dbs, read):
161169
print "Species assignments in %s: %s" % (fname, species_counts)
162170
print "Ambiguity distribution: ", ambig_counts
163171
print "Ambiguity types: ", ambig_types.most_common(50)
164-
165-

CalculateAbundances.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,36 +19,37 @@
1919
cuffcmp = ('cuffcompare -o {cuffname} -s {fasta} -CG -r {gtf} {gtf}')
2020
cuffname = path.join(path.dirname(gtf_ref), 'cuffcmp')
2121

22-
runstr = cuffcmp.format(cuffname = cuffname,
23-
fasta = fasta_ref,
24-
gtf = gtf_ref)
22+
runstr = cuffcmp.format(cuffname=cuffname,
23+
fasta=fasta_ref,
24+
gtf=gtf_ref)
2525
print runstr
2626
stdout.flush()
2727
call(runstr.split())
2828

29-
gtf_ref = cuffname + '.combined.gtf'
29+
gtf_ref = cuffname + '.combined.gtf'
3030

3131
design_file = pd.read_table(design_fname)
3232
files = defaultdict(list)
3333
for ix, row in design_file.iterrows():
3434
files[row['condition']].append(path.join(analysis_dir,
3535
row['Sample'],
36-
bamfile_base)
37-
)
36+
bamfile_base))
3837
conditions = sorted(files.keys())
3938

40-
cd = cd_base.format(conditions = ','.join(conditions),
41-
outdir = analysis_dir,
42-
fasta = fasta_ref,
43-
gtf = gtf_ref,
44-
bams = ' '.join([','.join(files[key]) for key in conditions]))
39+
cd = cd_base.format(conditions=','.join(conditions),
40+
outdir=analysis_dir,
41+
fasta=fasta_ref,
42+
gtf=gtf_ref,
43+
bams=' '.join([','.join(files[key])
44+
for key in conditions]))
4545

4646
conds_nobcd = [c for c in conditions if 'Bcd' not in c]
47-
cd_nobcd = cd_base.format(conditions = ','.join(conds_nobcd),
48-
outdir = analysis_dir+'-nobcd',
49-
fasta = fasta_ref,
50-
gtf = gtf_ref,
51-
bams = ' '.join([','.join(files[key]) for key in conds_nobcd]))
47+
cd_nobcd = cd_base.format(conditions=','.join(conds_nobcd),
48+
outdir=analysis_dir+'-nobcd',
49+
fasta=fasta_ref,
50+
gtf=gtf_ref,
51+
bams=' '.join([','.join(files[key])
52+
for key in conds_nobcd]))
5253
print cd_nobcd
5354
stdout.flush()
5455
call(cd_nobcd.split())
@@ -57,10 +58,10 @@
5758
call(cd.split())
5859
for condition in files:
5960
for file in files[condition]:
60-
cl = cl_base.format(outdir = path.dirname(file),
61-
fasta = fasta_ref,
62-
gtf = gtf_ref,
63-
bamfile = file)
61+
cl = cl_base.format(outdir=path.dirname(file),
62+
fasta=fasta_ref,
63+
gtf=gtf_ref,
64+
bamfile=file)
6465
print '-'*30
6566
print cl
6667
call(cl.split())

CalculateConcentrations.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
from __future__ import division
22
import pandas
33

4+
45
def main():
5-
df = pandas.read_table('CountConfig.tab', index_col = 5)
6+
df = pandas.read_table('CountConfig.tab', index_col=5)
67
df.dropna(how='any')
78

89
for rowname, row in df.iterrows():
910
mel_reads = row['mel_reads']
1011
carrier_reads = row['carrier_reads']
1112
carrier_conc = row['carrier_conc']
1213

13-
mel_conc = carrier_conc * mel_reads / carrier_reads
14+
mel_conc = carrier_conc * mel_reads / carrier_reads
1415
print rowname,
1516
print mel_conc, 'ng total RNA'
1617
return df

CheckCoverage.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
print """Usage: python CheckCoverage.py <GTF-File> BAMfile [BAMfile ...]"""
1515
sys.exit(1)
1616

17-
gtf_fname = sys.argv[1] #'Reference/dmel-all-r5.42.gtf'
17+
gtf_fname = sys.argv[1] # 'Reference/dmel-all-r5.42.gtf'
1818
analysis_dir = 'analysis'
1919

2020
starts = set()
@@ -23,11 +23,12 @@
2323

2424
cutoff = 0
2525

26+
2627
def analyze_bamfile(bam_fname):
2728
print bam_fname,
2829
bam_file = pysam.Samfile(bam_fname, 'rb')
2930

30-
coverages = defaultdict(lambda : [0,0, set()])
31+
coverages = defaultdict(lambda: [0, 0, set()])
3132
parent = ''
3233
curr_len = 0
3334
coverage = 0
@@ -36,28 +37,30 @@ def analyze_bamfile(bam_fname):
3637
f = open(gtf_fname)
3738
for line in f:
3839
pass
39-
pb = pbar.ProgressBar(widgets=[bam_fname, pbar.Bar(), pbar.ETA()],maxval=f.tell()).start()
40+
pb = pbar.ProgressBar(widgets=[bam_fname, pbar.Bar(), pbar.ETA()],
41+
maxval=f.tell()).start()
4042
f.seek(0)
4143
for line in f:
4244
pb.update(f.tell())
43-
if line.startswith('#'): continue
44-
if line.startswith('>'): break
45+
if line.startswith('#'):
46+
continue
47+
if line.startswith('>'):
48+
break
4549
data = line.split()
4650
chrom = data[0]
4751
kind = data[2]
4852
start = int(data[3]) - 1
4953
stop = int(data[4])
5054
fbtr_finder = re.compile('FBtr[0-9]*')
51-
#parent = fbtr_finder.findall(line)[0]
52-
53-
55+
# parent = fbtr_finder.findall(line)[0]
5456

5557
if kind == 'exon':
5658
fbtrs = fbtr_finder.findall(line)
57-
if not fbtrs: continue
59+
if not fbtrs:
60+
continue
5861
fbtr = fbtrs[0]
59-
coverages[fbtr][1] += (stop - start )
60-
62+
coverages[fbtr][1] += (stop - start)
63+
6164
starts = set()
6265
coverage = 0
6366
for read in bam_file.fetch(chrom, start, stop):
@@ -66,34 +69,34 @@ def analyze_bamfile(bam_fname):
6669
coverages[fbtr][0] += coverage
6770
coverages[fbtr][2].update(starts)
6871

69-
7072
pb.finish()
7173
curr_lens, rpks, uniques = zip(*coverages.itervalues())
7274
dir, fname = path.split(bam_fname)
7375

74-
7576
return dir, rpks, uniques, curr_lens
7677

7778
if __name__ == "__main__":
7879
import multiprocessing as mp
7980

8081
POOL = mp.Pool(20)
81-
res = POOL.map(analyze_bamfile, [f for f in sys.argv[2:] if f.endswith('.bam')])
82+
res = POOL.map(analyze_bamfile,
83+
[f for f in sys.argv[2:] if f.endswith('.bam')])
8284
all_dirs, all_rpks, all_pct_uniques, all_lens = zip(*res)
8385

8486
import cPickle as pickle
8587
out_fh = open('checkcoverage.pkl', 'w')
86-
pickle.dump({'dirs':all_dirs, 'rpks':all_rpks, 'pct_uniques':all_pct_uniques,
87-
'lens': all_lens}, out_fh)
88+
pickle.dump({'dirs': all_dirs, 'rpks': all_rpks,
89+
'pct_uniques': all_pct_uniques, 'lens': all_lens},
90+
out_fh)
8891
for fname, rpks, uniques, curr_lens in res:
8992
print fname
9093
try:
9194
xs = array(rpks)
92-
ys = array([len(u)/(curr_len + 1)
95+
ys = array([len(u)/(curr_len + 1)
9396
for u, curr_len in zip(uniques, curr_lens)])
94-
cutoff = max(xs[ys<.1])
95-
reg = stats.linregress(log(xs[(xs < cutoff) * (xs > 0) * (ys > 0)]),
96-
log(ys[(xs < cutoff) * (xs > 0) * (ys > 0)]))
97+
cutoff = max(xs[ys < .1])
98+
reg = stats.linregress(log(xs[(xs < cutoff)*(xs > 0)*(ys > 0)]),
99+
log(ys[(xs < cutoff)*(xs > 0)*(ys > 0)]))
97100
print "exp(%f) * x ** %f" % (reg[1], reg[0])
98101
print "Duplicate badness score: ", exp(-reg[1]-.38)
99102
except Exception as exc:

CountAllReads.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,3 @@
1818
print dirname, '\t{:15,}'.format(int(n)), ('*' if n % 1 else '')*10
1919
except:
2020
print dirname, "ERR!"
21-

GetSpeciesFromBlast.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020

2121
blast_recs5 = [r for r in NCBIXML.parse(open('5.blastout.xml'))]
2222
blast_recs6 = [r for r in NCBIXML.parse(open('6.blastout.xml'))]
23-
c5 = cs.Counter([tuple(r.alignments[0].hit_def.split()[:2]) for r in blast_recs5])
24-
c6 = cs.Counter([tuple(r.alignments[0].hit_def.split()[:2]) for r in blast_recs6])
23+
c5 = cs.Counter([tuple(r.alignments[0].hit_def.split()[:2])
24+
for r in blast_recs5])
25+
c6 = cs.Counter([tuple(r.alignments[0].hit_def.split()[:2])
26+
for r in blast_recs6])
2527

2628

2729
print(c5)

0 commit comments

Comments
 (0)