Skip to content

Commit 345b909

Browse files
author
Peter Combs
committed
Merge changes from Steven's branch
1 parent 75176e2 commit 345b909

3 files changed

Lines changed: 55 additions & 25 deletions

File tree

MakeSummaryTable.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@
66
2) -c -- also include confidence intervals
77
88
"""
9+
from __future__ import division
910
import pandas
1011
from os import path
1112
from glob import glob
1213
from sys import argv
14+
from pysam import Samfile
15+
import gzip
1316

1417

1518
def parse_args():
@@ -38,6 +41,9 @@ def parse_args():
3841
action='store_true',
3942
help='When stripping a sample, replace all data with'
4043
' NaN')
44+
parser.add_argument('--strip-low-map-rate', '-m', default=0, type=float,
45+
help='Remove samples with less than X% of reads '
46+
"mapping (off by default)")
4147
parser.add_argument('--mapped-bamfile', '-b', default='assigned_dmelR.bam',
4248
help='The bam file to look in for mapped reads')
4349
parser.add_argument('--in-subdirectory', default=None,
@@ -69,10 +75,15 @@ def parse_args():
6975
args.key = int(args.key)
7076
except ValueError:
7177
pass
78+
if args.strip_low_map_rate:
79+
args.strip_on_unique = True
80+
args.strip_low_reads = max(args.strip_low_reads, 1)
7281
return args
7382

7483

7584
def get_stagenum(name, series, dir):
85+
if not [c for c in name if c.isdigit()]:
86+
return 0
7687
# Slice until the first digit
7788
name_base = name[:[i for i, c in enumerate(name) if c.isdigit()][0]]
7889
dir = {'+': 1, '?': 1, '-': -1}[dir]
@@ -108,6 +119,7 @@ def get_stagenum(name, series, dir):
108119
.replace('//', '/')
109120
.strip('/'))
110121
basedir, dirname = path.split(alldir)
122+
old_dirname = dirname
111123
table = (table.drop_duplicates(args.key)
112124
.dropna(axis=1, how='all')
113125
.dropna(axis=0, how='any'))
@@ -123,26 +135,39 @@ def get_stagenum(name, series, dir):
123135
print dirname, '=', new_dirname
124136
dirname = new_dirname
125137

138+
skip = False
126139
if args.strip_low_reads:
127-
from pysam import Samfile
128140
sf = Samfile(path.join(alldir, args.mapped_bamfile))
129141
if args.strip_on_unique:
130142
reads = 0
131143
for read in sf:
132144
reads += not read.is_secondary
133-
if reads > args.strip_low_reads:
145+
if reads > args.strip_low_reads and not args.strip_low_map_rate:
134146
break
135147
skip = reads < args.strip_low_reads
136148
else:
137149
skip = sf.mapped < args.strip_low_reads
138-
if skip:
139-
if args.strip_as_nan:
140-
from numpy import nan
141-
print "NaNing", dirname
142-
table.ix[:] = nan
143-
else:
144-
print "Skipping", dirname
145-
continue
150+
if args.strip_low_map_rate and args.has_params and not skip:
151+
rfs = sorted(glob(path.join('sequence',
152+
'*{}*'.format(params.ix[old_dirname]['Index']),
153+
'*_R1_*.fastq.gz'))
154+
)
155+
total_reads = 4e6 * (len(rfs) - 1)
156+
for i, line in enumerate(gzip.open(rfs[-1])):
157+
pass
158+
total_reads += i//4
159+
skip += (reads / total_reads) < (args.strip_low_map_rate / 100)
160+
print(reads, total_reads, reads/total_reads,
161+
args.strip_low_map_rate / 100)
162+
163+
if skip:
164+
if args.strip_as_nan:
165+
from numpy import nan
166+
print "NaNing", dirname
167+
table.ix[:] = nan
168+
else:
169+
print "Skipping", dirname
170+
continue
146171
if df is None:
147172
df = pandas.DataFrame({dirname+"_FPKM": table.ix[:, args.column]})
148173
else:

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,11 @@ $(ANALYSIS_DIR)/summary.tsv : MakeSummaryTable.py $(FPKMS) $(RUNCONFIG) Makefile
5959
@echo '============================='
6060
python MakeSummaryTable.py \
6161
--params $(RUNCONFIG) \
62-
--strip-low-reads 100000 \
62+
--strip-low-reads 1000000 \
6363
--strip-on-unique \
6464
--strip-as-nan \
6565
--mapped-bamfile assigned_dmelR.bam \
66+
--strip-low-map-rate 85 \
6667
$(ANALYSIS_DIR)
6768

6869
%/genes.fpkm_tracking : %/assigned_dmelR.bam $(MELGTF) $(MELFASTA2)

configure

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import pandas as pd
44
from os import path
55
from glob import glob
66
from collections import defaultdict
7-
from sys import argv
87

98
tophat_str = ('$(ANALYSIS_DIR)/{label}/accepted_hits.bam: '
109
' {rfs} '
@@ -17,6 +16,9 @@ tophat_str = ('$(ANALYSIS_DIR)/{label}/accepted_hits.bam: '
1716
'--output-dir $(ANALYSIS_DIR)/{label}/ '
1817
'--transcriptome-index Reference/{genome}/transcriptome '
1918
'--transcriptome-only '
19+
# No need to sort automatically, for compatibility with
20+
# the STAR-based pipeline
21+
'--no-sort-bam '
2022
'--b2-sensitive '
2123
'--num-threads 12 '
2224
'Reference/{genome} '
@@ -36,6 +38,11 @@ star_str = ('$(ANALYSIS_DIR)/{label}/accepted_hits.bam: '
3638

3739
mappers = {'star': star_str, 'tophat' : tophat_str}
3840

41+
glob_specs = ['{seqdir}/*/*{label}*/*_{{read}}_*.fastq*',
42+
'{seqdir}/*index{index}/*_{{read}}*.fastq*',
43+
]
44+
45+
3946
def parse_arguments():
4047
from argparse import ArgumentParser
4148
p = ArgumentParser(description='Configuration script for SliceSeq data processing')
@@ -76,26 +83,23 @@ targets_all = ' '.join(path.join('$(ANALYSIS_DIR)',
7683
'genes.fpkm_tracking')
7784
for label in config_file['Label'])
7885

79-
out.write("FPKMS = {} {} \n".format(targets, targets_all))
86+
out.write("FPKMS = {} \n".format(targets))
8087

8188
reads = defaultdict(lambda : ([], []))
82-
carriers = defaultdict(list)
83-
carrier_species = {}
89+
sample_species = {}
8490
for i, row in config_file.iterrows():
8591
label = row['Label']
8692
index = row['Index']
8793
mbepc = int(row['MBEPC'])
88-
carrier = row['CarrierID']
89-
species = row['CarrierSpecies']
90-
glob_spec = ('{seqdir}/*/*{label}*/*_{{read}}_*.fastq*'
91-
.format(seqdir=args.seqdir, label=label, index=index, id=mbepc))
92-
rf1 = glob(glob_spec.format(read='R1'))
93-
if rf1 == []:
94-
glob_spec = ('{seqdir}/*{id}*{index}*/*_{{read}}*.fastq*'
95-
.format(seqdir=args.seqdir, label=label, index=index,
96-
id=mbepc))
94+
species = row['SampleSpecies']
95+
for glob_spec in glob_specs:
96+
glob_spec = (glob_spec
97+
.format(seqdir=args.seqdir, label=label, index=index, id=mbepc))
9798
rf1 = glob(glob_spec.format(read='R1'))
98-
if rf1 == []:
99+
if rf1 != []:
100+
out.write("GLOBSPEC = {}\n".format(glob_spec))
101+
break
102+
else:
99103
print "Warning: no sequence for ", label, index
100104
print glob_spec.format(read='R1')
101105
rf2 = glob(glob_spec.format(read='R2'))

0 commit comments

Comments
 (0)