66 2) -c -- also include confidence intervals
77
88"""
9+ from __future__ import division
910import pandas
1011from os import path
1112from glob import glob
1213from sys import argv
14+ from pysam import Samfile
15+ import gzip
1316
1417
1518def parse_args ():
@@ -38,6 +41,9 @@ def parse_args():
3841 action = 'store_true' ,
3942 help = 'When stripping a sample, replace all data with'
4043 ' NaN' )
44+ parser .add_argument ('--strip-low-map-rate' , '-m' , default = 0 , type = float ,
45+ help = 'Remove samples with less than X% of reads '
46+ "mapping (off by default)" )
4147 parser .add_argument ('--mapped-bamfile' , '-b' , default = 'assigned_dmelR.bam' ,
4248 help = 'The bam file to look in for mapped reads' )
4349 parser .add_argument ('--in-subdirectory' , default = None ,
@@ -69,10 +75,15 @@ def parse_args():
6975 args .key = int (args .key )
7076 except ValueError :
7177 pass
78+ if args .strip_low_map_rate :
79+ args .strip_on_unique = True
80+ args .strip_low_reads = max (args .strip_low_reads , 1 )
7281 return args
7382
7483
7584def get_stagenum (name , series , dir ):
85+ if not [c for c in name if c .isdigit ()]:
86+ return 0
7687 # Slice until the first digit
7788 name_base = name [:[i for i , c in enumerate (name ) if c .isdigit ()][0 ]]
7889 dir = {'+' : 1 , '?' : 1 , '-' : - 1 }[dir ]
@@ -108,6 +119,7 @@ def get_stagenum(name, series, dir):
108119 .replace ('//' , '/' )
109120 .strip ('/' ))
110121 basedir , dirname = path .split (alldir )
122+ old_dirname = dirname
111123 table = (table .drop_duplicates (args .key )
112124 .dropna (axis = 1 , how = 'all' )
113125 .dropna (axis = 0 , how = 'any' ))
@@ -123,26 +135,39 @@ def get_stagenum(name, series, dir):
123135 print dirname , '=' , new_dirname
124136 dirname = new_dirname
125137
138+ skip = False
126139 if args .strip_low_reads :
127- from pysam import Samfile
128140 sf = Samfile (path .join (alldir , args .mapped_bamfile ))
129141 if args .strip_on_unique :
130142 reads = 0
131143 for read in sf :
132144 reads += not read .is_secondary
133- if reads > args .strip_low_reads :
145+ if reads > args .strip_low_reads and not args . strip_low_map_rate :
134146 break
135147 skip = reads < args .strip_low_reads
136148 else :
137149 skip = sf .mapped < args .strip_low_reads
138- if skip :
139- if args .strip_as_nan :
140- from numpy import nan
141- print "NaNing" , dirname
142- table .ix [:] = nan
143- else :
144- print "Skipping" , dirname
145- continue
150+ if args .strip_low_map_rate and args .has_params and not skip :
151+ rfs = sorted (glob (path .join ('sequence' ,
152+ '*{}*' .format (params .ix [old_dirname ]['Index' ]),
153+ '*_R1_*.fastq.gz' ))
154+ )
155+ total_reads = 4e6 * (len (rfs ) - 1 )
156+ for i , line in enumerate (gzip .open (rfs [- 1 ])):
157+ pass
158+ total_reads += i // 4
159+ skip += (reads / total_reads ) < (args .strip_low_map_rate / 100 )
160+ print (reads , total_reads , reads / total_reads ,
161+ args .strip_low_map_rate / 100 )
162+
163+ if skip :
164+ if args .strip_as_nan :
165+ from numpy import nan
166+ print "NaNing" , dirname
167+ table .ix [:] = nan
168+ else :
169+ print "Skipping" , dirname
170+ continue
146171 if df is None :
147172 df = pandas .DataFrame ({dirname + "_FPKM" : table .ix [:, args .column ]})
148173 else :
0 commit comments