Merge changes from Steven's branch

Peter Combs · Peter Combs · commit 345b909c00f3 · 2014-11-10T14:05:06.000-08:00
diff --git a/MakeSummaryTable.py b/MakeSummaryTable.py
@@ -6,10 +6,13 @@
     2) -c -- also include confidence intervals
 
 """
+from __future__ import division
 import pandas
 from os import path
 from glob import glob
 from sys import argv
+from pysam import Samfile
+import gzip
 
 
 def parse_args():
@@ -38,6 +41,9 @@ def parse_args():
                         action='store_true',
                         help='When stripping a sample, replace all data with'
                         ' NaN')
+    parser.add_argument('--strip-low-map-rate', '-m', default=0, type=float,
+                        help='Remove samples with less than X% of reads '
+                        "mapping (off by default)")
     parser.add_argument('--mapped-bamfile', '-b', default='assigned_dmelR.bam',
                         help='The bam file to look in for mapped reads')
     parser.add_argument('--in-subdirectory', default=None,
@@ -69,10 +75,15 @@ def parse_args():
         args.key = int(args.key)
     except ValueError:
         pass
+    if args.strip_low_map_rate:
+        args.strip_on_unique = True
+        args.strip_low_reads = max(args.strip_low_reads, 1)
     return args
 
 
 def get_stagenum(name, series, dir):
+    if not [c for c in name if c.isdigit()]:
+        return 0
     # Slice until the first digit
     name_base = name[:[i for i, c in enumerate(name) if c.isdigit()][0]]
     dir = {'+': 1, '?': 1, '-': -1}[dir]
@@ -108,6 +119,7 @@ def get_stagenum(name, series, dir):
                   .replace('//', '/')
                   .strip('/'))
     basedir, dirname = path.split(alldir)
+    old_dirname = dirname
     table = (table.drop_duplicates(args.key)
              .dropna(axis=1, how='all')
              .dropna(axis=0, how='any'))
@@ -123,26 +135,39 @@ def get_stagenum(name, series, dir):
         print dirname, '=', new_dirname
         dirname = new_dirname
 
+    skip = False
     if args.strip_low_reads:
-        from pysam import Samfile
         sf = Samfile(path.join(alldir, args.mapped_bamfile))
         if args.strip_on_unique:
             reads = 0
             for read in sf:
                 reads += not read.is_secondary
-                if reads > args.strip_low_reads:
+                if reads > args.strip_low_reads and not args.strip_low_map_rate:
                     break
             skip = reads < args.strip_low_reads
         else:
             skip = sf.mapped < args.strip_low_reads
-        if skip:
-            if args.strip_as_nan:
-                from numpy import nan
-                print "NaNing", dirname
-                table.ix[:] = nan
-            else:
-                print "Skipping", dirname
-                continue
+    if args.strip_low_map_rate and args.has_params and not skip:
+        rfs = sorted(glob(path.join('sequence',
+                                    '*{}*'.format(params.ix[old_dirname]['Index']),
+                                    '*_R1_*.fastq.gz'))
+                    )
+        total_reads = 4e6 * (len(rfs) - 1)
+        for i, line in enumerate(gzip.open(rfs[-1])):
+            pass
+        total_reads += i//4
+        skip += (reads / total_reads) < (args.strip_low_map_rate / 100)
+        print(reads, total_reads, reads/total_reads,
+              args.strip_low_map_rate / 100)
+
+    if skip:
+        if args.strip_as_nan:
+            from numpy import nan
+            print "NaNing", dirname
+            table.ix[:] = nan
+        else:
+            print "Skipping", dirname
+            continue
     if df is None:
         df = pandas.DataFrame({dirname+"_FPKM": table.ix[:, args.column]})
     else:
diff --git a/Makefile b/Makefile
@@ -59,10 +59,11 @@ $(ANALYSIS_DIR)/summary.tsv : MakeSummaryTable.py $(FPKMS) $(RUNCONFIG) Makefile
 	@echo '============================='
 	python MakeSummaryTable.py \
        --params $(RUNCONFIG) \
-	   --strip-low-reads 100000 \
+	   --strip-low-reads 1000000 \
 	   --strip-on-unique \
 	   --strip-as-nan \
 	   --mapped-bamfile assigned_dmelR.bam \
+	   --strip-low-map-rate 85 \
 		$(ANALYSIS_DIR)
 
 %/genes.fpkm_tracking : %/assigned_dmelR.bam $(MELGTF) $(MELFASTA2)
diff --git a/configure b/configure
@@ -4,7 +4,6 @@ import pandas as pd
 from os import path
 from glob import glob
 from collections import defaultdict
-from sys import argv
 
 tophat_str = ('$(ANALYSIS_DIR)/{label}/accepted_hits.bam: '
               ' {rfs} '
@@ -17,6 +16,9 @@ tophat_str = ('$(ANALYSIS_DIR)/{label}/accepted_hits.bam: '
               '--output-dir $(ANALYSIS_DIR)/{label}/ '
               '--transcriptome-index Reference/{genome}/transcriptome '
               '--transcriptome-only '
+              # No need to sort automatically, for compatibility with
+              # the STAR-based pipeline
+              '--no-sort-bam '
               '--b2-sensitive '
               '--num-threads 12 '
               'Reference/{genome} '
@@ -36,6 +38,11 @@ star_str = ('$(ANALYSIS_DIR)/{label}/accepted_hits.bam: '
 
 mappers = {'star': star_str, 'tophat' : tophat_str}
 
+glob_specs = ['{seqdir}/*/*{label}*/*_{{read}}_*.fastq*',
+              '{seqdir}/*index{index}/*_{{read}}*.fastq*',
+             ]
+
+
 def parse_arguments():
     from argparse import ArgumentParser
     p = ArgumentParser(description='Configuration script for SliceSeq data processing')
@@ -76,26 +83,23 @@ targets_all = ' '.join(path.join('$(ANALYSIS_DIR)',
                              'genes.fpkm_tracking')
                    for label in config_file['Label'])
 
-out.write("FPKMS = {} {} \n".format(targets, targets_all))
+out.write("FPKMS = {}  \n".format(targets))
 
 reads = defaultdict(lambda : ([], []))
-carriers = defaultdict(list)
-carrier_species = {}
+sample_species = {}
 for i, row in config_file.iterrows():
     label = row['Label']
     index = row['Index']
     mbepc = int(row['MBEPC'])
-    carrier = row['CarrierID']
-    species = row['CarrierSpecies']
-    glob_spec = ('{seqdir}/*/*{label}*/*_{{read}}_*.fastq*'
-               .format(seqdir=args.seqdir, label=label, index=index, id=mbepc))
-    rf1 = glob(glob_spec.format(read='R1'))
-    if rf1 == []:
-        glob_spec = ('{seqdir}/*{id}*{index}*/*_{{read}}*.fastq*'
-                    .format(seqdir=args.seqdir, label=label, index=index,
-                            id=mbepc))
+    species = row['SampleSpecies']
+    for glob_spec in glob_specs:
+        glob_spec = (glob_spec
+                     .format(seqdir=args.seqdir, label=label, index=index, id=mbepc))
         rf1 = glob(glob_spec.format(read='R1'))
-        if rf1 == []:
+        if rf1 != []:
+            out.write("GLOBSPEC = {}\n".format(glob_spec))
+            break
+    else:
             print "Warning: no sequence for ", label, index
             print glob_spec.format(read='R1')
     rf2 = glob(glob_spec.format(read='R2'))