|
18 | 18 | # |
19 | 19 |
|
20 | 20 | #LICENSE |
21 | | -# LINKS, RAILS and Cobbler Copyright (c) 2014-2016 Canada's Michael Smith Genome Science Centre. All rights reserved. |
| 21 | +# LINKS, RAILS and Cobbler Copyright (c) 2014-2017 Canada's Michael Smith Genome Science Centre. All rights reserved. |
22 | 22 |
|
23 | 23 | use strict; |
24 | 24 | use Getopt::Std; |
25 | 25 | use Net::SMTP; |
26 | 26 | use vars qw($opt_f $opt_s $opt_d $opt_i $opt_v $opt_b $opt_t $opt_q); |
27 | 27 | getopts('f:s:d:v:b:t:i:q:'); |
28 | | -my ($base_name,$frag_dist,$seqid,$verbose)=("",1000,0.9,0); |
| 28 | +my ($base_name,$frag_dist,$seqid,$verbose)=("",250,0.9,0); |
29 | 29 |
|
30 | | -my $version = "[v0.2]"; |
| 30 | +my $version = "[v0.3]"; |
31 | 31 | my $dev = "rwarren\@bcgsc.ca"; |
| 32 | +my $SAMPATH = "/gsc/btl/linuxbrew/bin/samtools"; |
32 | 33 |
|
33 | 34 | #------------------------------------------------- |
34 | 35 |
|
35 | 36 | if(! $opt_f || ! $opt_s || ! $opt_q){ |
36 | 37 | print "Usage: $0 $version\n"; |
37 | | - print "-f Assembled Sequences to further scaffold (Multi-Fasta format, required)\n"; |
38 | | - print "-q Long Sequences queried (Multi-Fasta format, required)\n"; |
39 | | - print "-s SAM file\n"; |
| 38 | + print "-f Assembled Sequences to further scaffold (Multi-FASTA format NO LINE BREAKS, required)\n"; |
| 39 | + print "-q Long Sequences queried (Multi-FASTA format NO LINE BREAKS, required)\n"; |
| 40 | + print "-s BAM file (use v0.2 for reading SAM files)\n"; |
40 | 41 | print "-d Anchoring bases on contig edges (ie. minimum required alignment size on contigs, default -d $frag_dist, optional)\n"; |
41 | 42 | print "-i Minimum sequence identity, default -i $seqid, optional\n"; |
42 | 43 | print "-t LIST of names/header, long sequences to avoid using for merging/gap-filling scaffolds (optional)\n"; |
|
67 | 68 | ### Naming output files |
68 | 69 | if ($base_name eq ""){ |
69 | 70 |
|
70 | | - $base_name = $file . ".scaff_s-" . $longfile . "_d" . $frag_dist . "_i" . $seqid . "_t" . $listfile; |
| 71 | + $base_name = $file . ".scaff_s-" . $longfile . "_q-" . $queryfile . "_d" . $frag_dist . "_i" . $seqid . "_t" . $listfile; |
71 | 72 |
|
72 | 73 | my $pid_num = getpgrp(0); |
73 | 74 | $base_name .= "_pid" . $pid_num; |
|
113 | 114 | print $patchmsg; |
114 | 115 | print LOG $patchmsg; |
115 | 116 | $assemblyruninfo.=$patchmsg; |
116 | | -my $gsl = &patchGaps($file,$tigpair,$newassemblyfile,$tsvfile); |
| 117 | +my ($gsl,$totalgap) = &patchGaps($file,$tigpair,$newassemblyfile,$tsvfile); |
117 | 118 |
|
118 | 119 | my $date = `date`; |
119 | 120 | chomp($date); |
120 | 121 | my ($avg,$sum,$max,$min) = &average($gsl); |
121 | 122 | my $sd = &stdev($gsl); |
122 | | -my $final_message = "done: $date\n\n--------------- $0 Summary ---------------\nNumber of gaps patched : %i\nAverage length (bp) : %.2f\nLength st.dev +/- : %.2f\nTotal bases added : %i\nLargest gap resolved (bp) : %i\nShortest gap resolved (bp) : %i\n---------------------------------------------\n"; |
| 123 | +my $final_message = "done: $date\n\n--------------- $0 Summary ---------------\nNumber of gaps patched : %i out of %i (%.2f %%)\nAverage length (bp) : %.2f\nLength st.dev +/- : %.2f\nTotal bases added : %i\nLargest gap resolved (bp) : %i\nShortest gap resolved (bp) : %i\n---------------------------------------------\n"; |
123 | 124 | my @arrsg=@$gsl; |
124 | 125 | my $numgaps = $#arrsg+1; |
125 | | -printf $final_message, ($numgaps,$avg,$sd,$sum,$max,$min); |
126 | | -printf LOG $final_message, ($numgaps,$avg,$sd,$sum,$max,$min); |
| 126 | +my $percentclosed = $numgaps / $totalgap *100; |
| 127 | +printf $final_message, ($numgaps,$totalgap,$percentclosed,$avg,$sd,$sum,$max,$min); |
| 128 | +printf LOG $final_message, ($numgaps,$totalgap,$percentclosed,$avg,$sd,$sum,$max,$min); |
127 | 129 |
|
128 | | -$assemblyruninfo .= "done: $date\n\n--------------- $0 Summary ---------------\nNumber of gaps patched : $numgaps\nAverage length (bp) : $avg\nLength st.dev +/- : $sd\nTotal bases added : $sum\nLargest gap resolved (bp) : $max\nShortest gap resolved (bp) : $min\n---------------------------------------------\n"; |
| 130 | +$assemblyruninfo .= "done: $date\n\n--------------- $0 Summary ---------------\nNumber of gaps patched : $numgaps out of $totalgap ($percentclosed %) \nAverage length (bp) : $avg\nLength st.dev +/- : $sd\nTotal bases added : $sum\nLargest gap resolved (bp) : $max\nShortest gap resolved (bp) : $min\n---------------------------------------------\n"; |
129 | 131 |
|
130 | | -exit; |
| 132 | +#exit; |
131 | 133 |
|
132 | 134 | ###for dev. test purposes |
133 | 135 | eval{ |
|
150 | 152 | #----------------- |
151 | 153 | sub readSeqMemory{ |
152 | 154 |
|
153 | | - my $file = shift; |
| 155 | + my $file = shift; |
154 | 156 |
|
155 | 157 | my $fh; |
156 | 158 | my $prev="NA"; |
@@ -252,7 +254,7 @@ sub patchGaps{ |
252 | 254 | print "$endmessage"; |
253 | 255 | $assemblyruninfo .= $endmessage; |
254 | 256 |
|
255 | | - return \@gapspatched; |
| 257 | + return \@gapspatched,$totalgap; |
256 | 258 | } |
257 | 259 |
|
258 | 260 | #--------------- |
@@ -299,7 +301,9 @@ sub readSam{ |
299 | 301 | my %rlength = (); |
300 | 302 | my $min=1; |
301 | 303 |
|
302 | | - open(IN,$samfile) || die "Error reading $samfile -- fatal.\n"; |
| 304 | + my $ERRLOG = $samfile.".bampreprocessor.err.log".$$.time(); |
| 305 | + my $cmd = "$SAMPATH view $samfile 2>$ERRLOG|"; |
| 306 | + open(IN,$cmd) || die "Error reading $samfile -- fatal.\n"; |
303 | 307 | while(<IN>){ |
304 | 308 |
|
305 | 309 | chomp; |
|
0 commit comments