2323use strict;
2424use Getopt::Std;
2525use Net::SMTP;
26- use vars qw( $opt_f $opt_s $opt_d $opt_i $opt_e $opt_l $opt_a $opt_v $opt_b $opt_t $opt_p $opt_q) ;
26+ use vars qw( $opt_f $opt_s $opt_d $opt_i $opt_e $opt_l $opt_a $opt_v $opt_b $opt_t $opt_p $opt_q $opt_g ) ;
2727getopts(' f:s:d:e:l:a:v:b:t:p:i:q:' );
28- my ($base_name ,$anchor ,$seqid ,$insert_stdev ,$min_links ,$max_link_ratio ,$verbose )=(" " ,1000,0.9,1.0,1,0,0 );
28+ my ($base_name ,$anchor ,$seqid ,$insert_stdev ,$min_links ,$max_link_ratio ,$grace , $ verbose )=(" " ,1000,0.9,1.0,1,0.99,1,0, );
2929
30- my $version = " [v1.3 ]" ;
30+ my $version = " [v1.4 ]" ;
3131my $dev = " rwarren\@ bcgsc.ca" ;
3232my $SAMPATH = " /gsc/btl/linuxbrew/bin/samtools" ;
3333# -------------------------------------------------
@@ -41,8 +41,9 @@ if(! $opt_f || ! $opt_s || ! $opt_q){
4141 print " -i Minimum sequence identity fraction (0 to 1), default -i $seqid , optional\n " ;
4242 print " -t LIST of names/header, long sequences to avoid using for merging/gap-filling scaffolds (optional)\n " ;
4343 # print "-e Error (%) allowed on -d distance e.g. -e 0.1 == distance +/- 10% (default -e $insert_stdev, optional)\n";
44- # print "-l Minimum number of links to compute scaffold (default -l $min_links, optional)\n";
45- # print "-a Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a $max_link_ratio, optional)\n";
44+ print " -l Minimum number of links to compute scaffold (default -l $min_links , optional)\n " ;
45+ print " -a Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a $max_link_ratio , optional)\n " ;
46+ print " -g Grace length (bp), default -g $grace , optional\n " ;
4647 print " -b Base name for your output files (optional)\n " ;
4748 print " -v Runs in verbose mode (-v 1 = yes, default = no, optional)\n " ;
4849 die " IMPORTANT: the order of files in -q and -s MUST match!\n " ;
@@ -55,11 +56,12 @@ $anchor = $opt_d if($opt_d);
5556$seqid = $opt_i if ($opt_i );
5657$verbose = $opt_v if ($opt_v );
5758# DO NOT UNCOMMENT THE FOLLOWING LINES, ONLY DEFAULTS TO BE USED WITH RAILS
58- # $min_links = $opt_l if($opt_l);
59- # $max_link_ratio = $opt_a if($opt_a);
59+ $min_links = $opt_l if ($opt_l );
60+ $max_link_ratio = $opt_a if ($opt_a );
6061# $insert_stdev = $opt_e if($opt_e);
6162my $listfile = $opt_t if ($opt_t );
6263$base_name = $opt_b if ($opt_b );
64+ $grace = $opt_g if ($opt_g );
6365
6466my $assemblyruninfo =" " ;
6567
@@ -75,7 +77,7 @@ if(! -e $fof){
7577# ## Naming output files
7678if ($base_name eq " " ){
7779
78- $base_name = $file . " .scaff_s-" . $fof . " _q-" . $queryfof . " _d" . $anchor . " _i" . $seqid . " _e" . $insert_stdev . " _l" . $min_links . " _a" . $max_link_ratio . " _t" . $listfile ;
80+ $base_name = $file . " .scaff_s-" . $fof . " _q-" . $queryfof . " _d" . $anchor . " _i" . $seqid . " _e" . $insert_stdev . " _l" . $min_links . " _a" . $max_link_ratio . " _g " . $grace . " _t" . $listfile ;
7981
8082 my $pid_num = getpgrp (0);
8183 $base_name .= " _pid" . $pid_num ;
@@ -93,7 +95,7 @@ open (LOG, ">$log") || die "Can't write to $log -- fatal\n";
9395
9496my $init_message = " \n Running: $0 $version \n -f $file \n -q $queryfof \n -s $fof \n " ;
9597
96- $init_message .= " -d $anchor \n -i $seqid \n -e $insert_stdev \n -l $min_links \n -a $max_link_ratio \n -t $listfile \n " ;
98+ $init_message .= " -d $anchor \n -i $seqid \n -e $insert_stdev \n -l $min_links \n -a $max_link_ratio \n -g $grace \n - t $listfile \n " ;
9799
98100print $init_message ;
99101print LOG $init_message ;
@@ -130,7 +132,7 @@ while(<FOF>){
130132 my $bamfile = $_ ;
131133 my $rh = &readSeqMemory($qryfilearray [$ctline ]);# ## ONLY READ READ SEQUENCE IN MEMORY FOR THOSE MATCHING BAM (SAME ORDER NEEDED)
132134 print " Parsing alignment file $bamfile ...\n " ;
133- ($matepair ,$track_all ,$tigpair )=&readBam($matepair ,$track_all ,$tigpair ,$bamfile ,$anchor ,$seqid ,$listfile ,$matepair ,$initpos ,$rh );
135+ ($matepair ,$track_all ,$tigpair )=&readBam($matepair ,$track_all ,$tigpair ,$bamfile ,$anchor ,$seqid ,$listfile ,$matepair ,$initpos ,$rh , $grace );
134136 print " done.\n " ;
135137 $ctline ++;
136138}
@@ -269,7 +271,7 @@ sub readContigs{
269271# ---------------
270272sub readBam{
271273
272- my ($matepair ,$track_all ,$tigpair ,$bamfile ,$anchor ,$seqid ,$listfile ,$matepair ,$initpos ,$rh ) = @_ ;
274+ my ($matepair ,$track_all ,$tigpair ,$bamfile ,$anchor ,$seqid ,$listfile ,$matepair ,$initpos ,$rh , $grace ) = @_ ;
273275
274276 my $mem ;
275277 if (-f $listfile ){
@@ -306,7 +308,6 @@ sub readBam{
306308
307309
308310 my %rlength = ();
309- my $min =1;
310311
311312 my $ERRLOG = $bamfile ." .bampreprocessor.err.log" .$$ .time ();
312313 my $cmd = " $SAMPATH view $bamfile 2>$ERRLOG |" ;# ##read BAM
@@ -379,14 +380,14 @@ sub readBam{
379380 $si = ($qalen - $edit_dist ) / $qalen if ($qalen );
380381
381382
382- if ($si >= $seqid && $qalen >= $anchor && (( $rstart <= $min && ($qlen -$qend )<= $min ) || ($qstart <=$min && ($rlen -$rend )<=$min ) )){ # ## this indicates anchoring bases, within $anchor of edges
383+ if ($si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen -$qend )<= $grace ) || ($qstart <=$grace && ($rlen -$rend )<=$grace ) )){ # ## this indicates anchoring bases, within $anchor of edges
383384
384385
385- # print "$si >= $seqid && $qalen >= $anchor && (( $rstart <= $min && ($qlen-$qend)<= $min ) || ($qstart<=$min && ($rlen-$rend)<=$min \n";
386+ # print "$si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace ) || ($qstart<=$grace && ($rlen-$rend)<=$grace \n" if($verbose) ;
386387 my $start ;
387388 my $end ;
388389 # ##Coordinates on the scaffolds
389- if ($rstart <= $min && ($qlen -$qend )<= $min ){
390+ if ($rstart <= $grace && ($qlen -$qend )<= $grace ){
390391 $start = $rend ;
391392 $end = $rstart ;
392393 }else {
@@ -404,7 +405,7 @@ sub readBam{
404405 $orient =" f" ;
405406 }
406407
407- # $t->{$a[0]}{$a[2]}{'order'}=$ct;
408+ $t -> {$a [0]}{$a [2]}{' order' }=$ct ;
408409 $t -> {$a [0]}{$a [2]}{' orient' }=$orient ;
409410 $t -> {$a [0]}{$a [2]}{' real' }=$read ;
410411 $t -> {$a [0]}{$a [2]}{' length' }=$qlen ;
@@ -414,13 +415,12 @@ sub readBam{
414415 $track_all -> {$read }{' start' }=$start ;
415416 $track_all -> {$read }{' end' }=$end ;
416417 $track_all -> {$read }{' multiple' }=1;
417-
418418 $track_all -> {$read }{' sam' }=$line ;
419419 $track_all -> {$read }{' orient' }=$orient ;
420- $track_all -> {$read }{' qalen' }=$qalen ;
420+ $track_all -> {$read }{' qalen' }=$qalen ;# ## tracks anchor size
421421 $track_all -> {$read }{' qstart' }=$qstart ;
422422 $track_all -> {$read }{' qend' }=$qend ;
423- # print "$line\n\n";
423+ $track_all -> { $read }{ ' si ' }= $si ; # ## tracks sequence identity
424424 }
425425 }
426426 }
@@ -431,26 +431,41 @@ sub readBam{
431431 my $scafflist =$t -> {$rd };
432432 my $num = keys (%$scafflist );
433433
434- if ($num ==2){# ##maps on two different scaffolds only
434+ # if($num==2){###maps on two different scaffolds only
435435 my @arr ;
436436 my $prev =" " ;
437437 my $current =" " ;
438438 my $totalreadlength =0;
439- foreach my $scaff (keys %$scafflist ){
440- # foreach my $scaff(sort {$scafflist->{$a}{'order'}<=>$scafflist->{$a}{'order'}} keys %$scafflist){
439+ my $counttig = 0;
440+ # foreach my $scaff(keys %$scafflist){
441+ GETCONTIG:
442+ foreach my $scaff (sort {$scafflist -> {$a }{' order' }<=> $scafflist -> {$b }{' order' }} keys %$scafflist ){# ## best contig alignments listed first
443+ $counttig ++;
441444 $current = $scafflist -> {$scaff }{' real' };
442445 $prev = $current if ($prev eq " " );
443446 $totalreadlength = $scafflist -> {$scaff }{' length' };
447+ last GETCONTIG if ($counttig ==2);
444448 }
445449
446450 my ($one ,$two )=($track_all -> {$current }{' tig' },$track_all -> {$prev }{' tig' });
447451 if ($track_all -> {$prev }{' tig' }<$track_all -> {$current }{' tig' }){
448452 ($one ,$two )=($track_all -> {$prev }{' tig' },$track_all -> {$current }{' tig' })
449453 }
450454
451- if (! defined $mem -> {$rd } && ! defined $bt -> {$one }{$two } && ($track_all -> {$current }{' qstart' } > $track_all -> {$prev }{' qend' } || $track_all -> {$prev }{' qstart' } > $track_all -> {$current }{' qend' }) && $rh -> {$rd } ne " " ){# ## ADDED NO NULL SEQUENCES 30JAN2016
452- # if(! defined $mem->{$rd} && ($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'} || $track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}) && $rh->{$rd} ne ""){### ADDED NO NULL SEQUENCES 30JAN2016
453- $bt -> {$one }{$two }=1;# ## beenthere, bt, tracks that two contigs have been merged by a read (first one it saw here) .. to allow patch to match the chosen read support
455+ # ## this will track the best anchoring long reads for the merge/gapfill
456+ my $m1 = $track_all -> {$current }{' qalen' } * $track_all -> {$current }{' si' };
457+ my $m2 = $track_all -> {$prev }{' qalen' } * $track_all -> {$prev }{' si' };
458+ my $matchbases = $m1 + $m2 ;
459+
460+ if (! defined $mem -> {$rd } && ($track_all -> {$current }{' qstart' } > $track_all -> {$prev }{' qend' } || $track_all -> {$prev }{' qstart' } > $track_all -> {$current }{' qend' }) && $rh -> {$rd } ne " " ){# ## WILL TRACK BEST ANCHORING BASES
461+
462+ # ## This is used to count #support linkages and -a
463+ $matepair -> {$prev }{$current }{' is' } = $totalreadlength ;# - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'} );
464+ $matepair -> {$prev }{$current }{' bt' }=0;
465+
466+ if ($matchbases > $bt -> {$one }{$two }{' bestmatch' }){# ##conditional to track best patch sequence only
467+
468+ $bt -> {$one }{$two }{' bestmatch' } = $matchbases ; # ## beenthere, bt, tracks that two contigs have been merged by a read (first one it saw here) .. to allow patch to match the chosen read support
454469 my $pos =0;
455470 $pos = $track_all -> {$prev }{' qend' } if ($track_all -> {$current }{' qstart' } > $track_all -> {$prev }{' qend' });
456471 $pos = $track_all -> {$current }{' qend' } if ($track_all -> {$prev }{' qstart' } > $track_all -> {$current }{' qend' });
@@ -462,37 +477,29 @@ sub readBam{
462477
463478 print " GAP:$patch \n " if ($verbose );
464479
465- $matepair -> {$prev }{$current }{' is' }= $totalreadlength ;# - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'} );
466- $matepair -> {$prev }{$current }{' bt' }=0;
467-
468480 # ##JUST SOME TEST CODE
469481 if (defined $tigpair -> {$track_all -> {$prev }{' tig' }}{$track_all -> {$current }{' tig' }}{' seq' }){# ## previous patch saved
470482 $occ ++;
483+ $same ++ if ($patch eq $tigpair -> {$track_all -> {$prev }{' tig' }}{$track_all -> {$current }{' tig' }}{' seq' });
471484 # print "$prev ($track_all->{$prev}{'tig'})...$current ($track_all->{$current}{'tig'})\n$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}\nNEW GAP:\n$patch\n";
472- # if($patch ne $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){print "NOT SAME\n\n";}else{print "SAME\n\n";$same++; }
485+ # if($patch ne $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){print "NOT SAME\n\n";}else{print "SAME\n\n";}
473486 }
487+
474488 # ## EVEN THOUGH A COUNTER 'distr' TRACKS NUMBER OF SUPPORT FOR TIG A AND B, LAST COMBO OVERRIDES REST THIS COULD BE PROBLEMATIC WHEN MULTIPLE READS SUPPORT SAME CONTIGS, SEE NOTE ABOVE AND REASON TO LIMIT SUPPORT TO FIRST READ SEEN
475489 $tigpair -> {$track_all -> {$prev }{' tig' }}{$track_all -> {$current }{' tig' }}{' distr' }++;
476490 $tigpair -> {$track_all -> {$prev }{' tig' }}{$track_all -> {$current }{' tig' }}{' seq' }=$patch ;
477491 $tigpair -> {$track_all -> {$prev }{' tig' }}{$track_all -> {$current }{' tig' }}{' configuration' }=$track_all -> {$prev }{' orient' } . $track_all -> {$prev }{' tig' } . $track_all -> {$current }{' orient' } . $track_all -> {$current }{' tig' };
478492 $tigpair -> {$track_all -> {$prev }{' tig' }}{$track_all -> {$current }{' tig' }}{' origin' }=$rd ;
479- # $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'oriseq'}=$rh->{$rd};###original long read seq - consider commenting out
480493
481494 print " >>>> $track_all ->{$prev }{'tig'} $track_all ->{$current }{'tig'} $patch \n " if ($verbose );
482495
483- # scaffold4541,124801,f1463171Z123813k1a0m501_r634444z988
484- # scaffold4575,126112,f1463240Z123471k1a0m106_f511792z1266k1a0m273_r473614z1375
485- # scaffold4765,122476,f861408z779k1a0m765_f4120Z121697
486- # if($one == 1463171 || $one == 634444 || $one == 1463240 || $one == 511792 || $one == 473614 || $one == 861408 || $one == 4120){
487- # print "$track_all->{$prev}{'sam'}\n$track_all->{$current}{'sam'}\n x====x ($totalreadlength) ($track_all->{$prev}{'qstart'}-$track_all->{$prev}{'qend'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND ($track_all->{$current}{'qstart'}-$track_all->{$current}{'qend'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'})\n===x x==== sc$track_all->{$prev}{'tig'} ($track_all->{$prev}{'start'}-$track_all->{$prev}{'end'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND sc$track_all->{$current}{'tig'} ($track_all->{$current}{'start'}-$track_all->{$current}{'end'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'}) \n\n";
488- # }
489496 print " $track_all ->{$prev }{'sam'}\n $track_all ->{$current }{'sam'}\n x====x ($totalreadlength ) ($track_all ->{$prev }{'qstart'}-$track_all ->{$prev }{'qend'}:$track_all ->{$prev }{'qalen'} $track_all ->{$prev }{'orient'}) AND ($track_all ->{$current }{'qstart'}-$track_all ->{$current }{'qend'}:$track_all ->{$current }{'qalen'} $track_all ->{$current }{'orient'})\n ===x x==== sc$track_all ->{$prev }{'tig'} ($track_all ->{$prev }{'start'}-$track_all ->{$prev }{'end'}:$track_all ->{$prev }{'qalen'} $track_all ->{$prev }{'orient'}) AND sc$track_all ->{$current }{'tig'} ($track_all ->{$current }{'start'}-$track_all ->{$current }{'end'}:$track_all ->{$current }{'qalen'} $track_all ->{$current }{'orient'}) \n\n " if ($verbose );
497+ }# ##matchbase used to track best patch sequence
490498 }
491- }
499+ # }
492500 }
493501
494502 print " \n Redundant same contig combo linking:$occ \n Same gap sequence fill:$same \n\n " ;
495-
496503 return $matepair ,$track_all ,$tigpair ;
497504}
498505
0 commit comments