Skip to content

Commit 5b9c4bf

Browse files
committed
v1.4 offers better read handling
1. Save in memory gap sequence (patch), from highest-matching read for both tools 2. Track #reads support in cobbler (-l) and RAILS, and allow cutoff when scaffolding (-l and -a), with latter (RAILS) 3. Remove the hardcoded two-hit requirement for a read in RAILS. Instead, process two best hits for each read aligning different sequences 4. Implement grace (-g) option, which effectively simulate read trimming (valuable for ONT mapping) 5. bug fixes (-list.tsv (cobbler) reported some instances of gap-fill regions not fixed in the assembly – very marginal ~2.4% discrepancy). cobbler gap-fill table now lists the number of supporting reads for each gap filled
1 parent 36a692d commit 5b9c4bf

15 files changed

Lines changed: 3066 additions & 80 deletions

File tree

bin/RAILS

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323
use strict;
2424
use Getopt::Std;
2525
use Net::SMTP;
26-
use vars qw($opt_f $opt_s $opt_d $opt_i $opt_e $opt_l $opt_a $opt_v $opt_b $opt_t $opt_p $opt_q);
26+
use vars qw($opt_f $opt_s $opt_d $opt_i $opt_e $opt_l $opt_a $opt_v $opt_b $opt_t $opt_p $opt_q $opt_g);
2727
getopts('f:s:d:e:l:a:v:b:t:p:i:q:');
28-
my ($base_name,$anchor,$seqid,$insert_stdev,$min_links,$max_link_ratio,$verbose)=("",1000,0.9,1.0,1,0,0);
28+
my ($base_name,$anchor,$seqid,$insert_stdev,$min_links,$max_link_ratio,$grace,$verbose)=("",1000,0.9,1.0,1,0.99,1,0,);
2929

30-
my $version = "[v1.3]";
30+
my $version = "[v1.4]";
3131
my $dev = "rwarren\@bcgsc.ca";
3232
my $SAMPATH = "/gsc/btl/linuxbrew/bin/samtools";
3333
#-------------------------------------------------
@@ -41,8 +41,9 @@ if(! $opt_f || ! $opt_s || ! $opt_q){
4141
print "-i Minimum sequence identity fraction (0 to 1), default -i $seqid, optional\n";
4242
print "-t LIST of names/header, long sequences to avoid using for merging/gap-filling scaffolds (optional)\n";
4343
#print "-e Error (%) allowed on -d distance e.g. -e 0.1 == distance +/- 10% (default -e $insert_stdev, optional)\n";
44-
#print "-l Minimum number of links to compute scaffold (default -l $min_links, optional)\n";
45-
#print "-a Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a $max_link_ratio, optional)\n";
44+
print "-l Minimum number of links to compute scaffold (default -l $min_links, optional)\n";
45+
print "-a Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a $max_link_ratio, optional)\n";
46+
print "-g Grace length (bp), default -g $grace, optional\n";
4647
print "-b Base name for your output files (optional)\n";
4748
print "-v Runs in verbose mode (-v 1 = yes, default = no, optional)\n";
4849
die "IMPORTANT: the order of files in -q and -s MUST match!\n";
@@ -55,11 +56,12 @@ $anchor = $opt_d if($opt_d);
5556
$seqid = $opt_i if($opt_i);
5657
$verbose = $opt_v if($opt_v);
5758
#DO NOT UNCOMMENT THE FOLLOWING LINES, ONLY DEFAULTS TO BE USED WITH RAILS
58-
#$min_links = $opt_l if($opt_l);
59-
#$max_link_ratio = $opt_a if($opt_a);
59+
$min_links = $opt_l if($opt_l);
60+
$max_link_ratio = $opt_a if($opt_a);
6061
#$insert_stdev = $opt_e if($opt_e);
6162
my $listfile = $opt_t if($opt_t);
6263
$base_name = $opt_b if($opt_b);
64+
$grace = $opt_g if($opt_g);
6365

6466
my $assemblyruninfo="";
6567

@@ -75,7 +77,7 @@ if(! -e $fof){
7577
### Naming output files
7678
if ($base_name eq ""){
7779

78-
$base_name = $file . ".scaff_s-" . $fof . "_q-" . $queryfof . "_d" . $anchor . "_i" . $seqid . "_e" . $insert_stdev . "_l" . $min_links . "_a" . $max_link_ratio . "_t" . $listfile;
80+
$base_name = $file . ".scaff_s-" . $fof . "_q-" . $queryfof . "_d" . $anchor . "_i" . $seqid . "_e" . $insert_stdev . "_l" . $min_links . "_a" . $max_link_ratio . "_g" . $grace . "_t" . $listfile;
7981

8082
my $pid_num = getpgrp(0);
8183
$base_name .= "_pid" . $pid_num;
@@ -93,7 +95,7 @@ open (LOG, ">$log") || die "Can't write to $log -- fatal\n";
9395

9496
my $init_message = "\nRunning: $0 $version\n-f $file\n-q $queryfof\n-s $fof\n";
9597

96-
$init_message .= "-d $anchor\n-i $seqid\n-e $insert_stdev\n-l $min_links\n-a $max_link_ratio\n-t $listfile\n";
98+
$init_message .= "-d $anchor\n-i $seqid\n-e $insert_stdev\n-l $min_links\n-a $max_link_ratio\n-g $grace\n-t $listfile\n";
9799

98100
print $init_message;
99101
print LOG $init_message;
@@ -130,7 +132,7 @@ while(<FOF>){
130132
my $bamfile = $_;
131133
my $rh = &readSeqMemory($qryfilearray[$ctline]);### ONLY READ READ SEQUENCE IN MEMORY FOR THOSE MATCHING BAM (SAME ORDER NEEDED)
132134
print "Parsing alignment file $bamfile...\n";
133-
($matepair,$track_all,$tigpair)=&readBam($matepair,$track_all,$tigpair,$bamfile,$anchor,$seqid,$listfile,$matepair,$initpos,$rh);
135+
($matepair,$track_all,$tigpair)=&readBam($matepair,$track_all,$tigpair,$bamfile,$anchor,$seqid,$listfile,$matepair,$initpos,$rh,$grace);
134136
print "done.\n";
135137
$ctline++;
136138
}
@@ -269,7 +271,7 @@ sub readContigs{
269271
#---------------
270272
sub readBam{
271273

272-
my ($matepair,$track_all,$tigpair,$bamfile,$anchor,$seqid,$listfile,$matepair,$initpos,$rh) = @_;
274+
my ($matepair,$track_all,$tigpair,$bamfile,$anchor,$seqid,$listfile,$matepair,$initpos,$rh,$grace) = @_;
273275

274276
my $mem;
275277
if(-f $listfile){
@@ -306,7 +308,6 @@ sub readBam{
306308

307309

308310
my %rlength = ();
309-
my $min=1;
310311

311312
my $ERRLOG = $bamfile.".bampreprocessor.err.log".$$.time();
312313
my $cmd = "$SAMPATH view $bamfile 2>$ERRLOG|";###read BAM
@@ -379,14 +380,14 @@ sub readBam{
379380
$si = ($qalen - $edit_dist) / $qalen if($qalen);
380381

381382

382-
if($si >= $seqid && $qalen >= $anchor && (( $rstart <= $min && ($qlen-$qend)<= $min) || ($qstart<=$min && ($rlen-$rend)<=$min ) )){ ### this indicates anchoring bases, within $anchor of edges
383+
if($si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace) || ($qstart<=$grace && ($rlen-$rend)<=$grace ) )){ ### this indicates anchoring bases, within $anchor of edges
383384

384385

385-
# print "$si >= $seqid && $qalen >= $anchor && (( $rstart <= $min && ($qlen-$qend)<= $min) || ($qstart<=$min && ($rlen-$rend)<=$min\n";
386+
# print "$si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace) || ($qstart<=$grace && ($rlen-$rend)<=$grace\n" if($verbose);
386387
my $start;
387388
my $end;
388389
###Coordinates on the scaffolds
389-
if($rstart <= $min && ($qlen-$qend)<= $min){
390+
if($rstart <= $grace && ($qlen-$qend)<= $grace){
390391
$start = $rend;
391392
$end = $rstart;
392393
}else{
@@ -404,7 +405,7 @@ sub readBam{
404405
$orient="f";
405406
}
406407

407-
#$t->{$a[0]}{$a[2]}{'order'}=$ct;
408+
$t->{$a[0]}{$a[2]}{'order'}=$ct;
408409
$t->{$a[0]}{$a[2]}{'orient'}=$orient ;
409410
$t->{$a[0]}{$a[2]}{'real'}=$read;
410411
$t->{$a[0]}{$a[2]}{'length'}=$qlen;
@@ -414,13 +415,12 @@ sub readBam{
414415
$track_all->{$read}{'start'}=$start;
415416
$track_all->{$read}{'end'}=$end;
416417
$track_all->{$read}{'multiple'}=1;
417-
418418
$track_all->{$read}{'sam'}=$line;
419419
$track_all->{$read}{'orient'}=$orient;
420-
$track_all->{$read}{'qalen'}=$qalen;
420+
$track_all->{$read}{'qalen'}=$qalen;### tracks anchor size
421421
$track_all->{$read}{'qstart'}=$qstart;
422422
$track_all->{$read}{'qend'}=$qend;
423-
# print "$line\n\n";
423+
$track_all->{$read}{'si'}=$si; ### tracks sequence identity
424424
}
425425
}
426426
}
@@ -431,26 +431,41 @@ sub readBam{
431431
my $scafflist=$t->{$rd};
432432
my $num = keys(%$scafflist);
433433

434-
if($num==2){###maps on two different scaffolds only
434+
#if($num==2){###maps on two different scaffolds only
435435
my @arr;
436436
my $prev="";
437437
my $current="";
438438
my $totalreadlength=0;
439-
foreach my $scaff(keys %$scafflist){
440-
#foreach my $scaff(sort {$scafflist->{$a}{'order'}<=>$scafflist->{$a}{'order'}} keys %$scafflist){
439+
my $counttig = 0;
440+
#foreach my $scaff(keys %$scafflist){
441+
GETCONTIG:
442+
foreach my $scaff(sort {$scafflist->{$a}{'order'}<=>$scafflist->{$b}{'order'}} keys %$scafflist){### best contig alignments listed first
443+
$counttig++;
441444
$current = $scafflist->{$scaff}{'real'};
442445
$prev = $current if($prev eq "");
443446
$totalreadlength = $scafflist->{$scaff}{'length'};
447+
last GETCONTIG if($counttig==2);
444448
}
445449

446450
my ($one,$two)=($track_all->{$current}{'tig'},$track_all->{$prev}{'tig'});
447451
if($track_all->{$prev}{'tig'}<$track_all->{$current}{'tig'}){
448452
($one,$two)=($track_all->{$prev}{'tig'},$track_all->{$current}{'tig'})
449453
}
450454

451-
if(! defined $mem->{$rd} && ! defined $bt->{$one}{$two} && ($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'} || $track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}) && $rh->{$rd} ne ""){### ADDED NO NULL SEQUENCES 30JAN2016
452-
#if(! defined $mem->{$rd} && ($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'} || $track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}) && $rh->{$rd} ne ""){### ADDED NO NULL SEQUENCES 30JAN2016
453-
$bt->{$one}{$two}=1;### beenthere, bt, tracks that two contigs have been merged by a read (first one it saw here) .. to allow patch to match the chosen read support
455+
### this will track the best anchoring long reads for the merge/gapfill
456+
my $m1 = $track_all->{$current}{'qalen'} * $track_all->{$current}{'si'};
457+
my $m2 = $track_all->{$prev}{'qalen'} * $track_all->{$prev}{'si'};
458+
my $matchbases = $m1 + $m2;
459+
460+
if(! defined $mem->{$rd} && ($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'} || $track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}) && $rh->{$rd} ne ""){### WILL TRACK BEST ANCHORING BASES
461+
462+
### This is used to count #support linkages and -a
463+
$matepair->{$prev}{$current}{'is'} = $totalreadlength;# - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'} );
464+
$matepair->{$prev}{$current}{'bt'}=0;
465+
466+
if($matchbases > $bt->{$one}{$two}{'bestmatch'}){###conditional to track best patch sequence only
467+
468+
$bt->{$one}{$two}{'bestmatch'} = $matchbases; ### beenthere, bt, tracks that two contigs have been merged by a read (first one it saw here) .. to allow patch to match the chosen read support
454469
my $pos=0;
455470
$pos = $track_all->{$prev}{'qend'} if($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'});
456471
$pos = $track_all->{$current}{'qend'} if($track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'});
@@ -462,37 +477,29 @@ sub readBam{
462477

463478
print "GAP:$patch\n" if($verbose);
464479

465-
$matepair->{$prev}{$current}{'is'}= $totalreadlength;# - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'} );
466-
$matepair->{$prev}{$current}{'bt'}=0;
467-
468480
###JUST SOME TEST CODE
469481
if(defined $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){### previous patch saved
470482
$occ++;
483+
$same++ if($patch eq $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'});
471484
#print "$prev ($track_all->{$prev}{'tig'})...$current ($track_all->{$current}{'tig'})\n$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}\nNEW GAP:\n$patch\n";
472-
#if($patch ne $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){print "NOT SAME\n\n";}else{print "SAME\n\n";$same++;}
485+
#if($patch ne $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){print "NOT SAME\n\n";}else{print "SAME\n\n";}
473486
}
487+
474488
### EVEN THOUGH A COUNTER 'distr' TRACKS NUMBER OF SUPPORT FOR TIG A AND B, LAST COMBO OVERRIDES REST THIS COULD BE PROBLEMATIC WHEN MULTIPLE READS SUPPORT SAME CONTIGS, SEE NOTE ABOVE AND REASON TO LIMIT SUPPORT TO FIRST READ SEEN
475489
$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'distr'}++;
476490
$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}=$patch;
477491
$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'configuration'}=$track_all->{$prev}{'orient'} . $track_all->{$prev}{'tig'} . $track_all->{$current}{'orient'} . $track_all->{$current}{'tig'};
478492
$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'origin'}=$rd;
479-
#$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'oriseq'}=$rh->{$rd};###original long read seq - consider commenting out
480493

481494
print ">>>> $track_all->{$prev}{'tig'} $track_all->{$current}{'tig'} $patch\n" if($verbose);
482495

483-
#scaffold4541,124801,f1463171Z123813k1a0m501_r634444z988
484-
#scaffold4575,126112,f1463240Z123471k1a0m106_f511792z1266k1a0m273_r473614z1375
485-
#scaffold4765,122476,f861408z779k1a0m765_f4120Z121697
486-
#if($one == 1463171 || $one == 634444 || $one == 1463240 || $one == 511792 || $one == 473614 || $one == 861408 || $one == 4120){
487-
#print "$track_all->{$prev}{'sam'}\n$track_all->{$current}{'sam'}\n x====x ($totalreadlength) ($track_all->{$prev}{'qstart'}-$track_all->{$prev}{'qend'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND ($track_all->{$current}{'qstart'}-$track_all->{$current}{'qend'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'})\n===x x==== sc$track_all->{$prev}{'tig'} ($track_all->{$prev}{'start'}-$track_all->{$prev}{'end'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND sc$track_all->{$current}{'tig'} ($track_all->{$current}{'start'}-$track_all->{$current}{'end'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'}) \n\n";
488-
#}
489496
print "$track_all->{$prev}{'sam'}\n$track_all->{$current}{'sam'}\n x====x ($totalreadlength) ($track_all->{$prev}{'qstart'}-$track_all->{$prev}{'qend'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND ($track_all->{$current}{'qstart'}-$track_all->{$current}{'qend'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'})\n===x x==== sc$track_all->{$prev}{'tig'} ($track_all->{$prev}{'start'}-$track_all->{$prev}{'end'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND sc$track_all->{$current}{'tig'} ($track_all->{$current}{'start'}-$track_all->{$current}{'end'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'}) \n\n" if($verbose);
497+
}###matchbase used to track best patch sequence
490498
}
491-
}
499+
#}
492500
}
493501

494502
print "\nRedundant same contig combo linking:$occ\nSame gap sequence fill:$same\n\n";
495-
496503
return $matepair,$track_all,$tigpair;
497504
}
498505

0 commit comments

Comments
 (0)