1- //#define CONS_DEBUG
2-
3- /*
4-
5- TODO:
6-
7- - Reevaluate the two STR indel-size adjusting modes.
8- Maybe no longer relevant. (Looks poor to me)
9-
10- - Consider limiting fract to never add more than current depth, so we
11- change cons to Ns but not to another base type entirely.
12-
13- - Consider a separate rfract for lift-over of SNPs than for indels.
14- SNPs is good at replacing bases with N where we're unsure on the
15- data. However ref_ins may cause issues with sizing?
16- rfract*.8 is working better (so far). Trying 0.5 too.
17-
18- - Left-align indels before consensus generation. Eg:
19-
20- /pos being studied
21- AGCTGGGGGGAATCG REF
22- AGCT-GGGGGAATCG Seq type -1
23- ACGTGGGGG-AATGCG Seq type 0
24- ^
25-
26- Type 0 cons shouldn't include the right hand del, but it's outside
27- of "biggest_del" window. Expand this to STR size or left-align.
28-
29- - Long reads cause multiple scans of CIGAR to compute consensus.
30- We need a way of caching CIGAR/seq start coords for pos p=left so at
31- pos P where P>p we can start at p and continue instead of from the
32- start each time.
33-
34- - Improve QUAL scoring to consider AD vs DP.
35- Eg. AD 10,8 looks good if we have 18 sequences. High qual.
36- But AD 10,8 looks poor if we had 50 seqs. Why did we have to discard
37- 32 of them?
38-
39- */
40-
411/* bam2bcf_indel.c -- indel caller.
422
433 Copyright (C) 2010, 2011 Broad Institute.
@@ -63,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
6323FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
6424DEALINGS IN THE SOFTWARE. */
6525
26+ //#define CONS_DEBUG
27+
6628#include <assert.h>
6729#include <ctype.h>
6830#include <string.h>
@@ -126,8 +88,16 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos,
12688 return last_y ;
12789}
12890
129- // FIXME: check if the inserted sequence is consistent with the homopolymer run
130- // l is the relative gap length and l_run is the length of the homopolymer on the reference
91+ // l is the relative gap length and l_run is the length of the homopolymer
92+ // on the reference.
93+ //
94+ // Larger seqQ is good, so increasing tandemQ calls more indels,
95+ // and longer l_run means fewer calls. It is capped later at 255.
96+ // For short l_runs, the qual is simply based on size of indel
97+ // larger ones being considered more likely to be real.
98+ // Longer indels get assigned a score based on the relative indel size
99+ // to homopolymer, where l_run base will have already been verified by
100+ // the caller to ensure it's compatible.
131101static inline int est_seqQ (const bcf_callaux_t * bca , int l , int l_run )
132102{
133103 int q , qh ;
@@ -408,7 +378,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
408378 int left , int right ,
409379 int sample , int type , int biggest_del ,
410380 int * left_shift , int * right_shift ,
411- int * band , int * tcon_len ) {
381+ int * band , int * tcon_len , int * cpos_pos ) {
412382 // Map ASCII ACGTN* to 012345
413383 static uint8_t base6 [256 ] = {
414384 4 ,4 ,4 ,4 ,4 ,4 ,4 ,4 , 4 ,4 ,4 ,4 ,4 ,4 ,4 ,4 , 4 ,4 ,4 ,4 ,4 ,4 ,4 ,4 , 4 ,4 ,4 ,4 ,4 ,4 ,4 ,4 ,
@@ -717,8 +687,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
717687 // Used in cnum==1 to do the opposite of whichever way we did before.
718688 int heti [1024 ] = {0 }, hetd [1024 ] = {0 };
719689
690+ * cpos_pos = -1 ;
720691 for (cnum = 0 ; cnum < 2 ; cnum ++ ) {
721692 for (i = k = 0 ; i < right - left ; i ++ ) {
693+ // Location in consensus matching the indel itself
694+ if (i >= pos - left + 1 && * cpos_pos == -1 )
695+ * cpos_pos = k ;
696+
722697 int max_v = 0 , max_v2 = 0 , max_j = 4 , max_j2 = 4 , tot = 0 ;
723698 for (j = 0 ; j < 6 ; j ++ ) {
724699 // Top 2 consensus calls
@@ -815,7 +790,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
815790 (* right_shift )++ ;
816791 } else {
817792 // Finally the easy case - a non-indel base or an N
818- // FIXME: make cons[] in 0,1,2,3,4,5 terms
819793 if (max_v > CONS_CUTOFF * tot )
820794 cons [cnum ][k ++ ] = max_j ; // "ACGTN*"
821795 else if (max_v > 0 )
@@ -1320,6 +1294,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13201294
13211295 // The length of the homopolymer run around the current position
13221296 l_run = bcf_cgp_l_run (ref , pos );
1297+ int l_run_base = seq_nt16_table [(uint8_t )ref [pos + 1 ]];
1298+ int l_run_ins = 0 ;
13231299
13241300 // construct the consensus sequence (minus indels, which are added later)
13251301 if (max_ins > 0 ) {
@@ -1371,21 +1347,38 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13711347 char * * tcons ;
13721348 int left_shift , right_shift ;
13731349 int tcon_len [2 ];
1350+ int cpos_pos ;
13741351 tcons = bcf_cgp_consensus (n , n_plp , plp , pos , bca , ref ,
13751352 left , right , s , types [t ], biggest_del ,
13761353 & left_shift , & right_shift , & band ,
1377- tcon_len );
1354+ tcon_len , & cpos_pos );
13781355#ifdef CONS_DEBUG
1379- for (j = 0 ; j < 2 ; j ++ ) {
1380- int k ;
1381- fprintf (stderr , "Cons%d @ %d %4d/%3d " ,
1382- pos , types [t ], left_shift );
1383- for (k = 0 ; k < tcon_len [j ]; k ++ )
1384- putc ("ACGTN" [(uint8_t )tcons [j ][k ]], stderr );
1385- putc ('\n' , stderr );
1356+ {
1357+ int j ;
1358+ for (j = 0 ; j < 2 ; j ++ ) {
1359+ int k ;
1360+ fprintf (stderr , "Cons%d @ %d %4d/%4d " ,
1361+ j , pos , types [t ], left_shift );
1362+ for (k = 0 ; k < tcon_len [j ]; k ++ ) {
1363+ if (k == cpos_pos )
1364+ putc ('#' , stderr );
1365+ putc ("ACGTN" [(uint8_t )tcons [j ][k ]], stderr );
1366+ }
1367+ putc ('\n' , stderr );
1368+ }
13861369 }
13871370#endif
13881371
1372+ // Scan for base-runs in the insertion.
1373+ int k = tcons [0 ][cpos_pos ], j ;
1374+ for (j = 0 ; j < types [t ]; j ++ )
1375+ if (tcons [0 ][cpos_pos + j ] != k )
1376+ break ;
1377+ if (j && j == types [t ])
1378+ l_run_ins |= "\x1\x2\x4\x8\xf" [k ]; // ACGTN
1379+ if (types [t ] < 0 )
1380+ l_run_ins |= 0xff ;
1381+
13891382 // align each read to consensus(es)
13901383 for (i = 0 ; i < n_plp [s ]; ++ i , ++ K ) {
13911384 bam_pileup1_t * p = plp [s ] + i ;
@@ -1542,6 +1535,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
15421535 }
15431536
15441537 // compute indelQ
1538+ if (!(l_run_base & l_run_ins ))
1539+ l_run = 1 ; // different base type in ins to flanking region.
15451540 n_alt = bcf_cgp_compute_indelQ (n , n_plp , plp , bca , inscns , l_run , max_ins ,
15461541 ref_type , types , n_types , score );
15471542
0 commit comments