11/* bam2bcf_indel.c -- indel caller.
22
33 Copyright (C) 2010, 2011 Broad Institute.
4- Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd.
4+ Copyright (C) 2012-2014,2016-2017, 2021-2022 Genome Research Ltd.
55
66 Author: Heng Li <lh3@sanger.ac.uk>
77
@@ -283,7 +283,7 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
283283 if (sz == 0
284284 || j - i >= bca -> min_support
285285 // Note, doesn't handle bca->per_sample_flt yet
286- || bca -> per_sample_flt
286+ || bca -> per_sample_flt
287287 || (double )(j - i ) / n_tot >= bca -> min_frac )
288288 types [t ++ ] = sz ;
289289 i = j - 1 ;
@@ -309,7 +309,7 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
309309}
310310
311311// Increment ins["str"] and freq["str"]
312- #define NI 10 // number of alternative insertion sequences
312+ #define NI 100 // number of alternative insertion sequences
313313// Could use a hash table too, but expectation is a tiny number of alternatives
314314typedef struct {
315315 char * str [NI ];
@@ -410,6 +410,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
410410 int (* ref_base )[6 ] = calloc (right - left + 1 , sizeof (* ref_base ));
411411 str_freq * ref_ins = calloc (right - left + 1 , sizeof (* ref_ins ));
412412 int i , j , k , s = sample ;
413+ char * * cons = NULL ;
414+
415+ if (!cons_base || !cons_ins || !ref_base || !ref_ins )
416+ goto err ;
413417
414418 //--------------------------------------------------
415419 // Accumulate sequences into cons_base and cons_ins arrays
@@ -479,9 +483,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
479483 if (p -> indel == type /*&& x == pos+1*/ ) {
480484 // Assume any ins of the same size is the same ins.
481485 // (This rescues misaligned insertions.)
482- bcf_cgp_append_cons (& cons_ins [x - left ], ins , ilen , 1 );
486+ if (bcf_cgp_append_cons (& cons_ins [x - left ], ins ,
487+ ilen , 1 ) < 0 )
488+ goto err ;
483489 } else if (x != pos + 1 ){
484- bcf_cgp_append_cons (& ref_ins [x - left ], ins , ilen , 1 );
490+ if (bcf_cgp_append_cons (& ref_ins [x - left ], ins ,
491+ ilen , 1 ) < 0 )
492+ goto err ;
485493 }
486494 }
487495 break ;
@@ -550,7 +558,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
550558 // When evaluating this particular indel, we don't want to
551559 // penalise alignments by SNP errors elsewhere. This can
552560 // happen when we have low depth for a particular 'type'.
553- //
561+ //
554562 // So add in a little data from ref_base/ref_ins.
555563 double rfract = (r - t * 2 )* .75 / (r + 1 );
556564
@@ -582,9 +590,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
582590 for (j = 0 ; j < NI ; j ++ ) {
583591 if (!ref_ins [i ].str [j ])
584592 break ;
585- bcf_cgp_append_cons (& cons_ins [i ],
586- ref_ins [i ].str [j ], ref_ins [i ].len [j ],
587- rfract * ref_ins [i ].freq [j ]);
593+ if (bcf_cgp_append_cons (& cons_ins [i ],
594+ ref_ins [i ].str [j ], ref_ins [i ].len [j ],
595+ rfract * ref_ins [i ].freq [j ]) < 0 )
596+ goto err ;
588597 }
589598 }
590599
@@ -604,7 +613,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
604613 }
605614 max_len += ins ;
606615 }
607- char * * cons = malloc ((max_len + 1 )* 2 + sizeof (char * )* 2 );
616+ cons = malloc ((max_len + 1 )* 2 + sizeof (char * )* 2 );
617+ if (!cons )
618+ goto err ;
608619 cons [0 ] = (char * )& cons [2 ];
609620 cons [1 ] = cons [0 ] + max_len + 1 ;
610621
@@ -802,7 +813,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
802813 tcon_len [cnum ] = k ;
803814 }
804815
805- // FIXME: replace by string pool for rapid tidying
816+ // TODO: replace by io_lib's string pool for rapid tidying.
817+ // For now this isn't the bottleneck though.
806818 for (i = 0 ; i < right - left ; i ++ ) {
807819 for (j = 0 ; j < NI ; j ++ ) {
808820 if (cons_ins [i ].str [j ])
@@ -812,6 +824,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
812824 }
813825 }
814826
827+ err :
815828 free (cons_base );
816829 free (ref_base );
817830 free (cons_ins );
@@ -841,7 +854,9 @@ static int bcf_cgp_l_run(const char *ref, int pos) {
841854
842855
843856// Compute the insertion consensus for this sample 's' via a basic
844- // majority rule
857+ // majority rule.
858+ //
859+ // TODO: merge this into bcf_cgp_consensus as another return value?
845860static char * bcf_cgp_calc_ins_cons (int n , int * n_plp , bam_pileup1_t * * plp ,
846861 int pos , int * types , int n_types ,
847862 int max_ins , int s ) {
@@ -962,25 +977,18 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
962977 // Trim poly_Ns at ends of ref.
963978 // This helps to keep len(ref) and len(query) similar, to reduce
964979 // band size and reduce the chance of -ve BAQ scores.
965-
966- // FIXME Maybe instead of l>ABS(type) it should be l>query_len/2 ?
967- // TODO: no difference to result, but what difference is there to
968- // speed? Is this worth it?
969- #if 1
970980 for (l = 0 ; l < tend1 - tbeg && l < tend2 - tbeg ; l ++ )
971981 if (ref1 [l + tbeg - left ] != 4 || ref2 [l + tbeg - left ] != 4 )
972982 break ;
973- if (l > ABS (type )) {
983+ if (l > ABS (type ))
974984 tbeg += l - ABS (type );
975- }
976985
977986 for (l = tend1 - tbeg - 1 ; l >= 0 ; l -- )
978987 if (ref1 [l + tbeg - left ] != 4 )
979988 break ;
980989 l = tend1 - tbeg - 1 - l ;
981- if (l > ABS (type )) {
990+ if (l > ABS (type ))
982991 tend1 -= l - ABS (type );
983- }
984992
985993 for (l = tend2 - tbeg - 1 ; l >= 0 ; l -- )
986994 if (ref2 [l + tbeg - left ] != 4 )
@@ -989,7 +997,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
989997 if (l > ABS (type )) {
990998 tend2 -= l - ABS (type );
991999 }
992- #endif
9931000
9941001 // Get segment of quality, either ZQ tag or if absent QUAL.
9951002 if (!(qq = (uint8_t * ) calloc (qend - qbeg , 1 )))
@@ -1003,37 +1010,14 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
10031010 if (qval < 7 )
10041011 qval = 7 ;
10051012 qq [l - qbeg ] = qval ;
1006-
1007- // Skew qq at qpos to be higher than background and qq at
1008- // other regions to be lower. This means the alignment of
1009- // indel we are currently assessing takes precedence over
1010- // alignment of flanking regions.
1011- //
1012- // Ins; type = +ve
1013- // Ref AGCTAG---CTGA
1014- // Qry AGCTAGGGGCTGA (qpos..qpos+type)
1015- //
1016- // Del; type = -ve
1017- // Ref AGCTAGGGGCTGA
1018- // Qry AGCTAG---CTGA (qpos..qpos)
1019-
1020- // // Tests over 1-47MB
1021- // // shift8b FP/GT/FN = 290/296/2310
1022- // // develop = 264/326/2282
1023- // if (l >= qpos-2 && l <= qpos+2+(type>0?type:0))
1024- // //qq[l-qbeg] += 15; //qq2 = 282/312/2334
1025- // qq[l-qbeg] *= 1.5; //qq3 = 284/305/2326
1026- // //qq[l-qbeg] *= 0.75;//qq4 = 287/333/2347
1027- //// else
1028- //// qq[l-qbeg] *= 0.67; // qq = 269/343/2413 (qq3 with else clause)
10291013 }
10301014
10311015 // The bottom 8 bits are length-normalised score while
10321016 // the top bits are unnormalised.
10331017 //
10341018 // Try original cons and new cons and pick best.
1035- // This doesn't removed FN much (infact maybe adds very slightly),
1036- // but it does reduce GT errors and some slight reduction to FP.
1019+ // This doesn't reduce FN much (infact maybe adds very slightly),
1020+ // but it does reduce GT errors and is a slight reduction to FP.
10371021 sc2 = probaln_glocal (ref2 + tbeg - left , tend2 - tbeg ,
10381022 query , qend - qbeg , qq , & apf , 0 , 0 );
10391023
@@ -1083,7 +1067,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
10831067 // used for adjusting indelQ below
10841068 l = (int )((100. * sc2 / (qend - qbeg ) + .499 ) * bca -> indel_bias );
10851069 * score = sc2 <<8 | MIN (255 , l );
1086- //fprintf(stderr, "score = %d, qend-qbeg = %d, => adj score %d\n", sc, qend-qbeg, l);
10871070
10881071 rep_ele * reps , * elt , * tmp ;
10891072 uint8_t * seg = ref2 + tbeg - left ;
@@ -1112,9 +1095,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
11121095 iscore += (elt -> end - elt -> start ) / elt -> rep_len ; // c
11131096 if (elt -> start + tbeg <= r_start ||
11141097 elt -> end + tbeg >= r_end ) {
1115- //iscore += 2*(elt->end-elt->start); //h5 (STR2)
1116- //iscore += 4*(elt->end-elt->start); //h5STR4
1117- iscore += (elt -> end - elt -> start ); //h5STR1
1098+ iscore += (elt -> end - elt -> start );
11181099 }
11191100 }
11201101
@@ -1191,9 +1172,10 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
11911172 if (indelQ > seqQ ) indelQ = seqQ ;
11921173 if (indelQ > 255 ) indelQ = 255 ;
11931174 if (seqQ > 255 ) seqQ = 255 ;
1194- p -> aux = (sc [0 ]& 0x3f )<<16 | seqQ <<8 | indelQ ; // use 22 bits in total
1195- sumq [sc [0 ]& 0x3f ] += indelQ < seqQ ? indelQ : seqQ ; // FIXME: redunctant; always indelQ
1196- // fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
1175+
1176+ // use 22 bits in total
1177+ p -> aux = (sc [0 ]& 0x3f )<<16 | seqQ <<8 | indelQ ;
1178+ sumq [sc [0 ]& 0x3f ] += indelQ ;
11971179 }
11981180 }
11991181 // determine bca->indel_types[] and bca->inscns
@@ -1299,7 +1281,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
12991281
13001282 // construct the consensus sequence (minus indels, which are added later)
13011283 if (max_ins > 0 ) {
1302- // FIXME : replace filling inscns[] with calc_consensus return
1284+ // TODO : replace filling inscns[] with calc_consensus return
13031285 // so the merges of the insertion consensus for type[t] is
13041286 // reported directly. (It may need adjustment to avoid N)
13051287 inscns = bcf_cgp_calc_ins_cons (n , n_plp , plp , pos ,
@@ -1349,7 +1331,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13491331 int tcon_len [2 ];
13501332 int cpos_pos ;
13511333 tcons = bcf_cgp_consensus (n , n_plp , plp , pos , bca , ref ,
1352- left , right , s , types [t ], biggest_del ,
1334+ left , right , s , types [t ], biggest_del ,
13531335 & left_shift , & right_shift , & band ,
13541336 tcon_len , & cpos_pos );
13551337#ifdef CONS_DEBUG
@@ -1370,6 +1352,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13701352#endif
13711353
13721354 // Scan for base-runs in the insertion.
1355+ // We use this to avoid over-correction in est_seqQ when the
1356+ // insertion is not part of the neighbouring homopolymer.
13731357 int k = tcons [0 ][cpos_pos ], j ;
13741358 for (j = 0 ; j < types [t ]; j ++ )
13751359 if (tcons [0 ][cpos_pos + j ] != k )
@@ -1437,16 +1421,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
14371421 {
14381422 rep_ele * reps , * elt , * tmp ;
14391423 reps = find_STR (tcons [0 ], tcon_len [0 ], 0 );
1440- int max_str = 0 , tot_str = 0 ;
1424+ //int max_str = 0;
1425+ int tot_str = 0 ;
14411426 DL_FOREACH_SAFE (reps , elt , tmp ) {
1442- if (max_str < elt -> end - elt -> start )
1443- max_str = elt -> end - elt -> start ;
1427+ // if (max_str < elt->end - elt->start)
1428+ // max_str = elt->end - elt->start;
14441429 tot_str += elt -> end - elt -> start ;
14451430 DL_DELETE (reps , elt );
14461431 free (elt );
14471432 }
14481433
1449- // Max_str should be enough, but it's still not
1434+ // Ideally max_str should be enough, but it's still not
14501435 // sufficient in longer range some repeats.
14511436 //min_win_size += max_str;
14521437 min_win_size += tot_str ;
@@ -1476,11 +1461,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
14761461 // entire left/right region, so this also returns the
14771462 // equivalent genomic coords for qbeg/qend in tbeg/tend.
14781463 qbeg = tpos2qpos (& p -> b -> core , bam_get_cigar (p -> b ),
1479- left2 /*+biggest_ins*/ , 0 , & tbeg );
1464+ left2 , 0 , & tbeg );
14801465 qpos = tpos2qpos (& p -> b -> core , bam_get_cigar (p -> b ), pos ,
14811466 0 , & tend ) - qbeg ;
14821467 qend = tpos2qpos (& p -> b -> core , bam_get_cigar (p -> b ),
1483- right2 /*-biggest_ins*/ , 1 , & tend );
1468+ right2 , 1 , & tend );
14841469
14851470 int old_tend = tend ;
14861471 int old_tbeg = tbeg ;
0 commit comments