Skip to content

Commit 7a3efbf

Browse files
committed
Tidy the indel changes up a bit.
1 parent 727030a commit 7a3efbf

2 files changed

Lines changed: 61 additions & 76 deletions

File tree

bam2bcf_indel.c

Lines changed: 47 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* bam2bcf_indel.c -- indel caller.
22
33
Copyright (C) 2010, 2011 Broad Institute.
4-
Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd.
4+
Copyright (C) 2012-2014,2016-2017, 2021-2022 Genome Research Ltd.
55
66
Author: Heng Li <lh3@sanger.ac.uk>
77
@@ -283,7 +283,7 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
283283
if (sz == 0
284284
|| j-i >= bca->min_support
285285
// Note, doesn't handle bca->per_sample_flt yet
286-
|| bca->per_sample_flt
286+
|| bca->per_sample_flt
287287
|| (double)(j-i) / n_tot >= bca->min_frac)
288288
types[t++] = sz;
289289
i = j-1;
@@ -309,7 +309,7 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
309309
}
310310

311311
// Increment ins["str"] and freq["str"]
312-
#define NI 10 // number of alternative insertion sequences
312+
#define NI 100 // number of alternative insertion sequences
313313
// Could use a hash table too, but expectation is a tiny number of alternatives
314314
typedef struct {
315315
char *str[NI];
@@ -410,6 +410,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
410410
int (*ref_base)[6] = calloc(right - left + 1, sizeof(*ref_base));
411411
str_freq *ref_ins = calloc(right - left + 1, sizeof(*ref_ins));
412412
int i, j, k, s = sample;
413+
char **cons = NULL;
414+
415+
if (!cons_base || !cons_ins || !ref_base || !ref_ins)
416+
goto err;
413417

414418
//--------------------------------------------------
415419
// Accumulate sequences into cons_base and cons_ins arrays
@@ -479,9 +483,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
479483
if (p->indel == type /*&& x == pos+1*/) {
480484
// Assume any ins of the same size is the same ins.
481485
// (This rescues misaligned insertions.)
482-
bcf_cgp_append_cons(&cons_ins[x-left], ins, ilen, 1);
486+
if (bcf_cgp_append_cons(&cons_ins[x-left], ins,
487+
ilen, 1) < 0)
488+
goto err;
483489
} else if (x != pos+1){
484-
bcf_cgp_append_cons(&ref_ins[x-left], ins, ilen, 1);
490+
if (bcf_cgp_append_cons(&ref_ins[x-left], ins,
491+
ilen, 1) < 0)
492+
goto err;
485493
}
486494
}
487495
break;
@@ -550,7 +558,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
550558
// When evaluating this particular indel, we don't want to
551559
// penalise alignments by SNP errors elsewhere. This can
552560
// happen when we have low depth for a particular 'type'.
553-
//
561+
//
554562
// So add in a little data from ref_base/ref_ins.
555563
double rfract = (r - t*2)*.75 / (r+1);
556564

@@ -582,9 +590,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
582590
for (j = 0; j < NI; j++) {
583591
if (!ref_ins[i].str[j])
584592
break;
585-
bcf_cgp_append_cons(&cons_ins[i],
586-
ref_ins[i].str[j], ref_ins[i].len[j],
587-
rfract * ref_ins[i].freq[j]);
593+
if (bcf_cgp_append_cons(&cons_ins[i],
594+
ref_ins[i].str[j], ref_ins[i].len[j],
595+
rfract * ref_ins[i].freq[j]) < 0)
596+
goto err;
588597
}
589598
}
590599

@@ -604,7 +613,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
604613
}
605614
max_len += ins;
606615
}
607-
char **cons = malloc((max_len+1)*2 + sizeof(char *)*2);
616+
cons = malloc((max_len+1)*2 + sizeof(char *)*2);
617+
if (!cons)
618+
goto err;
608619
cons[0] = (char *)&cons[2];
609620
cons[1] = cons[0] + max_len+1;
610621

@@ -802,7 +813,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
802813
tcon_len[cnum] = k;
803814
}
804815

805-
// FIXME: replace by string pool for rapid tidying
816+
// TODO: replace by io_lib's string pool for rapid tidying.
817+
// For now this isn't the bottleneck though.
806818
for (i = 0; i < right-left; i++) {
807819
for (j = 0; j < NI; j++) {
808820
if (cons_ins[i].str[j])
@@ -812,6 +824,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
812824
}
813825
}
814826

827+
err:
815828
free(cons_base);
816829
free(ref_base);
817830
free(cons_ins);
@@ -841,7 +854,9 @@ static int bcf_cgp_l_run(const char *ref, int pos) {
841854

842855

843856
// Compute the insertion consensus for this sample 's' via a basic
844-
// majority rule
857+
// majority rule.
858+
//
859+
// TODO: merge this into bcf_cgp_consensus as another return value?
845860
static char *bcf_cgp_calc_ins_cons(int n, int *n_plp, bam_pileup1_t **plp,
846861
int pos, int *types, int n_types,
847862
int max_ins, int s) {
@@ -962,25 +977,18 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
962977
// Trim poly_Ns at ends of ref.
963978
// This helps to keep len(ref) and len(query) similar, to reduce
964979
// band size and reduce the chance of -ve BAQ scores.
965-
966-
// FIXME Maybe instead of l>ABS(type) it should be l>query_len/2 ?
967-
// TODO: no difference to result, but what difference is there to
968-
// speed? Is this worth it?
969-
#if 1
970980
for (l = 0; l < tend1-tbeg && l < tend2-tbeg; l++)
971981
if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4)
972982
break;
973-
if (l > ABS(type)) {
983+
if (l > ABS(type))
974984
tbeg += l-ABS(type);
975-
}
976985

977986
for (l = tend1-tbeg-1; l >= 0; l--)
978987
if (ref1[l + tbeg-left] != 4)
979988
break;
980989
l = tend1-tbeg-1 - l;
981-
if (l > ABS(type)) {
990+
if (l > ABS(type))
982991
tend1 -= l-ABS(type);
983-
}
984992

985993
for (l = tend2-tbeg-1; l >= 0; l--)
986994
if (ref2[l + tbeg-left] != 4)
@@ -989,7 +997,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
989997
if (l > ABS(type)) {
990998
tend2 -= l-ABS(type);
991999
}
992-
#endif
9931000

9941001
// Get segment of quality, either ZQ tag or if absent QUAL.
9951002
if (!(qq = (uint8_t*) calloc(qend - qbeg, 1)))
@@ -1003,37 +1010,14 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
10031010
if (qval < 7)
10041011
qval = 7;
10051012
qq[l - qbeg] = qval;
1006-
1007-
// Skew qq at qpos to be higher than background and qq at
1008-
// other regions to be lower. This means the alignment of
1009-
// indel we are currently assessing takes precedence over
1010-
// alignment of flanking regions.
1011-
//
1012-
// Ins; type = +ve
1013-
// Ref AGCTAG---CTGA
1014-
// Qry AGCTAGGGGCTGA (qpos..qpos+type)
1015-
//
1016-
// Del; type = -ve
1017-
// Ref AGCTAGGGGCTGA
1018-
// Qry AGCTAG---CTGA (qpos..qpos)
1019-
1020-
// // Tests over 1-47MB
1021-
// // shift8b FP/GT/FN = 290/296/2310
1022-
// // develop = 264/326/2282
1023-
// if (l >= qpos-2 && l <= qpos+2+(type>0?type:0))
1024-
// //qq[l-qbeg] += 15; //qq2 = 282/312/2334
1025-
// qq[l-qbeg] *= 1.5; //qq3 = 284/305/2326
1026-
// //qq[l-qbeg] *= 0.75;//qq4 = 287/333/2347
1027-
//// else
1028-
//// qq[l-qbeg] *= 0.67; // qq = 269/343/2413 (qq3 with else clause)
10291013
}
10301014

10311015
// The bottom 8 bits are length-normalised score while
10321016
// the top bits are unnormalised.
10331017
//
10341018
// Try original cons and new cons and pick best.
1035-
// This doesn't removed FN much (infact maybe adds very slightly),
1036-
// but it does reduce GT errors and some slight reduction to FP.
1019+
// This doesn't reduce FN much (infact maybe adds very slightly),
1020+
// but it does reduce GT errors and is a slight reduction to FP.
10371021
sc2 = probaln_glocal(ref2 + tbeg - left, tend2 - tbeg,
10381022
query, qend - qbeg, qq, &apf, 0, 0);
10391023

@@ -1083,7 +1067,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
10831067
// used for adjusting indelQ below
10841068
l = (int)((100. * sc2 / (qend - qbeg) + .499) * bca->indel_bias);
10851069
*score = sc2<<8 | MIN(255, l);
1086-
//fprintf(stderr, "score = %d, qend-qbeg = %d, => adj score %d\n", sc, qend-qbeg, l);
10871070

10881071
rep_ele *reps, *elt, *tmp;
10891072
uint8_t *seg = ref2 + tbeg - left;
@@ -1112,9 +1095,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
11121095
iscore += (elt->end-elt->start) / elt->rep_len; // c
11131096
if (elt->start+tbeg <= r_start ||
11141097
elt->end+tbeg >= r_end) {
1115-
//iscore += 2*(elt->end-elt->start); //h5 (STR2)
1116-
//iscore += 4*(elt->end-elt->start); //h5STR4
1117-
iscore += (elt->end-elt->start); //h5STR1
1098+
iscore += (elt->end-elt->start);
11181099
}
11191100
}
11201101

@@ -1191,9 +1172,10 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
11911172
if (indelQ > seqQ) indelQ = seqQ;
11921173
if (indelQ > 255) indelQ = 255;
11931174
if (seqQ > 255) seqQ = 255;
1194-
p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
1195-
sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; // FIXME: redunctant; always indelQ
1196-
// fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
1175+
1176+
// use 22 bits in total
1177+
p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ;
1178+
sumq[sc[0]&0x3f] += indelQ;
11971179
}
11981180
}
11991181
// determine bca->indel_types[] and bca->inscns
@@ -1299,7 +1281,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
12991281

13001282
// construct the consensus sequence (minus indels, which are added later)
13011283
if (max_ins > 0) {
1302-
// FIXME: replace filling inscns[] with calc_consensus return
1284+
// TODO: replace filling inscns[] with calc_consensus return
13031285
// so the merges of the insertion consensus for type[t] is
13041286
// reported directly. (It may need adjustment to avoid N)
13051287
inscns = bcf_cgp_calc_ins_cons(n, n_plp, plp, pos,
@@ -1349,7 +1331,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13491331
int tcon_len[2];
13501332
int cpos_pos;
13511333
tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref,
1352-
left, right, s, types[t], biggest_del,
1334+
left, right, s, types[t], biggest_del,
13531335
&left_shift, &right_shift, &band,
13541336
tcon_len, &cpos_pos);
13551337
#ifdef CONS_DEBUG
@@ -1370,6 +1352,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13701352
#endif
13711353

13721354
// Scan for base-runs in the insertion.
1355+
// We use this to avoid over-correction in est_seqQ when the
1356+
// insertion is not part of the neighbouring homopolymer.
13731357
int k = tcons[0][cpos_pos], j;
13741358
for (j = 0; j < types[t]; j++)
13751359
if (tcons[0][cpos_pos+j] != k)
@@ -1437,16 +1421,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
14371421
{
14381422
rep_ele *reps, *elt, *tmp;
14391423
reps = find_STR(tcons[0], tcon_len[0], 0);
1440-
int max_str = 0, tot_str = 0;
1424+
//int max_str = 0;
1425+
int tot_str = 0;
14411426
DL_FOREACH_SAFE(reps, elt, tmp) {
1442-
if (max_str < elt->end - elt->start)
1443-
max_str = elt->end - elt->start;
1427+
// if (max_str < elt->end - elt->start)
1428+
// max_str = elt->end - elt->start;
14441429
tot_str += elt->end - elt->start;
14451430
DL_DELETE(reps, elt);
14461431
free(elt);
14471432
}
14481433

1449-
// Max_str should be enough, but it's still not
1434+
// Ideally max_str should be enough, but it's still not
14501435
// sufficient in longer range some repeats.
14511436
//min_win_size += max_str;
14521437
min_win_size += tot_str;
@@ -1476,11 +1461,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
14761461
// entire left/right region, so this also returns the
14771462
// equivalent genomic coords for qbeg/qend in tbeg/tend.
14781463
qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b),
1479-
left2/*+biggest_ins*/, 0, &tbeg);
1464+
left2, 0, &tbeg);
14801465
qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos,
14811466
0, &tend) - qbeg;
14821467
qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b),
1483-
right2/*-biggest_ins*/, 1, &tend);
1468+
right2, 1, &tend);
14841469

14851470
int old_tend = tend;
14861471
int old_tbeg = tbeg;

str_finder.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -184,33 +184,33 @@ rep_ele *find_STR(char *cons, int len, int lower_only) {
184184
w <<= 2;
185185
w |= cons[i];
186186
//printf("%3d %c w=%08x\n", i, cons[i], w);
187-
if ((w&0xfffffff) == ((w>>28)&0xfffffff))
187+
if ((w&0xfffffff) == ((w>>28)&0xfffffff))
188188
add_rep(&reps, cons, len, i, 14, lower_only, w);
189-
else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff))
189+
else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff))
190190
add_rep(&reps, cons, len, i, 13, lower_only, w);
191-
else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff))
191+
else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff))
192192
add_rep(&reps, cons, len, i, 12, lower_only, w);
193-
else if ((w&0x03fffff) == ((w>>22)&0x03fffff))
193+
else if ((w&0x03fffff) == ((w>>22)&0x03fffff))
194194
add_rep(&reps, cons, len, i, 11, lower_only, w);
195-
else if ((w&0x00fffff) == ((w>>20)&0x00fffff))
195+
else if ((w&0x00fffff) == ((w>>20)&0x00fffff))
196196
add_rep(&reps, cons, len, i, 10, lower_only, w);
197-
else if ((w&0x003ffff) == ((w>>18)&0x003ffff))
197+
else if ((w&0x003ffff) == ((w>>18)&0x003ffff))
198198
add_rep(&reps, cons, len, i, 9, lower_only, w);
199-
else if ((w&0xffff) == ((w>>16)&0xffff))
199+
else if ((w&0xffff) == ((w>>16)&0xffff))
200200
add_rep(&reps, cons, len, i, 8, lower_only, w);
201-
else if ((w&0x3fff) == ((w>>14)&0x3fff))
201+
else if ((w&0x3fff) == ((w>>14)&0x3fff))
202202
add_rep(&reps, cons, len, i, 7, lower_only, w);
203-
else if ((w&0x0fff) == ((w>>12)&0x0fff))
203+
else if ((w&0x0fff) == ((w>>12)&0x0fff))
204204
add_rep(&reps, cons, len, i, 6, lower_only, w);
205-
else if ((w&0x03ff) == ((w>>10)&0x03ff))
205+
else if ((w&0x03ff) == ((w>>10)&0x03ff))
206206
add_rep(&reps, cons, len, i, 5, lower_only, w);
207-
else if ((w&0x00ff) == ((w>> 8)&0x00ff))
207+
else if ((w&0x00ff) == ((w>> 8)&0x00ff))
208208
add_rep(&reps, cons, len, i, 4, lower_only, w);
209-
else if ((w&0x003f) == ((w>> 6)&0x003f))
209+
else if ((w&0x003f) == ((w>> 6)&0x003f))
210210
add_rep(&reps, cons, len, i, 3, lower_only, w);
211-
else if ((w&0x000f) == ((w>> 4)&0x000f))
211+
else if ((w&0x000f) == ((w>> 4)&0x000f))
212212
add_rep(&reps, cons, len, i, 2, lower_only, w);
213-
else if ((w&0x0003) == ((w>> 2)&0x0003))
213+
else if ((w&0x0003) == ((w>> 2)&0x0003))
214214
add_rep(&reps, cons, len, i, 1, lower_only, w);
215215
}
216216

0 commit comments

Comments
 (0)