Skip to content

Commit 727030a

Browse files
committed
Improve l_run estQ assignment.
If the sequence being inserted has differing base-calls to the flanking sequence, then est_seqQ's analysis of homopolymer runs and how they impact the score is irrelevant as the insertion isn't ambiguous anyway. This has a small reduction to FN on PacBio CCS with minimal change to FP.
1 parent 051b2bd commit 727030a

2 files changed

Lines changed: 54 additions & 52 deletions

File tree

bam2bcf.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,8 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
240240
seqQ = (3*seqQ + 2*q)/8;
241241
}
242242
if (_n > 20 && seqQ > 40) seqQ = 40;
243+
// Note baseQ changes some output fields such as I16, but has no
244+
// significant affect on "call".
243245
baseQ = p->aux>>8&0xff;
244246

245247
is_diff = (b != 0);
@@ -358,6 +360,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
358360
for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
359361
}
360362

363+
// Else consider downgrading bca->bases[] scores by AD vs AD_ref_missed
364+
// ratios. This is detrimental on Illumina, but beneficial on PacBio CCS.
365+
// It's possibly related to the homopolyer error likelihoods or overall
366+
// Indel accuracy. Maybe tie this in to the -h option?
367+
361368
r->ori_depth = ori_depth;
362369
// glfgen
363370
errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype

bam2bcf_indel.c

Lines changed: 47 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,3 @@
1-
//#define CONS_DEBUG
2-
3-
/*
4-
5-
TODO:
6-
7-
- Reevaluate the two STR indel-size adjusting modes.
8-
Maybe no longer relevant. (Looks poor to me)
9-
10-
- Consider limiting fract to never add more than current depth, so we
11-
change cons to Ns but not to another base type entirely.
12-
13-
- Consider a separate rfract for lift-over of SNPs than for indels.
14-
SNPs is good at replacing bases with N where we're unsure on the
15-
data. However ref_ins may cause issues with sizing?
16-
rfract*.8 is working better (so far). Trying 0.5 too.
17-
18-
- Left-align indels before consensus generation. Eg:
19-
20-
/pos being studied
21-
AGCTGGGGGGAATCG REF
22-
AGCT-GGGGGAATCG Seq type -1
23-
ACGTGGGGG-AATGCG Seq type 0
24-
^
25-
26-
Type 0 cons shouldn't include the right hand del, but it's outside
27-
of "biggest_del" window. Expand this to STR size or left-align.
28-
29-
- Long reads cause multiple scans of CIGAR to compute consensus.
30-
We need a way of caching CIGAR/seq start coords for pos p=left so at
31-
pos P where P>p we can start at p and continue instead of from the
32-
start each time.
33-
34-
- Improve QUAL scoring to consider AD vs DP.
35-
Eg. AD 10,8 looks good if we have 18 sequences. High qual.
36-
But AD 10,8 looks poor if we had 50 seqs. Why did we have to discard
37-
32 of them?
38-
39-
*/
40-
411
/* bam2bcf_indel.c -- indel caller.
422
433
Copyright (C) 2010, 2011 Broad Institute.
@@ -63,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
6323
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
6424
DEALINGS IN THE SOFTWARE. */
6525

26+
//#define CONS_DEBUG
27+
6628
#include <assert.h>
6729
#include <ctype.h>
6830
#include <string.h>
@@ -126,8 +88,16 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos,
12688
return last_y;
12789
}
12890

129-
// FIXME: check if the inserted sequence is consistent with the homopolymer run
130-
// l is the relative gap length and l_run is the length of the homopolymer on the reference
91+
// l is the relative gap length and l_run is the length of the homopolymer
92+
// on the reference.
93+
//
94+
// Larger seqQ is good, so increasing tandemQ calls more indels,
95+
// and longer l_run means fewer calls. It is capped later at 255.
96+
// For short l_runs, the qual is simply based on size of indel
97+
// larger ones being considered more likely to be real.
98+
// Longer indels get assigned a score based on the relative indel size
99+
// to homopolymer, where l_run base will have already been verified by
100+
// the caller to ensure it's compatible.
131101
static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
132102
{
133103
int q, qh;
@@ -408,7 +378,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
408378
int left, int right,
409379
int sample, int type, int biggest_del,
410380
int *left_shift, int *right_shift,
411-
int *band, int *tcon_len) {
381+
int *band, int *tcon_len, int *cpos_pos) {
412382
// Map ASCII ACGTN* to 012345
413383
static uint8_t base6[256] = {
414384
4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
@@ -717,8 +687,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
717687
// Used in cnum==1 to do the opposite of whichever way we did before.
718688
int heti[1024] = {0}, hetd[1024] = {0};
719689

690+
*cpos_pos = -1;
720691
for (cnum = 0; cnum < 2; cnum++) {
721692
for (i = k = 0; i < right-left; i++) {
693+
// Location in consensus matching the indel itself
694+
if (i >= pos-left+1 && *cpos_pos == -1)
695+
*cpos_pos = k;
696+
722697
int max_v = 0, max_v2 = 0, max_j = 4, max_j2 = 4, tot = 0;
723698
for (j = 0; j < 6; j++) {
724699
// Top 2 consensus calls
@@ -815,7 +790,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp,
815790
(*right_shift)++;
816791
} else {
817792
// Finally the easy case - a non-indel base or an N
818-
// FIXME: make cons[] in 0,1,2,3,4,5 terms
819793
if (max_v > CONS_CUTOFF*tot)
820794
cons[cnum][k++] = max_j; // "ACGTN*"
821795
else if (max_v > 0)
@@ -1320,6 +1294,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13201294

13211295
// The length of the homopolymer run around the current position
13221296
l_run = bcf_cgp_l_run(ref, pos);
1297+
int l_run_base = seq_nt16_table[(uint8_t)ref[pos+1]];
1298+
int l_run_ins = 0;
13231299

13241300
// construct the consensus sequence (minus indels, which are added later)
13251301
if (max_ins > 0) {
@@ -1371,21 +1347,38 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
13711347
char **tcons;
13721348
int left_shift, right_shift;
13731349
int tcon_len[2];
1350+
int cpos_pos;
13741351
tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref,
13751352
left, right, s, types[t], biggest_del,
13761353
&left_shift, &right_shift, &band,
1377-
tcon_len);
1354+
tcon_len, &cpos_pos);
13781355
#ifdef CONS_DEBUG
1379-
for (j = 0; j < 2; j++) {
1380-
int k;
1381-
fprintf(stderr, "Cons%d @ %d %4d/%3d ",
1382-
pos, types[t], left_shift);
1383-
for (k = 0; k < tcon_len[j]; k++)
1384-
putc("ACGTN"[(uint8_t)tcons[j][k]], stderr);
1385-
putc('\n', stderr);
1356+
{
1357+
int j;
1358+
for (j = 0; j < 2; j++) {
1359+
int k;
1360+
fprintf(stderr, "Cons%d @ %d %4d/%4d ",
1361+
j, pos, types[t], left_shift);
1362+
for (k = 0; k < tcon_len[j]; k++) {
1363+
if (k == cpos_pos)
1364+
putc('#', stderr);
1365+
putc("ACGTN"[(uint8_t)tcons[j][k]], stderr);
1366+
}
1367+
putc('\n', stderr);
1368+
}
13861369
}
13871370
#endif
13881371

1372+
// Scan for base-runs in the insertion.
1373+
int k = tcons[0][cpos_pos], j;
1374+
for (j = 0; j < types[t]; j++)
1375+
if (tcons[0][cpos_pos+j] != k)
1376+
break;
1377+
if (j && j == types[t])
1378+
l_run_ins |= "\x1\x2\x4\x8\xf"[k]; // ACGTN
1379+
if (types[t] < 0)
1380+
l_run_ins |= 0xff;
1381+
13891382
// align each read to consensus(es)
13901383
for (i = 0; i < n_plp[s]; ++i, ++K) {
13911384
bam_pileup1_t *p = plp[s] + i;
@@ -1542,6 +1535,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
15421535
}
15431536

15441537
// compute indelQ
1538+
if (!(l_run_base & l_run_ins))
1539+
l_run = 1; // different base type in ins to flanking region.
15451540
n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins,
15461541
ref_type, types, n_types, score);
15471542

0 commit comments

Comments
 (0)