Skip to content

Commit 1881d9f

Browse files
committed
Update to the depth filtering for mpileup.
We can now count the depth anywhere along the read rather than only at the left end. It's not buffering reads up so it doesn't know what's about to come, so for any individual column it'll still favour reads that end near a column than start near a column, but by counting at the right hand end we avoid the boom & bust approach to before where the coverage could plumet to zero immediately following a large spike. With iter->depth_pos = 1.0 the coverage ends up being a minimum of maxcnt (assuming the true coverage is higher), rather than a maximum of maxcnt. So we go over-depth, but avoid the drop outs.
1 parent 34c1596 commit 1881d9f

2 files changed

Lines changed: 86 additions & 3 deletions

File tree

htslib/sam.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1650,6 +1650,17 @@ typedef struct __bam_mplp_t *bam_mplp_t;
16501650
HTSLIB_EXPORT
16511651
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
16521652

1653+
/**
1654+
* bam_plp_set_cntpos() - indicate which end of reach to apply maxcnt.
1655+
* @plp: The bam_plp_t initialised using bam_plp_init.
1656+
* @end: A fraction along the read from 0.0 (start) to 1.0 (end).
1657+
* Defaults to 0.0 which makes maxcnt a true upper-bound value.
1658+
* Using 1.0 makes maxcnt a lower-bound, with upper-bound
1659+
* varying based on length and coordinate distribution.
1660+
*/
1661+
HTSLIB_EXPORT
1662+
void bam_plp_set_cntpos(bam_plp_t iter, double end);
1663+
16531664
HTSLIB_EXPORT
16541665
void bam_plp_reset(bam_plp_t iter);
16551666

@@ -1712,6 +1723,18 @@ typedef struct __bam_mplp_t *bam_mplp_t;
17121723
HTSLIB_EXPORT
17131724
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
17141725

1726+
/**
1727+
* bam_mplp_set_cntpos() - indicate which end of reach to apply maxcnt.
1728+
* @mplp: The bam_mplp_t initialised using bam_mplp_init.
1729+
* @end: A fraction along the read from 0.0 (start) to 1.0 (end).
1730+
* Defaults to 0.0 which makes maxcnt a true upper-bound value.
1731+
* Using 1.0 makes maxcnt a lower-bound, with upper-bound
1732+
* varying based on length and coordinate distribution.
1733+
*/
1734+
HTSLIB_EXPORT
1735+
void bam_mplp_set_cntpos(bam_mplp_t iter, double end);
1736+
1737+
17151738
HTSLIB_EXPORT
17161739
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
17171740

sam.c

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4139,6 +4139,8 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
41394139
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
41404140
typedef khash_t(olap_hash) olap_hash_t;
41414141

4142+
#define MAX_AHEAD 4096
4143+
41424144
struct __bam_plp_t {
41434145
mempool_t *mp;
41444146
lbnode_t *head, *tail;
@@ -4153,6 +4155,10 @@ struct __bam_plp_t {
41534155
void *data;
41544156
olap_hash_t *overlaps;
41554157

4158+
uint32_t depth[MAX_AHEAD];
4159+
uint64_t last_depth_pos;
4160+
double depth_pos_fract;
4161+
41564162
// For notification of creation and destruction events
41574163
// and associated client-owned pointer.
41584164
int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
@@ -4172,6 +4178,9 @@ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
41724178
iter->data = data;
41734179
iter->b = bam_init1();
41744180
}
4181+
memset(iter->depth, 0, MAX_AHEAD * sizeof(*iter->depth));
4182+
iter->depth_pos_fract = 0; // fraction between 0 = left and 1 = right
4183+
iter->last_depth_pos = 0;
41754184
return iter;
41764185
}
41774186

@@ -4494,21 +4503,60 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_
44944503

44954504
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
44964505
{
4506+
uint64_t endpos = 0;
4507+
44974508
if (iter->error) return -1;
44984509
if (b) {
44994510
if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
45004511
// Skip only unmapped reads here, any additional filtering must be done in iter->func
45014512
if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
4502-
if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
4503-
{
4513+
if (iter->depth_pos_fract == 0
4514+
&& iter->tid == b->core.tid
4515+
&& iter->pos == b->core.pos
4516+
&& iter->mp->cnt > iter->maxcnt) {
4517+
// Removal from left-end depth; mp->cnt
45044518
overlap_remove(iter, b);
45054519
return 0;
4520+
} else if (iter->depth_pos_fract > 0
4521+
&& iter->tid == b->core.tid
4522+
&& iter->pos == b->core.pos) {
4523+
// Removal from depth somewhere else along read
4524+
endpos = bam_endpos(b);
4525+
uint32_t len = (uint32_t)(endpos - b->core.pos - 1);
4526+
len = len * iter->depth_pos_fract + 0.4999;
4527+
if (b->core.pos + len < iter->last_depth_pos
4528+
&& iter->depth[(b->core.pos+len)%MAX_AHEAD] > iter->maxcnt) {
4529+
// NB this means read longer than MAX_AHEAD (after downsizing
4530+
// by depth_pos fraction) will be retained as by definition
4531+
// it'll have zero coverage that far out.
4532+
overlap_remove(iter, b);
4533+
return 0;
4534+
}
4535+
4536+
// Zero newly observed iter depth elemtns.
4537+
uint64_t end_clipped = endpos, p;
4538+
if (end_clipped > iter->pos + MAX_AHEAD)
4539+
end_clipped = iter->pos + MAX_AHEAD;
4540+
if (iter->last_depth_pos < end_clipped) {
4541+
//iter->last_depth_pos = end_clipped;
4542+
if (iter->last_depth_pos < end_clipped-MAX_AHEAD)
4543+
iter->last_depth_pos = end_clipped-MAX_AHEAD;
4544+
for (p = iter->last_depth_pos; p < end_clipped; p++)
4545+
iter->depth[p % MAX_AHEAD] = 0;
4546+
iter->last_depth_pos = end_clipped;
4547+
}
4548+
4549+
// Increment depth
4550+
for (p = b->core.pos; p < end_clipped; p++)
4551+
iter->depth[p % MAX_AHEAD]++;
45064552
}
45074553
if (bam_copy1(&iter->tail->b, b) == NULL)
45084554
return -1;
45094555
iter->tail->b.id = iter->id++;
45104556
iter->tail->beg = b->core.pos;
4511-
iter->tail->end = bam_endpos(b);
4557+
if (!endpos)
4558+
endpos = bam_endpos(b);
4559+
iter->tail->end = endpos;
45124560
iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
45134561
if (b->core.tid < iter->max_tid) {
45144562
hts_log_error("The input is not sorted (chromosomes out of order)");
@@ -4602,6 +4650,11 @@ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
46024650
iter->maxcnt = maxcnt;
46034651
}
46044652

4653+
void bam_plp_set_cntpos(bam_plp_t iter, double end)
4654+
{
4655+
iter->depth_pos_fract = end;
4656+
}
4657+
46054658
/************************
46064659
*** Mpileup iterator ***
46074660
************************/
@@ -4651,6 +4704,13 @@ void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
46514704
iter->iter[i]->maxcnt = maxcnt;
46524705
}
46534706

4707+
void bam_mplp_set_cntpos(bam_mplp_t iter, double end)
4708+
{
4709+
int i;
4710+
for (i = 0; i < iter->n; ++i)
4711+
iter->iter[i]->depth_pos_fract = end;
4712+
}
4713+
46544714
void bam_mplp_destroy(bam_mplp_t iter)
46554715
{
46564716
int i;

0 commit comments

Comments
 (0)