Skip to content

Commit bf93316

Browse files
committed
Merge CRAM explicit tlen and ref_end fixes
2 parents b13f80a + 5bd8e0a commit bf93316

2 files changed

Lines changed: 40 additions & 18 deletions

File tree

cram/cram_encode.c

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,17 +1502,20 @@ int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
15021502

15031503
// Ensure ref and hist are large enough.
15041504
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
1505-
hts_pos_t ref_start, hts_pos_t *ref_end) {
1505+
hts_pos_t ref_start, hts_pos_t *ref_end,
1506+
hts_pos_t *ref_end_alloc) {
1507+
if (*ref_end < pos)
1508+
*ref_end = pos;
15061509
if (pos < ref_start)
15071510
return -1;
1508-
if (pos < *ref_end)
1511+
if (pos < *ref_end_alloc)
15091512
return 0;
15101513

15111514
// realloc
15121515
if (pos - ref_start > UINT_MAX)
15131516
return -2; // protect overflow in new_end calculation
15141517

1515-
hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
1518+
hts_pos_t old_end = *ref_end_alloc ? *ref_end_alloc : ref_start;
15161519
hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
15171520

15181521
// Refuse to work on excessively large blocks.
@@ -1531,7 +1534,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
15311534
if (!tmp5)
15321535
return -1;
15331536
*hist = tmp5;
1534-
*ref_end = new_end;
1537+
*ref_end_alloc = new_end;
15351538

15361539
// initialise
15371540
old_end -= ref_start;
@@ -1546,7 +1549,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
15461549
// Returns 1 on success, <0 on failure
15471550
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
15481551
hts_pos_t ref_start, hts_pos_t *ref_end,
1549-
const uint8_t *MD) {
1552+
hts_pos_t *ref_end_alloc, const uint8_t *MD) {
15501553
uint8_t *seq = bam_get_seq(b);
15511554
uint32_t *cigar = bam_get_cigar(b);
15521555
uint32_t ncigar = b->core.n_cigar;
@@ -1558,7 +1561,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
15581561

15591562
// No sequence means extend based on CIGAR instead
15601563
if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end,
1561-
ref_start, ref_end) < 0)
1564+
ref_start, ref_end, ref_end_alloc) < 0)
15621565
return -1;
15631566

15641567
int iseq = 0, next_op;
@@ -1573,7 +1576,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
15731576
int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
15741577
if (overflow ||
15751578
extend_ref(ref, hist, iref+ref_start + len,
1576-
ref_start, ref_end) < 0)
1579+
ref_start, ref_end, ref_end_alloc) < 0)
15771580
return -1;
15781581
while (iseq < b->core.l_qseq && len) {
15791582
// rewrite to have internal loops?
@@ -1605,7 +1608,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
16051608
MD++;
16061609
while (isalpha(*MD)) {
16071610
if (extend_ref(ref, hist, iref+ref_start, ref_start,
1608-
ref_end) < 0)
1611+
ref_end, ref_end_alloc) < 0)
16091612
return -1;
16101613
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
16111614
&iseq, &cig_ind, &cig_op,
@@ -1621,7 +1624,8 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
16211624
}
16221625
} else {
16231626
// substitution
1624-
if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
1627+
if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end,
1628+
ref_end_alloc) < 0)
16251629
return -1;
16261630
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
16271631
&iseq, &cig_ind, &cig_op,
@@ -1650,12 +1654,14 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
16501654
// Returns >=0 on success,
16511655
// -1 on failure (eg inconsistent data)
16521656
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
1653-
hts_pos_t ref_start, hts_pos_t *ref_end) {
1657+
hts_pos_t ref_start, hts_pos_t *ref_end,
1658+
hts_pos_t *ref_end_alloc) {
16541659
const uint8_t *MD = bam_aux_get(b, "MD");
16551660
int ret = 0;
16561661
if (MD && *MD == 'Z') {
16571662
// We can use MD to directly compute the reference
1658-
int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
1663+
int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end,
1664+
ref_end_alloc, MD+1);
16591665

16601666
if (ret > 0)
16611667
return ret;
@@ -1683,7 +1689,7 @@ static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
16831689
static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
16841690

16851691
if (extend_ref(ref, hist, iref+ref_start + len,
1686-
ref_start, ref_end) < 0)
1692+
ref_start, ref_end, ref_end_alloc) < 0)
16871693
return -1;
16881694
if (iseq + len <= b->core.l_qseq) {
16891695
// Nullify failed MD:Z if appropriate
@@ -1726,15 +1732,16 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
17261732
// user told us to do embed_ref=2.
17271733
char *ref = NULL;
17281734
uint32_t (*hist)[5] = NULL;
1729-
hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
1735+
hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0,
1736+
ref_end_alloc = 0;
17301737
if (ref_start < 0)
17311738
return -1; // cannot build consensus from unmapped data
17321739

17331740
// initial allocation
17341741
if (extend_ref(&ref, &hist,
17351742
c->bams[r1 + s->hdr->num_records-1]->core.pos +
17361743
c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
1737-
ref_start, &ref_end) < 0)
1744+
ref_start, &ref_end, &ref_end_alloc) < 0)
17381745
return -1;
17391746

17401747
// Add each bam file to the reference/consensus arrays
@@ -1746,7 +1753,8 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
17461753
goto err;
17471754
}
17481755
last_pos = c->bams[r1]->core.pos;
1749-
if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
1756+
if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end,
1757+
&ref_end_alloc) < 0)
17501758
goto err;
17511759
}
17521760

@@ -1770,6 +1778,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
17701778
c->ref_start = ref_start+1;
17711779
c->ref_end = ref_end+1;
17721780
c->ref_free = 1;
1781+
17731782
return 0;
17741783

17751784
err:
@@ -2012,8 +2021,10 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
20122021
pthread_mutex_unlock(&fd->ref_lock);
20132022
}
20142023

2015-
if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
2016-
c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
2024+
hts_pos_t rlen = MAX(fd->refs->ref_id[c->ref_id]->LN_length,
2025+
fd->refs->ref_id[c->ref_id]->length);
2026+
if (c->ref_end > rlen && rlen)
2027+
c->ref_end = rlen;
20172028
}
20182029

20192030
// Iterate through records creating the cram blocks for some
@@ -3927,7 +3938,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
39273938
if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
39283939
cram_stats_del(c->stats[DS_NP], p->mate_pos);
39293940
cram_stats_del(c->stats[DS_MF], p->mate_flags);
3930-
if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
3941+
if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN)
3942+
&& !explicit_tlen)
39313943
cram_stats_del(c->stats[DS_TS], p->tlen);
39323944
cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
39333945
}

test/test.pl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,16 @@ sub test_view
894894
testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
895895
testv $opts, "./compare_sam.pl $ersam $ersam2";
896896

897+
# Embed_ref=2 with CRAM v4 uses explicit_len if it has to instead of
898+
# breaking pairs with detached mode.
899+
# Oddly this bug was only triggered when also specifying a reference.
900+
$ersam = "xx#pair.sam";
901+
$ercram = "xx#pair.tmp.cram";
902+
$ersam2 = "${ercram}.sam";
903+
testv $opts, "./test_view $tv_args -o version=4.0 -o embed_ref=2 -t xx.fa -C -p $ercram $ersam";
904+
testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
905+
testv $opts, "./compare_sam.pl $ersam $ersam2";
906+
897907
if ($test_view_failures == 0) {
898908
passed($opts, "embed_ref=2 tests");
899909
} else {

0 commit comments

Comments
 (0)