Skip to content

Commit 2aae307

Browse files
committed
Work around colons in reference names (GRCh38 HLA).
This extends the callback function used by the hts iterator code to the ref:start-end range parser, permitting the parser to validate whether the whole thing matches a reference name. This works around parsing conflicts when given range queries like "HLA-DRB1*12:17", which is either the whole of "HLA-DRB1*12:17" (this exists in GRCh38) or base 17 onwards of ""HLA-DRB1*12" (which does not). Note there are still some undecided questions here. Do we want to handle the special names used in the iterator at this level to? Eg "*" meaning unmapped data and "." meaning whole file?
1 parent a13ac99 commit 2aae307

4 files changed

Lines changed: 70 additions & 21 deletions

File tree

hts.c

Lines changed: 48 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2507,15 +2507,39 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
25072507
return (sign == '+')? n : -n;
25082508
}
25092509

2510-
const char *hts_parse_reg(const char *s, int *beg, int *end)
2510+
/*
2511+
* A variant of hts_parse_reg which is reference-id aware. It uses
2512+
* the iterator name2id callbacks to validate the region tokenisation works.
2513+
*
2514+
* This is necessary due to GRCh38 HLA additions which have reference names
2515+
* like "HLA-DRB1*12:17".
2516+
*
2517+
* On success the end of the reference is returned (colon or end of string).
2518+
* On failure NULL is returned, and if tid/getid are supplied *tid will be -1.
2519+
*/
2520+
const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
2521+
hts_name2id_f getid, void *hdr)
25112522
{
2512-
char *hyphen;
2523+
// FIXME: do we need to permit tid=-1 for reference "*" to indicate unmapped
2524+
// reads, and strictly have NULL as failure test?
2525+
int tid_, s_len = strlen(s); // int is sufficient given beg/end types
2526+
if (!tid) tid = &tid_; // simplifies code below
2527+
25132528
const char *colon = strrchr(s, ':');
25142529
if (colon == NULL) {
25152530
*beg = 0; *end = INT_MAX;
2516-
return s + strlen(s);
2531+
*tid = getid ? getid(hdr, s) : 0;
2532+
return *tid >= 0 ? s + s_len : NULL;
25172533
}
25182534

2535+
// Has a colon, but check whole name first
2536+
if (getid) {
2537+
*beg = 0; *end = INT_MAX;
2538+
if ((*tid = getid(hdr, s)) >= 0)
2539+
return s + s_len;
2540+
}
2541+
2542+
char *hyphen;
25192543
*beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
25202544
if (*beg < 0) *beg = 0;
25212545

@@ -2524,37 +2548,42 @@ const char *hts_parse_reg(const char *s, int *beg, int *end)
25242548
else return NULL;
25252549

25262550
if (*beg >= *end) return NULL;
2551+
if (getid) {
2552+
kstring_t ks = { 0, 0, NULL };
2553+
kputsn(s, colon-s, &ks); // convert to nul terminated string
2554+
if (!ks.s) {
2555+
*tid = -1;
2556+
return NULL;
2557+
}
2558+
*tid = getid(hdr, ks.s);
2559+
free(ks.s);
2560+
} else {
2561+
*tid = 0;
2562+
}
25272563
return colon;
25282564
}
25292565

2566+
const char *hts_parse_reg(const char *s, int *beg, int *end)
2567+
{
2568+
return hts_parse_reg2(s, NULL, beg, end, NULL, NULL);
2569+
}
2570+
25302571
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
25312572
{
25322573
int tid, beg, end;
2533-
const char *q;
25342574

25352575
if (strcmp(reg, ".") == 0)
25362576
return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
25372577
else if (strcmp(reg, "*") == 0)
25382578
return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
25392579

2540-
q = hts_parse_reg(reg, &beg, &end);
2541-
if (q) {
2542-
char tmp_a[1024], *tmp = tmp_a;
2543-
if (q - reg + 1 > 1024)
2544-
if (!(tmp = malloc(q - reg + 1)))
2545-
return NULL;
2546-
strncpy(tmp, reg, q - reg);
2547-
tmp[q - reg] = 0;
2548-
tid = getid(hdr, tmp);
2549-
if (tmp != tmp_a)
2550-
free(tmp);
2551-
}
2552-
else {
2553-
// not parsable as a region, but possibly a sequence named "foo:a"
2554-
tid = getid(hdr, reg);
2580+
if ((tid = getid(hdr, reg)) >= 0) {
25552581
beg = 0; end = INT_MAX;
2582+
return itr_query(idx, tid, beg, end, readrec);
25562583
}
25572584

2585+
hts_parse_reg2(reg, &tid, &beg, &end, getid, hdr);
2586+
25582587
if (tid < 0) return NULL;
25592588
return itr_query(idx, tid, beg, end, readrec);
25602589
}

htslib/hts.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -663,13 +663,27 @@ long long hts_parse_decimal(const char *str, char **strend, int flags);
663663
@return Pointer to the colon or '\0' after the reference sequence name,
664664
or NULL if @a str could not be parsed.
665665
*/
666+
667+
typedef int (*hts_name2id_f)(void*, const char*);
668+
typedef const char *(*hts_id2name_f)(void*, int);
666669
const char *hts_parse_reg(const char *str, int *beg, int *end);
667670

671+
/// Parse a "CHR:START-END"-style region string
672+
/** @param str String to be parsed
673+
@param tid Set on return (if not NULL) to be reference index (-1 if invalid)
674+
@param beg Set on return to the 0-based start of the region
675+
@param end Set on return to the 1-based end of the region
676+
@param getid Function pointer. Called if not NULL to set tid.
677+
@param hdr Caller data passed to getid.
678+
@return Pointer to the colon or '\0' after the reference sequence name,
679+
or NULL if @a str could not be parsed.
680+
*/
681+
const char *hts_parse_reg2(const char *str, int *tid, int *beg, int *end,
682+
hts_name2id_f getid, void *hdr);
683+
668684
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
669685
void hts_itr_destroy(hts_itr_t *iter);
670686

671-
typedef int (*hts_name2id_f)(void*, const char*);
672-
typedef const char *(*hts_id2name_f)(void*, int);
673687
typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
674688

675689
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec);

htslib/sam.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ typedef struct {
275275
int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) HTS_RESULT_USED;
276276
void bam_hdr_destroy(bam_hdr_t *h);
277277
int bam_name2id(bam_hdr_t *h, const char *ref);
278+
const char *sam_parse_reg(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end);
278279
bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0);
279280

280281
bam1_t *bam_init1(void);

sam.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,11 @@ int bam_name2id(bam_hdr_t *h, const char *ref)
285285
return k == kh_end(d)? -1 : kh_val(d, k);
286286
}
287287

288+
const char *sam_parse_reg(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end) {
289+
// FIXME: do we need to also use cram_name2id?
290+
return hts_parse_reg2(s, tid, beg, end, (hts_name2id_f)bam_name2id, h);
291+
}
292+
288293
/*************************
289294
*** BAM alignment I/O ***
290295
*************************/

0 commit comments

Comments
 (0)