Skip to content

Commit 8d7e222

Browse files
committed
Now region parsin fails on ambiguous cases.
Also improved return values so NULL is primary way of detecting failure rather than tid. This is more in line with the old hts_parse_reg code. See samtools/hts-specs#124 (comment) for heuristic suggestions.
1 parent 2aae307 commit 8d7e222

1 file changed

Lines changed: 32 additions & 11 deletions

File tree

hts.c

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2514,8 +2514,13 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
25142514
* This is necessary due to GRCh38 HLA additions which have reference names
25152515
* like "HLA-DRB1*12:17".
25162516
*
2517-
* On success the end of the reference is returned (colon or end of string).
2518-
* On failure NULL is returned, and if tid/getid are supplied *tid will be -1.
2517+
* getid is optional and may be passed in as NULL. If given it is used to
2518+
* validate the reference name exists and is unambiguously parseable. If not
2519+
* given the best guess will be made but no has guarantees in validity.
2520+
*
2521+
* On success the end of the reference is returned (colon or end of string)
2522+
* beg/end will be set, plus tid if getid has been supplied.
2523+
* On failure NULL is returned.
25192524
*/
25202525
const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25212526
hts_name2id_f getid, void *hdr)
@@ -2525,18 +2530,37 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25252530
int tid_, s_len = strlen(s); // int is sufficient given beg/end types
25262531
if (!tid) tid = &tid_; // simplifies code below
25272532

2533+
// No colon implies entirety of the reference
25282534
const char *colon = strrchr(s, ':');
25292535
if (colon == NULL) {
25302536
*beg = 0; *end = INT_MAX;
25312537
*tid = getid ? getid(hdr, s) : 0;
25322538
return *tid >= 0 ? s + s_len : NULL;
25332539
}
25342540

2535-
// Has a colon, but check whole name first
2541+
// Has a colon, but check whole name first.
25362542
if (getid) {
25372543
*beg = 0; *end = INT_MAX;
2538-
if ((*tid = getid(hdr, s)) >= 0)
2544+
if ((*tid = getid(hdr, s)) >= 0) {
2545+
// Entire name matches, but also check this isn't
2546+
// ambiguous. eg we have ref chr1 and ref chr1:100-200
2547+
// both present.
2548+
kstring_t ks = { 0, 0, NULL };
2549+
kputsn(s, colon-s, &ks); // convert to nul terminated string
2550+
if (!ks.s) {
2551+
*tid = -1;
2552+
return NULL;
2553+
}
2554+
if (getid(hdr, ks.s) >= 0) {
2555+
free(ks.s);
2556+
*tid = -1;
2557+
hts_log_error("Range %s is ambiguous", s);
2558+
return NULL;
2559+
}
2560+
free(ks.s);
2561+
25392562
return s + s_len;
2563+
}
25402564
}
25412565

25422566
char *hyphen;
@@ -2557,6 +2581,8 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25572581
}
25582582
*tid = getid(hdr, ks.s);
25592583
free(ks.s);
2584+
if (*tid < 0)
2585+
return NULL;
25602586
} else {
25612587
*tid = 0;
25622588
}
@@ -2577,14 +2603,9 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g
25772603
else if (strcmp(reg, "*") == 0)
25782604
return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
25792605

2580-
if ((tid = getid(hdr, reg)) >= 0) {
2581-
beg = 0; end = INT_MAX;
2582-
return itr_query(idx, tid, beg, end, readrec);
2583-
}
2584-
2585-
hts_parse_reg2(reg, &tid, &beg, &end, getid, hdr);
2606+
if (!hts_parse_reg2(reg, &tid, &beg, &end, getid, hdr))
2607+
return NULL;
25862608

2587-
if (tid < 0) return NULL;
25882609
return itr_query(idx, tid, beg, end, readrec);
25892610
}
25902611

0 commit comments

Comments
 (0)