@@ -2490,6 +2490,12 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
24902490 if (esign == '-' ) e = - e ;
24912491 }
24922492
2493+ switch (* s ) {
2494+ case 'k' : case 'K' : e += 3 ; s ++ ; break ;
2495+ case 'm' : case 'M' : e += 6 ; s ++ ; break ;
2496+ case 'g' : case 'G' : e += 9 ; s ++ ; break ;
2497+ }
2498+
24932499 e -= decimals ;
24942500 while (e > 0 ) n *= 10 , e -- ;
24952501 while (e < 0 ) lost += n % 10 , n /= 10 , e ++ ;
@@ -2501,45 +2507,90 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
25012507 if (strend ) {
25022508 * strend = (char * )s ;
25032509 } else if (* s ) {
2504- hts_log_warning ("Ignoring unknown characters after %.*s[%s]" , (int )(s - str ), str , s );
2510+ if ((flags & HTS_PARSE_THOUSANDS_SEP ) || (!(flags & HTS_PARSE_THOUSANDS_SEP ) && * s != ',' ))
2511+ hts_log_warning ("Ignoring unknown characters after %.*s[%s]" , (int )(s - str ), str , s );
25052512 }
25062513
25072514 return (sign == '+' )? n : - n ;
25082515}
25092516
2517+ static void * hts_memrchr (const void * s , int c , size_t n ) {
2518+ size_t i ;
2519+ unsigned char * u = (unsigned char * )s ;
2520+ for (i = n ; i > 0 ; i -- ) {
2521+ if (u [i - 1 ] == c )
2522+ return u + i - 1 ;
2523+ }
2524+
2525+ return NULL ;
2526+ }
2527+
25102528/*
25112529 * A variant of hts_parse_reg which is reference-id aware. It uses
25122530 * the iterator name2id callbacks to validate the region tokenisation works.
25132531 *
25142532 * This is necessary due to GRCh38 HLA additions which have reference names
25152533 * like "HLA-DRB1*12:17".
25162534 *
2517- * getid is optional and may be passed in as NULL. If given it is used to
2518- * validate the reference name exists and is unambiguously parseable. If not
2519- * given the best guess will be made but no has guarantees in validity.
2535+ * All parameters are mandatory.
2536+ *
2537+ * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
2538+ * are reference names, we may quote using curly braces.
2539+ * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
2540+ *
2541+ * Flags are used to control how parsing works, and can be one of the below.
2542+ *
2543+ * HTS_PARSE_LIST:
2544+ * If present, the region is assmed to be a comma separated list and
2545+ * position parsing will not contain commas (this implicitly
2546+ * clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
2547+ * On success the return pointer will be the start of the next region, ie
2548+ * the character after the comma. (If *ret != '\0' then the caller can
2549+ * assume another region is present in the list.)
2550+ *
2551+ * If not set then positions may contain commas. In this case the return
2552+ * value should point to the end of the string, or NULL on failure.
25202553 *
2521- * To work around these issues quoting is also permitted via {ref}:start-end.
2522- * In this case, the return value will point to '}' and not the end of the
2523- * reference (but this is a useful indication that it started with '{').
2554+ * HTS_PARSE_ONE_COORD:
2555+ * If present, X:100 is treated as the single base pair region X:100-100.
2556+ * In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
2557+ * (This is the standard bcftools region convention.)
2558+ *
2559+ * When not set X:100 is considered to be X:100-<end> where <end> is
2560+ * the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are
2561+ * invalid.
2562+ * (This is the standard samtools region convention.)
2563+ *
2564+ * Note the supplied string expects 1 based inclusive coordinates, but the
2565+ * returned coordinates start from 0 and are half open, so pos0 is valid
2566+ * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
25242567 *
25252568 * On success the end of the reference is returned (colon or end of string)
25262569 * beg/end will be set, plus tid if getid has been supplied.
25272570 * On failure NULL is returned.
25282571 */
2529- const char * hts_parse_reg2 (const char * s , int * tid , int * beg , int * end ,
2530- hts_name2id_f getid , void * hdr )
2572+ const char * hts_parse_region (const char * s , int * tid , int * beg , int * end ,
2573+ hts_name2id_f getid , void * hdr , int flags )
25312574{
2532- // FIXME: do we need to permit tid=-1 for reference "*" to indicate unmapped
2533- // reads, and strictly have NULL as failure test?
2534- int tid_ , s_len = strlen (s ); // int is sufficient given beg/end types
2535- if (!tid ) tid = & tid_ ; // simplifies code below
2575+ if (!s || !tid || !beg || !end || !getid )
2576+ return NULL ;
25362577
2537- const char * colon = NULL ;
2578+ int s_len = strlen (s ); // int is sufficient given beg/end types
2579+ kstring_t ks = { 0 , 0 , NULL };
2580+
2581+ const char * colon = NULL , * comma = NULL ;
25382582 int quoted = 0 ;
25392583
2584+ if (flags & HTS_PARSE_LIST )
2585+ flags &= ~HTS_PARSE_THOUSANDS_SEP ;
2586+ else
2587+ flags |= HTS_PARSE_THOUSANDS_SEP ;
2588+
2589+ const char * s_end = s + s_len ;
2590+
25402591 // Braced quoting of references is permitted to resolve ambiguities.
25412592 if (* s == '{' ) {
2542- const char * close = strrchr (s , '}' );
2593+ const char * close = memchr (s , '}' , s_len );
25432594 if (!close ) {
25442595 hts_log_error ("Mismatching braces in \"%s\"" , s );
25452596 return NULL ;
@@ -2549,36 +2600,56 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25492600 if (close [1 ] == ':' )
25502601 colon = close + 1 ;
25512602 quoted = 1 ; // number of trailing characters to trim
2603+
2604+ // Truncate to this item only, if appropriate.
2605+ if (flags & HTS_PARSE_LIST ) {
2606+ comma = strchr (close , ',' );
2607+ if (comma ) {
2608+ s_len = comma - s ;
2609+ s_end = comma + 1 ;
2610+ }
2611+ }
25522612 } else {
2553- colon = strrchr (s , ':' );
2613+ // Truncate to this item only, if appropriate.
2614+ if (flags & HTS_PARSE_LIST ) {
2615+ comma = strchr (s , ',' );
2616+ if (comma ) {
2617+ s_len = comma - s ;
2618+ s_end = comma + 1 ;
2619+ }
2620+ }
2621+
2622+ colon = hts_memrchr (s , ':' , s_len );
25542623 }
25552624
2625+ // No colon is simplest case; just check and return.
25562626 if (colon == NULL ) {
25572627 * beg = 0 ; * end = INT_MAX ;
2558- if (getid ) {
2559- kstring_t ks = { 0 , 0 , NULL };
2560- kputsn (s , s_len - quoted , & ks ); // convert to nul terminated string
2561- if (!ks .s ) {
2562- * tid = -1 ;
2563- return NULL ;
2564- }
2565-
2566- * tid = getid (hdr , ks .s );
2567- free (ks .s );
2568- } else {
2569- * tid = 0 ;
2628+ kputsn (s , s_len - quoted , & ks ); // convert to nul terminated string
2629+ if (!ks .s ) {
2630+ * tid = -1 ;
2631+ return NULL ;
25702632 }
2571- return * tid >= 0 ? s + s_len : NULL ;
2633+
2634+ * tid = getid (hdr , ks .s );
2635+ free (ks .s );
2636+
2637+ return * tid >= 0 ? s_end : NULL ;
25722638 }
25732639
25742640 // Has a colon, but check whole name first.
2575- if (!quoted && getid ) {
2641+ if (!quoted ) {
25762642 * beg = 0 ; * end = INT_MAX ;
2577- if ((* tid = getid (hdr , s )) >= 0 ) {
2643+ kputsn (s , s_len , & ks ); // convert to nul terminated string
2644+ if (!ks .s ) {
2645+ * tid = -1 ;
2646+ return NULL ;
2647+ }
2648+ if ((* tid = getid (hdr , ks .s )) >= 0 ) {
25782649 // Entire name matches, but also check this isn't
25792650 // ambiguous. eg we have ref chr1 and ref chr1:100-200
25802651 // both present.
2581- kstring_t ks = { 0 , 0 , NULL } ;
2652+ ks . l = 0 ;
25822653 kputsn (s , colon - s , & ks ); // convert to nul terminated string
25832654 if (!ks .s ) {
25842655 * tid = -1 ;
@@ -2594,39 +2665,81 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25942665 }
25952666 free (ks .s );
25962667
2597- return s + s_len ;
2668+ return s_end ;
25982669 }
25992670 }
26002671
2601- char * hyphen ;
2602- * beg = hts_parse_decimal (colon + 1 , & hyphen , HTS_PARSE_THOUSANDS_SEP ) - 1 ;
2603- if (* beg < 0 ) * beg = 0 ;
2604-
2605- if (* hyphen == '\0' ) * end = INT_MAX ;
2606- else if (* hyphen == '-' ) * end = hts_parse_decimal (hyphen + 1 , NULL , HTS_PARSE_THOUSANDS_SEP );
2607- else return NULL ;
2672+ // Quoted, or unquoted and whole string isn't a name.
2673+ // Check the pre-colon part is valid.
2674+ ks .l = 0 ;
2675+ kputsn (s , colon - s - quoted , & ks ); // convert to nul terminated string
2676+ if (!ks .s ) {
2677+ * tid = -1 ;
2678+ return NULL ;
2679+ }
2680+ * tid = getid (hdr , ks .s );
2681+ free (ks .s );
2682+ if (* tid < 0 )
2683+ return NULL ;
26082684
2609- if (* beg >= * end ) return NULL ;
2610- if (getid ) {
2611- kstring_t ks = { 0 , 0 , NULL };
2612- kputsn (s , colon - s - quoted , & ks ); // convert to nul terminated string
2613- if (!ks .s ) {
2614- * tid = -1 ;
2685+ // Finally parse the post-colon coordinates
2686+ char * hyphen ;
2687+ * beg = hts_parse_decimal (colon + 1 , & hyphen , flags ) - 1 ;
2688+ if (* beg < 0 ) {
2689+ if (isdigit (* hyphen ) || * hyphen == '\0' || * hyphen == ',' ) {
2690+ // interpret chr:-100 as chr:1-100
2691+ * end = * beg == -1 ? INT_MAX : - (* beg + 1 );
2692+ * beg = 0 ;
2693+ return s_end ;
2694+ } else if (* hyphen == '-' ) {
2695+ * beg = 0 ;
2696+ } else {
2697+ hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
26152698 return NULL ;
26162699 }
2617- * tid = getid (hdr , ks .s );
2618- free (ks .s );
2619- if (* tid < 0 )
2700+ }
2701+
2702+ if (* hyphen == '\0' || ((flags & HTS_PARSE_LIST ) && * hyphen == ',' )) {
2703+ * end = flags & HTS_PARSE_ONE_COORD ? * beg + 1 : INT_MAX ;
2704+ } else if (* hyphen == '-' ) {
2705+ * end = hts_parse_decimal (hyphen + 1 , & hyphen , flags );
2706+ if (* hyphen != '\0' && * hyphen != ',' ) {
2707+ hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
26202708 return NULL ;
2709+ }
26212710 } else {
2622- * tid = 0 ;
2711+ hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
2712+ return NULL ;
26232713 }
2624- return colon ;
2714+
2715+ if (* end == 0 )
2716+ * end = INT_MAX ; // interpret chr:100- as chr:100-<end>
2717+
2718+ if (* beg >= * end ) return NULL ;
2719+
2720+ return s_end ;
26252721}
26262722
2723+ // Next release we should mark this as deprecated?
2724+ // Use hts_parse_region above instead.
26272725const char * hts_parse_reg (const char * s , int * beg , int * end )
26282726{
2629- return hts_parse_reg2 (s , NULL , beg , end , NULL , NULL );
2727+ char * hyphen ;
2728+ const char * colon = strrchr (s , ':' );
2729+ if (colon == NULL ) {
2730+ * beg = 0 ; * end = INT_MAX ;
2731+ return s + strlen (s );
2732+ }
2733+
2734+ * beg = hts_parse_decimal (colon + 1 , & hyphen , HTS_PARSE_THOUSANDS_SEP ) - 1 ;
2735+ if (* beg < 0 ) * beg = 0 ;
2736+
2737+ if (* hyphen == '\0' ) * end = INT_MAX ;
2738+ else if (* hyphen == '-' ) * end = hts_parse_decimal (hyphen + 1 , NULL , HTS_PARSE_THOUSANDS_SEP );
2739+ else return NULL ;
2740+
2741+ if (* beg >= * end ) return NULL ;
2742+ return colon ;
26302743}
26312744
26322745hts_itr_t * hts_itr_querys (const hts_idx_t * idx , const char * reg , hts_name2id_f getid , void * hdr , hts_itr_query_func * itr_query , hts_readrec_func * readrec )
@@ -2638,7 +2751,7 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g
26382751 else if (strcmp (reg , "*" ) == 0 )
26392752 return itr_query (idx , HTS_IDX_NOCOOR , 0 , 0 , readrec );
26402753
2641- if (!hts_parse_reg2 (reg , & tid , & beg , & end , getid , hdr ))
2754+ if (!hts_parse_region (reg , & tid , & beg , & end , getid , hdr , HTS_PARSE_THOUSANDS_SEP ))
26422755 return NULL ;
26432756
26442757 return itr_query (idx , tid , beg , end , readrec );
0 commit comments