Skip to content

Commit f971815

Browse files
sirus20x6claude
andcommitted
Fix literal string matching for external filter values in annotate (fixes #2506)
When bcftools annotate uses a filter expression with external values (e.g., -i 'SOURCE_RECORD={SOURCE_RECORD}') to match annotation records, the string comparison incorrectly split values on commas and performed cross-product matching. This meant that two different INFO field values could falsely match if they shared any comma-separated component. External values from annotation file columns are single literal strings where commas are part of the value, not VCF multi-value separators. This change makes cmp_vector_strings() perform a direct string comparison when either operand is an external value (iext > 0), instead of using _match_vector_strings() which splits on commas. Also removes a stray debug fprintf left in the regex comparison path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent db17826 commit f971815

1 file changed

Lines changed: 22 additions & 2 deletions

File tree

filter.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2883,12 +2883,24 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
28832883
return;
28842884
}
28852885
if ( !regex )
2886-
rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
2886+
{
2887+
// When either operand is an external value (set via filter_test_ext),
2888+
// compare as literal strings rather than splitting on commas. External
2889+
// values come from annotation file columns and may contain commas that
2890+
// are part of the value, not VCF multi-value separators. (see gh #2506)
2891+
if ( atok->iext || btok->iext )
2892+
{
2893+
int match = atok->str_value.l==btok->str_value.l && !strncmp(atok->str_value.s,btok->str_value.s,atok->str_value.l) ? 1 : 0;
2894+
if ( logic==TOK_NE ) match = match ? 0 : 1;
2895+
rtok->pass_site = match;
2896+
}
2897+
else
2898+
rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
2899+
}
28872900
else
28882901
{
28892902
token_t *tok = atok->regex ? btok : atok;
28902903
rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
2891-
fprintf(stderr,"pass=%d [%s]\n",rtok->pass_site,tok->str_value.s);
28922904
}
28932905
return;
28942906
}
@@ -2934,6 +2946,14 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
29342946
int match;
29352947
if ( regex )
29362948
match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic);
2949+
else if ( atok->iext || btok->iext )
2950+
{
2951+
char *xstr = xtok->str_value.s + i*xtok->nval1;
2952+
size_t xlen = 0;
2953+
while ( xlen < (size_t)xtok->nval1 && xstr[xlen] ) xlen++;
2954+
match = xlen==ytok->str_value.l && !strncmp(xstr,ytok->str_value.s,xlen) ? 1 : 0;
2955+
if ( logic==TOK_NE ) match = match ? 0 : 1;
2956+
}
29372957
else
29382958
match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic);
29392959
if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }

0 commit comments

Comments
 (0)