Skip to content

Commit 5e5334c

Browse files
committed
FSST LIKE: ILIKE / ASCII case-insensitive matching
Extend the DFA construction to optionally fold ASCII letter case. Adds `FsstMatcher::try_new_with(symbols, lengths, pattern, case_insensitive)`; the `like.rs` kernel now plumbs `options.case_insensitive` through instead of bailing out. Mechanism: - `dfa/mod.rs`: add `ascii_to_lower`, `pattern_eq(a, b, ci)`, `pattern_matches_byte(p, b, ci)`, and a `set_advance` helper that, when `ci` is true, sets both case variants of an ASCII letter in the byte table. `kmp_byte_transitions` and `kmp_failure_table` now take `ci`; the fold is at construction time so the hot loop stays a single table lookup per byte. - `dfa/prefix.rs::build_prefix_byte_table`, `dfa/suffix.rs:: build_suffix_byte_table`, `dfa/multi_contains.rs:: chained_kmp_byte_transitions`: same pattern. - Each DFA's `new()` takes `case_insensitive: bool`. Threaded through `FsstMatcher::try_new_with` from `LikeKernel::like`. - Escape-only memmem fast path is gated to wildcard-free, case-sensitive needles (the encoded pattern is byte-exact). Adds 6 ILIKE tests covering prefix, suffix, contains, multi-contains, ILIKE + `_` wildcard, and ILIKE with FSST symbol expansions in mixed case. 169 tests pass. Signed-off-by: Claude <noreply@anthropic.com>
1 parent b791a00 commit 5e5334c

8 files changed

Lines changed: 245 additions & 75 deletions

File tree

encodings/fsst/src/compute/like.rs

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,8 @@ impl LikeKernel for FSST {
6868
return Ok(None);
6969
};
7070

71-
if options.case_insensitive {
72-
return Ok(None);
73-
}
71+
// `case_insensitive` (SQL `ILIKE`) is plumbed through the
72+
// matcher; only ASCII letter case is folded.
7473

7574
let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
7675
let Some(v) = s.value() else {
@@ -101,8 +100,12 @@ impl LikeKernel for FSST {
101100
let layout_us = phase_us(phase_t);
102101
phase_t = trace.then(std::time::Instant::now);
103102

104-
let Some(matcher) =
105-
FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
103+
let Some(matcher) = FsstMatcher::try_new_with(
104+
symbols.as_slice(),
105+
symbol_lengths.as_slice(),
106+
pattern_bytes,
107+
options.case_insensitive,
108+
)?
106109
else {
107110
return Ok(None);
108111
};
@@ -364,21 +367,22 @@ mod tests {
364367
let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
365368
let mut ctx = SESSION.create_execution_ctx();
366369

367-
// Underscore wildcard -- not handled.
370+
// Anchored exact pattern (no bookend `%`) is still unsupported —
371+
// mixed-anchor work is a separate item.
368372
let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
369373
let fsst_v = fsst.as_view();
370374
let result =
371375
<FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
372-
assert!(result.is_none(), "underscore pattern should fall back");
376+
assert!(result.is_none(), "anchored exact pattern should fall back");
373377

374-
// Case-insensitive -- not handled.
378+
// ILIKE is now handled by the matcher's case-insensitive path.
375379
let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
376380
let opts = LikeOptions {
377381
negated: false,
378382
case_insensitive: true,
379383
};
380384
let result = <FSST as LikeKernel>::like(fsst_v, &pattern, opts, &mut ctx)?;
381-
assert!(result.is_none(), "ilike should fall back");
385+
assert!(result.is_some(), "ilike should now be handled");
382386

383387
Ok(())
384388
}

encodings/fsst/src/dfa/flat_contains.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ impl FlatContainsDfa {
6868
symbols: &[Symbol],
6969
symbol_lengths: &[u8],
7070
needle: &[u8],
71+
case_insensitive: bool,
7172
) -> VortexResult<Self> {
7273
if needle.len() > Self::MAX_NEEDLE_LEN {
7374
vortex_bail!(
@@ -82,7 +83,7 @@ impl FlatContainsDfa {
8283
let n_states = accept_state + 1;
8384
let sentinel = n_states;
8485

85-
let byte_table = kmp_byte_transitions(needle);
86+
let byte_table = kmp_byte_transitions(needle, case_insensitive);
8687
let sym_trans =
8788
build_symbol_transitions(symbols, symbol_lengths, &byte_table, n_states, accept_state);
8889
let transitions = build_fused_table(&sym_trans, symbols.len(), n_states, |_| sentinel, 0);
@@ -103,7 +104,8 @@ impl FlatContainsDfa {
103104
// and `scan_to_bitbuf` can prefilter with a single `memmem` over
104105
// `all_bytes` whose length matches the encoded needle. Disabled
105106
// for L < 2 (no possible win over the existing scan path).
106-
let escape_only_pattern = (needle.len() >= 2
107+
let escape_only_pattern = (!case_insensitive
108+
&& needle.len() >= 2
107109
&& super::needle_is_literal(needle)
108110
&& needle_bytes_absent_from_all_symbols(symbols, symbol_lengths, needle))
109111
.then(|| build_escape_only_encoded_pattern(needle));

encodings/fsst/src/dfa/folded_contains.rs

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ impl FoldedContainsDfa {
219219
symbols: &[Symbol],
220220
symbol_lengths: &[u8],
221221
needle: &[u8],
222+
case_insensitive: bool,
222223
) -> VortexResult<Self> {
223224
if needle.len() > Self::MAX_NEEDLE_LEN {
224225
vortex_bail!(
@@ -235,7 +236,7 @@ impl FoldedContainsDfa {
235236
// Total states: 2N+1 (normal 0..=N, escape N+1..=2N for base 0..=N-1).
236237
let n_states_usize = 2 * usize::from(accept_state) + 1;
237238

238-
let byte_table = kmp_byte_transitions(needle);
239+
let byte_table = kmp_byte_transitions(needle, case_insensitive);
239240
let sym_trans =
240241
build_symbol_transitions(symbols, symbol_lengths, &byte_table, n_normal, accept_state);
241242

@@ -327,13 +328,10 @@ impl FoldedContainsDfa {
327328
if v.is_empty() { None } else { Some(v) }
328329
});
329330

330-
// Escape-only fast path: when no symbol's expansion contains any
331-
// needle byte AND the needle has no `_` wildcards, the only
332-
// DFA-accepting compressed sequence is the 2L-byte pattern
333-
// `[ESCAPE, needle[0], …, ESCAPE, needle[L-1]]`. With wildcards
334-
// present the encoded pattern is no longer unique and the
335-
// memmem prefilter is disabled.
336-
let escape_only_pattern = (needle.len() >= 2
331+
// Escape-only fast path: only when the needle is wildcard-free,
332+
// case-sensitive, and no symbol contains any literal needle byte.
333+
let escape_only_pattern = (!case_insensitive
334+
&& needle.len() >= 2
337335
&& super::needle_is_literal(needle)
338336
&& needle_bytes_absent_from_all_symbols(symbols, symbol_lengths, needle))
339337
.then(|| build_escape_only_encoded_pattern(needle));

encodings/fsst/src/dfa/mod.rs

Lines changed: 73 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,24 @@ impl FsstMatcher {
180180
symbols: &[Symbol],
181181
symbol_lengths: &[u8],
182182
pattern: &[u8],
183+
) -> VortexResult<Option<Self>> {
184+
Self::try_new_with(symbols, symbol_lengths, pattern, false)
185+
}
186+
187+
/// Variant of [`Self::try_new`] that opts in to ASCII case-insensitive
188+
/// matching (SQL `ILIKE`). Letter bytes in the needle then accept
189+
/// either case at every position.
190+
pub fn try_new_with(
191+
symbols: &[Symbol],
192+
symbol_lengths: &[u8],
193+
pattern: &[u8],
194+
case_insensitive: bool,
183195
) -> VortexResult<Option<Self>> {
184196
let Some(like_kind) = LikeKind::parse(pattern) else {
185197
return Ok(None);
186198
};
187199

200+
let ci = case_insensitive;
188201
let inner = match like_kind {
189202
LikeKind::Prefix(b"") | LikeKind::Contains(b"") | LikeKind::Suffix(b"") => {
190203
MatcherInner::MatchAll
@@ -193,23 +206,29 @@ impl FsstMatcher {
193206
if prefix.len() > FlatPrefixDfa::MAX_PREFIX_LEN {
194207
return Ok(None);
195208
}
196-
MatcherInner::Prefix(FlatPrefixDfa::new(symbols, symbol_lengths, prefix)?)
209+
MatcherInner::Prefix(FlatPrefixDfa::new(symbols, symbol_lengths, prefix, ci)?)
197210
}
198211
LikeKind::Suffix(suffix) => {
199212
if suffix.len() > SuffixMatcher::MAX_SUFFIX_LEN {
200213
return Ok(None);
201214
}
202-
MatcherInner::Suffix(SuffixMatcher::new(symbols, symbol_lengths, suffix)?)
215+
MatcherInner::Suffix(SuffixMatcher::new(symbols, symbol_lengths, suffix, ci)?)
203216
}
204217
LikeKind::Contains(needle) => {
205218
if needle.len() <= FoldedContainsDfa::MAX_NEEDLE_LEN {
206219
MatcherInner::FoldedContains(FoldedContainsDfa::new(
207220
symbols,
208221
symbol_lengths,
209222
needle,
223+
ci,
210224
)?)
211225
} else if needle.len() <= FlatContainsDfa::MAX_NEEDLE_LEN {
212-
MatcherInner::Contains(FlatContainsDfa::new(symbols, symbol_lengths, needle)?)
226+
MatcherInner::Contains(FlatContainsDfa::new(
227+
symbols,
228+
symbol_lengths,
229+
needle,
230+
ci,
231+
)?)
213232
} else {
214233
return Ok(None);
215234
}
@@ -223,6 +242,7 @@ impl FsstMatcher {
223242
symbols,
224243
symbol_lengths,
225244
&segments,
245+
ci,
226246
)?))
227247
}
228248
};
@@ -610,20 +630,57 @@ fn build_fused_table(
610630
/// expressed.
611631
pub(super) const WILDCARD: u8 = b'_';
612632

633+
/// ASCII case fold to lowercase. Non-letters pass through.
634+
#[inline]
635+
fn ascii_to_lower(b: u8) -> u8 {
636+
if b.is_ascii_uppercase() { b + 32 } else { b }
637+
}
638+
613639
/// Pattern-position byte equality with wildcard semantics. Returns
614640
/// `true` if `a` or `b` is the [`WILDCARD`] byte, or both bytes are
615-
/// equal.
641+
/// equal. When `ci` is true, ASCII letter case is ignored.
616642
#[inline]
617-
fn pattern_eq(a: u8, b: u8) -> bool {
618-
a == WILDCARD || b == WILDCARD || a == b
643+
fn pattern_eq(a: u8, b: u8, ci: bool) -> bool {
644+
if a == WILDCARD || b == WILDCARD {
645+
return true;
646+
}
647+
if ci {
648+
ascii_to_lower(a) == ascii_to_lower(b)
649+
} else {
650+
a == b
651+
}
619652
}
620653

621654
/// Concrete-input byte match against a needle position. The pattern
622655
/// position `p` is one of the needle bytes (possibly the wildcard);
623-
/// the input byte `b` is always concrete (never a wildcard).
656+
/// the input byte `b` is always concrete (never a wildcard). When `ci`
657+
/// is true, ASCII letter case is ignored.
658+
#[inline]
659+
#[expect(
660+
dead_code,
661+
reason = "Reserved for the future correct contains-wildcard DFA."
662+
)]
663+
fn pattern_matches_byte(p: u8, b: u8, ci: bool) -> bool {
664+
if p == WILDCARD {
665+
return true;
666+
}
667+
if ci {
668+
ascii_to_lower(p) == ascii_to_lower(b)
669+
} else {
670+
p == b
671+
}
672+
}
673+
674+
/// For an advancing transition on byte `needle_byte`, set the table
675+
/// row entry. With `ci` true, also set the entry for the case-flipped
676+
/// byte so either case of the same ASCII letter advances.
624677
#[inline]
625-
fn pattern_matches_byte(p: u8, b: u8) -> bool {
626-
p == WILDCARD || p == b
678+
fn set_advance(table: &mut [u8], row_start: usize, needle_byte: u8, new_state: u8, ci: bool) {
679+
table[row_start + usize::from(needle_byte)] = new_state;
680+
if ci && needle_byte.is_ascii_alphabetic() {
681+
let flipped = needle_byte ^ 0x20;
682+
table[row_start + usize::from(flipped)] = new_state;
683+
}
627684
}
628685

629686
/// Build the `(state × byte) → state` KMP transition table.
@@ -641,11 +698,11 @@ fn pattern_matches_byte(p: u8, b: u8) -> bool {
641698
///
642699
/// This is one 256-byte memcpy + a single override per state, instead
643700
/// of running the KMP fallback loop at every cell.
644-
fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
701+
fn kmp_byte_transitions(needle: &[u8], ci: bool) -> Vec<u8> {
645702
let n_states = u8::try_from(needle.len() + 1)
646703
.vortex_expect("kmp_byte_transitions: must have needle.len() ≤ 255");
647704
let accept = n_states - 1;
648-
let failure = kmp_failure_table(needle);
705+
let failure = kmp_failure_table(needle, ci);
649706

650707
let mut table = vec![0u8; usize::from(n_states) * 256];
651708

@@ -654,7 +711,7 @@ fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
654711
if first == WILDCARD {
655712
table[0..256].fill(1);
656713
} else {
657-
table[usize::from(first)] = 1;
714+
set_advance(&mut table, 0, first, 1, ci);
658715
}
659716
}
660717

@@ -672,7 +729,7 @@ fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
672729
// Wildcard at position s: every byte advances.
673730
table[state_row..state_row + 256].fill(state + 1);
674731
} else {
675-
table[state_row + usize::from(needle[s])] = state + 1;
732+
set_advance(&mut table, state_row, needle[s], state + 1, ci);
676733
}
677734
}
678735

@@ -685,14 +742,14 @@ fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
685742
table
686743
}
687744

688-
fn kmp_failure_table(needle: &[u8]) -> Vec<u8> {
745+
fn kmp_failure_table(needle: &[u8], ci: bool) -> Vec<u8> {
689746
let mut failure = vec![0u8; needle.len()];
690747
let mut k = 0u8;
691748
for i in 1..needle.len() {
692-
while k > 0 && !pattern_eq(needle[usize::from(k)], needle[i]) {
749+
while k > 0 && !pattern_eq(needle[usize::from(k)], needle[i], ci) {
693750
k = failure[usize::from(k) - 1];
694751
}
695-
if pattern_eq(needle[usize::from(k)], needle[i]) {
752+
if pattern_eq(needle[usize::from(k)], needle[i], ci) {
696753
k += 1;
697754
}
698755
failure[i] = k;

encodings/fsst/src/dfa/multi_contains.rs

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ impl MultiContainsDfa {
101101
symbols: &[Symbol],
102102
symbol_lengths: &[u8],
103103
segments: &[&[u8]],
104+
case_insensitive: bool,
104105
) -> VortexResult<Self> {
105106
let total_len: usize = segments.iter().map(|s| s.len()).sum();
106107
if total_len > Self::MAX_TOTAL_LEN {
@@ -116,7 +117,7 @@ impl MultiContainsDfa {
116117
let n_states = accept_state + 1;
117118
let sentinel = n_states;
118119

119-
let byte_table = chained_kmp_byte_transitions(segments, accept_state);
120+
let byte_table = chained_kmp_byte_transitions(segments, accept_state, case_insensitive);
120121
let sym_trans =
121122
build_symbol_transitions(symbols, symbol_lengths, &byte_table, n_states, accept_state);
122123
let transitions = build_fused_table(&sym_trans, symbols.len(), n_states, |_| sentinel, 0);
@@ -160,8 +161,13 @@ impl MultiContainsDfa {
160161
// requires every segment to appear in order, so the longest
161162
// segment's encoded pattern (the most selective single test) is
162163
// a sound row-level prefilter — rows without it can't match.
163-
let escape_only_anchor_pattern =
164-
compute_escape_only_anchor(symbols, symbol_lengths, segments);
164+
// Case-insensitive patterns disable this fast path; the encoded
165+
// pattern is byte-exact and wouldn't match case-flipped bytes.
166+
let escape_only_anchor_pattern = if case_insensitive {
167+
None
168+
} else {
169+
compute_escape_only_anchor(symbols, symbol_lengths, segments)
170+
};
165171

166172
Ok(Self {
167173
transitions,
@@ -351,7 +357,11 @@ impl MultiContainsDfa {
351357
/// - The final phase's accept is the global accept state (sticky)
352358
///
353359
/// Each phase has its own KMP failure function for intra-segment backtracking.
354-
fn chained_kmp_byte_transitions(segments: &[&[u8]], accept_state: u8) -> Vec<u8> {
360+
fn chained_kmp_byte_transitions(
361+
segments: &[&[u8]],
362+
accept_state: u8,
363+
case_insensitive: bool,
364+
) -> Vec<u8> {
355365
let n_states = accept_state + 1;
356366
let mut table = vec![0u8; usize::from(n_states) * 256];
357367

@@ -366,14 +376,21 @@ fn chained_kmp_byte_transitions(segments: &[&[u8]], accept_state: u8) -> Vec<u8>
366376

367377
for (k, segment) in segments.iter().enumerate() {
368378
let base = offsets[k];
369-
let failure = kmp_failure_table(segment);
379+
let failure = kmp_failure_table(segment, case_insensitive);
370380

371381
for local_s in 0..segment.len() {
372382
let global_s = base + local_s;
373383
for byte in 0..256usize {
374384
let mut s = local_s;
375385
loop {
376-
if byte == usize::from(segment[s]) {
386+
let pattern_byte = segment[s];
387+
let matches = if case_insensitive {
388+
super::ascii_to_lower(pattern_byte)
389+
== super::ascii_to_lower(u8::try_from(byte).expect("0..256"))
390+
} else {
391+
byte == usize::from(pattern_byte)
392+
};
393+
if matches {
377394
s += 1;
378395
break;
379396
}

0 commit comments

Comments
 (0)