@@ -327,29 +327,37 @@ enum LikeKind<'a> {
327327
328328impl < ' a > LikeKind < ' a > {
329329 fn parse ( pattern : & ' a [ u8 ] ) -> Option < Self > {
330- // `prefix%` (including just `%` where prefix is empty)
330+ // `prefix%` (including just `%` where prefix is empty).
331+ // `_` in the prefix is the single-byte wildcard (anchored from
332+ // the row start, no KMP fallback ambiguity).
331333 if let Some ( prefix) = pattern. strip_suffix ( b"%" )
332334 && !prefix. contains ( & b'%' )
333- && !prefix. contains ( & b'_' )
334335 {
335336 return Some ( LikeKind :: Prefix ( prefix) ) ;
336337 }
337338
338- // `%suffix` (no trailing %)
339+ // `%suffix` (no trailing %); `_` allowed in suffix (anchored
340+ // from the row end, scanned right-to-left, also wildcard-safe).
339341 if let Some ( suffix) = pattern. strip_prefix ( b"%" )
340342 && !suffix. contains ( & b'%' )
341- && !suffix. contains ( & b'_' )
342343 {
343344 return Some ( LikeKind :: Suffix ( suffix) ) ;
344345 }
345346
346- // `%needle%`
347+ // `%needle%`. We reject `_` in unanchored contains for now —
348+ // the symmetric KMP failure function over-approximates when
349+ // wildcards appear in the matched portion, producing false
350+ // positives. A correct unanchored wildcard matcher needs NFA
351+ // subset construction (or per-position sliding-window match);
352+ // tracked as a follow-up.
347353 let inner = pattern. strip_prefix ( b"%" ) ?. strip_suffix ( b"%" ) ?;
348354 if !inner. contains ( & b'%' ) && !inner. contains ( & b'_' ) {
349355 return Some ( LikeKind :: Contains ( inner) ) ;
350356 }
351357
352- // `%seg1%seg2%...%segN%`
358+ // `%seg1%seg2%...%segN%`. Same wildcard limitation: any
359+ // segment containing `_` falls through to the
360+ // decompression-based fallback.
353361 let segments: Vec < & [ u8 ] > = inner
354362 . split ( |& b| b == b'%' )
355363 . filter ( |s| !s. is_empty ( ) )
@@ -413,23 +421,30 @@ where
413421// DFA construction helpers
414422// ---------------------------------------------------------------------------
415423
416- /// Returns `true` iff no byte of `needle` appears in any symbol's expansion.
424+ /// Returns `true` iff no literal byte of `needle` appears in any symbol's
425+ /// expansion. Wildcard (`_`) positions are skipped — they're allowed to
426+ /// match symbol bytes and don't constrain the prefilter.
417427///
418- /// When this holds, every needle byte in the decompressed stream must come
419- /// from an `ESCAPE` pair, so the only compressed sequence that reaches the
420- /// contains DFA's accept state from state 0 is exactly
428+ /// When this holds AND the needle has no wildcards, every needle byte in
429+ /// the decompressed stream must come from an `ESCAPE` pair, so the only
430+ /// compressed sequence that reaches the contains DFA's accept state
431+ /// from state 0 is exactly
421432/// `[ESCAPE, needle[0], ESCAPE, needle[1], …, ESCAPE, needle[L-1]]`. The
422433/// contains scan can then prefilter with a single `memmem` for that 2L-byte
423- /// pattern, which is dramatically more selective than the 2-byte
424- /// `(ESCAPE, needle[0])` anchor that the bucketed Teddy pair scan would
425- /// otherwise use.
434+ /// pattern. For needles WITH wildcards, the same condition implies each
435+ /// literal byte must come from an escape pair, but wildcard bytes can
436+ /// come from anywhere — the encoded pattern is no longer unique, so
437+ /// the memmem prefilter is disabled.
426438pub ( super ) fn needle_bytes_absent_from_all_symbols (
427439 symbols : & [ Symbol ] ,
428440 symbol_lengths : & [ u8 ] ,
429441 needle : & [ u8 ] ,
430442) -> bool {
431443 let mut needle_byte_present = [ false ; 256 ] ;
432444 for & b in needle {
445+ if b == WILDCARD {
446+ continue ;
447+ }
433448 needle_byte_present[ usize:: from ( b) ] = true ;
434449 }
435450 debug_assert ! ( symbol_lengths. len( ) >= symbols. len( ) ) ;
@@ -447,7 +462,8 @@ pub(super) fn needle_bytes_absent_from_all_symbols(
447462
448463/// Build the compressed pattern `[ESCAPE, needle[0], ESCAPE, needle[1], …,
449464/// ESCAPE, needle[L-1]]`. Only meaningful when
450- /// [`needle_bytes_absent_from_all_symbols`] is true.
465+ /// [`needle_bytes_absent_from_all_symbols`] is true AND the needle is
466+ /// wildcard-free.
451467pub ( super ) fn build_escape_only_encoded_pattern ( needle : & [ u8 ] ) -> Vec < u8 > {
452468 let mut out = Vec :: with_capacity ( needle. len ( ) * 2 ) ;
453469 for & b in needle {
@@ -457,6 +473,12 @@ pub(super) fn build_escape_only_encoded_pattern(needle: &[u8]) -> Vec<u8> {
457473 out
458474}
459475
476+ /// `true` iff the needle has no `_` wildcard bytes.
477+ #[ inline]
478+ pub ( super ) fn needle_is_literal ( needle : & [ u8 ] ) -> bool {
479+ !needle. contains ( & WILDCARD )
480+ }
481+
460482/// Builds the per-symbol transition table for FSST symbols.
461483///
462484/// For each `(state, symbol_code)` pair, simulates feeding the symbol's bytes
@@ -581,20 +603,44 @@ fn build_fused_table(
581603// KMP helpers
582604// ---------------------------------------------------------------------------
583605
606+ /// The wildcard byte in a LIKE needle. SQL `_` (`0x5F`) is interpreted
607+ /// as "match any single byte" by [`kmp_byte_transitions`] and
608+ /// [`kmp_failure_table`]. Without SQL `ESCAPE` support, every `_`
609+ /// in the parsed needle is a wildcard; a literal `_` cannot be
610+ /// expressed.
611+ pub ( super ) const WILDCARD : u8 = b'_' ;
612+
613+ /// Pattern-position byte equality with wildcard semantics. Returns
614+ /// `true` if `a` or `b` is the [`WILDCARD`] byte, or both bytes are
615+ /// equal.
616+ #[ inline]
617+ fn pattern_eq ( a : u8 , b : u8 ) -> bool {
618+ a == WILDCARD || b == WILDCARD || a == b
619+ }
620+
621+ /// Concrete-input byte match against a needle position. The pattern
622+ /// position `p` is one of the needle bytes (possibly the wildcard);
623+ /// the input byte `b` is always concrete (never a wildcard).
624+ #[ inline]
625+ fn pattern_matches_byte ( p : u8 , b : u8 ) -> bool {
626+ p == WILDCARD || p == b
627+ }
628+
584629/// Build the `(state × byte) → state` KMP transition table.
585630///
586631/// ## Construction
587632///
588633/// Uses the standard recurrence — for each non-accept state `s`:
589- /// - On byte == `needle[s]`: transition to `s + 1`.
634+ /// - On byte == `needle[s]` (or `needle[s]` is the wildcard) : transition to `s + 1`.
590635/// - On any other byte: transition to whatever the *failure* row
591636/// would give for the same byte, i.e. `table[failure[s-1] * 256 + b]`
592637/// for `s > 0`, and `0` for `s = 0`.
593638///
639+ /// When `needle[s]` is the [`WILDCARD`] byte (`_`), every input byte
640+ /// advances to `s + 1` regardless of the failure row's content.
641+ ///
594642/// This is one 256-byte memcpy + a single override per state, instead
595- /// of running the KMP fallback loop at every cell. For an N=6 needle
596- /// that's 6 × 256 = 1536 bytes copied + 6 overrides vs ~3500 iterative
597- /// fallback steps previously — about 3× faster.
643+ /// of running the KMP fallback loop at every cell.
598644fn kmp_byte_transitions ( needle : & [ u8 ] ) -> Vec < u8 > {
599645 let n_states = u8:: try_from ( needle. len ( ) + 1 )
600646 . vortex_expect ( "kmp_byte_transitions: must have needle.len() ≤ 255" ) ;
@@ -603,12 +649,16 @@ fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
603649
604650 let mut table = vec ! [ 0u8 ; usize :: from( n_states) * 256 ] ;
605651
606- // State 0: only `needle[0]` advances.
607- if !needle. is_empty ( ) {
608- table[ usize:: from ( needle[ 0 ] ) ] = 1 ;
652+ // State 0: either `needle[0]` (literal) or every byte (wildcard) advances.
653+ if let Some ( & first) = needle. first ( ) {
654+ if first == WILDCARD {
655+ table[ 0 ..256 ] . fill ( 1 ) ;
656+ } else {
657+ table[ usize:: from ( first) ] = 1 ;
658+ }
609659 }
610660
611- // States 1..accept: each row is the failure-row plus one advance entry.
661+ // States 1..accept: each row is the failure-row plus the advance entry.
612662 for state in 1 ..accept {
613663 let s = usize:: from ( state) ;
614664 let fail_row = usize:: from ( failure[ s - 1 ] ) * 256 ;
@@ -617,14 +667,18 @@ fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
617667 // the KMP fallback eventually lands at the same place the
618668 // failure-state would land on that byte.
619669 table. copy_within ( fail_row..fail_row + 256 , state_row) ;
620- // Override the one entry that advances.
621- table[ state_row + usize:: from ( needle[ s] ) ] = state + 1 ;
670+ // Override the advancing entries.
671+ if needle[ s] == WILDCARD {
672+ // Wildcard at position s: every byte advances.
673+ table[ state_row..state_row + 256 ] . fill ( state + 1 ) ;
674+ } else {
675+ table[ state_row + usize:: from ( needle[ s] ) ] = state + 1 ;
676+ }
622677 }
623678
624679 // Accept state: sticky — every byte stays at accept.
625680 if usize:: from ( accept) < usize:: from ( n_states) {
626681 let accept_row = usize:: from ( accept) * 256 ;
627- // SAFETY-ish: in-bounds writes to a pre-zeroed Vec.
628682 table[ accept_row..accept_row + 256 ] . fill ( accept) ;
629683 }
630684
@@ -635,10 +689,10 @@ fn kmp_failure_table(needle: &[u8]) -> Vec<u8> {
635689 let mut failure = vec ! [ 0u8 ; needle. len( ) ] ;
636690 let mut k = 0u8 ;
637691 for i in 1 ..needle. len ( ) {
638- while k > 0 && needle[ usize:: from ( k) ] != needle[ i] {
692+ while k > 0 && ! pattern_eq ( needle[ usize:: from ( k) ] , needle[ i] ) {
639693 k = failure[ usize:: from ( k) - 1 ] ;
640694 }
641- if needle[ usize:: from ( k) ] == needle[ i] {
695+ if pattern_eq ( needle[ usize:: from ( k) ] , needle[ i] ) {
642696 k += 1 ;
643697 }
644698 failure[ i] = k;
0 commit comments