Skip to content

Commit b791a00

Browse files
committed
FSST LIKE: _ single-byte wildcard for prefix/suffix patterns
Extend the LikeKind parser and the KMP byte-table / suffix byte-table construction to treat `_` (byte 0x5F) as the SQL single-byte wildcard. Anchored shapes — `prefix%` and `%suffix` — gain wildcard support; each `_` position transitions on every byte instead of one literal. Unanchored shapes (`%contains%`, `%seg1%seg2%`) are still rejected when any `_` appears: KMP's failure function with wildcards is unsound (treats `_` as symmetrically compatible with any pattern byte, producing false positives at the DFA level). A correct unanchored wildcard matcher needs NFA subset construction; tracked as a follow-up. Changes: - `dfa/mod.rs`: add `WILDCARD = b'_'`, `pattern_eq`, `pattern_matches_byte`. Update `kmp_byte_transitions` to fill the row (any byte advances) at wildcard positions; `kmp_failure_table` uses wildcard-aware pattern equality. - `dfa/prefix.rs::build_prefix_byte_table`: fill the row at wildcard positions. - `dfa/suffix.rs::build_suffix_byte_table`: same, for the backward-scanned suffix. - `dfa/mod.rs::LikeKind::parse`: accept `_` in `Prefix` and `Suffix` variants; still reject in `Contains` / `MultiContains`. - `needle_bytes_absent_from_all_symbols` skips wildcard positions when computing the literal-byte symbol overlap; the escape-only memmem fast path is gated on `needle_is_literal`. Adds 6 wildcard tests covering prefix, suffix, multi-wildcard, leading-wildcard, symbol-interaction, and the deliberate contains-rejection. All 163 existing + new tests pass. Signed-off-by: Claude <noreply@anthropic.com>
1 parent 970c092 commit b791a00

7 files changed

Lines changed: 198 additions & 39 deletions

File tree

encodings/fsst/src/dfa/flat_contains.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ impl FlatContainsDfa {
104104
// `all_bytes` whose length matches the encoded needle. Disabled
105105
// for L < 2 (no possible win over the existing scan path).
106106
let escape_only_pattern = (needle.len() >= 2
107+
&& super::needle_is_literal(needle)
107108
&& needle_bytes_absent_from_all_symbols(symbols, symbol_lengths, needle))
108109
.then(|| build_escape_only_encoded_pattern(needle));
109110

encodings/fsst/src/dfa/folded_contains.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -328,12 +328,13 @@ impl FoldedContainsDfa {
328328
});
329329

330330
// Escape-only fast path: when no symbol's expansion contains any
331-
// needle byte, the only DFA-accepting compressed sequence is the
332-
// 2L-byte pattern `[ESCAPE, needle[0], …, ESCAPE, needle[L-1]]`.
333-
// Only enable for L >= 2 — at L = 1 the encoded pattern is
334-
// identical to the existing escape_pair 2-byte memmem, so taking
335-
// a separate path would just add a redundant branch.
331+
// needle byte AND the needle has no `_` wildcards, the only
332+
// DFA-accepting compressed sequence is the 2L-byte pattern
333+
// `[ESCAPE, needle[0], …, ESCAPE, needle[L-1]]`. With wildcards
334+
// present the encoded pattern is no longer unique and the
335+
// memmem prefilter is disabled.
336336
let escape_only_pattern = (needle.len() >= 2
337+
&& super::needle_is_literal(needle)
337338
&& needle_bytes_absent_from_all_symbols(symbols, symbol_lengths, needle))
338339
.then(|| build_escape_only_encoded_pattern(needle));
339340

encodings/fsst/src/dfa/mod.rs

Lines changed: 81 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -327,29 +327,37 @@ enum LikeKind<'a> {
327327

328328
impl<'a> LikeKind<'a> {
329329
fn parse(pattern: &'a [u8]) -> Option<Self> {
330-
// `prefix%` (including just `%` where prefix is empty)
330+
// `prefix%` (including just `%` where prefix is empty).
331+
// `_` in the prefix is the single-byte wildcard (anchored from
332+
// the row start, no KMP fallback ambiguity).
331333
if let Some(prefix) = pattern.strip_suffix(b"%")
332334
&& !prefix.contains(&b'%')
333-
&& !prefix.contains(&b'_')
334335
{
335336
return Some(LikeKind::Prefix(prefix));
336337
}
337338

338-
// `%suffix` (no trailing %)
339+
// `%suffix` (no trailing %); `_` allowed in suffix (anchored
340+
// from the row end, scanned right-to-left, also wildcard-safe).
339341
if let Some(suffix) = pattern.strip_prefix(b"%")
340342
&& !suffix.contains(&b'%')
341-
&& !suffix.contains(&b'_')
342343
{
343344
return Some(LikeKind::Suffix(suffix));
344345
}
345346

346-
// `%needle%`
347+
// `%needle%`. We reject `_` in unanchored contains for now —
348+
// the symmetric KMP failure function over-approximates when
349+
// wildcards appear in the matched portion, producing false
350+
// positives. A correct unanchored wildcard matcher needs NFA
351+
// subset construction (or per-position sliding-window match);
352+
// tracked as a follow-up.
347353
let inner = pattern.strip_prefix(b"%")?.strip_suffix(b"%")?;
348354
if !inner.contains(&b'%') && !inner.contains(&b'_') {
349355
return Some(LikeKind::Contains(inner));
350356
}
351357

352-
// `%seg1%seg2%...%segN%`
358+
// `%seg1%seg2%...%segN%`. Same wildcard limitation: any
359+
// segment containing `_` falls through to the
360+
// decompression-based fallback.
353361
let segments: Vec<&[u8]> = inner
354362
.split(|&b| b == b'%')
355363
.filter(|s| !s.is_empty())
@@ -413,23 +421,30 @@ where
413421
// DFA construction helpers
414422
// ---------------------------------------------------------------------------
415423

416-
/// Returns `true` iff no byte of `needle` appears in any symbol's expansion.
424+
/// Returns `true` iff no literal byte of `needle` appears in any symbol's
425+
/// expansion. Wildcard (`_`) positions are skipped — they're allowed to
426+
/// match symbol bytes and don't constrain the prefilter.
417427
///
418-
/// When this holds, every needle byte in the decompressed stream must come
419-
/// from an `ESCAPE` pair, so the only compressed sequence that reaches the
420-
/// contains DFA's accept state from state 0 is exactly
428+
/// When this holds AND the needle has no wildcards, every needle byte in
429+
/// the decompressed stream must come from an `ESCAPE` pair, so the only
430+
/// compressed sequence that reaches the contains DFA's accept state
431+
/// from state 0 is exactly
421432
/// `[ESCAPE, needle[0], ESCAPE, needle[1], …, ESCAPE, needle[L-1]]`. The
422433
/// contains scan can then prefilter with a single `memmem` for that 2L-byte
423-
/// pattern, which is dramatically more selective than the 2-byte
424-
/// `(ESCAPE, needle[0])` anchor that the bucketed Teddy pair scan would
425-
/// otherwise use.
434+
/// pattern. For needles WITH wildcards, the same condition implies each
435+
/// literal byte must come from an escape pair, but wildcard bytes can
436+
/// come from anywhere — the encoded pattern is no longer unique, so
437+
/// the memmem prefilter is disabled.
426438
pub(super) fn needle_bytes_absent_from_all_symbols(
427439
symbols: &[Symbol],
428440
symbol_lengths: &[u8],
429441
needle: &[u8],
430442
) -> bool {
431443
let mut needle_byte_present = [false; 256];
432444
for &b in needle {
445+
if b == WILDCARD {
446+
continue;
447+
}
433448
needle_byte_present[usize::from(b)] = true;
434449
}
435450
debug_assert!(symbol_lengths.len() >= symbols.len());
@@ -447,7 +462,8 @@ pub(super) fn needle_bytes_absent_from_all_symbols(
447462

448463
/// Build the compressed pattern `[ESCAPE, needle[0], ESCAPE, needle[1], …,
449464
/// ESCAPE, needle[L-1]]`. Only meaningful when
450-
/// [`needle_bytes_absent_from_all_symbols`] is true.
465+
/// [`needle_bytes_absent_from_all_symbols`] is true AND the needle is
466+
/// wildcard-free.
451467
pub(super) fn build_escape_only_encoded_pattern(needle: &[u8]) -> Vec<u8> {
452468
let mut out = Vec::with_capacity(needle.len() * 2);
453469
for &b in needle {
@@ -457,6 +473,12 @@ pub(super) fn build_escape_only_encoded_pattern(needle: &[u8]) -> Vec<u8> {
457473
out
458474
}
459475

476+
/// `true` iff the needle has no `_` wildcard bytes.
477+
#[inline]
478+
pub(super) fn needle_is_literal(needle: &[u8]) -> bool {
479+
!needle.contains(&WILDCARD)
480+
}
481+
460482
/// Builds the per-symbol transition table for FSST symbols.
461483
///
462484
/// For each `(state, symbol_code)` pair, simulates feeding the symbol's bytes
@@ -581,20 +603,44 @@ fn build_fused_table(
581603
// KMP helpers
582604
// ---------------------------------------------------------------------------
583605

606+
/// The wildcard byte in a LIKE needle. SQL `_` (`0x5F`) is interpreted
607+
/// as "match any single byte" by [`kmp_byte_transitions`] and
608+
/// [`kmp_failure_table`]. Without SQL `ESCAPE` support, every `_`
609+
/// in the parsed needle is a wildcard; a literal `_` cannot be
610+
/// expressed.
611+
pub(super) const WILDCARD: u8 = b'_';
612+
613+
/// Pattern-position byte equality with wildcard semantics. Returns
614+
/// `true` if `a` or `b` is the [`WILDCARD`] byte, or both bytes are
615+
/// equal.
616+
#[inline]
617+
fn pattern_eq(a: u8, b: u8) -> bool {
618+
a == WILDCARD || b == WILDCARD || a == b
619+
}
620+
621+
/// Concrete-input byte match against a needle position. The pattern
622+
/// position `p` is one of the needle bytes (possibly the wildcard);
623+
/// the input byte `b` is always concrete (never a wildcard).
624+
#[inline]
625+
fn pattern_matches_byte(p: u8, b: u8) -> bool {
626+
p == WILDCARD || p == b
627+
}
628+
584629
/// Build the `(state × byte) → state` KMP transition table.
585630
///
586631
/// ## Construction
587632
///
588633
/// Uses the standard recurrence — for each non-accept state `s`:
589-
/// - On byte == `needle[s]`: transition to `s + 1`.
634+
/// - On byte == `needle[s]` (or `needle[s]` is the wildcard): transition to `s + 1`.
590635
/// - On any other byte: transition to whatever the *failure* row
591636
/// would give for the same byte, i.e. `table[failure[s-1] * 256 + b]`
592637
/// for `s > 0`, and `0` for `s = 0`.
593638
///
639+
/// When `needle[s]` is the [`WILDCARD`] byte (`_`), every input byte
640+
/// advances to `s + 1` regardless of the failure row's content.
641+
///
594642
/// This is one 256-byte memcpy + a single override per state, instead
595-
/// of running the KMP fallback loop at every cell. For an N=6 needle
596-
/// that's 6 × 256 = 1536 bytes copied + 6 overrides vs ~3500 iterative
597-
/// fallback steps previously — about 3× faster.
643+
/// of running the KMP fallback loop at every cell.
598644
fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
599645
let n_states = u8::try_from(needle.len() + 1)
600646
.vortex_expect("kmp_byte_transitions: must have needle.len() ≤ 255");
@@ -603,12 +649,16 @@ fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
603649

604650
let mut table = vec![0u8; usize::from(n_states) * 256];
605651

606-
// State 0: only `needle[0]` advances.
607-
if !needle.is_empty() {
608-
table[usize::from(needle[0])] = 1;
652+
// State 0: either `needle[0]` (literal) or every byte (wildcard) advances.
653+
if let Some(&first) = needle.first() {
654+
if first == WILDCARD {
655+
table[0..256].fill(1);
656+
} else {
657+
table[usize::from(first)] = 1;
658+
}
609659
}
610660

611-
// States 1..accept: each row is the failure-row plus one advance entry.
661+
// States 1..accept: each row is the failure-row plus the advance entry.
612662
for state in 1..accept {
613663
let s = usize::from(state);
614664
let fail_row = usize::from(failure[s - 1]) * 256;
@@ -617,14 +667,18 @@ fn kmp_byte_transitions(needle: &[u8]) -> Vec<u8> {
617667
// the KMP fallback eventually lands at the same place the
618668
// failure-state would land on that byte.
619669
table.copy_within(fail_row..fail_row + 256, state_row);
620-
// Override the one entry that advances.
621-
table[state_row + usize::from(needle[s])] = state + 1;
670+
// Override the advancing entries.
671+
if needle[s] == WILDCARD {
672+
// Wildcard at position s: every byte advances.
673+
table[state_row..state_row + 256].fill(state + 1);
674+
} else {
675+
table[state_row + usize::from(needle[s])] = state + 1;
676+
}
622677
}
623678

624679
// Accept state: sticky — every byte stays at accept.
625680
if usize::from(accept) < usize::from(n_states) {
626681
let accept_row = usize::from(accept) * 256;
627-
// SAFETY-ish: in-bounds writes to a pre-zeroed Vec.
628682
table[accept_row..accept_row + 256].fill(accept);
629683
}
630684

@@ -635,10 +689,10 @@ fn kmp_failure_table(needle: &[u8]) -> Vec<u8> {
635689
let mut failure = vec![0u8; needle.len()];
636690
let mut k = 0u8;
637691
for i in 1..needle.len() {
638-
while k > 0 && needle[usize::from(k)] != needle[i] {
692+
while k > 0 && !pattern_eq(needle[usize::from(k)], needle[i]) {
639693
k = failure[usize::from(k) - 1];
640694
}
641-
if needle[usize::from(k)] == needle[i] {
695+
if pattern_eq(needle[usize::from(k)], needle[i]) {
642696
k += 1;
643697
}
644698
failure[i] = k;

encodings/fsst/src/dfa/multi_contains.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,9 @@ fn compute_escape_only_anchor(
419419
return None;
420420
}
421421
let longest = segments.iter().max_by_key(|s| s.len())?;
422-
if longest.len() < 2 {
422+
if longest.len() < 2 || !super::needle_is_literal(longest) {
423+
// The encoded pattern is only well-defined when the longest
424+
// (anchor) segment is wildcard-free.
423425
return None;
424426
}
425427
// Union of every segment's bytes is what must be absent from symbols.

encodings/fsst/src/dfa/prefix.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,14 +237,20 @@ fn build_prefix_byte_table(prefix: &[u8], accept_state: u8, fail_state: u8) -> V
237237
table[s * 256 + byte] = accept_state;
238238
}
239239
} else if state != fail_state {
240-
// Only the correct next byte advances; everything else fails.
241-
let next_byte = prefix[s];
242240
let next_state = if s + 1 >= prefix.len() {
243241
accept_state
244242
} else {
245243
state + 1
246244
};
247-
table[s * 256 + usize::from(next_byte)] = next_state;
245+
if prefix[s] == super::WILDCARD {
246+
// Wildcard: every byte advances.
247+
for byte in 0..256 {
248+
table[s * 256 + byte] = next_state;
249+
}
250+
} else {
251+
// Only the literal byte advances; everything else fails.
252+
table[s * 256 + usize::from(prefix[s])] = next_state;
253+
}
248254
}
249255
}
250256
table

encodings/fsst/src/dfa/suffix.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,14 +209,21 @@ fn build_suffix_byte_table(suffix: &[u8], accept_state: u8, fail_state: u8) -> V
209209
}
210210
} else if state != fail_state {
211211
// State s: confirmed s bytes from the right. Next byte must be
212-
// suffix[suf_len - 1 - s] to advance.
212+
// suffix[suf_len - 1 - s] to advance — or any byte if that
213+
// pattern position is the `_` wildcard.
213214
let expected = suffix[suf_len - 1 - s];
214215
let next_state = if s + 1 >= suf_len {
215216
accept_state
216217
} else {
217218
state + 1
218219
};
219-
table[s * 256 + usize::from(expected)] = next_state;
220+
if expected == super::WILDCARD {
221+
for byte in 0..256 {
222+
table[s * 256 + byte] = next_state;
223+
}
224+
} else {
225+
table[s * 256 + usize::from(expected)] = next_state;
226+
}
220227
}
221228
// fail_state stays fail for all bytes (default)
222229
}

0 commit comments

Comments
 (0)