55//! everything else returns `None` so the caller decompresses + runs the
66//! scalar `LIKE` on the canonical bytes.
77//!
8- //! * `'literal'` — token-aware equality. LPM-tokenise the literal once
8+ //! * `'literal'` — token-aware equality ( LPM-tokenise the literal once
99//! and compare the row's `codes[lo..hi]` against the tokenised needle
10- //! as `&[u16]`. Full byte equality is exactly equivalent to full LPM
11- //! token-sequence equality, so this is sound and skips row decode
12- //! entirely.
13- //! * `'prefix%'` — byte-streaming via `DecodeView::for_each_dict_slice`
14- //! with a single length check up front. The naive "tokenise the
15- //! prefix and compare token prefix" trick is **wrong** because the
16- //! LPM of the row's leading bytes may extend its last token past the
17- //! literal prefix's tokenisation boundary. Streaming dict slices and
18- //! comparing prefix-wise is the correct minimum-work option.
19- //! * `'%substring%'` — decode each row into a small reusable scratch
20- //! buffer and run `memchr::memmem::Finder::find`, which is SIMD-
21- //! accelerated (SSE2/AVX2 on x86_64, NEON on aarch64) and Two-Way
22- //! underneath. The `Finder` is built once per kernel call and reused
23- //! across every row.
10+ //! as `&[u16]`). No row decode.
11+ //! * `'prefix%'` — OnPair-style [`PrefixAutomaton`][crate::dfa::PrefixAutomaton]:
12+ //! tokenise the prefix and precompute valid-divergence intervals for
13+ //! each query position. Per-row scan is `≤ q + 1` `u16` comparisons
14+ //! plus one interval check; no decode at all in the hot path.
15+ //! * `'%substring%'` — dict-bloom skip + `memchr::memmem` over the
16+ //! decoded row only when needed.
17+ //! [`ContainsBloom`][crate::dfa::ContainsBloom] precomputes "this
18+ //! dict entry contains the substring" and "some suffix of this entry
19+ //! could start a cross-token match". Most rows resolve via the bloom
20+ //! without touching `dict_bytes`; the rest fall through to a
21+ //! scratch-buffer decode + memmem.
2422//!
2523//! Escapes (`\\`), single-character wildcards (`_`), mid-pattern
2624//! wildcards, and `case_insensitive: true` all bail out with `None`.
@@ -40,6 +38,8 @@ use vortex_error::VortexResult;
4038use crate :: OnPair ;
4139use crate :: decode:: DecodeView ;
4240use crate :: decode:: OwnedDecodeInputs ;
41+ use crate :: dfa:: ContainsBloom ;
42+ use crate :: dfa:: PrefixAutomaton ;
4343use crate :: lpm:: DictIndex ;
4444use crate :: lpm:: tokenize_needle;
4545
@@ -110,28 +110,36 @@ impl LikeKernel for OnPair {
110110 if let Some ( needle_toks) = tokenize_needle ( & dv, & index, needle) {
111111 let codes = dv. codes ;
112112 let codes_offsets = dv. codes_offsets ;
113+ let needle_slice = needle_toks. as_slice ( ) ;
113114 for r in 0 ..n {
114115 let lo = codes_offsets[ r] as usize ;
115116 let hi = codes_offsets[ r + 1 ] as usize ;
116117 // SAFETY: codes_offsets validated at construction.
117118 let row_toks = unsafe { codes. get_unchecked ( lo..hi) } ;
118- if row_toks == needle_toks . as_slice ( ) {
119+ if row_toks == needle_slice {
119120 bytes[ r / 8 ] |= 1u8 << ( r % 8 ) ;
120121 }
121122 }
122123 }
123- // Else: needle has a byte not in the dict, no row matches.
124+ // Else: needle has a byte not in the dict ⇒ no row matches.
124125 }
125126 PatternShape :: StartsWith ( prefix) => {
126127 if prefix. is_empty ( ) {
127128 fill_all ( & mut bytes, n) ;
128- } else {
129+ } else if let Some ( automaton) = PrefixAutomaton :: build ( & dv, prefix) {
130+ let codes = dv. codes ;
131+ let codes_offsets = dv. codes_offsets ;
129132 for r in 0 ..n {
130- if row_starts_with ( & dv, r, prefix) {
133+ let lo = codes_offsets[ r] as usize ;
134+ let hi = codes_offsets[ r + 1 ] as usize ;
135+ // SAFETY: codes_offsets validated at construction.
136+ let row_toks = unsafe { codes. get_unchecked ( lo..hi) } ;
137+ if automaton. matches ( row_toks) {
131138 bytes[ r / 8 ] |= 1u8 << ( r % 8 ) ;
132139 }
133140 }
134141 }
142+ // Else: prefix has a byte not in the dict ⇒ no row matches.
135143 }
136144 PatternShape :: Contains ( sub) => {
137145 if sub. is_empty ( ) {
@@ -154,48 +162,27 @@ impl LikeKernel for OnPair {
154162 }
155163}
156164
157- /// `LIKE 'prefix%'` — byte-stream the row's dict slices, comparing
158- /// against `prefix` and short-circuiting on the first mismatch or once
159- /// the prefix is satisfied.
160- fn row_starts_with ( dv : & DecodeView < ' _ > , r : usize , prefix : & [ u8 ] ) -> bool {
161- let mut pos = 0usize ;
162- let mut matched = false ;
163- let plen = prefix. len ( ) ;
164- let prefix_ptr = prefix. as_ptr ( ) ;
165- dv. for_each_dict_slice ( r, |slice| {
166- let remaining = plen - pos;
167- let take = slice. len ( ) . min ( remaining) ;
168- // SAFETY: `pos + take <= plen` because `take <= remaining`,
169- // and `take <= slice.len()` by construction.
170- let eq = unsafe {
171- let lhs = std:: slice:: from_raw_parts ( prefix_ptr. add ( pos) , take) ;
172- let rhs = slice. get_unchecked ( ..take) ;
173- lhs == rhs
174- } ;
175- if !eq {
176- return false ;
177- }
178- pos += take;
179- if pos == plen {
180- matched = true ;
181- return false ; // short-circuit, prefix satisfied
182- }
183- true
184- } ) ;
185- matched
186- }
187-
188- /// `%substring%` pushdown via SIMD-accelerated `memmem`. The `Finder`
189- /// is built once and reused across every row's decoded bytes; the
190- /// scratch buffer is reused too so each row decode reuses the same
191- /// allocation.
165+ /// `%substring%` pushdown: dict-bloom skip + per-row decode + memmem.
192166fn contains_into_bitmap ( dv : & DecodeView < ' _ > , sub : & [ u8 ] , n : usize , out : & mut [ u8 ] ) {
167+ let bloom = ContainsBloom :: build ( dv, sub) ;
193168 let finder = memmem:: Finder :: new ( sub) ;
194169 let mut scratch: Vec < u8 > = Vec :: with_capacity ( 64 ) ;
170+ let codes = dv. codes ;
171+ let codes_offsets = dv. codes_offsets ;
195172 for r in 0 ..n {
196- scratch. clear ( ) ;
197- dv. decode_row_into ( r, & mut scratch) ;
198- if finder. find ( & scratch) . is_some ( ) {
173+ let lo = codes_offsets[ r] as usize ;
174+ let hi = codes_offsets[ r + 1 ] as usize ;
175+ // SAFETY: codes_offsets validated at construction.
176+ let row_toks = unsafe { codes. get_unchecked ( lo..hi) } ;
177+ let hit = match bloom. classify ( row_toks) {
178+ Some ( b) => b,
179+ None => {
180+ scratch. clear ( ) ;
181+ dv. decode_row_into ( r, & mut scratch) ;
182+ finder. find ( & scratch) . is_some ( )
183+ }
184+ } ;
185+ if hit {
199186 out[ r / 8 ] |= 1u8 << ( r % 8 ) ;
200187 }
201188 }
0 commit comments