@@ -6,12 +6,14 @@ use std::sync::LazyLock;
66use fsst:: ESCAPE_CODE ;
77use fsst:: Symbol ;
88use rstest:: rstest;
9+ use vortex_array:: ArrayRef ;
910use vortex_array:: Canonical ;
1011use vortex_array:: IntoArray ;
1112use vortex_array:: VortexSessionExecute ;
1213use vortex_array:: arrays:: BoolArray ;
1314use vortex_array:: arrays:: ConstantArray ;
1415use vortex_array:: arrays:: VarBinArray ;
16+ use vortex_array:: arrays:: scalar_fn:: ScalarFnArrayExt ;
1517use vortex_array:: assert_arrays_eq;
1618use vortex_array:: dtype:: DType ;
1719use vortex_array:: dtype:: Nullability ;
@@ -51,17 +53,17 @@ fn escaped(bytes: &[u8]) -> Vec<u8> {
5153#[ test]
5254fn test_like_kind_parse ( ) {
5355 assert ! ( matches!(
54- LikeKind :: parse( "http%" ) ,
55- Some ( LikeKind :: Prefix ( "http" ) )
56+ LikeKind :: parse( b "http%") ,
57+ Some ( LikeKind :: Prefix ( b "http") )
5658 ) ) ;
5759 assert ! ( matches!(
58- LikeKind :: parse( "%needle%" ) ,
59- Some ( LikeKind :: Contains ( "needle" ) )
60+ LikeKind :: parse( b "%needle%") ,
61+ Some ( LikeKind :: Contains ( b "needle") )
6062 ) ) ;
61- assert ! ( matches!( LikeKind :: parse( "%" ) , Some ( LikeKind :: Prefix ( "" ) ) ) ) ;
63+ assert ! ( matches!( LikeKind :: parse( b "%") , Some ( LikeKind :: Prefix ( b "") ) ) ) ;
6264 // Suffix and underscore patterns are not supported.
63- assert ! ( LikeKind :: parse( "%suffix" ) . is_none( ) ) ;
64- assert ! ( LikeKind :: parse( "a_c" ) . is_none( ) ) ;
65+ assert ! ( LikeKind :: parse( b "%suffix") . is_none( ) ) ;
66+ assert ! ( LikeKind :: parse( b "a_c") . is_none( ) ) ;
6567}
6668
6769/// No symbols — all bytes escaped. Simplest case to see the two tables.
@@ -144,7 +146,7 @@ fn test_prefix_dfa_longer() -> VortexResult<()> {
144146
145147#[ test]
146148fn test_prefix_pushdown_len_13_with_escapes ( ) {
147- let matcher = FsstMatcher :: try_new ( & [ ] , & [ ] , "abcdefghijklm%" )
149+ let matcher = FsstMatcher :: try_new ( & [ ] , & [ ] , b "abcdefghijklm%")
148150 . unwrap ( )
149151 . unwrap ( ) ;
150152
@@ -156,7 +158,7 @@ fn test_prefix_pushdown_len_13_with_escapes() {
156158fn test_prefix_pushdown_len_14_now_handled ( ) {
157159 // 14-byte prefix is now handled by FlatPrefixDfa (was rejected by shift-packed).
158160 assert ! (
159- FsstMatcher :: try_new( & [ ] , & [ ] , "abcdefghijklmn%" )
161+ FsstMatcher :: try_new( & [ ] , & [ ] , b "abcdefghijklmn%")
160162 . unwrap( )
161163 . is_some( )
162164 ) ;
@@ -166,7 +168,7 @@ fn test_prefix_pushdown_len_14_now_handled() {
166168fn test_prefix_pushdown_long_prefix ( ) -> VortexResult < ( ) > {
167169 let prefix = "a" . repeat ( FlatPrefixDfa :: MAX_PREFIX_LEN ) ;
168170 let pattern = format ! ( "{prefix}%" ) ;
169- let matcher = FsstMatcher :: try_new ( & [ ] , & [ ] , & pattern) ?. unwrap ( ) ;
171+ let matcher = FsstMatcher :: try_new ( & [ ] , & [ ] , pattern. as_bytes ( ) ) ?. unwrap ( ) ;
170172
171173 assert ! ( matcher. matches( & escaped( prefix. as_bytes( ) ) ) ) ;
172174
@@ -182,14 +184,20 @@ fn test_prefix_pushdown_rejects_len_254() {
182184 debug_assert_eq ! ( FlatPrefixDfa :: MAX_PREFIX_LEN , 253 ) ;
183185 let prefix = "a" . repeat ( 254 ) ;
184186 let pattern = format ! ( "{prefix}%" ) ;
185- assert ! ( FsstMatcher :: try_new( & [ ] , & [ ] , & pattern) . unwrap( ) . is_none( ) ) ;
187+ assert ! (
188+ FsstMatcher :: try_new( & [ ] , & [ ] , pattern. as_bytes( ) )
189+ . unwrap( )
190+ . is_none( )
191+ ) ;
186192}
187193
188194#[ test]
189195fn test_contains_pushdown_len_254_with_escapes ( ) {
190196 let needle = "a" . repeat ( FlatContainsDfa :: MAX_NEEDLE_LEN ) ;
191197 let pattern = format ! ( "%{needle}%" ) ;
192- let matcher = FsstMatcher :: try_new ( & [ ] , & [ ] , & pattern) . unwrap ( ) . unwrap ( ) ;
198+ let matcher = FsstMatcher :: try_new ( & [ ] , & [ ] , pattern. as_bytes ( ) )
199+ . unwrap ( )
200+ . unwrap ( ) ;
193201
194202 assert ! ( matcher. matches( & escaped( needle. as_bytes( ) ) ) ) ;
195203
@@ -202,14 +210,18 @@ fn test_contains_pushdown_len_254_with_escapes() {
202210fn test_contains_pushdown_rejects_len_255 ( ) {
203211 let needle = "a" . repeat ( FlatContainsDfa :: MAX_NEEDLE_LEN + 1 ) ;
204212 let pattern = format ! ( "%{needle}%" ) ;
205- assert ! ( FsstMatcher :: try_new( & [ ] , & [ ] , & pattern) . unwrap( ) . is_none( ) ) ;
213+ assert ! (
214+ FsstMatcher :: try_new( & [ ] , & [ ] , pattern. as_bytes( ) )
215+ . unwrap( )
216+ . is_none( )
217+ ) ;
206218}
207219
208220// ---------------------------------------------------------------------------
209221// End-to-end edge cases: FSST compress → LIKE → compare booleans
210222// ---------------------------------------------------------------------------
211223
212- fn make_fsst ( strings : & [ Option < & str > ] ) -> FSSTArray {
224+ fn make_fsst_str ( strings : & [ Option < & str > ] ) -> FSSTArray {
213225 let varbin = VarBinArray :: from_iter (
214226 strings. iter ( ) . copied ( ) ,
215227 DType :: Utf8 ( Nullability :: NonNullable ) ,
@@ -218,13 +230,9 @@ fn make_fsst(strings: &[Option<&str>]) -> FSSTArray {
218230 fsst_compress ( varbin, & compressor)
219231}
220232
221- fn run_like ( array : FSSTArray , pattern : & str ) -> VortexResult < BoolArray > {
222- use vortex_array:: ArrayRef ;
223- use vortex_array:: arrays:: scalar_fn:: ScalarFnArrayExt ;
224-
233+ fn run_like ( array : FSSTArray , pattern_arr : ArrayRef ) -> VortexResult < BoolArray > {
225234 let len = array. len ( ) ;
226235 let arr: ArrayRef = array. into_array ( ) ;
227- let pattern_arr = ConstantArray :: new ( pattern, len) . into_array ( ) ;
228236 let result = Like
229237 . try_new_array ( len, LikeOptions :: default ( ) , [ arr, pattern_arr] ) ?
230238 . into_array ( )
@@ -267,14 +275,42 @@ fn run_like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
267275// Prefix that shares chars with rest of string
268276#[ case( & [ "abab" , "abba" , "abcd" ] , "ab%" , & [ true , true , true ] ) ]
269277#[ case( & [ "abab" , "abba" , "abcd" , "ba" ] , "ab%" , & [ true , true , true , false ] ) ]
278+ // The string "aabaabaabaab" requires multi-level KMP fallback at the 'a' after "aabaabaab"
279+ #[ case( & [ "aabaabaabaab" , "aabaabaax" , "xaabaabaab" ] , "%aabaabaab%" , & [ true , false , true ] ) ]
280+ #[ case( & [ "café latte" , "naïve approach" , "café noir" ] , "café%" , & [ true , false , true ] ) ]
281+ #[ case( & [ "日本語テスト" , "日本語データ" , "英語テスト" ] , "%日本語%" , & [ true , true , false ] ) ]
282+ // 10-byte needle, contains: match at start, middle, end, exact, and near-miss
283+ #[ case(
284+ & [ "abcdefghijxxx" , "xxxabcdefghij" , "xxabcdefghijxx" , "abcdefghij" , "abcdefghxx" ] ,
285+ "%abcdefghij%" ,
286+ & [ true , true , true , true , false ]
287+ ) ]
288+ // 10-byte prefix: same needle but anchored at the start of the string
289+ #[ case(
290+ & [ "abcdefghijxxx" , "abcdefghij" , "xabcdefghij" , "abcdefghxx" ] ,
291+ "abcdefghij%" ,
292+ & [ true , true , false , false ]
293+ ) ]
294+ // 9-byte needle with KMP-relevant overlap ("abcabcabc"):
295+ // failure table = [0,0,0,1,2,3,4,5,6], so a partial match of "abcabcab"
296+ // followed by a mismatch must fall back to state 5 ("abcab"), not restart.
297+ // This exercises multi-level KMP backtracking across symbol boundaries.
298+ #[ case(
299+ & [ "xxabcabcabcxx" , "abcabcabc" , "abcabcabx" , "abcabcxx" ] ,
300+ "%abcabcabc%" ,
301+ & [ true , true , false , false ]
302+ ) ]
270303fn test_like_edge_cases (
271304 #[ case] strings : & [ & str ] ,
272305 #[ case] pattern : & str ,
273306 #[ case] expected : & [ bool ] ,
274307) -> VortexResult < ( ) > {
275308 let opts: Vec < Option < & str > > = strings. iter ( ) . map ( |s| Some ( * s) ) . collect ( ) ;
276- let fsst = make_fsst ( & opts) ;
277- let result = run_like ( fsst, pattern) ?;
309+ let fsst_arr = make_fsst_str ( & opts) ;
310+ let result = run_like (
311+ fsst_arr,
312+ ConstantArray :: new ( pattern, opts. len ( ) ) . into_array ( ) ,
313+ ) ?;
278314 let expected_arr = BoolArray :: from_iter ( expected. iter ( ) . copied ( ) ) ;
279315 assert_arrays_eq ! ( & result, & expected_arr) ;
280316 Ok ( ( ) )
0 commit comments