|
1 | 1 | // SPDX-License-Identifier: Apache-2.0 |
2 | 2 | // SPDX-FileCopyrightText: Copyright the Vortex contributors |
3 | 3 |
|
| 4 | +use std::sync::LazyLock; |
| 5 | + |
4 | 6 | use fsst::ESCAPE_CODE; |
5 | 7 | use fsst::Symbol; |
| 8 | +use rstest::rstest; |
| 9 | +use vortex_array::Canonical; |
| 10 | +use vortex_array::IntoArray; |
| 11 | +use vortex_array::VortexSessionExecute; |
| 12 | +use vortex_array::arrays::BoolArray; |
| 13 | +use vortex_array::arrays::ConstantArray; |
| 14 | +use vortex_array::arrays::VarBinArray; |
| 15 | +use vortex_array::assert_arrays_eq; |
| 16 | +use vortex_array::dtype::DType; |
| 17 | +use vortex_array::dtype::Nullability; |
| 18 | +use vortex_array::scalar_fn::fns::like::Like; |
| 19 | +use vortex_array::scalar_fn::fns::like::LikeOptions; |
| 20 | +use vortex_array::session::ArraySession; |
6 | 21 | use vortex_error::VortexResult; |
| 22 | +use vortex_session::VortexSession; |
7 | 23 |
|
8 | 24 | use super::FsstMatcher; |
9 | 25 | use super::LikeKind; |
10 | 26 | use super::flat_contains::FlatContainsDfa; |
11 | 27 | use super::prefix::FlatPrefixDfa; |
| 28 | +use crate::FSSTArray; |
| 29 | +use crate::fsst_compress; |
| 30 | +use crate::fsst_train_compressor; |
| 31 | + |
| 32 | +static SESSION: LazyLock<VortexSession> = |
| 33 | + LazyLock::new(|| VortexSession::empty().with::<ArraySession>()); |
12 | 34 |
|
13 | 35 | /// Helper: make a Symbol from a byte string (up to 8 bytes, zero-padded). |
14 | 36 | fn sym(bytes: &[u8]) -> Symbol { |
@@ -182,3 +204,76 @@ fn test_contains_pushdown_rejects_len_255() { |
182 | 204 | let pattern = format!("%{needle}%"); |
183 | 205 | assert!(FsstMatcher::try_new(&[], &[], &pattern).unwrap().is_none()); |
184 | 206 | } |
| 207 | + |
| 208 | +// --------------------------------------------------------------------------- |
| 209 | +// End-to-end edge cases: FSST compress → LIKE → compare booleans |
| 210 | +// --------------------------------------------------------------------------- |
| 211 | + |
| 212 | +fn make_fsst(strings: &[Option<&str>]) -> FSSTArray { |
| 213 | + let varbin = VarBinArray::from_iter( |
| 214 | + strings.iter().copied(), |
| 215 | + DType::Utf8(Nullability::NonNullable), |
| 216 | + ); |
| 217 | + let compressor = fsst_train_compressor(&varbin); |
| 218 | + fsst_compress(varbin, &compressor) |
| 219 | +} |
| 220 | + |
| 221 | +fn run_like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> { |
| 222 | + use vortex_array::ArrayRef; |
| 223 | + use vortex_array::arrays::scalar_fn::ScalarFnArrayExt; |
| 224 | + |
| 225 | + let len = array.len(); |
| 226 | + let arr: ArrayRef = array.into_array(); |
| 227 | + let pattern_arr = ConstantArray::new(pattern, len).into_array(); |
| 228 | + let result = Like |
| 229 | + .try_new_array(len, LikeOptions::default(), [arr, pattern_arr])? |
| 230 | + .into_array() |
| 231 | + .execute::<Canonical>(&mut SESSION.create_execution_ctx())?; |
| 232 | + Ok(result.into_bool()) |
| 233 | +} |
| 234 | + |
| 235 | +#[rstest] |
| 236 | +// Empty strings |
| 237 | +#[case(&[""], "aaaa%", &[false])] |
| 238 | +#[case(&[""], "%aaaa%", &[false])] |
| 239 | +#[case(&[""], "%", &[true])] |
| 240 | +#[case(&["", "", ""], "%", &[true, true, true])] |
| 241 | +// Single-char patterns |
| 242 | +#[case(&["a", "b", ""], "a%", &[true, false, false])] |
| 243 | +#[case(&["a", "b", ""], "%a%", &[true, false, false])] |
| 244 | +// Needle longer than every input string |
| 245 | +#[case(&["ab", "abc", ""], "%abcd%", &[false, false, false])] |
| 246 | +#[case(&["ab", "abc", ""], "abcd%", &[false, false, false])] |
| 247 | +// Exact match (prefix pattern = entire string + %) |
| 248 | +#[case(&["abc", "abcd", "ab"], "abc%", &[true, true, false])] |
| 249 | +#[case(&["abc", "abcd", "ab"], "%abc%", &[true, true, false])] |
| 250 | +// Repeated characters — KMP overlap |
| 251 | +#[case(&["aa", "aaa", "aaaa", "aba"], "%aaa%", &[false, true, true, false])] |
| 252 | +#[case(&["aab", "aaab", "a"], "aaa%", &[false, true, false])] |
| 253 | +// Needle at different positions |
| 254 | +#[case(&["xxabcyy", "abcyy", "xxabc", "abc", "xabx"], "%abc%", &[true, true, true, true, false])] |
| 255 | +// All identical strings |
| 256 | +#[case(&["aaa", "aaa", "aaa"], "%aaa%", &[true, true, true])] |
| 257 | +#[case(&["aaa", "aaa", "aaa"], "bbb%", &[false, false, false])] |
| 258 | +// Single element arrays |
| 259 | +#[case(&["hello"], "hello%", &[true])] |
| 260 | +#[case(&["hello"], "hellx%", &[false])] |
| 261 | +#[case(&["hello"], "%ello%", &[true])] |
| 262 | +#[case(&["hello"], "%ellx%", &[false])] |
| 263 | +// Overlapping KMP pattern "abab" |
| 264 | +#[case(&["ababab", "abab", "aba", "xababx"], "%abab%", &[true, true, false, true])] |
| 265 | +// Prefix that shares chars with rest of string |
| 266 | +#[case(&["abab", "abba", "abcd"], "ab%", &[true, true, true])] |
| 267 | +#[case(&["abab", "abba", "abcd", "ba"], "ab%", &[true, true, true, false])] |
| 268 | +fn test_like_edge_cases( |
| 269 | + #[case] strings: &[&str], |
| 270 | + #[case] pattern: &str, |
| 271 | + #[case] expected: &[bool], |
| 272 | +) -> VortexResult<()> { |
| 273 | + let opts: Vec<Option<&str>> = strings.iter().map(|s| Some(*s)).collect(); |
| 274 | + let fsst = make_fsst(&opts); |
| 275 | + let result = run_like(fsst, pattern)?; |
| 276 | + let expected_arr = BoolArray::from_iter(expected.iter().copied()); |
| 277 | + assert_arrays_eq!(&result, &expected_arr); |
| 278 | + Ok(()) |
| 279 | +} |
0 commit comments