Skip to content

Commit b013dad

Browse files
committed
fixup
Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent ef2445b commit b013dad

5 files changed

Lines changed: 81 additions & 39 deletions

File tree

_typos.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ extend-ignore-re = [
88
]
99

1010
[files]
11-
extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**", "vortex-sqllogictest/slt/**"]
11+
extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**", "vortex-sqllogictest/slt/**", "encodings/fsst/src/dfa/tests.rs"]
1212

1313
[type.py]
1414
extend-ignore-identifiers-re = [

encodings/fsst/src/compute/like.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,25 @@ impl LikeKernel for FSST {
3333
return Ok(None);
3434
}
3535

36-
let Some(pattern_str) = pattern_scalar.as_utf8().value() else {
36+
let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
37+
let Some(v) = s.value() else {
38+
return Ok(None);
39+
};
40+
v.as_ref()
41+
} else if let Some(b) = pattern_scalar.as_binary_opt() {
42+
let Some(v) = b.value() else {
43+
return Ok(None);
44+
};
45+
v
46+
} else {
3747
return Ok(None);
3848
};
3949

4050
let symbols = array.symbols();
4151
let symbol_lengths = array.symbol_lengths();
4252

4353
let Some(matcher) =
44-
FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_str)?
54+
FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
4555
else {
4656
return Ok(None);
4757
};
@@ -73,9 +83,6 @@ impl LikeKernel for FSST {
7383
mod tests {
7484
use std::sync::LazyLock;
7585

76-
use rand::Rng;
77-
use rand::SeedableRng;
78-
use rand::rngs::StdRng;
7986
use vortex_array::Canonical;
8087
use vortex_array::IntoArray;
8188
use vortex_array::VortexSessionExecute;

encodings/fsst/src/dfa/flat_contains.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
//!
1313
//! ### Step 1: KMP (Knuth–Morris–Pratt) byte-level transition table
1414
//!
15-
//! See: https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm
15+
//! See: <https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm>
1616
//!
1717
//! Build a `(state × byte) → state` table using the KMP failure function.
1818
//! States 0..2 track match progress, state 3 is accept (sticky).

encodings/fsst/src/dfa/mod.rs

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -163,23 +163,21 @@ impl FsstMatcher {
163163
pub(crate) fn try_new(
164164
symbols: &[Symbol],
165165
symbol_lengths: &[u8],
166-
pattern: &str,
166+
pattern: &[u8],
167167
) -> VortexResult<Option<Self>> {
168168
let Some(like_kind) = LikeKind::parse(pattern) else {
169169
return Ok(None);
170170
};
171171

172172
let inner = match like_kind {
173-
LikeKind::Prefix("") | LikeKind::Contains("") => MatcherInner::MatchAll,
173+
LikeKind::Prefix(b"") | LikeKind::Contains(b"") => MatcherInner::MatchAll,
174174
LikeKind::Prefix(prefix) => {
175-
let prefix = prefix.as_bytes();
176175
if prefix.len() > FlatPrefixDfa::MAX_PREFIX_LEN {
177176
return Ok(None);
178177
}
179178
MatcherInner::Prefix(FlatPrefixDfa::new(symbols, symbol_lengths, prefix)?)
180179
}
181180
LikeKind::Contains(needle) => {
182-
let needle = needle.as_bytes();
183181
if needle.len() > FlatContainsDfa::MAX_NEEDLE_LEN {
184182
return Ok(None);
185183
}
@@ -203,23 +201,24 @@ impl FsstMatcher {
203201
/// The subset of LIKE patterns we can handle without decompression.
204202
enum LikeKind<'a> {
205203
/// `prefix%`
206-
Prefix(&'a str),
204+
Prefix(&'a [u8]),
207205
/// `%needle%`
208-
Contains(&'a str),
206+
Contains(&'a [u8]),
209207
}
210208

211209
impl<'a> LikeKind<'a> {
212-
fn parse(pattern: &'a str) -> Option<Self> {
210+
fn parse(pattern: &'a [u8]) -> Option<Self> {
213211
// `prefix%` (including just `%` where prefix is empty)
214-
if let Some(prefix) = pattern.strip_suffix('%')
215-
&& !prefix.contains(['%', '_'])
212+
if let Some(prefix) = pattern.strip_suffix(&[b'%'])
213+
&& !prefix.contains(&b'%')
214+
&& !prefix.contains(&b'_')
216215
{
217216
return Some(LikeKind::Prefix(prefix));
218217
}
219218

220219
// `%needle%`
221-
let inner = pattern.strip_prefix('%')?.strip_suffix('%')?;
222-
if !inner.contains(['%', '_']) {
220+
let inner = pattern.strip_prefix(&[b'%'])?.strip_suffix(&[b'%'])?;
221+
if !inner.contains(&b'%') && !inner.contains(&b'_') {
223222
return Some(LikeKind::Contains(inner));
224223
}
225224

encodings/fsst/src/dfa/tests.rs

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@ use std::sync::LazyLock;
66
use fsst::ESCAPE_CODE;
77
use fsst::Symbol;
88
use rstest::rstest;
9+
use vortex_array::ArrayRef;
910
use vortex_array::Canonical;
1011
use vortex_array::IntoArray;
1112
use vortex_array::VortexSessionExecute;
1213
use vortex_array::arrays::BoolArray;
1314
use vortex_array::arrays::ConstantArray;
1415
use vortex_array::arrays::VarBinArray;
16+
use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
1517
use vortex_array::assert_arrays_eq;
1618
use vortex_array::dtype::DType;
1719
use vortex_array::dtype::Nullability;
@@ -51,17 +53,17 @@ fn escaped(bytes: &[u8]) -> Vec<u8> {
5153
#[test]
5254
fn test_like_kind_parse() {
5355
assert!(matches!(
54-
LikeKind::parse("http%"),
55-
Some(LikeKind::Prefix("http"))
56+
LikeKind::parse(b"http%"),
57+
Some(LikeKind::Prefix(b"http"))
5658
));
5759
assert!(matches!(
58-
LikeKind::parse("%needle%"),
59-
Some(LikeKind::Contains("needle"))
60+
LikeKind::parse(b"%needle%"),
61+
Some(LikeKind::Contains(b"needle"))
6062
));
61-
assert!(matches!(LikeKind::parse("%"), Some(LikeKind::Prefix(""))));
63+
assert!(matches!(LikeKind::parse(b"%"), Some(LikeKind::Prefix(b""))));
6264
// Suffix and underscore patterns are not supported.
63-
assert!(LikeKind::parse("%suffix").is_none());
64-
assert!(LikeKind::parse("a_c").is_none());
65+
assert!(LikeKind::parse(b"%suffix").is_none());
66+
assert!(LikeKind::parse(b"a_c").is_none());
6567
}
6668

6769
/// No symbols — all bytes escaped. Simplest case to see the two tables.
@@ -144,7 +146,7 @@ fn test_prefix_dfa_longer() -> VortexResult<()> {
144146

145147
#[test]
146148
fn test_prefix_pushdown_len_13_with_escapes() {
147-
let matcher = FsstMatcher::try_new(&[], &[], "abcdefghijklm%")
149+
let matcher = FsstMatcher::try_new(&[], &[], b"abcdefghijklm%")
148150
.unwrap()
149151
.unwrap();
150152

@@ -156,7 +158,7 @@ fn test_prefix_pushdown_len_13_with_escapes() {
156158
fn test_prefix_pushdown_len_14_now_handled() {
157159
// 14-byte prefix is now handled by FlatPrefixDfa (was rejected by shift-packed).
158160
assert!(
159-
FsstMatcher::try_new(&[], &[], "abcdefghijklmn%")
161+
FsstMatcher::try_new(&[], &[], b"abcdefghijklmn%")
160162
.unwrap()
161163
.is_some()
162164
);
@@ -166,7 +168,7 @@ fn test_prefix_pushdown_len_14_now_handled() {
166168
fn test_prefix_pushdown_long_prefix() -> VortexResult<()> {
167169
let prefix = "a".repeat(FlatPrefixDfa::MAX_PREFIX_LEN);
168170
let pattern = format!("{prefix}%");
169-
let matcher = FsstMatcher::try_new(&[], &[], &pattern)?.unwrap();
171+
let matcher = FsstMatcher::try_new(&[], &[], pattern.as_bytes())?.unwrap();
170172

171173
assert!(matcher.matches(&escaped(prefix.as_bytes())));
172174

@@ -182,14 +184,20 @@ fn test_prefix_pushdown_rejects_len_254() {
182184
debug_assert_eq!(FlatPrefixDfa::MAX_PREFIX_LEN, 253);
183185
let prefix = "a".repeat(254);
184186
let pattern = format!("{prefix}%");
185-
assert!(FsstMatcher::try_new(&[], &[], &pattern).unwrap().is_none());
187+
assert!(
188+
FsstMatcher::try_new(&[], &[], pattern.as_bytes())
189+
.unwrap()
190+
.is_none()
191+
);
186192
}
187193

188194
#[test]
189195
fn test_contains_pushdown_len_254_with_escapes() {
190196
let needle = "a".repeat(FlatContainsDfa::MAX_NEEDLE_LEN);
191197
let pattern = format!("%{needle}%");
192-
let matcher = FsstMatcher::try_new(&[], &[], &pattern).unwrap().unwrap();
198+
let matcher = FsstMatcher::try_new(&[], &[], pattern.as_bytes())
199+
.unwrap()
200+
.unwrap();
193201

194202
assert!(matcher.matches(&escaped(needle.as_bytes())));
195203

@@ -202,14 +210,18 @@ fn test_contains_pushdown_len_254_with_escapes() {
202210
fn test_contains_pushdown_rejects_len_255() {
203211
let needle = "a".repeat(FlatContainsDfa::MAX_NEEDLE_LEN + 1);
204212
let pattern = format!("%{needle}%");
205-
assert!(FsstMatcher::try_new(&[], &[], &pattern).unwrap().is_none());
213+
assert!(
214+
FsstMatcher::try_new(&[], &[], pattern.as_bytes())
215+
.unwrap()
216+
.is_none()
217+
);
206218
}
207219

208220
// ---------------------------------------------------------------------------
209221
// End-to-end edge cases: FSST compress → LIKE → compare booleans
210222
// ---------------------------------------------------------------------------
211223

212-
fn make_fsst(strings: &[Option<&str>]) -> FSSTArray {
224+
fn make_fsst_str(strings: &[Option<&str>]) -> FSSTArray {
213225
let varbin = VarBinArray::from_iter(
214226
strings.iter().copied(),
215227
DType::Utf8(Nullability::NonNullable),
@@ -218,13 +230,9 @@ fn make_fsst(strings: &[Option<&str>]) -> FSSTArray {
218230
fsst_compress(varbin, &compressor)
219231
}
220232

221-
fn run_like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
222-
use vortex_array::ArrayRef;
223-
use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
224-
233+
fn run_like(array: FSSTArray, pattern_arr: ArrayRef) -> VortexResult<BoolArray> {
225234
let len = array.len();
226235
let arr: ArrayRef = array.into_array();
227-
let pattern_arr = ConstantArray::new(pattern, len).into_array();
228236
let result = Like
229237
.try_new_array(len, LikeOptions::default(), [arr, pattern_arr])?
230238
.into_array()
@@ -267,14 +275,42 @@ fn run_like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
267275
// Prefix that shares chars with rest of string
268276
#[case(&["abab", "abba", "abcd"], "ab%", &[true, true, true])]
269277
#[case(&["abab", "abba", "abcd", "ba"], "ab%", &[true, true, true, false])]
278+
// The string "aabaabaabaab" requires multi-level KMP fallback at the 'a' after "aabaabaab"
279+
#[case(&["aabaabaabaab", "aabaabaax", "xaabaabaab"], "%aabaabaab%", &[true, false, true])]
280+
#[case(&["café latte", "naïve approach", "café noir"], "café%", &[true, false, true])]
281+
#[case(&["日本語テスト", "日本語データ", "英語テスト"], "%日本語%", &[true, true, false])]
282+
// 10-byte needle, contains: match at start, middle, end, exact, and near-miss
283+
#[case(
284+
&["abcdefghijxxx", "xxxabcdefghij", "xxabcdefghijxx", "abcdefghij", "abcdefghxx"],
285+
"%abcdefghij%",
286+
&[true, true, true, true, false]
287+
)]
288+
// 10-byte prefix: same needle but anchored at the start of the string
289+
#[case(
290+
&["abcdefghijxxx", "abcdefghij", "xabcdefghij", "abcdefghxx"],
291+
"abcdefghij%",
292+
&[true, true, false, false]
293+
)]
294+
// 9-byte needle with KMP-relevant overlap ("abcabcabc"):
295+
// failure table = [0,0,0,1,2,3,4,5,6], so a partial match of "abcabcab"
296+
// followed by a mismatch must fall back to state 5 ("abcab"), not restart.
297+
// This exercises multi-level KMP backtracking across symbol boundaries.
298+
#[case(
299+
&["xxabcabcabcxx", "abcabcabc", "abcabcabx", "abcabcxx"],
300+
"%abcabcabc%",
301+
&[true, true, false, false]
302+
)]
270303
fn test_like_edge_cases(
271304
#[case] strings: &[&str],
272305
#[case] pattern: &str,
273306
#[case] expected: &[bool],
274307
) -> VortexResult<()> {
275308
let opts: Vec<Option<&str>> = strings.iter().map(|s| Some(*s)).collect();
276-
let fsst = make_fsst(&opts);
277-
let result = run_like(fsst, pattern)?;
309+
let fsst_arr = make_fsst_str(&opts);
310+
let result = run_like(
311+
fsst_arr,
312+
ConstantArray::new(pattern, opts.len()).into_array(),
313+
)?;
278314
let expected_arr = BoolArray::from_iter(expected.iter().copied());
279315
assert_arrays_eq!(&result, &expected_arr);
280316
Ok(())

0 commit comments

Comments
 (0)