Skip to content

Commit 0e99e3a

Browse files
authored
improve LIKE regex (#6145)
1 parent bd1e76b commit 0e99e3a

1 file changed

Lines changed: 57 additions & 26 deletions

File tree

arrow-string/src/predicate.rs

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -140,39 +140,54 @@ fn ends_with_ignore_ascii_case(haystack: &str, needle: &str) -> bool {
140140

141141
/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
142142
///
143-
/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
144-
/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.`
145-
/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%`
143+
/// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern,
144+
/// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`)
145+
/// 2. Replace `LIKE` single-character wildcards `_` => `.`
146+
/// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.`
147+
/// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%`
146148
fn regex_like(pattern: &str, case_insensitive: bool) -> Result<Regex, ArrowError> {
147149
let mut result = String::with_capacity(pattern.len() * 2);
148-
result.push('^');
149150
let mut chars_iter = pattern.chars().peekable();
151+
match chars_iter.peek() {
152+
// if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*`
153+
Some('%') => {
154+
chars_iter.next();
155+
}
156+
_ => result.push('^'),
157+
};
158+
150159
while let Some(c) = chars_iter.next() {
151-
if c == '\\' {
152-
let next = chars_iter.peek();
153-
match next {
154-
Some(next) if is_like_pattern(*next) => {
155-
result.push(*next);
156-
// Skipping the next char as it is already appended
157-
chars_iter.next();
160+
match c {
161+
'\\' => {
162+
match chars_iter.peek() {
163+
Some(next) if is_like_pattern(*next) => {
164+
result.push(*next);
165+
// Skipping the next char as it is already appended
166+
chars_iter.next();
167+
}
168+
_ => {
169+
result.push('\\');
170+
result.push('\\');
171+
}
158172
}
159-
_ => {
160-
result.push('\\');
173+
}
174+
'%' => result.push_str(".*"),
175+
'_' => result.push('.'),
176+
c => {
177+
if regex_syntax::is_meta_character(c) {
161178
result.push('\\');
162179
}
180+
result.push(c);
163181
}
164-
} else if regex_syntax::is_meta_character(c) {
165-
result.push('\\');
166-
result.push(c);
167-
} else if c == '%' {
168-
result.push_str(".*");
169-
} else if c == '_' {
170-
result.push('.');
171-
} else {
172-
result.push(c);
173182
}
174183
}
175-
result.push('$');
184+
// instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex
185+
if result.ends_with(".*") {
186+
result.pop();
187+
result.pop();
188+
} else {
189+
result.push('$');
190+
}
176191
RegexBuilder::new(&result)
177192
.case_insensitive(case_insensitive)
178193
.dot_matches_new_line(true)
@@ -197,9 +212,25 @@ mod tests {
197212
use super::*;
198213

199214
#[test]
200-
fn test_replace_like_wildcards() {
201-
let a_eq = "_%";
202-
let expected = "^..*$";
215+
fn test_replace_start_end_percent() {
216+
let a_eq = "%foobar%";
217+
let expected = "foobar";
218+
let r = regex_like(a_eq, false).unwrap();
219+
assert_eq!(r.to_string(), expected);
220+
}
221+
222+
#[test]
223+
fn test_replace_middle_percent() {
224+
let a_eq = "foo%bar";
225+
let expected = "^foo.*bar$";
226+
let r = regex_like(a_eq, false).unwrap();
227+
assert_eq!(r.to_string(), expected);
228+
}
229+
230+
#[test]
231+
fn test_replace_underscore() {
232+
let a_eq = "foo_bar";
233+
let expected = "^foo.bar$";
203234
let r = regex_like(a_eq, false).unwrap();
204235
assert_eq!(r.to_string(), expected);
205236
}

0 commit comments

Comments
 (0)