Skip to content

Commit c0dc571

Browse files
authored
fix: regex simplification of anchored patterns produces wrong results (apache#22727)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes apache#22726. ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> The regex simplification rule rewrites anchored regex matches (`^literal$`, `^(a|b)$`) into cheaper `=` / `IN` / `LIKE` expressions. Two bugs in that path: 1. The literal was always built as `Utf8` via `lit(...)`, so on a `Utf8View` / `LargeUtf8` column the rewritten comparison failed at execution with `Invalid comparison operation: Utf8View == Utf8`. 2. A `~*` (case-insensitive) anchored literal was rewritten to a case-sensitive `=`, silently dropping rows that differ only in case. ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> - Build the extracted literal with `string_scalar.to_expr(...)` so its type follows the column type (`Utf8` / `LargeUtf8` / `Utf8View`), consistent with the existing `LIKE` branches. - Rewrite `~*` anchored literals to `ILIKE` instead of `=`. The existing `is_safe_for_like` guard ensures the literal has no `%` / `_`, so this is an exact case-insensitive match. (Anchored alternations under `~*` still fall back to regex evaluation.) ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Yes. `predicates.slt` now covers anchored `~` / `~*`, single literals and alternations, over both `Utf8` and `Utf8View` columns. Existing `regex.rs` unit tests still pass. ## Are there any user-facing changes? Yes, bug fixes only <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 8995ce6 commit c0dc571

2 files changed

Lines changed: 181 additions & 10 deletions

File tree

datafusion/optimizer/src/simplify_expressions/regex.rs

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -283,20 +283,23 @@ fn partial_anchored_literal_to_like(v: &[Hir]) -> Option<String> {
283283

284284
/// Extracts a string literal expression assuming that [`is_anchored_literal`]
285285
/// returned true.
286-
fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
286+
fn anchored_literal_to_expr(v: &[Hir], string_scalar: &StringScalar) -> Option<Expr> {
287287
match v.len() {
288-
2 => Some(lit("")),
288+
2 => Some(string_scalar.to_expr("")),
289289
3 => {
290290
let HirKind::Literal(l) = v[1].kind() else {
291291
return None;
292292
};
293-
like_str_from_literal(l).map(lit)
293+
like_str_from_literal(l).map(|s| string_scalar.to_expr(s))
294294
}
295295
_ => None,
296296
}
297297
}
298298

299-
fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
299+
fn anchored_alternation_to_exprs(
300+
v: &[Hir],
301+
string_scalar: &StringScalar,
302+
) -> Option<Vec<Expr>> {
300303
if 3 != v.len() {
301304
return None;
302305
}
@@ -308,7 +311,8 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
308311
for hir in alters {
309312
let mut is_safe = false;
310313
if let HirKind::Literal(l) = hir.kind()
311-
&& let Some(safe_literal) = str_from_literal(l).map(lit)
314+
&& let Some(safe_literal) =
315+
str_from_literal(l).map(|s| string_scalar.to_expr(s))
312316
{
313317
literals.push(safe_literal);
314318
is_safe = true;
@@ -321,7 +325,9 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
321325

322326
return Some(literals);
323327
} else if let HirKind::Literal(l) = sub.kind() {
324-
if let Some(safe_literal) = str_from_literal(l).map(lit) {
328+
if let Some(safe_literal) =
329+
str_from_literal(l).map(|s| string_scalar.to_expr(s))
330+
{
325331
return Some(vec![safe_literal]);
326332
}
327333
return None;
@@ -351,12 +357,18 @@ fn lower_simple(
351357
));
352358
}
353359
HirKind::Concat(inner) if is_anchored_literal(inner) => {
354-
return anchored_literal_to_expr(inner).map(|right| {
355-
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right))
360+
return anchored_literal_to_expr(inner, string_scalar).map(|right| {
361+
if mode.i {
362+
// Case-insensitive: use ILIKE for exact match (no wildcards)
363+
mode.expr(Box::new(left.clone()), Box::new(right))
364+
} else {
365+
// Case-sensitive: use Eq / NotEq
366+
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right))
367+
}
356368
});
357369
}
358-
HirKind::Concat(inner) if is_anchored_capture(inner) => {
359-
return anchored_alternation_to_exprs(inner)
370+
HirKind::Concat(inner) if !mode.i && is_anchored_capture(inner) => {
371+
return anchored_alternation_to_exprs(inner, string_scalar)
360372
.map(|right| left.clone().in_list(right, mode.not));
361373
}
362374
HirKind::Concat(inner) => {

datafusion/sqllogictest/test_files/predicates.slt

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,171 @@ SELECT * FROM test WHERE column1 ~ 'z'
204204
----
205205
Bazzz
206206

207+
query T
208+
SELECT * FROM test WHERE column1 ~ '^Bazzz$'
209+
----
210+
Bazzz
211+
212+
query T
213+
SELECT * FROM test WHERE column1 ~ '^(foo|Bazzz)$'
214+
----
215+
foo
216+
Bazzz
217+
218+
statement ok
219+
CREATE TABLE test_regex_utf8view(s VARCHAR) AS VALUES ('foo'), ('Bazzz');
220+
221+
statement ok
222+
set datafusion.explain.logical_plan_only = true
223+
224+
# `~` anchored literal -> `= Utf8View(..)`
225+
query TT
226+
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~ '^Bazzz$'
227+
----
228+
logical_plan
229+
01)Filter: test_regex_utf8view.s = Utf8View("Bazzz")
230+
02)--TableScan: test_regex_utf8view projection=[s]
231+
232+
# `~*` anchored literal -> `ILIKE Utf8View(..)`
233+
query TT
234+
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~* '^bazzz$'
235+
----
236+
logical_plan
237+
01)Filter: test_regex_utf8view.s ILIKE Utf8View("bazzz")
238+
02)--TableScan: test_regex_utf8view projection=[s]
239+
240+
# `~` anchored alternation -> OR of `= Utf8View(..)` comparisons.
241+
query TT
242+
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~ '^(foo|Bazzz)$'
243+
----
244+
logical_plan
245+
01)Filter: test_regex_utf8view.s = Utf8View("foo") OR test_regex_utf8view.s = Utf8View("Bazzz")
246+
02)--TableScan: test_regex_utf8view projection=[s]
247+
248+
# `~*` anchored alternation -> NOT simplified: it falls back to a regex match,
249+
# because `IN`/`=` cannot express case-insensitive matching.
250+
query TT
251+
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s ~* '^(foo|bazzz)$'
252+
----
253+
logical_plan
254+
01)Filter: test_regex_utf8view.s ~* Utf8View("^(foo|bazzz)$")
255+
02)--TableScan: test_regex_utf8view projection=[s]
256+
257+
# `!~` -> `!= Utf8View(..)`
258+
query TT
259+
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~ '^Bazzz$'
260+
----
261+
logical_plan
262+
01)Filter: test_regex_utf8view.s != Utf8View("Bazzz")
263+
02)--TableScan: test_regex_utf8view projection=[s]
264+
265+
# `!~*` -> `NOT ILIKE Utf8View(..)`
266+
query TT
267+
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~* '^bazzz$'
268+
----
269+
logical_plan
270+
01)Filter: test_regex_utf8view.s NOT ILIKE Utf8View("bazzz")
271+
02)--TableScan: test_regex_utf8view projection=[s]
272+
273+
# `!~` anchored alternation -> AND of `!= Utf8View(..)` comparisons.
274+
query TT
275+
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~ '^(foo|Bazzz)$'
276+
----
277+
logical_plan
278+
01)Filter: test_regex_utf8view.s != Utf8View("foo") AND test_regex_utf8view.s != Utf8View("Bazzz")
279+
02)--TableScan: test_regex_utf8view projection=[s]
280+
281+
# `!~*` anchored alternation -> NOT simplified: it falls back to a regex match,
282+
# same reason as the `~*` alternation above.
283+
query TT
284+
EXPLAIN SELECT * FROM test_regex_utf8view WHERE s !~* '^(foo|bazzz)$'
285+
----
286+
logical_plan
287+
01)Filter: test_regex_utf8view.s !~* Utf8View("^(foo|bazzz)$")
288+
02)--TableScan: test_regex_utf8view projection=[s]
289+
290+
statement ok
291+
set datafusion.explain.logical_plan_only = false
292+
293+
# Result assertions
294+
query T
295+
SELECT * FROM test_regex_utf8view WHERE s ~ '^Bazzz$'
296+
----
297+
Bazzz
298+
299+
query T
300+
SELECT * FROM test_regex_utf8view WHERE s ~ '^(foo|Bazzz)$'
301+
----
302+
foo
303+
Bazzz
304+
305+
# Case-insensitive anchored match over Utf8View: must be simplified to ILIKE
306+
# (not a case-sensitive Eq) and must keep operand types as Utf8View.
307+
query T
308+
SELECT * FROM test_regex_utf8view WHERE s ~* '^bazzz$'
309+
----
310+
Bazzz
311+
312+
# Case-insensitive anchored alternation over Utf8View
313+
query T rowsort
314+
SELECT * FROM test_regex_utf8view WHERE s ~* '^(foo|bazzz)$'
315+
----
316+
Bazzz
317+
foo
318+
319+
query T rowsort
320+
SELECT * FROM test_regex_utf8view WHERE s !~ '^Bazzz$'
321+
----
322+
foo
323+
324+
query T rowsort
325+
SELECT * FROM test_regex_utf8view WHERE s !~* '^bazzz$'
326+
----
327+
foo
328+
329+
# Both rows match the alternation, so the negated forms return nothing.
330+
query T rowsort
331+
SELECT * FROM test_regex_utf8view WHERE s !~ '^(foo|Bazzz)$'
332+
----
333+
334+
query T rowsort
335+
SELECT * FROM test_regex_utf8view WHERE s !~* '^(foo|bazzz)$'
336+
----
337+
338+
statement ok
339+
DROP TABLE test_regex_utf8view;
340+
207341
query T
208342
SELECT * FROM test WHERE column1 ~* 'z'
209343
----
210344
Bazzz
211345
ZZZZZ
212346

347+
query T
348+
SELECT * FROM test WHERE column1 ~* '^barrr$'
349+
----
350+
Barrr
351+
352+
query T
353+
SELECT * FROM test WHERE column1 ~* '^(barrr|bazzz)$'
354+
----
355+
Barrr
356+
Bazzz
357+
358+
query T rowsort
359+
SELECT * FROM test WHERE column1 !~ '^Bazzz$'
360+
----
361+
Barrr
362+
ZZZZZ
363+
foo
364+
365+
query T rowsort
366+
SELECT * FROM test WHERE column1 !~* '^barrr$'
367+
----
368+
Bazzz
369+
ZZZZZ
370+
foo
371+
213372
query T
214373
SELECT * FROM test WHERE column1 !~ 'z'
215374
----

0 commit comments

Comments
 (0)