Skip to content

Commit 7b4d2e6

Browse files
authored
Fix pruning predicate for LIKE expressions with escape sequences (#22375)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #22374. ## Rationale for this change See #22374. <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> This change includes a fix for the issue and a test that fails without this fix. The fix is straightforward: when splitting on the wildcard, we also unescape any escape sequences in the prefix. ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> I added a test that fails without this fix. ## Are there any user-facing changes? No <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 3d43f56 commit 7b4d2e6

1 file changed

Lines changed: 192 additions & 20 deletions

File tree

datafusion/pruning/src/pruning_predicate.rs

Lines changed: 192 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1822,8 +1822,10 @@ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
18221822
fn build_like_match(
18231823
expr_builder: &mut PruningExpressionBuilder,
18241824
) -> Option<Arc<dyn PhysicalExpr>> {
1825-
// column LIKE literal => (min, max) LIKE literal split at % => min <= split literal && split literal <= max
1825+
// column LIKE literal => (min, max) LIKE literal split at unescaped % => min <= split literal && split literal <= max
18261826
// column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
1827+
// column LIKE 'foo\_%' => min <= 'foo_' && 'foo_' <= max (the _ is escaped)
1828+
// column LIKE 'foo\%%' => min <= 'foo%' && 'foo%' <= max (the % is escaped)
18271829
// column LIKE '%foo' => min <= '' && '' <= max => true
18281830
// column LIKE '%foo%' => min <= '' && '' <= max => true
18291831
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
@@ -1836,24 +1838,25 @@ fn build_like_match(
18361838
// check that the scalar is a string literal
18371839
let s = extract_string_literal(scalar_expr)?;
18381840
// ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character.
1839-
let first_wildcard_index = s.find(['%', '_']);
1840-
if first_wildcard_index == Some(0) {
1841-
// there's no filtering we could possibly do, return an error and have this be handled by the unhandled hook
1841+
let (decoded_prefix, rest) = split_constant_prefix(s);
1842+
let has_wildcard = !rest.is_empty();
1843+
if has_wildcard && decoded_prefix.is_empty() {
1844+
// there's no filtering we could possibly do, return None and have this be handled by the unhandled hook
18421845
return None;
18431846
}
1844-
let (lower_bound, upper_bound) = if let Some(wildcard_index) = first_wildcard_index {
1845-
let prefix = &s[..wildcard_index];
1847+
let (lower_bound, upper_bound) = if has_wildcard {
1848+
let incremented_prefix = increment_utf8(&decoded_prefix)?;
18461849
let lower_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
1847-
prefix.to_string(),
1850+
decoded_prefix,
18481851
))));
18491852
let upper_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
1850-
increment_utf8(prefix)?,
1853+
incremented_prefix,
18511854
))));
18521855
(lower_bound_lit, upper_bound_lit)
18531856
} else {
18541857
// the like expression is a literal and can be converted into a comparison
18551858
let bound = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
1856-
s.to_string(),
1859+
decoded_prefix,
18571860
))));
18581861
(Arc::clone(&bound), bound)
18591862
};
@@ -1934,19 +1937,20 @@ fn build_not_like_match(
19341937
}
19351938

19361939
/// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty)
1937-
fn split_constant_prefix(pattern: &str) -> (&str, &str) {
1938-
let char_indices = pattern.char_indices().collect::<Vec<_>>();
1939-
for i in 0..char_indices.len() {
1940-
let (idx, char) = char_indices[i];
1941-
if char == '%' || char == '_' {
1942-
if i != 0 && char_indices[i - 1].1 == '\\' {
1943-
// ecsaped by `\`
1944-
continue;
1945-
}
1946-
return (&pattern[..idx], &pattern[idx..]);
1940+
fn split_constant_prefix(pattern: &str) -> (String, &str) {
1941+
let mut prefix = String::with_capacity(pattern.len());
1942+
let mut iter = pattern.char_indices();
1943+
while let Some((idx, c)) = iter.next() {
1944+
match c {
1945+
'%' | '_' => return (prefix, &pattern[idx..]),
1946+
'\\' => match iter.next() {
1947+
Some((_, escaped)) => prefix.push(escaped),
1948+
None => prefix.push('\\'),
1949+
},
1950+
_ => prefix.push(c),
19471951
}
19481952
}
1949-
(pattern, "")
1953+
(prefix, "")
19501954
}
19511955

19521956
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
@@ -4816,6 +4820,174 @@ mod tests {
48164820
prune_with_expr(expr, &schema, &statistics, expected_ret);
48174821
}
48184822

4823+
// `build_like_match()` must honor `\` escapes when scanning the pattern for
4824+
// wildcards.
4825+
#[test]
4826+
fn prune_utf8_like_escaped_chars() {
4827+
let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)]));
4828+
let statistics = TestStatistics::new().with(
4829+
"s1",
4830+
ContainerStats::new_utf8(
4831+
vec![
4832+
Some("foo_aaa"),
4833+
Some(r#"foo\aaa"#),
4834+
Some("foo"),
4835+
Some("bar"),
4836+
Some("foo%aaa"),
4837+
Some("%foo_aaa"),
4838+
], // min
4839+
vec![
4840+
Some("foo_zzz"),
4841+
Some(r#"foo\zzz"#),
4842+
Some("foozzz"),
4843+
Some("baz"),
4844+
Some("foo%zzz"),
4845+
Some("%foo_zzz"),
4846+
], // max
4847+
),
4848+
);
4849+
4850+
let expr = col("s1").like(lit(r#"foo\_%"#));
4851+
#[rustfmt::skip]
4852+
let expected_ret = &[
4853+
// s1 ["foo_aaa", "foo_zzz"] => every value starts with literal
4854+
// "foo_" and matches the pattern; must keep.
4855+
true,
4856+
// s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4857+
false,
4858+
// s1 ["foo", "foozzz"] => stats don't prove "foo_" is or isn't in
4859+
// range; must conservatively keep.
4860+
true,
4861+
// s1 ["bar", "baz"] => no rows can pass (not keep)
4862+
false,
4863+
// s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4864+
false,
4865+
// s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4866+
false,
4867+
];
4868+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4869+
4870+
let expr = col("s1").like(lit(r#"foo\\%"#));
4871+
#[rustfmt::skip]
4872+
let expected_ret = &[
4873+
// s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4874+
false,
4875+
// s1 ["foo\aaa", "foo\zzz"] => every value starts with literal
4876+
// "foo\" and matches the pattern; must keep.
4877+
true,
4878+
// s1 ["foo", "foozzz"] => stats don't prove "foo\" is or isn't in
4879+
// range; must conservatively keep.
4880+
true,
4881+
// s1 ["bar", "baz"] => no rows can pass (not keep)
4882+
false,
4883+
// s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4884+
false,
4885+
// s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4886+
false,
4887+
];
4888+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4889+
4890+
let expr = col("s1").like(lit(r#"foo\%%"#));
4891+
#[rustfmt::skip]
4892+
let expected_ret = &[
4893+
// s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4894+
false,
4895+
// s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4896+
false,
4897+
// s1 ["foo", "foozzz"] => range straddles "foo%"; must keep.
4898+
true,
4899+
// s1 ["bar", "baz"] => no rows can pass (not keep)
4900+
false,
4901+
// s1 ["foo%aaa", "foo%zzz"] => every value starts with literal
4902+
// "foo%" and matches the pattern; must keep.
4903+
true,
4904+
// s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4905+
false,
4906+
];
4907+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4908+
4909+
// No wildcard after escapes: pattern reduces to an equality check on
4910+
// the literal "foo_".
4911+
let expr = col("s1").like(lit(r#"foo\_"#));
4912+
#[rustfmt::skip]
4913+
let expected_ret = &[
4914+
// s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4915+
false,
4916+
// s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4917+
false,
4918+
// s1 ["foo", "foozzz"] => "foo_" is within the range; must keep.
4919+
true,
4920+
// s1 ["bar", "baz"] => no rows can pass (not keep)
4921+
false,
4922+
// s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4923+
false,
4924+
// s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4925+
false,
4926+
];
4927+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4928+
4929+
// Leading escaped `%`: prefix is "%foo" (non-empty), so the guard
4930+
// for "all wildcards" must NOT bail out here.
4931+
let expr = col("s1").like(lit(r#"\%foo%"#));
4932+
#[rustfmt::skip]
4933+
let expected_ret = &[
4934+
// s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4935+
false,
4936+
// s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4937+
false,
4938+
// s1 ["foo", "foozzz"] => no rows can pass (not keep)
4939+
false,
4940+
// s1 ["bar", "baz"] => no rows can pass (not keep)
4941+
false,
4942+
// s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4943+
false,
4944+
// s1 ["%foo_aaa", "%foo_zzz"] => every value starts with literal
4945+
// "%foo" and matches the pattern; must keep.
4946+
true,
4947+
];
4948+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4949+
4950+
// Two escaped wildcards, no real wildcard: equality on "foo%_".
4951+
let expr = col("s1").like(lit(r#"foo\%\_"#));
4952+
#[rustfmt::skip]
4953+
let expected_ret = &[
4954+
// s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4955+
false,
4956+
// s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4957+
false,
4958+
// s1 ["foo", "foozzz"] => "foo%_" is within the range; must keep.
4959+
true,
4960+
// s1 ["bar", "baz"] => no rows can pass (not keep)
4961+
false,
4962+
// s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4963+
false,
4964+
// s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4965+
false,
4966+
];
4967+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4968+
4969+
// Escaped backslash followed by more literal chars before the
4970+
// wildcard: prefix is "foo\bar".
4971+
let expr = col("s1").like(lit(r#"foo\\bar%"#));
4972+
#[rustfmt::skip]
4973+
let expected_ret = &[
4974+
// s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4975+
false,
4976+
// s1 ["foo\aaa", "foo\zzz"] => range straddles "foo\bar"; must
4977+
// keep.
4978+
true,
4979+
// s1 ["foo", "foozzz"] => range straddles "foo\bar"; must keep.
4980+
true,
4981+
// s1 ["bar", "baz"] => no rows can pass (not keep)
4982+
false,
4983+
// s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4984+
false,
4985+
// s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4986+
false,
4987+
];
4988+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4989+
}
4990+
48194991
#[test]
48204992
fn prune_utf8_not_like_one() {
48214993
let (schema, statistics) = utf8_setup();

0 commit comments

Comments
 (0)