@@ -1822,8 +1822,10 @@ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
18221822fn build_like_match (
18231823 expr_builder : & mut PruningExpressionBuilder ,
18241824) -> Option < Arc < dyn PhysicalExpr > > {
1825- // column LIKE literal => (min, max) LIKE literal split at % => min <= split literal && split literal <= max
1825+ // column LIKE literal => (min, max) LIKE literal split at unescaped % => min <= split literal && split literal <= max
18261826 // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
1827+ // column LIKE 'foo\_%' => min <= 'foo_' && 'foo_' <= max (the _ is escaped)
1828+ // column LIKE 'foo\%%' => min <= 'foo%' && 'foo%' <= max (the % is escaped)
18271829 // column LIKE '%foo' => min <= '' && '' <= max => true
18281830 // column LIKE '%foo%' => min <= '' && '' <= max => true
18291831 // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
@@ -1836,24 +1838,25 @@ fn build_like_match(
18361838 // check that the scalar is a string literal
18371839 let s = extract_string_literal ( scalar_expr) ?;
18381840 // ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character.
1839- let first_wildcard_index = s. find ( [ '%' , '_' ] ) ;
1840- if first_wildcard_index == Some ( 0 ) {
1841- // there's no filtering we could possibly do, return an error and have this be handled by the unhandled hook
1841+ let ( decoded_prefix, rest) = split_constant_prefix ( s) ;
1842+ let has_wildcard = !rest. is_empty ( ) ;
1843+ if has_wildcard && decoded_prefix. is_empty ( ) {
1844+ // there's no filtering we could possibly do, return None and have this be handled by the unhandled hook
18421845 return None ;
18431846 }
1844- let ( lower_bound, upper_bound) = if let Some ( wildcard_index ) = first_wildcard_index {
1845- let prefix = & s [ ..wildcard_index ] ;
1847+ let ( lower_bound, upper_bound) = if has_wildcard {
1848+ let incremented_prefix = increment_utf8 ( & decoded_prefix ) ? ;
18461849 let lower_bound_lit = Arc :: new ( phys_expr:: Literal :: new ( ScalarValue :: Utf8 ( Some (
1847- prefix . to_string ( ) ,
1850+ decoded_prefix ,
18481851 ) ) ) ) ;
18491852 let upper_bound_lit = Arc :: new ( phys_expr:: Literal :: new ( ScalarValue :: Utf8 ( Some (
1850- increment_utf8 ( prefix ) ? ,
1853+ incremented_prefix ,
18511854 ) ) ) ) ;
18521855 ( lower_bound_lit, upper_bound_lit)
18531856 } else {
18541857 // the like expression is a literal and can be converted into a comparison
18551858 let bound = Arc :: new ( phys_expr:: Literal :: new ( ScalarValue :: Utf8 ( Some (
1856- s . to_string ( ) ,
1859+ decoded_prefix ,
18571860 ) ) ) ) ;
18581861 ( Arc :: clone ( & bound) , bound)
18591862 } ;
@@ -1934,19 +1937,20 @@ fn build_not_like_match(
19341937}
19351938
19361939/// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty)
1937- fn split_constant_prefix ( pattern : & str ) -> ( & str , & str ) {
1938- let char_indices = pattern. char_indices ( ) . collect :: < Vec < _ > > ( ) ;
1939- for i in 0 ..char_indices. len ( ) {
1940- let ( idx, char) = char_indices[ i] ;
1941- if char == '%' || char == '_' {
1942- if i != 0 && char_indices[ i - 1 ] . 1 == '\\' {
1943- // ecsaped by `\`
1944- continue ;
1945- }
1946- return ( & pattern[ ..idx] , & pattern[ idx..] ) ;
1940+ fn split_constant_prefix ( pattern : & str ) -> ( String , & str ) {
1941+ let mut prefix = String :: with_capacity ( pattern. len ( ) ) ;
1942+ let mut iter = pattern. char_indices ( ) ;
1943+ while let Some ( ( idx, c) ) = iter. next ( ) {
1944+ match c {
1945+ '%' | '_' => return ( prefix, & pattern[ idx..] ) ,
1946+ '\\' => match iter. next ( ) {
1947+ Some ( ( _, escaped) ) => prefix. push ( escaped) ,
1948+ None => prefix. push ( '\\' ) ,
1949+ } ,
1950+ _ => prefix. push ( c) ,
19471951 }
19481952 }
1949- ( pattern , "" )
1953+ ( prefix , "" )
19501954}
19511955
19521956/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
@@ -4816,6 +4820,174 @@ mod tests {
48164820 prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
48174821 }
48184822
4823+ // `build_like_match()` must honor `\` escapes when scanning the pattern for
4824+ // wildcards.
4825+ #[ test]
4826+ fn prune_utf8_like_escaped_chars ( ) {
4827+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new( "s1" , DataType :: Utf8 , true ) ] ) ) ;
4828+ let statistics = TestStatistics :: new ( ) . with (
4829+ "s1" ,
4830+ ContainerStats :: new_utf8 (
4831+ vec ! [
4832+ Some ( "foo_aaa" ) ,
4833+ Some ( r#"foo\aaa"# ) ,
4834+ Some ( "foo" ) ,
4835+ Some ( "bar" ) ,
4836+ Some ( "foo%aaa" ) ,
4837+ Some ( "%foo_aaa" ) ,
4838+ ] , // min
4839+ vec ! [
4840+ Some ( "foo_zzz" ) ,
4841+ Some ( r#"foo\zzz"# ) ,
4842+ Some ( "foozzz" ) ,
4843+ Some ( "baz" ) ,
4844+ Some ( "foo%zzz" ) ,
4845+ Some ( "%foo_zzz" ) ,
4846+ ] , // max
4847+ ) ,
4848+ ) ;
4849+
4850+ let expr = col ( "s1" ) . like ( lit ( r#"foo\_%"# ) ) ;
4851+ #[ rustfmt:: skip]
4852+ let expected_ret = & [
4853+ // s1 ["foo_aaa", "foo_zzz"] => every value starts with literal
4854+ // "foo_" and matches the pattern; must keep.
4855+ true ,
4856+ // s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4857+ false ,
4858+ // s1 ["foo", "foozzz"] => stats don't prove "foo_" is or isn't in
4859+ // range; must conservatively keep.
4860+ true ,
4861+ // s1 ["bar", "baz"] => no rows can pass (not keep)
4862+ false ,
4863+ // s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4864+ false ,
4865+ // s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4866+ false ,
4867+ ] ;
4868+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4869+
4870+ let expr = col ( "s1" ) . like ( lit ( r#"foo\\%"# ) ) ;
4871+ #[ rustfmt:: skip]
4872+ let expected_ret = & [
4873+ // s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4874+ false ,
4875+ // s1 ["foo\aaa", "foo\zzz"] => every value starts with literal
4876+ // "foo\" and matches the pattern; must keep.
4877+ true ,
4878+ // s1 ["foo", "foozzz"] => stats don't prove "foo\" is or isn't in
4879+ // range; must conservatively keep.
4880+ true ,
4881+ // s1 ["bar", "baz"] => no rows can pass (not keep)
4882+ false ,
4883+ // s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4884+ false ,
4885+ // s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4886+ false ,
4887+ ] ;
4888+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4889+
4890+ let expr = col ( "s1" ) . like ( lit ( r#"foo\%%"# ) ) ;
4891+ #[ rustfmt:: skip]
4892+ let expected_ret = & [
4893+ // s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4894+ false ,
4895+ // s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4896+ false ,
4897+ // s1 ["foo", "foozzz"] => range straddles "foo%"; must keep.
4898+ true ,
4899+ // s1 ["bar", "baz"] => no rows can pass (not keep)
4900+ false ,
4901+ // s1 ["foo%aaa", "foo%zzz"] => every value starts with literal
4902+ // "foo%" and matches the pattern; must keep.
4903+ true ,
4904+ // s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4905+ false ,
4906+ ] ;
4907+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4908+
4909+ // No wildcard after escapes: pattern reduces to an equality check on
4910+ // the literal "foo_".
4911+ let expr = col ( "s1" ) . like ( lit ( r#"foo\_"# ) ) ;
4912+ #[ rustfmt:: skip]
4913+ let expected_ret = & [
4914+ // s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4915+ false ,
4916+ // s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4917+ false ,
4918+ // s1 ["foo", "foozzz"] => "foo_" is within the range; must keep.
4919+ true ,
4920+ // s1 ["bar", "baz"] => no rows can pass (not keep)
4921+ false ,
4922+ // s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4923+ false ,
4924+ // s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4925+ false ,
4926+ ] ;
4927+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4928+
4929+ // Leading escaped `%`: prefix is "%foo" (non-empty), so the guard
4930+ // for "all wildcards" must NOT bail out here.
4931+ let expr = col ( "s1" ) . like ( lit ( r#"\%foo%"# ) ) ;
4932+ #[ rustfmt:: skip]
4933+ let expected_ret = & [
4934+ // s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4935+ false ,
4936+ // s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4937+ false ,
4938+ // s1 ["foo", "foozzz"] => no rows can pass (not keep)
4939+ false ,
4940+ // s1 ["bar", "baz"] => no rows can pass (not keep)
4941+ false ,
4942+ // s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4943+ false ,
4944+ // s1 ["%foo_aaa", "%foo_zzz"] => every value starts with literal
4945+ // "%foo" and matches the pattern; must keep.
4946+ true ,
4947+ ] ;
4948+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4949+
4950+ // Two escaped wildcards, no real wildcard: equality on "foo%_".
4951+ let expr = col ( "s1" ) . like ( lit ( r#"foo\%\_"# ) ) ;
4952+ #[ rustfmt:: skip]
4953+ let expected_ret = & [
4954+ // s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4955+ false ,
4956+ // s1 ["foo\aaa", "foo\zzz"] => no rows can pass (not keep)
4957+ false ,
4958+ // s1 ["foo", "foozzz"] => "foo%_" is within the range; must keep.
4959+ true ,
4960+ // s1 ["bar", "baz"] => no rows can pass (not keep)
4961+ false ,
4962+ // s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4963+ false ,
4964+ // s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4965+ false ,
4966+ ] ;
4967+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4968+
4969+ // Escaped backslash followed by more literal chars before the
4970+ // wildcard: prefix is "foo\bar".
4971+ let expr = col ( "s1" ) . like ( lit ( r#"foo\\bar%"# ) ) ;
4972+ #[ rustfmt:: skip]
4973+ let expected_ret = & [
4974+ // s1 ["foo_aaa", "foo_zzz"] => no rows can pass (not keep)
4975+ false ,
4976+ // s1 ["foo\aaa", "foo\zzz"] => range straddles "foo\bar"; must
4977+ // keep.
4978+ true ,
4979+ // s1 ["foo", "foozzz"] => range straddles "foo\bar"; must keep.
4980+ true ,
4981+ // s1 ["bar", "baz"] => no rows can pass (not keep)
4982+ false ,
4983+ // s1 ["foo%aaa", "foo%zzz"] => no rows can pass (not keep)
4984+ false ,
4985+ // s1 ["%foo_aaa", "%foo_zzz"] => no rows can pass (not keep)
4986+ false ,
4987+ ] ;
4988+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4989+ }
4990+
48194991 #[ test]
48204992 fn prune_utf8_not_like_one ( ) {
48214993 let ( schema, statistics) = utf8_setup ( ) ;
0 commit comments