Skip to content

Commit 69dcbb5

Browse files
fix: align regexp_instr empty pattern
1 parent 66f82af commit 69dcbb5

2 files changed

Lines changed: 57 additions & 20 deletions

File tree

datafusion/functions/src/regex/regexpinstr.rs

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,6 @@ where
329329
)
330330
.map(|(value, regex, start, nth, flags, subexp)| match regex {
331331
None => Ok(None),
332-
Some("") => Ok(Some(0)),
333332
Some(regex) => get_index(
334333
value,
335334
regex,
@@ -395,11 +394,8 @@ where
395394
{
396395
let value = match value {
397396
None => return Ok(None),
398-
Some("") => return Ok(Some(0)),
399397
Some(value) => value,
400398
};
401-
let pattern: &Regex = compile_and_cache_regex(pattern, flags, regex_cache)?;
402-
// println!("get_index: value = {}, pattern = {}, start = {}, n = {}, subexpr = {}, flags = {:?}", value, pattern, start, n, subexpr, flags);
403399
if start < 1 {
404400
return Err(ArrowError::ComputeError(
405401
"regexp_instr() requires start to be 1-based".to_string(),
@@ -412,8 +408,22 @@ where
412408
));
413409
}
414410

415-
// --- Simplified byte_start_offset calculation ---
416411
let total_chars = value.chars().count() as i64;
412+
if pattern.is_empty() {
413+
compile_and_cache_regex(pattern, flags, regex_cache)?;
414+
if subexpr > 0 {
415+
return Ok(Some(0));
416+
}
417+
418+
let match_position = start.saturating_add(n).saturating_sub(1);
419+
return Ok(Some(if match_position <= total_chars + 1 {
420+
match_position
421+
} else {
422+
0
423+
}));
424+
}
425+
426+
let pattern: &Regex = compile_and_cache_regex(pattern, flags, regex_cache)?;
417427
let byte_start_offset: usize = if start > total_chars {
418428
// If start is beyond the total characters, it means we start searching
419429
// after the string effectively. No matches possible.
@@ -426,7 +436,6 @@ where
426436
.map(|(idx, _)| idx)
427437
.unwrap_or(0) // Should not happen if start is valid and <= total_chars
428438
};
429-
// --- End simplified calculation ---
430439

431440
let search_slice = &value[byte_start_offset..];
432441

@@ -492,7 +501,7 @@ mod tests {
492501
fn test_case_sensitive_regexp_instr_nulls() {
493502
let v = "";
494503
let r = "";
495-
let expected = 0;
504+
let expected = 1;
496505
let regex_sv = ScalarValue::Utf8(Some(r.to_string()));
497506
let re = regexp_instr_with_scalar_values(&[v.to_string().into(), regex_sv]);
498507
// let res_exp = re.unwrap();
@@ -511,10 +520,11 @@ mod tests {
511520
"no match here",
512521
"abc",
513522
"ДатаФусион数据融合📊🔥",
523+
"abc",
514524
];
515-
let regex = ["o", "d", "123", "z", "gg", "📊"];
525+
let regex = ["o", "d", "123", "z", "gg", "📊", ""];
516526

517-
let expected: Vec<i64> = vec![5, 4, 4, 0, 0, 15];
527+
let expected: Vec<i64> = vec![5, 4, 4, 0, 0, 15, 1];
518528

519529
izip!(values.iter(), regex.iter())
520530
.enumerate()
@@ -772,10 +782,11 @@ mod tests {
772782
"xyz123xyz",
773783
"no match here",
774784
"",
785+
"abc",
775786
]);
776-
let regex = A::from(vec!["o", "d", "123", "z", "gg"]);
787+
let regex = A::from(vec!["o", "d", "123", "z", "gg", ""]);
777788

778-
let expected = Int64Array::from(vec![5, 4, 4, 0, 0]);
789+
let expected = Int64Array::from(vec![5, 4, 4, 0, 0, 1]);
779790
let re = regexp_instr_func(&[Arc::new(values), Arc::new(regex)]).unwrap();
780791
assert_eq!(re.as_ref(), &expected);
781792
}
@@ -784,10 +795,10 @@ mod tests {
784795
where
785796
A: From<Vec<&'static str>> + Array + 'static,
786797
{
787-
let values = A::from(vec!["abcabcabc", "abcabcabc", ""]);
788-
let regex = A::from(vec!["abc", "abc", "gg"]);
789-
let start = Int64Array::from(vec![4, 5, 5]);
790-
let expected = Int64Array::from(vec![4, 7, 0]);
798+
let values = A::from(vec!["abcabcabc", "abcabcabc", "", "abc"]);
799+
let regex = A::from(vec!["abc", "abc", "gg", ""]);
800+
let start = Int64Array::from(vec![4, 5, 5, 2]);
801+
let expected = Int64Array::from(vec![4, 7, 0, 2]);
791802

792803
let re = regexp_instr_func(&[Arc::new(values), Arc::new(regex), Arc::new(start)])
793804
.unwrap();
@@ -798,11 +809,17 @@ mod tests {
798809
where
799810
A: From<Vec<&'static str>> + Array + 'static,
800811
{
801-
let values = A::from(vec!["abcabcabc", "abcabcabc", "abcabcabc", "abcabcabc"]);
802-
let regex = A::from(vec!["abc", "abc", "abc", "abc"]);
803-
let start = Int64Array::from(vec![1, 1, 1, 1]);
804-
let nth = Int64Array::from(vec![1, 2, 3, 4]);
805-
let expected = Int64Array::from(vec![1, 4, 7, 0]);
812+
let values = A::from(vec![
813+
"abcabcabc",
814+
"abcabcabc",
815+
"abcabcabc",
816+
"abcabcabc",
817+
"abc",
818+
]);
819+
let regex = A::from(vec!["abc", "abc", "abc", "abc", ""]);
820+
let start = Int64Array::from(vec![1, 1, 1, 1, 2]);
821+
let nth = Int64Array::from(vec![1, 2, 3, 4, 3]);
822+
let expected = Int64Array::from(vec![1, 4, 7, 0, 4]);
806823

807824
let re = regexp_instr_func(&[
808825
Arc::new(values),

datafusion/sqllogictest/test_files/regexp/regexp_instr.slt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,26 @@ SELECT regexp_instr('123123123123123', '(12)3');
2323
----
2424
1
2525

26+
query I
27+
SELECT regexp_instr('abc', '');
28+
----
29+
1
30+
31+
query I
32+
SELECT regexp_instr('', '');
33+
----
34+
1
35+
36+
query I
37+
SELECT regexp_instr('abc', '', 2, 3);
38+
----
39+
4
40+
41+
query I
42+
SELECT regexp_instr('abc', '', 5);
43+
----
44+
0
45+
2646
query I
2747
SELECT regexp_instr('123123123123', '123', 1);
2848
----

0 commit comments

Comments
 (0)