Skip to content

Commit f3735c1

Browse files
committed
fix: count empty regexp matches in regexp_count
1 parent 66f82af commit f3735c1

2 files changed

Lines changed: 77 additions & 9 deletions

File tree

datafusion/functions/src/regex/regexpcount.rs

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,14 @@ where
268268
S: StringArrayType<'a>,
269269
{
270270
let (regex_scalar, is_regex_scalar) = if is_regex_scalar || regex_array.len() == 1 {
271-
(Some(regex_array.value(0)), true)
271+
(
272+
if regex_array.is_null(0) {
273+
None
274+
} else {
275+
Some(regex_array.value(0))
276+
},
277+
true,
278+
)
272279
} else {
273280
(None, false)
274281
};
@@ -300,7 +307,7 @@ where
300307
match (is_regex_scalar, is_start_scalar, is_flags_scalar) {
301308
(true, true, true) => {
302309
let regex = match regex_scalar {
303-
None | Some("") => {
310+
None => {
304311
return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
305312
}
306313
Some(regex) => regex,
@@ -317,7 +324,7 @@ where
317324
}
318325
(true, true, false) => {
319326
let regex = match regex_scalar {
320-
None | Some("") => {
327+
None => {
321328
return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
322329
}
323330
Some(regex) => regex,
@@ -346,7 +353,7 @@ where
346353
}
347354
(true, false, true) => {
348355
let regex = match regex_scalar {
349-
None | Some("") => {
356+
None => {
350357
return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
351358
}
352359
Some(regex) => regex,
@@ -366,7 +373,7 @@ where
366373
}
367374
(true, false, false) => {
368375
let regex = match regex_scalar {
369-
None | Some("") => {
376+
None => {
370377
return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
371378
}
372379
Some(regex) => regex,
@@ -411,7 +418,7 @@ where
411418
.zip(regex_array.iter())
412419
.map(|(value, regex)| {
413420
let regex = match regex {
414-
None | Some("") => return Ok(0),
421+
None => return Ok(0),
415422
Some(regex) => regex,
416423
};
417424

@@ -447,7 +454,7 @@ where
447454
izip!(values.iter(), regex_array.iter(), flags_array.iter())
448455
.map(|(value, regex, flags)| {
449456
let regex = match regex {
450-
None | Some("") => return Ok(0),
457+
None => return Ok(0),
451458
Some(regex) => regex,
452459
};
453460

@@ -481,7 +488,7 @@ where
481488
izip!(values.iter(), regex_array.iter(), start_array.iter())
482489
.map(|(value, regex, start)| {
483490
let regex = match regex {
484-
None | Some("") => return Ok(0),
491+
None => return Ok(0),
485492
Some(regex) => regex,
486493
};
487494

@@ -531,7 +538,7 @@ where
531538
)
532539
.map(|(value, regex, start, flags)| {
533540
let regex = match regex {
534-
None | Some("") => return Ok(0),
541+
None => return Ok(0),
535542
Some(regex) => regex,
536543
};
537544

@@ -590,6 +597,7 @@ mod tests {
590597
fn test_regexp_count() {
591598
test_case_sensitive_regexp_count_scalar();
592599
test_case_sensitive_regexp_count_scalar_start();
600+
test_case_sensitive_regexp_count_scalar_empty_pattern();
593601
test_case_insensitive_regexp_count_scalar_flags();
594602
test_case_sensitive_regexp_count_start_scalar_complex();
595603

@@ -719,6 +727,61 @@ mod tests {
719727
});
720728
}
721729

730+
fn test_case_sensitive_regexp_count_scalar_empty_pattern() {
731+
let values = ["abc", "abc", ""];
732+
let regex = "";
733+
let start = [1, 4, 1];
734+
let expected: Vec<i64> = vec![4, 1, 0];
735+
736+
izip!(values.iter(), start.iter())
737+
.enumerate()
738+
.for_each(|(pos, (&v, &s))| {
739+
let expected = expected.get(pos).cloned();
740+
741+
let v_sv = ScalarValue::Utf8(Some(v.to_string()));
742+
let regex_sv = ScalarValue::Utf8(Some(regex.to_string()));
743+
let start_sv = ScalarValue::Int64(Some(s));
744+
let re =
745+
regexp_count_with_scalar_values(&[v_sv, regex_sv, start_sv.clone()]);
746+
match re {
747+
Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
748+
assert_eq!(
749+
v, expected,
750+
"regexp_count scalar empty-pattern test failed"
751+
);
752+
}
753+
_ => panic!("Unexpected result"),
754+
}
755+
756+
let v_sv = ScalarValue::LargeUtf8(Some(v.to_string()));
757+
let regex_sv = ScalarValue::LargeUtf8(Some(regex.to_string()));
758+
let re =
759+
regexp_count_with_scalar_values(&[v_sv, regex_sv, start_sv.clone()]);
760+
match re {
761+
Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
762+
assert_eq!(
763+
v, expected,
764+
"regexp_count scalar empty-pattern test failed"
765+
);
766+
}
767+
_ => panic!("Unexpected result"),
768+
}
769+
770+
let v_sv = ScalarValue::Utf8View(Some(v.to_string()));
771+
let regex_sv = ScalarValue::Utf8View(Some(regex.to_string()));
772+
let re = regexp_count_with_scalar_values(&[v_sv, regex_sv, start_sv]);
773+
match re {
774+
Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
775+
assert_eq!(
776+
v, expected,
777+
"regexp_count scalar empty-pattern test failed"
778+
);
779+
}
780+
_ => panic!("Unexpected result"),
781+
}
782+
});
783+
}
784+
722785
fn test_case_insensitive_regexp_count_scalar_flags() {
723786
let values = ["", "aabca", "abcabc", "abcAbcab", "abcabcabc"];
724787
let regex = "abc";

datafusion/sqllogictest/test_files/regexp/regexp_count.slt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ SELECT regexp_count('123123123123123', '(12)3');
2626
----
2727
5
2828

29+
query I
30+
SELECT regexp_count('abc', '');
31+
----
32+
4
33+
2934
query I
3035
SELECT regexp_count('123123123123', '123', 1);
3136
----

0 commit comments

Comments
 (0)