Skip to content

Commit 3ab1301

Browse files
authored
fix: handle empty delimiter in split_part (closes #20503) (#20542)
## Which issue does this PR close? - Closes #20503 ## Rationale for this change `split_part` did not handle empty delimiters in a PostgreSQL-compatible way (`split("")` in Rust creates leading/trailing empty fields). This could return unexpected results for positions like `1` / `-1` and out-of-range values. This PR aligns behavior with Postgres semantics for empty delimiters. ## What changes are included in this PR? Small change in how we treat the 1, -1 ## Are these changes tested? Indeed! ## Are there any user-facing changes? Yes, behavior is now more consistent with PostgreSQL for `split_part(str, '', n)`. No API changes.
1 parent e76f0ee commit 3ab1301

File tree

2 files changed

+148
-2
lines changed

2 files changed

+148
-2
lines changed

datafusion/functions/src/string/split_part.rs

Lines changed: 128 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,15 @@ where
231231
"split_part index {n} exceeds maximum supported value"
232232
)
233233
})?;
234-
string.split(delimiter).nth(idx)
234+
235+
if delimiter.is_empty() {
236+
// Match PostgreSQL split_part behavior for empty delimiter:
237+
// treat the input as a single field ("ab" -> ["ab"]),
238+
// rather than Rust's split("") result (["", "a", "b", ""]).
239+
(n == 1).then_some(string)
240+
} else {
241+
string.split(delimiter).nth(idx)
242+
}
235243
}
236244
std::cmp::Ordering::Less => {
237245
// Negative index: use rsplit().nth() to efficiently get from the end
@@ -241,7 +249,14 @@ where
241249
"split_part index {n} exceeds minimum supported value"
242250
)
243251
})?;
244-
string.rsplit(delimiter).nth(idx)
252+
if delimiter.is_empty() {
253+
// Match PostgreSQL split_part behavior for empty delimiter:
254+
// treat the input as a single field ("ab" -> ["ab"]),
255+
// rather than Rust's split("") result (["", "a", "b", ""]).
256+
(n == -1).then_some(string)
257+
} else {
258+
string.rsplit(delimiter).nth(idx)
259+
}
245260
}
246261
std::cmp::Ordering::Equal => {
247262
return exec_err!("field position must not be zero");
@@ -341,6 +356,117 @@ mod tests {
341356
Utf8,
342357
StringArray
343358
);
359+
// Edge cases with delimiters
360+
test_function!(
361+
SplitPartFunc::new(),
362+
vec![
363+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
364+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
365+
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
366+
],
367+
Ok(Some("a")),
368+
&str,
369+
Utf8,
370+
StringArray
371+
);
372+
test_function!(
373+
SplitPartFunc::new(),
374+
vec![
375+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
376+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
377+
ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
378+
],
379+
Ok(Some("")),
380+
&str,
381+
Utf8,
382+
StringArray
383+
);
384+
test_function!(
385+
SplitPartFunc::new(),
386+
vec![
387+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
388+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
389+
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
390+
],
391+
Ok(Some("a,b")),
392+
&str,
393+
Utf8,
394+
StringArray
395+
);
396+
test_function!(
397+
SplitPartFunc::new(),
398+
vec![
399+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
400+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
401+
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
402+
],
403+
Ok(Some("")),
404+
&str,
405+
Utf8,
406+
StringArray
407+
);
408+
test_function!(
409+
SplitPartFunc::new(),
410+
vec![
411+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
412+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
413+
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
414+
],
415+
Ok(Some("a,b")),
416+
&str,
417+
Utf8,
418+
StringArray
419+
);
420+
test_function!(
421+
SplitPartFunc::new(),
422+
vec![
423+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
424+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
425+
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
426+
],
427+
Ok(Some("")),
428+
&str,
429+
Utf8,
430+
StringArray
431+
);
432+
433+
// Edge cases with delimiters with negative n
434+
test_function!(
435+
SplitPartFunc::new(),
436+
vec![
437+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
438+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
439+
ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
440+
],
441+
Ok(Some("a,b")),
442+
&str,
443+
Utf8,
444+
StringArray
445+
);
446+
test_function!(
447+
SplitPartFunc::new(),
448+
vec![
449+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
450+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
451+
ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
452+
],
453+
Ok(Some("a,b")),
454+
&str,
455+
Utf8,
456+
StringArray
457+
);
458+
test_function!(
459+
SplitPartFunc::new(),
460+
vec![
461+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
462+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
463+
ColumnarValue::Scalar(ScalarValue::Int64(Some(-2))),
464+
],
465+
Ok(Some("")),
466+
&str,
467+
Utf8,
468+
StringArray
469+
);
344470

345471
Ok(())
346472
}

datafusion/sqllogictest/test_files/expr.slt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,26 @@ SELECT split_part('abc~@~def~@~ghi', '~@~', -100)
701701
----
702702
(empty)
703703

704+
query T
705+
SELECT split_part('a,b', '', 1)
706+
----
707+
a,b
708+
709+
query T
710+
SELECT split_part('a,b', '', -1)
711+
----
712+
a,b
713+
714+
query T
715+
SELECT split_part('a,b', '', 2)
716+
----
717+
(empty)
718+
719+
query T
720+
SELECT split_part('a,b', '', -2)
721+
----
722+
(empty)
723+
704724
statement error DataFusion error: Execution error: field position must not be zero
705725
SELECT split_part('abc~@~def~@~ghi', '~@~', 0)
706726

0 commit comments

Comments
 (0)