Skip to content

Commit 6c106ba

Browse files
authored
fix: Use codepoints in lpad, rpad, translate (#21405)
## Which issue does this PR close? - Closes #21060. ## Rationale for this change `lpad`, `rpad`, and `translate` use grapheme segmentation. This is inconsistent with how these functions behave in Postgres and DuckDB, as well as the SQL standard -- segmentation based on Unicode codepoints is used instead. It also happens that grapheme-based segmentation is significantly more expensive than codepoint-based segmentation. In the case of `lpad` and `rpad`, graphemes and codepoints were used inconsistently: the input string was measured in code points but the fill string was measured in graphemes. #3054 switched to using codepoints for most string-related functions in DataFusion but these three functions still need to be changed. Benchmarks (M4 Max): lpad size=1024: - lpad utf8 [str_len=5, target=20]: 12.4 µs → 12.8 µs, +3.0% - lpad stringview [str_len=5, target=20]: 11.5 µs → 11.7 µs, +1.4% - lpad utf8 [str_len=20, target=50]: 11.3 µs → 11.3 µs, +0.1% - lpad stringview [str_len=20, target=50]: 11.8 µs → 12.0 µs, +1.6% - lpad utf8 unicode [target=20]: 98.4 µs → 24.4 µs, -75.1% - lpad stringview unicode [target=20]: 99.8 µs → 26.0 µs, -74.0% - lpad utf8 scalar [str_len=5, target=20, fill='x']: 8.7 µs → 8.8 µs, +1.0% - lpad stringview scalar [str_len=5, target=20, fill='x']: 10.2 µs → 10.1 µs, -0.1% - lpad utf8 scalar unicode [str_len=5, target=20, fill='é']: 44.7 µs → 10.9 µs, -75.7% - lpad utf8 scalar truncate [str_len=20, target=5, fill='é']: 152.5 µs → 11.7 µs, -92.3% lpad size=4096: - lpad utf8 [str_len=5, target=20]: 55.9 µs → 55.1 µs, -1.4% - lpad stringview [str_len=5, target=20]: 49.2 µs → 50.1 µs, +1.8% - lpad utf8 [str_len=20, target=50]: 46.6 µs → 46.4 µs, -0.5% - lpad stringview [str_len=20, target=50]: 47.5 µs → 48.5 µs, +2.1% - lpad utf8 unicode [target=20]: 401.3 µs → 100.1 µs, -75.0% - lpad stringview unicode [target=20]: 397.7 µs → 104.9 µs, -73.6% - lpad utf8 scalar [str_len=5, target=20, fill='x']: 34.2 µs → 35.0 µs, +2.4% - lpad stringview scalar [str_len=5, target=20, fill='x']: 40.1 µs → 40.4 µs, +0.6% - lpad utf8 scalar unicode [str_len=5, target=20, fill='é']: 178.3 µs → 42.9 µs, -76.0% - lpad utf8 scalar truncate [str_len=20, target=5, fill='é']: 601.3 µs → 46.2 µs, -92.3% rpad size=1024: - rpad utf8 [str_len=5, target=20]: 15.5 µs → 14.4 µs, -7.1% - rpad stringview [str_len=5, target=20]: 13.8 µs → 14.0 µs, +1.7% - rpad utf8 [str_len=20, target=50]: 12.6 µs → 12.7 µs, +1.3% - rpad stringview [str_len=20, target=50]: 13.0 µs → 13.1 µs, +0.7% - rpad utf8 unicode [target=20]: 103.5 µs → 26.0 µs, -74.8% - rpad stringview unicode [target=20]: 101.2 µs → 27.6 µs, -72.7% - rpad utf8 scalar [str_len=5, target=20, fill='x']: 11.4 µs → 10.9 µs, -3.9% - rpad stringview scalar [str_len=5, target=20, fill='x']: 12.2 µs → 12.6 µs, +2.8% - rpad utf8 scalar unicode [str_len=5, target=20, fill='é']: 46.3 µs → 12.4 µs, -73.1% - rpad utf8 scalar truncate [str_len=20, target=5, fill='é']: 155.6 µs → 11.6 µs, -92.4% rpad size=4096: - rpad utf8 [str_len=5, target=20]: 70.1 µs → 61.6 µs, -12.2% - rpad stringview [str_len=5, target=20]: 60.4 µs → 56.8 µs, -6.0% - rpad utf8 [str_len=20, target=50]: 50.6 µs → 51.2 µs, +1.2% - rpad stringview [str_len=20, target=50]: 53.7 µs → 53.3 µs, -0.8% - rpad utf8 unicode [target=20]: 407.1 µs → 104.0 µs, -74.5% - rpad stringview unicode [target=20]: 404.8 µs → 114.5 µs, -71.7% - rpad utf8 scalar [str_len=5, target=20, fill='x']: 47.5 µs → 45.6 µs, -4.0% - rpad stringview scalar [str_len=5, target=20, fill='x']: 56.4 µs → 58.5 µs, +3.6% - rpad utf8 scalar unicode [str_len=5, target=20, fill='é']: 184.1 µs → 48.1 µs, -73.9% - rpad utf8 scalar truncate [str_len=20, target=5, fill='é']: 606.4 µs → 45.6 µs, -92.5% translate size=1024: - array_from_to [str_len=8]: 140.0 µs → 37.6 µs, -73.2% - scalar_from_to [str_len=8]: 9.0 µs → 8.8 µs, -2.7% - array_from_to [str_len=32]: 371.3 µs → 65.6 µs, -82.3% - scalar_from_to [str_len=32]: 19.9 µs → 19.2 µs, -3.6% - array_from_to [str_len=128]: 1249.6 µs → 188.7 µs, -84.9% - scalar_from_to [str_len=128]: 70.2 µs → 64.7 µs, -7.9% - array_from_to [str_len=1024]: 9349.4 µs → 1378.1 µs, -85.3% - scalar_from_to [str_len=1024]: 506.5 µs → 445.8 µs, -12.0% translate size=4096: - array_from_to [str_len=8]: 548.0 µs → 147.1 µs, -73.2% - scalar_from_to [str_len=8]: 33.9 µs → 32.8 µs, -3.1% - array_from_to [str_len=32]: 1457.2 µs → 266.0 µs, -81.7% - scalar_from_to [str_len=32]: 78.0 µs → 75.5 µs, -3.2% - array_from_to [str_len=128]: 4935.0 µs → 791.1 µs, -84.0% - scalar_from_to [str_len=128]: 278.2 µs → 260.7 µs, -6.3% - array_from_to [str_len=1024]: 37496 µs → 5591 µs, -85.1% - scalar_from_to [str_len=1024]: 2058.0 µs → 1770 µs, -14.0% ## What changes are included in this PR? * Switch from grapheme segmentation to codepoint segmentation for `lpad`, `rpad`, and `translate` * Add SLT tests * Refactor a few helper functions * Remove dependency on `unicode_segmentation` crate as it is no longer used ## Are these changes tested? Yes. The new SLT tests were also run against DuckDB and Postgres to confirm the behavior is consistent. ## Are there any user-facing changes? Yes. This PR changes the behavior of `lpad`, `rpad`, and `translate`, although the new behavior is more consistent with the SQL standard and with other SQL implementations.
1 parent 6cf94c7 commit 6c106ba

File tree

8 files changed

+255
-169
lines changed

8 files changed

+255
-169
lines changed

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/functions/Cargo.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ regex_expressions = ["regex"]
5959
# enable string functions
6060
string_expressions = ["uuid"]
6161
# enable unicode functions
62-
unicode_expressions = ["unicode-segmentation"]
62+
unicode_expressions = []
6363

6464
[lib]
6565
name = "datafusion_functions"
@@ -87,7 +87,6 @@ num-traits = { workspace = true }
8787
rand = { workspace = true }
8888
regex = { workspace = true, optional = true }
8989
sha2 = { workspace = true, optional = true }
90-
unicode-segmentation = { version = "^1.13.2", optional = true }
9190
uuid = { workspace = true, features = ["v4"], optional = true }
9291

9392
[dev-dependencies]

datafusion/functions/src/unicode/common.rs

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,39 @@ impl LeftRightSlicer for RightSlicer {
7878
}
7979
}
8080

81+
/// Returns the byte offset of the `n`th codepoint in `string`,
82+
/// or `string.len()` if the string has fewer than `n` codepoints.
83+
#[inline]
84+
pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
85+
string
86+
.char_indices()
87+
.nth(n)
88+
.map_or(string.len(), |(i, _)| i)
89+
}
90+
91+
/// If `string` has more than `n` codepoints, returns the byte offset of
92+
/// the `n`-th codepoint boundary. Otherwise returns the total codepoint count.
93+
#[inline]
94+
pub(crate) fn char_count_or_boundary(string: &str, n: usize) -> StringCharLen {
95+
let mut count = 0;
96+
for (byte_idx, _) in string.char_indices() {
97+
if count == n {
98+
return StringCharLen::ByteOffset(byte_idx);
99+
}
100+
count += 1;
101+
}
102+
StringCharLen::CharCount(count)
103+
}
104+
105+
/// Result of [`char_count_or_boundary`].
106+
pub(crate) enum StringCharLen {
107+
/// The string has more than `n` codepoints; contains the byte offset
108+
/// at the `n`-th codepoint boundary.
109+
ByteOffset(usize),
110+
/// The string has `n` or fewer codepoints; contains the exact count.
111+
CharCount(usize),
112+
}
113+
81114
/// Calculate the byte length of the substring of `n` chars from string `string`
82115
#[inline]
83116
fn left_right_byte_length(string: &str, n: i64) -> usize {
@@ -88,11 +121,9 @@ fn left_right_byte_length(string: &str, n: i64) -> usize {
88121
.map(|(index, _)| index)
89122
.unwrap_or(0),
90123
Ordering::Equal => 0,
91-
Ordering::Greater => string
92-
.char_indices()
93-
.nth(n.unsigned_abs().min(usize::MAX as u64) as usize)
94-
.map(|(index, _)| index)
95-
.unwrap_or(string.len()),
124+
Ordering::Greater => {
125+
byte_offset_of_char(string, n.unsigned_abs().min(usize::MAX as u64) as usize)
126+
}
96127
}
97128
}
98129

datafusion/functions/src/unicode/lpad.rs

Lines changed: 38 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ use arrow::array::{
2424
OffsetSizeTrait, StringArrayType, StringViewArray,
2525
};
2626
use arrow::datatypes::DataType;
27-
use unicode_segmentation::UnicodeSegmentation;
2827

2928
use crate::utils::{make_scalar_function, utf8_to_str_type};
3029
use datafusion_common::cast::as_int64_array;
@@ -178,7 +177,9 @@ impl ScalarUDFImpl for LPadFunc {
178177
}
179178
}
180179

181-
use super::common::{try_as_scalar_i64, try_as_scalar_str};
180+
use super::common::{
181+
StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str,
182+
};
182183

183184
/// Optimized lpad for constant target_len and fill arguments.
184185
fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
@@ -270,27 +271,22 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
270271
let data_capacity = string_array.len().saturating_mul(target_len * 4);
271272
let mut builder =
272273
GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
273-
let mut graphemes_buf = Vec::new();
274274

275275
for maybe_string in string_array.iter() {
276276
match maybe_string {
277-
Some(string) => {
278-
graphemes_buf.clear();
279-
graphemes_buf.extend(string.graphemes(true));
280-
281-
if target_len < graphemes_buf.len() {
282-
let end: usize =
283-
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
284-
builder.append_value(&string[..end]);
285-
} else if fill_chars.is_empty() {
286-
builder.append_value(string);
287-
} else {
288-
let pad_chars = target_len - graphemes_buf.len();
289-
let pad_bytes = char_byte_offsets[pad_chars];
290-
builder.write_str(&padding_buf[..pad_bytes])?;
277+
Some(string) => match char_count_or_boundary(string, target_len) {
278+
StringCharLen::ByteOffset(offset) => {
279+
builder.append_value(&string[..offset]);
280+
}
281+
StringCharLen::CharCount(char_count) => {
282+
if !fill_chars.is_empty() {
283+
let pad_chars = target_len - char_count;
284+
let pad_bytes = char_byte_offsets[pad_chars];
285+
builder.write_str(&padding_buf[..pad_bytes])?;
286+
}
291287
builder.append_value(string);
292288
}
293-
}
289+
},
294290
None => builder.append_null(),
295291
}
296292
}
@@ -378,7 +374,6 @@ where
378374
{
379375
let array = if let Some(fill_array) = fill_array {
380376
let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
381-
let mut graphemes_buf = Vec::new();
382377
let mut fill_chars_buf = Vec::new();
383378

384379
for ((string, target_len), fill) in string_array
@@ -407,8 +402,7 @@ where
407402
}
408403

409404
if string.is_ascii() && fill.is_ascii() {
410-
// ASCII fast path: byte length == character length,
411-
// so we skip expensive grapheme segmentation.
405+
// ASCII fast path: byte length == character length.
412406
let str_len = string.len();
413407
if target_len < str_len {
414408
builder.append_value(&string[..target_len]);
@@ -428,26 +422,24 @@ where
428422
builder.append_value(string);
429423
}
430424
} else {
431-
// Reuse buffers by clearing and refilling
432-
graphemes_buf.clear();
433-
graphemes_buf.extend(string.graphemes(true));
434-
435425
fill_chars_buf.clear();
436426
fill_chars_buf.extend(fill.chars());
437427

438-
if target_len < graphemes_buf.len() {
439-
let end: usize =
440-
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
441-
builder.append_value(&string[..end]);
442-
} else if fill_chars_buf.is_empty() {
443-
builder.append_value(string);
444-
} else {
445-
for l in 0..target_len - graphemes_buf.len() {
446-
let c =
447-
*fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
448-
builder.write_char(c)?;
428+
match char_count_or_boundary(string, target_len) {
429+
StringCharLen::ByteOffset(offset) => {
430+
builder.append_value(&string[..offset]);
431+
}
432+
StringCharLen::CharCount(char_count) => {
433+
if !fill_chars_buf.is_empty() {
434+
for l in 0..target_len - char_count {
435+
let c = *fill_chars_buf
436+
.get(l % fill_chars_buf.len())
437+
.unwrap();
438+
builder.write_char(c)?;
439+
}
440+
}
441+
builder.append_value(string);
449442
}
450-
builder.append_value(string);
451443
}
452444
}
453445
} else {
@@ -458,7 +450,6 @@ where
458450
builder.finish()
459451
} else {
460452
let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
461-
let mut graphemes_buf = Vec::new();
462453

463454
for (string, target_len) in string_array.iter().zip(length_array.iter()) {
464455
if let (Some(string), Some(target_len)) = (string, target_len) {
@@ -491,19 +482,16 @@ where
491482
builder.append_value(string);
492483
}
493484
} else {
494-
// Reuse buffer by clearing and refilling
495-
graphemes_buf.clear();
496-
graphemes_buf.extend(string.graphemes(true));
497-
498-
if target_len < graphemes_buf.len() {
499-
let end: usize =
500-
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
501-
builder.append_value(&string[..end]);
502-
} else {
503-
for _ in 0..(target_len - graphemes_buf.len()) {
504-
builder.write_str(" ")?;
485+
match char_count_or_boundary(string, target_len) {
486+
StringCharLen::ByteOffset(offset) => {
487+
builder.append_value(&string[..offset]);
488+
}
489+
StringCharLen::CharCount(char_count) => {
490+
for _ in 0..(target_len - char_count) {
491+
builder.write_str(" ")?;
492+
}
493+
builder.append_value(string);
505494
}
506-
builder.append_value(string);
507495
}
508496
}
509497
} else {

datafusion/functions/src/unicode/rpad.rs

Lines changed: 40 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ use arrow::array::{
2424
OffsetSizeTrait, StringArrayType, StringViewArray,
2525
};
2626
use arrow::datatypes::DataType;
27-
use unicode_segmentation::UnicodeSegmentation;
2827

2928
use crate::utils::{make_scalar_function, utf8_to_str_type};
3029
use datafusion_common::cast::as_int64_array;
@@ -178,7 +177,9 @@ impl ScalarUDFImpl for RPadFunc {
178177
}
179178
}
180179

181-
use super::common::{try_as_scalar_i64, try_as_scalar_str};
180+
use super::common::{
181+
StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str,
182+
};
182183

183184
/// Optimized rpad for constant target_len and fill arguments.
184185
fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
@@ -271,28 +272,23 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
271272
let data_capacity = string_array.len().saturating_mul(target_len * 4);
272273
let mut builder =
273274
GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
274-
let mut graphemes_buf = Vec::new();
275275

276276
for maybe_string in string_array.iter() {
277277
match maybe_string {
278-
Some(string) => {
279-
graphemes_buf.clear();
280-
graphemes_buf.extend(string.graphemes(true));
281-
282-
if target_len < graphemes_buf.len() {
283-
let end: usize =
284-
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
285-
builder.append_value(&string[..end]);
286-
} else if fill_chars.is_empty() {
287-
builder.append_value(string);
288-
} else {
289-
let pad_chars = target_len - graphemes_buf.len();
290-
let pad_bytes = char_byte_offsets[pad_chars];
278+
Some(string) => match char_count_or_boundary(string, target_len) {
279+
StringCharLen::ByteOffset(offset) => {
280+
builder.append_value(&string[..offset]);
281+
}
282+
StringCharLen::CharCount(char_count) => {
291283
builder.write_str(string)?;
292-
builder.write_str(&padding_buf[..pad_bytes])?;
284+
if !fill_chars.is_empty() {
285+
let pad_chars = target_len - char_count;
286+
let pad_bytes = char_byte_offsets[pad_chars];
287+
builder.write_str(&padding_buf[..pad_bytes])?;
288+
}
293289
builder.append_value("");
294290
}
295-
}
291+
},
296292
None => builder.append_null(),
297293
}
298294
}
@@ -377,7 +373,6 @@ where
377373
{
378374
let array = if let Some(fill_array) = fill_array {
379375
let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
380-
let mut graphemes_buf = Vec::new();
381376
let mut fill_chars_buf = Vec::new();
382377

383378
for ((string, target_len), fill) in string_array
@@ -406,8 +401,7 @@ where
406401
}
407402

408403
if string.is_ascii() && fill.is_ascii() {
409-
// ASCII fast path: byte length == character length,
410-
// so we skip expensive grapheme segmentation.
404+
// ASCII fast path: byte length == character length.
411405
let str_len = string.len();
412406
if target_len < str_len {
413407
builder.append_value(&string[..target_len]);
@@ -428,26 +422,25 @@ where
428422
builder.append_value("");
429423
}
430424
} else {
431-
graphemes_buf.clear();
432-
graphemes_buf.extend(string.graphemes(true));
433-
434425
fill_chars_buf.clear();
435426
fill_chars_buf.extend(fill.chars());
436427

437-
if target_len < graphemes_buf.len() {
438-
let end: usize =
439-
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
440-
builder.append_value(&string[..end]);
441-
} else if fill_chars_buf.is_empty() {
442-
builder.append_value(string);
443-
} else {
444-
builder.write_str(string)?;
445-
for l in 0..target_len - graphemes_buf.len() {
446-
let c =
447-
*fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
448-
builder.write_char(c)?;
428+
match char_count_or_boundary(string, target_len) {
429+
StringCharLen::ByteOffset(offset) => {
430+
builder.append_value(&string[..offset]);
431+
}
432+
StringCharLen::CharCount(char_count) => {
433+
builder.write_str(string)?;
434+
if !fill_chars_buf.is_empty() {
435+
for l in 0..target_len - char_count {
436+
let c = *fill_chars_buf
437+
.get(l % fill_chars_buf.len())
438+
.unwrap();
439+
builder.write_char(c)?;
440+
}
441+
}
442+
builder.append_value("");
449443
}
450-
builder.append_value("");
451444
}
452445
}
453446
} else {
@@ -458,7 +451,6 @@ where
458451
builder.finish()
459452
} else {
460453
let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
461-
let mut graphemes_buf = Vec::new();
462454

463455
for (string, target_len) in string_array.iter().zip(length_array.iter()) {
464456
if let (Some(string), Some(target_len)) = (string, target_len) {
@@ -492,19 +484,17 @@ where
492484
builder.append_value("");
493485
}
494486
} else {
495-
graphemes_buf.clear();
496-
graphemes_buf.extend(string.graphemes(true));
497-
498-
if target_len < graphemes_buf.len() {
499-
let end: usize =
500-
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
501-
builder.append_value(&string[..end]);
502-
} else {
503-
builder.write_str(string)?;
504-
for _ in 0..(target_len - graphemes_buf.len()) {
505-
builder.write_str(" ")?;
487+
match char_count_or_boundary(string, target_len) {
488+
StringCharLen::ByteOffset(offset) => {
489+
builder.append_value(&string[..offset]);
490+
}
491+
StringCharLen::CharCount(char_count) => {
492+
builder.write_str(string)?;
493+
for _ in 0..(target_len - char_count) {
494+
builder.write_str(" ")?;
495+
}
496+
builder.append_value("");
506497
}
507-
builder.append_value("");
508498
}
509499
}
510500
} else {

0 commit comments

Comments
 (0)