Skip to content

Commit 4f4e814

Browse files
authored
perf: Optimize concat()/concat_ws() UDFs (#20317)
## Which issue does this PR close? - Closes #20316. ## Rationale for this change Faster is better. ## What changes are included in this PR? This commit implements three optimizations: * In `StringViewArrayBuilder`, we recreated `block` after every call to `append_offset`. It is cheaper to instead clear and re-use `block`. * In `StringViewArrayBuilder::write()`, we re-validated that a string array consists of valid UTF8 characters. This was unnecessary work and can be skipped. * In the concat() UDF implementation, we miscalculated the initial size of the StringViewArrayBuilder buffer. This didn't lead to incorrect behavior but it resulted in unnecessarily needing to reallocate the buffer. ## Are these changes tested? Yes; no additional test cases warranted. ## Are there any user-facing changes? No.
1 parent c699361 commit 4f4e814

File tree

3 files changed

+12
-17
lines changed

3 files changed

+12
-17
lines changed

datafusion/functions/src/string/concat.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,9 @@ impl ScalarUDFImpl for ConcatFunc {
204204
DataType::Utf8View => {
205205
let string_array = as_string_view_array(array)?;
206206

207-
data_size += string_array.len();
207+
// This is an estimate; in particular, it will
208+
// undercount arrays of short strings (<= 12 bytes).
209+
data_size += string_array.total_buffer_bytes_used();
208210
let column = if array.is_nullable() {
209211
ColumnarValueRef::NullableStringViewArray(string_array)
210212
} else {

datafusion/functions/src/string/concat_ws.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,8 @@ impl ScalarUDFImpl for ConcatWsFunc {
247247
DataType::Utf8View => {
248248
let string_array = as_string_view_array(array)?;
249249

250+
// This is an estimate; in particular, it will
251+
// undercount arrays of short strings (<= 12 bytes).
250252
data_size += string_array.total_buffer_bytes_used();
251253
let column = if array.is_nullable() {
252254
ColumnarValueRef::NullableStringViewArray(string_array)

datafusion/functions/src/strings.rs

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -152,43 +152,34 @@ impl StringViewArrayBuilder {
152152
}
153153
ColumnarValueRef::NullableArray(array) => {
154154
if !CHECK_VALID || array.is_valid(i) {
155-
self.block.push_str(
156-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
157-
);
155+
self.block.push_str(array.value(i));
158156
}
159157
}
160158
ColumnarValueRef::NullableLargeStringArray(array) => {
161159
if !CHECK_VALID || array.is_valid(i) {
162-
self.block.push_str(
163-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
164-
);
160+
self.block.push_str(array.value(i));
165161
}
166162
}
167163
ColumnarValueRef::NullableStringViewArray(array) => {
168164
if !CHECK_VALID || array.is_valid(i) {
169-
self.block.push_str(
170-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
171-
);
165+
self.block.push_str(array.value(i));
172166
}
173167
}
174168
ColumnarValueRef::NonNullableArray(array) => {
175-
self.block
176-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
169+
self.block.push_str(array.value(i));
177170
}
178171
ColumnarValueRef::NonNullableLargeStringArray(array) => {
179-
self.block
180-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
172+
self.block.push_str(array.value(i));
181173
}
182174
ColumnarValueRef::NonNullableStringViewArray(array) => {
183-
self.block
184-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
175+
self.block.push_str(array.value(i));
185176
}
186177
}
187178
}
188179

189180
pub fn append_offset(&mut self) {
190181
self.builder.append_value(&self.block);
191-
self.block = String::new();
182+
self.block.clear();
192183
}
193184

194185
pub fn finish(mut self) -> StringViewArray {

0 commit comments

Comments
 (0)