Skip to content

Commit 7fa7fe0

Browse files
authored
perf: Optimize split_part for scalar args (#21238)
## Which issue does this PR close? - Closes #21204. ## Rationale for this change In practice, `split_part(string, delimiter, position)` is often invoked with constant values for `delimiter` and `position`. We can take advantage of that to hoist some conditional branches out of the per-row hot loop; more importantly, we can switch from using `str::split` to building a `memchr::memmem::Finder` and using it for each row. Building a `Finder` is relatively expensive but it's a clear win when we can amortize that one-time cost over an entire input batch. Benchmarks (M4 Max): - `scalar_utf8_single_char/pos_first`: 105 µs → 41 µs, -61% - `scalar_utf8_single_char/pos_middle`: 358 µs → 97 µs, -73% - `scalar_utf8_single_char/pos_negative`: 110 µs → 46 µs, -58% - `scalar_utf8_multi_char/pos_middle`: 355 µs → 132 µs, -63% - `scalar_utf8_long_strings/pos_middle`: 1.97 ms → 1.11 ms, -43% - `scalar_utf8view_long_parts/pos_middle`: 467 µs → 169 µs, -63% - `array_utf8_single_char/pos_middle`: 351 µs → 357 µs, no change - `array_utf8_multi_char/pos_middle`: 366 µs → 357 µs, -2.6% ## What changes are included in this PR? * Add benchmarks for `split_part` with scalar delimiter and position * Add new fast-path for `split_part` with scalar delimiter and position * Add SLT tests for `split_part` with scalar delimiter and position ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent a51971b commit 7fa7fe0

File tree

3 files changed

+373
-63
lines changed

3 files changed

+373
-63
lines changed

datafusion/functions/benches/split_part.rs

Lines changed: 81 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray};
1919
use arrow::datatypes::{DataType, Field};
2020
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
21+
use datafusion_common::ScalarValue;
2122
use datafusion_common::config::ConfigOptions;
2223
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
2324
use datafusion_functions::string::split_part;
@@ -29,15 +30,15 @@ use std::sync::Arc;
2930

3031
const N_ROWS: usize = 8192;
3132

32-
/// Creates strings with `num_parts` random alphanumeric segments of `part_len`
33-
/// bytes each, joined by `delimiter`.
34-
fn gen_split_part_data(
33+
/// Creates an array of strings with `num_parts` random alphanumeric segments
34+
/// of `part_len` bytes each, joined by `delimiter`.
35+
fn gen_string_array(
3536
n_rows: usize,
3637
num_parts: usize,
3738
part_len: usize,
3839
delimiter: &str,
3940
use_string_view: bool,
40-
) -> (ColumnarValue, ColumnarValue) {
41+
) -> ColumnarValue {
4142
let mut rng = StdRng::seed_from_u64(42);
4243

4344
let mut strings: Vec<String> = Vec::with_capacity(n_rows);
@@ -54,22 +55,12 @@ fn gen_split_part_data(
5455
strings.push(parts.join(delimiter));
5556
}
5657

57-
let delimiters: Vec<String> = vec![delimiter.to_string(); n_rows];
58-
5958
if use_string_view {
6059
let string_array: StringViewArray = strings.into_iter().map(Some).collect();
61-
let delimiter_array: StringViewArray = delimiters.into_iter().map(Some).collect();
62-
(
63-
ColumnarValue::Array(Arc::new(string_array) as ArrayRef),
64-
ColumnarValue::Array(Arc::new(delimiter_array) as ArrayRef),
65-
)
60+
ColumnarValue::Array(Arc::new(string_array) as ArrayRef)
6661
} else {
6762
let string_array: StringArray = strings.into_iter().map(Some).collect();
68-
let delimiter_array: StringArray = delimiters.into_iter().map(Some).collect();
69-
(
70-
ColumnarValue::Array(Arc::new(string_array) as ArrayRef),
71-
ColumnarValue::Array(Arc::new(delimiter_array) as ArrayRef),
72-
)
63+
ColumnarValue::Array(Arc::new(string_array) as ArrayRef)
7364
}
7465
}
7566

@@ -81,12 +72,10 @@ fn bench_split_part(
8172
name: &str,
8273
tag: &str,
8374
strings: ColumnarValue,
84-
delimiters: ColumnarValue,
85-
position: i64,
75+
delimiter: ColumnarValue,
76+
position: ColumnarValue,
8677
) {
87-
let positions: ColumnarValue =
88-
ColumnarValue::Array(Arc::new(Int64Array::from(vec![position; N_ROWS])));
89-
let args = vec![strings, delimiters, positions];
78+
let args = vec![strings, delimiter, position];
9079
let arg_fields: Vec<_> = args
9180
.iter()
9281
.enumerate()
@@ -119,108 +108,143 @@ fn criterion_benchmark(c: &mut Criterion) {
119108
let config_options = Arc::new(ConfigOptions::default());
120109
let mut group = c.benchmark_group("split_part");
121110

122-
// Utf8, single-char delimiter, first position
111+
// ── Scalar delimiter and position ────────────────
112+
113+
// Utf8, single-char delimiter, scalar args
123114
{
124-
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
115+
let strings = gen_string_array(N_ROWS, 10, 8, ".", false);
116+
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some(".".into())));
117+
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1)));
125118
bench_split_part(
126119
&mut group,
127120
&split_part_func,
128121
&config_options,
129-
"utf8_single_char",
122+
"scalar_utf8_single_char",
130123
"pos_first",
131124
strings,
132-
delimiters,
133-
1,
125+
delimiter,
126+
position,
134127
);
135128
}
136129

137-
// Utf8, single-char delimiter, middle position
138130
{
139-
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
131+
let strings = gen_string_array(N_ROWS, 10, 8, ".", false);
132+
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some(".".into())));
133+
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(5)));
140134
bench_split_part(
141135
&mut group,
142136
&split_part_func,
143137
&config_options,
144-
"utf8_single_char",
138+
"scalar_utf8_single_char",
145139
"pos_middle",
146140
strings,
147-
delimiters,
148-
5,
141+
delimiter,
142+
position,
149143
);
150144
}
151145

152-
// Utf8, single-char delimiter, negative position
153146
{
154-
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
147+
let strings = gen_string_array(N_ROWS, 10, 8, ".", false);
148+
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some(".".into())));
149+
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(-1)));
155150
bench_split_part(
156151
&mut group,
157152
&split_part_func,
158153
&config_options,
159-
"utf8_single_char",
154+
"scalar_utf8_single_char",
160155
"pos_negative",
161156
strings,
162-
delimiters,
163-
-1,
157+
delimiter,
158+
position,
164159
);
165160
}
166161

167-
// Utf8, multi-char delimiter, middle position
162+
// Utf8, multi-char delimiter, scalar args
168163
{
169-
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, "~@~", false);
164+
let strings = gen_string_array(N_ROWS, 10, 8, "~@~", false);
165+
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some("~@~".into())));
166+
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(5)));
170167
bench_split_part(
171168
&mut group,
172169
&split_part_func,
173170
&config_options,
174-
"utf8_multi_char",
171+
"scalar_utf8_multi_char",
175172
"pos_middle",
176173
strings,
177-
delimiters,
178-
5,
174+
delimiter,
175+
position,
179176
);
180177
}
181178

182-
// Utf8View, single-char delimiter, first position
179+
// Utf8, long strings, scalar args
183180
{
184-
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", true);
181+
let strings = gen_string_array(N_ROWS, 50, 16, ".", false);
182+
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8(Some(".".into())));
183+
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(25)));
185184
bench_split_part(
186185
&mut group,
187186
&split_part_func,
188187
&config_options,
189-
"utf8view_single_char",
190-
"pos_first",
188+
"scalar_utf8_long_strings",
189+
"pos_middle",
191190
strings,
192-
delimiters,
193-
1,
191+
delimiter,
192+
position,
194193
);
195194
}
196195

197-
// Utf8, single-char delimiter, many long parts
196+
// Utf8View, long parts, scalar args
197+
{
198+
let strings = gen_string_array(N_ROWS, 10, 32, ".", true);
199+
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into())));
200+
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(5)));
201+
bench_split_part(
202+
&mut group,
203+
&split_part_func,
204+
&config_options,
205+
"scalar_utf8view_long_parts",
206+
"pos_middle",
207+
strings,
208+
delimiter,
209+
position,
210+
);
211+
}
212+
213+
// ── Array delimiter and position ─────────────────
214+
215+
// Utf8, single-char delimiter, array args
198216
{
199-
let (strings, delimiters) = gen_split_part_data(N_ROWS, 50, 16, ".", false);
217+
let strings = gen_string_array(N_ROWS, 10, 8, ".", false);
218+
let delimiters: StringArray = vec![Some("."); N_ROWS].into_iter().collect();
219+
let delimiter = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
220+
let positions = ColumnarValue::Array(Arc::new(Int64Array::from(vec![5; N_ROWS])));
200221
bench_split_part(
201222
&mut group,
202223
&split_part_func,
203224
&config_options,
204-
"utf8_long_strings",
225+
"array_utf8_single_char",
205226
"pos_middle",
206227
strings,
207-
delimiters,
208-
25,
228+
delimiter,
229+
positions,
209230
);
210231
}
211232

212-
// Utf8View, single-char delimiter, middle position, long parts
233+
// Utf8, multi-char delimiter, array args
213234
{
214-
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 32, ".", true);
235+
let strings = gen_string_array(N_ROWS, 10, 8, "~@~", false);
236+
let delimiters: StringArray = vec![Some("~@~"); N_ROWS].into_iter().collect();
237+
let delimiter = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
238+
let positions = ColumnarValue::Array(Arc::new(Int64Array::from(vec![5; N_ROWS])));
215239
bench_split_part(
216240
&mut group,
217241
&split_part_func,
218242
&config_options,
219-
"utf8view_long_parts",
243+
"array_utf8_multi_char",
220244
"pos_middle",
221245
strings,
222-
delimiters,
223-
5,
246+
delimiter,
247+
positions,
224248
);
225249
}
226250

0 commit comments

Comments
 (0)