Skip to content

Commit b6d46a6

Browse files
authored
perf: Optimize initcap() (#20352)
## Which issue does this PR close? - Closes #20351. ## Rationale for this change When all values in a `Utf8`/`LargeUtf8` array are ASCII, we can skip using `GenericStringBuilder` and instead process the entire input buffer in a single pass using byte-level operations. This also avoids recomputing the offsets and nulls arrays. A similar optimization is already used for lower() and upper(). Along the way, optimize `initcap_string()` for ASCII-only inputs. It already had an ASCII-only fastpath but there was room for further optimization, by iterating over bytes rather than characters. ## What changes are included in this PR? * Cleanup benchmarks: we ran the scalar benchmark for different array sizes, despite the fact that it is invariant to the array size * Add benchmark for different string lengths * Add benchmark for Unicode array input * Optimize for ASCII-only inputs as described above * Add test case for ASCII-only input that is a sliced array * Add test case variants for `LargeStringArray` ## Are these changes tested? Yes, plus an additional test added. ## Are there any user-facing changes? No.
1 parent 7602913 commit b6d46a6

2 files changed

Lines changed: 276 additions & 29 deletions

File tree

datafusion/functions/benches/initcap.rs

Lines changed: 90 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow::array::OffsetSizeTrait;
18+
use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray, StringViewBuilder};
1919
use arrow::datatypes::{DataType, Field};
2020
use arrow::util::bench_util::{
2121
create_string_array_with_len, create_string_view_array_with_len,
@@ -47,52 +47,124 @@ fn create_args<O: OffsetSizeTrait>(
4747
}
4848
}
4949

50+
/// Create a Utf8 array where every value contains non-ASCII Unicode text.
51+
fn create_unicode_utf8_args(size: usize) -> Vec<ColumnarValue> {
52+
let array = Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
53+
"ñAnDÚ ÁrBOL ОлЕГ ÍslENsku",
54+
size,
55+
))) as ArrayRef;
56+
vec![ColumnarValue::Array(array)]
57+
}
58+
59+
/// Create a Utf8View array where every value contains non-ASCII Unicode text.
60+
fn create_unicode_utf8view_args(size: usize) -> Vec<ColumnarValue> {
61+
let mut builder = StringViewBuilder::with_capacity(size);
62+
for _ in 0..size {
63+
builder.append_value("ñAnDÚ ÁrBOL ОлЕГ ÍslENsku");
64+
}
65+
let array = Arc::new(builder.finish()) as ArrayRef;
66+
vec![ColumnarValue::Array(array)]
67+
}
68+
5069
fn criterion_benchmark(c: &mut Criterion) {
5170
let initcap = unicode::initcap();
5271
let config_options = Arc::new(ConfigOptions::default());
5372

54-
// Grouped benchmarks for array sizes - to compare with scalar performance
73+
// Array benchmarks: vary both row count and string length
74+
for size in [1024, 4096, 8192] {
75+
for str_len in [16, 128] {
76+
let mut group =
77+
c.benchmark_group(format!("initcap size={size} str_len={str_len}"));
78+
group.sampling_mode(SamplingMode::Flat);
79+
group.sample_size(10);
80+
group.measurement_time(Duration::from_secs(10));
81+
82+
// Utf8
83+
let array_args = create_args::<i32>(size, str_len, false);
84+
let array_arg_fields = vec![Field::new("arg_0", DataType::Utf8, true).into()];
85+
86+
group.bench_function("array_utf8", |b| {
87+
b.iter(|| {
88+
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
89+
args: array_args.clone(),
90+
arg_fields: array_arg_fields.clone(),
91+
number_rows: size,
92+
return_field: Field::new("f", DataType::Utf8, true).into(),
93+
config_options: Arc::clone(&config_options),
94+
}))
95+
})
96+
});
97+
98+
// Utf8View
99+
let array_view_args = create_args::<i32>(size, str_len, true);
100+
let array_view_arg_fields =
101+
vec![Field::new("arg_0", DataType::Utf8View, true).into()];
102+
103+
group.bench_function("array_utf8view", |b| {
104+
b.iter(|| {
105+
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
106+
args: array_view_args.clone(),
107+
arg_fields: array_view_arg_fields.clone(),
108+
number_rows: size,
109+
return_field: Field::new("f", DataType::Utf8View, true).into(),
110+
config_options: Arc::clone(&config_options),
111+
}))
112+
})
113+
});
114+
115+
group.finish();
116+
}
117+
}
118+
119+
// Unicode array benchmarks
55120
for size in [1024, 4096, 8192] {
56-
let mut group = c.benchmark_group(format!("initcap size={size}"));
121+
let mut group = c.benchmark_group(format!("initcap unicode size={size}"));
57122
group.sampling_mode(SamplingMode::Flat);
58123
group.sample_size(10);
59124
group.measurement_time(Duration::from_secs(10));
60125

61-
// Array benchmark - Utf8
62-
let array_args = create_args::<i32>(size, 16, false);
63-
let array_arg_fields = vec![Field::new("arg_0", DataType::Utf8, true).into()];
64-
let batch_len = size;
126+
let unicode_args = create_unicode_utf8_args(size);
127+
let unicode_arg_fields = vec![Field::new("arg_0", DataType::Utf8, true).into()];
65128

66129
group.bench_function("array_utf8", |b| {
67130
b.iter(|| {
68131
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
69-
args: array_args.clone(),
70-
arg_fields: array_arg_fields.clone(),
71-
number_rows: batch_len,
132+
args: unicode_args.clone(),
133+
arg_fields: unicode_arg_fields.clone(),
134+
number_rows: size,
72135
return_field: Field::new("f", DataType::Utf8, true).into(),
73136
config_options: Arc::clone(&config_options),
74137
}))
75138
})
76139
});
77140

78-
// Array benchmark - Utf8View
79-
let array_view_args = create_args::<i32>(size, 16, true);
80-
let array_view_arg_fields =
141+
let unicode_view_args = create_unicode_utf8view_args(size);
142+
let unicode_view_arg_fields =
81143
vec![Field::new("arg_0", DataType::Utf8View, true).into()];
82144

83145
group.bench_function("array_utf8view", |b| {
84146
b.iter(|| {
85147
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
86-
args: array_view_args.clone(),
87-
arg_fields: array_view_arg_fields.clone(),
88-
number_rows: batch_len,
148+
args: unicode_view_args.clone(),
149+
arg_fields: unicode_view_arg_fields.clone(),
150+
number_rows: size,
89151
return_field: Field::new("f", DataType::Utf8View, true).into(),
90152
config_options: Arc::clone(&config_options),
91153
}))
92154
})
93155
});
94156

95-
// Scalar benchmark - Utf8 (the optimization we added)
157+
group.finish();
158+
}
159+
160+
// Scalar benchmarks: independent of array size, run once
161+
{
162+
let mut group = c.benchmark_group("initcap scalar");
163+
group.sampling_mode(SamplingMode::Flat);
164+
group.sample_size(10);
165+
group.measurement_time(Duration::from_secs(10));
166+
167+
// Utf8
96168
let scalar_args = vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
97169
"hello world test string".to_string(),
98170
)))];
@@ -110,7 +182,7 @@ fn criterion_benchmark(c: &mut Criterion) {
110182
})
111183
});
112184

113-
// Scalar benchmark - Utf8View
185+
// Utf8View
114186
let scalar_view_args = vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
115187
"hello world test string".to_string(),
116188
)))];

0 commit comments

Comments
 (0)