Skip to content

Commit 961c5fc

Browse files
neilconwaymartin-g
andauthored
perf: Optimize NULL handling in StringViewArrayBuilder (#21538)
## Which issue does this PR close? - Closes #21537. ## Rationale for this change `StringViewArrayBuilder` is implemented on top of Arrow's `StringViewBuilder`; the latter tracks NULLs incrementally. However, the `StringViewArrayBuilder` requires callers to pass a NULL buffer to `finish()` anyway, so the NULL bitmap that has been computed by `StringViewBuilder` is discarded. It would be more efficient to stop using `StringViewBuilder` so that we don't do this redundant work; in theory there might be room for inconsistency between the two NULL bitmaps as well. Right now, `StringViewArrayBuilder` is only used by the `concat` and `concat_ws` UDFs, but I'd like to generalize the API and use it more broadly in place of `StringViewBuilder` (#21539). For the time being, here are the results of this PR on the `concat` benchmarks (Arm64): ``` - 1024 rows: 29.6 µs → 28.0 µs, -5.3% - 4096 rows: 134.3 µs → 125.6 µs, -6.5% - 8192 rows: 289.7 µs → 273.5 µs, -5.6% ``` ## What changes are included in this PR? * Stop using `StringViewBuilder` and build the views ourselves * Improve some comments * Return an error instead of panicking on large input strings ## Are these changes tested? Yes. ## Are there any user-facing changes? No. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
1 parent 776b723 commit 961c5fc

3 files changed

Lines changed: 72 additions & 36 deletions

File tree

datafusion/functions/src/string/concat.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ impl ScalarUDFImpl for ConcatFunc {
247247
columns
248248
.iter()
249249
.for_each(|column| builder.write::<true>(column, i));
250-
builder.append_offset();
250+
builder.append_offset()?;
251251
}
252252

253253
let string_array = builder.finish(None)?;
@@ -271,7 +271,7 @@ impl ScalarUDFImpl for ConcatFunc {
271271
columns
272272
.iter()
273273
.for_each(|column| builder.write::<true>(column, i));
274-
builder.append_offset();
274+
builder.append_offset()?;
275275
}
276276

277277
let string_array = builder.finish(None)?;

datafusion/functions/src/string/concat_ws.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
335335
let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size);
336336
for i in 0..len {
337337
if !sep.is_valid(i) {
338-
builder.append_offset();
338+
builder.append_offset()?;
339339
continue;
340340
}
341341
let mut first = true;
@@ -348,15 +348,15 @@ impl ScalarUDFImpl for ConcatWsFunc {
348348
first = false;
349349
}
350350
}
351-
builder.append_offset();
351+
builder.append_offset()?;
352352
}
353353
Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
354354
}
355355
_ => {
356356
let mut builder = StringArrayBuilder::with_capacity(len, data_size);
357357
for i in 0..len {
358358
if !sep.is_valid(i) {
359-
builder.append_offset();
359+
builder.append_offset()?;
360360
continue;
361361
}
362362
let mut first = true;
@@ -369,7 +369,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
369369
first = false;
370370
}
371371
}
372-
builder.append_offset();
372+
builder.append_offset()?;
373373
}
374374
Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
375375
}

datafusion/functions/src/strings.rs

Lines changed: 66 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ use datafusion_common::{Result, exec_datafusion_err, internal_err};
2121

2222
use arrow::array::{
2323
Array, ArrayAccessor, ArrayDataBuilder, BinaryArray, ByteView, LargeStringArray,
24-
StringArray, StringViewArray, StringViewBuilder, make_view,
24+
StringArray, StringViewArray, make_view,
2525
};
26-
use arrow::buffer::{MutableBuffer, NullBuffer};
26+
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer};
2727
use arrow::datatypes::DataType;
2828

2929
/// Optimized version of the StringBuilder in Arrow that:
@@ -106,13 +106,14 @@ impl StringArrayBuilder {
106106
}
107107
}
108108

109-
pub fn append_offset(&mut self) {
109+
pub fn append_offset(&mut self) -> Result<()> {
110110
let next_offset: i32 = self
111111
.value_buffer
112112
.len()
113113
.try_into()
114-
.expect("byte array offset overflow");
114+
.map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
115115
self.offsets_buffer.push(next_offset);
116+
Ok(())
116117
}
117118

118119
/// Finalize the builder into a concrete [`StringArray`].
@@ -150,18 +151,25 @@ impl StringArrayBuilder {
150151
}
151152
}
152153

154+
/// Optimized version of Arrow's [`StringViewBuilder`]. Rather than adding NULLs
155+
/// on a row-by-row basis, the caller should provide nulls when calling
156+
/// [`finish`](Self::finish). This allows callers to compute nulls more
157+
/// efficiently (e.g., via bulk bitmap operations).
158+
///
159+
/// [`StringViewBuilder`]: arrow::array::StringViewBuilder
153160
pub struct StringViewArrayBuilder {
154-
builder: StringViewBuilder,
161+
views: Vec<u128>,
162+
data: Vec<u8>,
155163
block: Vec<u8>,
156164
/// If true, a safety check is required during the `append_offset` call
157165
tainted: bool,
158166
}
159167

160168
impl StringViewArrayBuilder {
161-
pub fn with_capacity(item_capacity: usize, _data_capacity: usize) -> Self {
162-
let builder = StringViewBuilder::with_capacity(item_capacity);
169+
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
163170
Self {
164-
builder,
171+
views: Vec::with_capacity(item_capacity),
172+
data: Vec::with_capacity(data_capacity),
165173
block: vec![],
166174
tainted: false,
167175
}
@@ -214,16 +222,29 @@ impl StringViewArrayBuilder {
214222
}
215223
}
216224

225+
/// Finalizes the current row by converting the accumulated data into a
226+
/// StringView and appending it to the views buffer.
217227
pub fn append_offset(&mut self) -> Result<()> {
218-
let block_str = if self.tainted {
228+
if self.tainted {
219229
std::str::from_utf8(&self.block)
220-
.map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary literal"))?
230+
.map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary literal"))?;
231+
}
232+
233+
let v = &self.block;
234+
if v.len() > 12 {
235+
let offset: u32 = self
236+
.data
237+
.len()
238+
.try_into()
239+
.map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
240+
self.data.extend_from_slice(v);
241+
self.views.push(make_view(v, 0, offset));
221242
} else {
222-
// SAFETY: all data that was appended was valid UTF8
223-
unsafe { std::str::from_utf8_unchecked(&self.block) }
224-
};
225-
self.builder.append_value(block_str);
243+
self.views.push(make_view(v, 0, 0));
244+
}
245+
226246
self.block.clear();
247+
self.tainted = false;
227248
Ok(())
228249
}
229250

@@ -233,21 +254,35 @@ impl StringViewArrayBuilder {
233254
///
234255
/// Returns an error when:
235256
///
236-
/// - the provided `null_buffer` does not match amount of `append_offset` calls.
237-
pub fn finish(mut self, null_buffer: Option<NullBuffer>) -> Result<StringViewArray> {
238-
let array = self.builder.finish();
239-
match null_buffer {
240-
Some(nulls) if nulls.len() != array.len() => {
241-
internal_err!("Null buffer and views buffer must be the same length")
242-
}
243-
Some(nulls) => {
244-
let array_builder = array.into_data().into_builder().nulls(Some(nulls));
245-
// SAFETY: the underlying data is valid; we are only adding a null buffer
246-
let array_data = unsafe { array_builder.build_unchecked() };
247-
Ok(StringViewArray::from(array_data))
248-
}
249-
None => Ok(array),
257+
/// - the provided `null_buffer` length does not match the row count.
258+
pub fn finish(self, null_buffer: Option<NullBuffer>) -> Result<StringViewArray> {
259+
if let Some(ref nulls) = null_buffer
260+
&& nulls.len() != self.views.len()
261+
{
262+
return internal_err!(
263+
"Null buffer length ({}) must match row count ({})",
264+
nulls.len(),
265+
self.views.len()
266+
);
250267
}
268+
269+
let buffers: Vec<Buffer> = if self.data.is_empty() {
270+
vec![]
271+
} else {
272+
vec![Buffer::from(self.data)]
273+
};
274+
275+
// SAFETY: views were constructed with correct lengths, offsets, and
276+
// prefixes. UTF-8 validity was checked in append_offset() for any row
277+
// where tainted data (e.g., binary literals) was appended.
278+
let array = unsafe {
279+
StringViewArray::new_unchecked(
280+
ScalarBuffer::from(self.views),
281+
buffers,
282+
null_buffer,
283+
)
284+
};
285+
Ok(array)
251286
}
252287
}
253288

@@ -328,13 +363,14 @@ impl LargeStringArrayBuilder {
328363
}
329364
}
330365

331-
pub fn append_offset(&mut self) {
366+
pub fn append_offset(&mut self) -> Result<()> {
332367
let next_offset: i64 = self
333368
.value_buffer
334369
.len()
335370
.try_into()
336-
.expect("byte array offset overflow");
371+
.map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
337372
self.offsets_buffer.push(next_offset);
373+
Ok(())
338374
}
339375

340376
/// Finalize the builder into a concrete [`LargeStringArray`].

0 commit comments

Comments
 (0)