Skip to content

Commit bf65460

Browse files
committed
Optimize left, right using Utf8View
1 parent b1ede7c commit bf65460

4 files changed

Lines changed: 290 additions & 125 deletions

File tree

datafusion/functions/src/unicode/common.rs

Lines changed: 112 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@
1818
//! Common utilities for implementing unicode functions
1919
2020
use arrow::array::{
21-
Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array,
22-
OffsetSizeTrait, StringViewArray, make_view,
21+
Array, ArrayRef, ByteView, GenericStringArray, Int64Array, OffsetSizeTrait,
22+
StringViewArray, make_view,
2323
};
2424
use arrow::datatypes::DataType;
2525
use arrow_buffer::{NullBuffer, ScalarBuffer};
26+
use datafusion_common::Result;
2627
use datafusion_common::ScalarValue;
2728
use datafusion_common::cast::{
2829
as_generic_string_array, as_int64_array, as_string_view_array,
@@ -99,17 +100,17 @@ fn left_right_byte_length(string: &str, n: i64) -> usize {
99100
/// General implementation for `left` and `right` functions
100101
pub(crate) fn general_left_right<F: LeftRightSlicer>(
101102
args: &[ArrayRef],
102-
) -> datafusion_common::Result<ArrayRef> {
103+
) -> Result<ArrayRef> {
103104
let n_array = as_int64_array(&args[1])?;
104105

105106
match args[0].data_type() {
106107
DataType::Utf8 => {
107108
let string_array = as_generic_string_array::<i32>(&args[0])?;
108-
general_left_right_array::<i32, _, F>(string_array, n_array)
109+
general_left_right_array::<i32, F>(string_array, n_array)
109110
}
110111
DataType::LargeUtf8 => {
111112
let string_array = as_generic_string_array::<i64>(&args[0])?;
112-
general_left_right_array::<i64, _, F>(string_array, n_array)
113+
general_left_right_array::<i64, F>(string_array, n_array)
113114
}
114115
DataType::Utf8View => {
115116
let string_view_array = as_string_view_array(&args[0])?;
@@ -119,83 +120,125 @@ pub(crate) fn general_left_right<F: LeftRightSlicer>(
119120
}
120121
}
121122

122-
/// `general_left_right` implementation for strings
123-
fn general_left_right_array<
124-
'a,
125-
T: OffsetSizeTrait,
126-
V: ArrayAccessor<Item = &'a str>,
127-
F: LeftRightSlicer,
128-
>(
129-
string_array: V,
123+
/// Returns true if all offsets in the array fit in u32, meaning the values
124+
/// buffer can be referenced by StringView's u32 offset field.
125+
fn values_fit_in_u32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>) -> bool {
126+
string_array
127+
.offsets()
128+
.last()
129+
.map(|offset| offset.as_usize() <= u32::MAX as usize)
130+
.unwrap_or(true)
131+
}
132+
133+
/// `left`/`right` for Utf8/LargeUtf8 input.
134+
///
135+
/// When offsets fit in u32, produces a zero-copy `StringViewArray` with views
136+
/// pointing into the input values buffer. Otherwise falls back to building a
137+
/// `StringViewArray` by copying.
138+
fn general_left_right_array<T: OffsetSizeTrait, F: LeftRightSlicer>(
139+
string_array: &GenericStringArray<T>,
130140
n_array: &Int64Array,
131-
) -> datafusion_common::Result<ArrayRef> {
132-
let iter = ArrayIter::new(string_array);
133-
let result = iter
134-
.zip(n_array.iter())
135-
.map(|(string, n)| match (string, n) {
136-
(Some(string), Some(n)) => {
137-
let range = F::slice(string, n);
138-
// Extract a given range from a byte-indexed slice
139-
Some(&string[range])
140-
}
141-
_ => None,
142-
})
143-
.collect::<GenericStringArray<T>>();
141+
) -> Result<ArrayRef> {
142+
if !values_fit_in_u32(string_array) {
143+
let result = string_array
144+
.iter()
145+
.zip(n_array.iter())
146+
.map(|(string, n)| match (string, n) {
147+
(Some(string), Some(n)) => Some(&string[F::slice(string, n)]),
148+
_ => None,
149+
})
150+
.collect::<StringViewArray>();
151+
return Ok(Arc::new(result) as ArrayRef);
152+
}
153+
154+
let len = string_array.len();
155+
let offsets = string_array.value_offsets();
156+
let nulls = NullBuffer::union(string_array.nulls(), n_array.nulls());
144157

145-
Ok(Arc::new(result) as ArrayRef)
158+
let mut views_buf = Vec::with_capacity(len);
159+
let mut has_out_of_line = false;
160+
161+
for (i, offset) in offsets.iter().enumerate().take(len) {
162+
if nulls.as_ref().is_some_and(|n| !n.is_valid(i)) {
163+
views_buf.push(0);
164+
continue;
165+
}
166+
167+
// SAFETY: we just checked validity above
168+
let string = unsafe { string_array.value_unchecked(i) };
169+
let n = n_array.value(i);
170+
let range = F::slice(string, n);
171+
let result_bytes = &string.as_bytes()[range.clone()];
172+
if result_bytes.len() > 12 {
173+
has_out_of_line = true;
174+
}
175+
176+
let buf_offset = offset.as_usize() as u32 + range.start as u32;
177+
views_buf.push(make_view(result_bytes, 0, buf_offset));
178+
}
179+
180+
let views = ScalarBuffer::from(views_buf);
181+
let data_buffers = if has_out_of_line {
182+
vec![string_array.values().clone()]
183+
} else {
184+
vec![]
185+
};
186+
187+
// SAFETY:
188+
// - Each view is produced by `make_view` with correct bytes and offset
189+
// - Out-of-line views reference buffer index 0, which is the original
190+
// values buffer included in data_buffers when has_out_of_line is true
191+
// - values_fit_in_u32 guarantees all offsets fit in u32
192+
unsafe {
193+
let array = StringViewArray::new_unchecked(views, data_buffers, nulls);
194+
Ok(Arc::new(array) as ArrayRef)
195+
}
146196
}
147197

148-
/// `general_left_right` implementation for StringViewArray
198+
/// `general_left_right` for StringViewArray input.
149199
fn general_left_right_view<F: LeftRightSlicer>(
150200
string_view_array: &StringViewArray,
151201
n_array: &Int64Array,
152-
) -> datafusion_common::Result<ArrayRef> {
153-
let len = n_array.len();
154-
202+
) -> Result<ArrayRef> {
155203
let views = string_view_array.views();
156-
// Every string in StringViewArray has one corresponding view in `views`
157-
debug_assert!(views.len() == string_view_array.len());
158-
159-
// Compose null buffer at once
160-
let string_nulls = string_view_array.nulls();
161-
let n_nulls = n_array.nulls();
162-
let new_nulls = NullBuffer::union(string_nulls, n_nulls);
204+
let new_nulls = NullBuffer::union(string_view_array.nulls(), n_array.nulls());
205+
let len = n_array.len();
206+
let mut has_out_of_line = false;
163207

164208
let new_views = (0..len)
165209
.map(|idx| {
166-
let view = views[idx];
167-
168-
let is_valid = match &new_nulls {
169-
Some(nulls_buf) => nulls_buf.is_valid(idx),
170-
None => true,
171-
};
172-
173-
if is_valid {
174-
let string: &str = string_view_array.value(idx);
175-
let n = n_array.value(idx);
176-
177-
// Input string comes from StringViewArray, so it should fit in 32-bit length
178-
let range = F::slice(string, n);
179-
let result_bytes = &string.as_bytes()[range.clone()];
180-
181-
let byte_view = ByteView::from(view);
182-
// New offset starts at 0 for left, and at `range.start` for right,
183-
// which is encoded in the given range
184-
let new_offset = byte_view.offset + (range.start as u32);
185-
// Reuse buffer
186-
make_view(result_bytes, byte_view.buffer_index, new_offset)
187-
} else {
188-
// For nulls, keep the original view
189-
view
210+
if new_nulls.as_ref().is_some_and(|n| !n.is_valid(idx)) {
211+
return 0;
190212
}
213+
214+
// SAFETY: we just checked validity above
215+
let string: &str = unsafe { string_view_array.value_unchecked(idx) };
216+
let n = n_array.value(idx);
217+
218+
let range = F::slice(string, n);
219+
let result_bytes = &string.as_bytes()[range.clone()];
220+
if result_bytes.len() > 12 {
221+
has_out_of_line = true;
222+
}
223+
224+
let byte_view = ByteView::from(views[idx]);
225+
let new_offset = byte_view.offset + (range.start as u32);
226+
make_view(result_bytes, byte_view.buffer_index, new_offset)
191227
})
192228
.collect::<Vec<u128>>();
193229

194-
// Buffers are unchanged
195-
let result = StringViewArray::try_new(
196-
ScalarBuffer::from(new_views),
197-
Vec::from(string_view_array.data_buffers()),
198-
new_nulls,
199-
)?;
200-
Ok(Arc::new(result) as ArrayRef)
230+
let views = ScalarBuffer::from(new_views);
231+
let data_buffers = if has_out_of_line {
232+
string_view_array.data_buffers().to_vec()
233+
} else {
234+
vec![]
235+
};
236+
237+
// SAFETY:
238+
// - Each view is produced by `make_view` with correct bytes and offset
239+
// - Out-of-line views reuse the original buffer index and adjusted offset
240+
unsafe {
241+
let array = StringViewArray::new_unchecked(views, data_buffers, new_nulls);
242+
Ok(Arc::new(array) as ArrayRef)
243+
}
201244
}

datafusion/functions/src/unicode/left.rs

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ impl ScalarUDFImpl for LeftFunc {
7979
&self.signature
8080
}
8181

82-
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
83-
Ok(arg_types[0].clone())
82+
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
83+
Ok(DataType::Utf8View)
8484
}
8585

8686
/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters.
@@ -108,8 +108,8 @@ impl ScalarUDFImpl for LeftFunc {
108108

109109
#[cfg(test)]
110110
mod tests {
111-
use arrow::array::{Array, StringArray, StringViewArray};
112-
use arrow::datatypes::DataType::{Utf8, Utf8View};
111+
use arrow::array::{Array, StringViewArray};
112+
use arrow::datatypes::DataType::Utf8View;
113113

114114
use datafusion_common::{Result, ScalarValue};
115115
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -127,8 +127,8 @@ mod tests {
127127
],
128128
Ok(Some("ab")),
129129
&str,
130-
Utf8,
131-
StringArray
130+
Utf8View,
131+
StringViewArray
132132
);
133133
test_function!(
134134
LeftFunc::new(),
@@ -138,8 +138,8 @@ mod tests {
138138
],
139139
Ok(Some("abcde")),
140140
&str,
141-
Utf8,
142-
StringArray
141+
Utf8View,
142+
StringViewArray
143143
);
144144
test_function!(
145145
LeftFunc::new(),
@@ -149,8 +149,8 @@ mod tests {
149149
],
150150
Ok(Some("abc")),
151151
&str,
152-
Utf8,
153-
StringArray
152+
Utf8View,
153+
StringViewArray
154154
);
155155
test_function!(
156156
LeftFunc::new(),
@@ -160,8 +160,8 @@ mod tests {
160160
],
161161
Ok(Some("")),
162162
&str,
163-
Utf8,
164-
StringArray
163+
Utf8View,
164+
StringViewArray
165165
);
166166
test_function!(
167167
LeftFunc::new(),
@@ -171,8 +171,8 @@ mod tests {
171171
],
172172
Ok(Some("")),
173173
&str,
174-
Utf8,
175-
StringArray
174+
Utf8View,
175+
StringViewArray
176176
);
177177
test_function!(
178178
LeftFunc::new(),
@@ -182,8 +182,8 @@ mod tests {
182182
],
183183
Ok(Some("")),
184184
&str,
185-
Utf8,
186-
StringArray
185+
Utf8View,
186+
StringViewArray
187187
);
188188
test_function!(
189189
LeftFunc::new(),
@@ -193,8 +193,8 @@ mod tests {
193193
],
194194
Ok(None),
195195
&str,
196-
Utf8,
197-
StringArray
196+
Utf8View,
197+
StringViewArray
198198
);
199199
test_function!(
200200
LeftFunc::new(),
@@ -204,8 +204,8 @@ mod tests {
204204
],
205205
Ok(None),
206206
&str,
207-
Utf8,
208-
StringArray
207+
Utf8View,
208+
StringViewArray
209209
);
210210
test_function!(
211211
LeftFunc::new(),
@@ -215,8 +215,8 @@ mod tests {
215215
],
216216
Ok(Some("joséé")),
217217
&str,
218-
Utf8,
219-
StringArray
218+
Utf8View,
219+
StringViewArray
220220
);
221221
test_function!(
222222
LeftFunc::new(),
@@ -226,8 +226,8 @@ mod tests {
226226
],
227227
Ok(Some("joséé")),
228228
&str,
229-
Utf8,
230-
StringArray
229+
Utf8View,
230+
StringViewArray
231231
);
232232
#[cfg(not(feature = "unicode_expressions"))]
233233
test_function!(
@@ -240,8 +240,8 @@ mod tests {
240240
"function left requires compilation with feature flag: unicode_expressions."
241241
),
242242
&str,
243-
Utf8,
244-
StringArray
243+
Utf8View,
244+
StringViewArray
245245
);
246246

247247
// StringView cases
@@ -307,8 +307,8 @@ mod tests {
307307
],
308308
Ok(Some(expected.as_str())),
309309
&str,
310-
Utf8,
311-
StringArray
310+
Utf8View,
311+
StringViewArray
312312
);
313313
}
314314

0 commit comments

Comments
 (0)