Skip to content

Commit 711be08

Browse files
authored
feat: add fast paths for substr and string column concat (#19628)
* perf: add fast paths for substr and string column concat * test(functions): cover substr fast paths * fix(expression): compact sparse string buffers in concat
1 parent 73c2d6e commit 711be08

5 files changed

Lines changed: 253 additions & 1 deletion

File tree

src/query/expression/src/kernels/concat.rs

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ use crate::types::map::KvColumnBuilder;
4949
use crate::types::nullable::NullableColumn;
5050
use crate::types::number::NumberColumn;
5151
use crate::types::opaque::OpaqueType;
52+
use crate::types::string::StringColumn;
53+
use crate::types::string::StringColumnBuilder;
5254
use crate::types::timestamp_tz::TimestampTzType;
5355
use crate::types::vector::VectorColumnBuilder;
5456
use crate::with_decimal_mapped_type;
@@ -242,11 +244,14 @@ impl Column {
242244
Column::Vector(builder.build())
243245
}
244246
}),
247+
Column::String(_) => Column::String(Self::concat_string_types(
248+
columns.map(|col| col.into_string().unwrap()),
249+
capacity,
250+
)),
245251
Column::Variant(_)
246252
| Column::Geometry(_)
247253
| Column::Geography(_)
248254
| Column::Binary(_)
249-
| Column::String(_)
250255
| Column::Bitmap(_) => {
251256
Self::concat_use_arrow(columns, first_column.data_type(), capacity)
252257
}
@@ -268,6 +273,17 @@ impl Column {
268273
builder.into()
269274
}
270275

276+
pub fn concat_string_types(
277+
cols: impl Iterator<Item = StringColumn>,
278+
num_rows: usize,
279+
) -> StringColumn {
280+
let mut builder = StringColumnBuilder::with_capacity(num_rows);
281+
for col in cols {
282+
builder.append_column_for_concat(&col);
283+
}
284+
builder.build()
285+
}
286+
271287
fn concat_opaque_column<I, const N: usize>(columns: I, capacity: usize) -> Column
272288
where I: Iterator<Item = Column> + TrustedLen + Clone {
273289
let buffer = Self::concat_primitive_types(
@@ -307,3 +323,53 @@ impl Column {
307323
T::upcast_column_with_type(T::build_column(builder), data_type)
308324
}
309325
}
326+
327+
#[cfg(test)]
328+
mod tests {
329+
use crate::Column;
330+
use crate::types::string::StringColumn;
331+
332+
#[test]
333+
fn test_concat_string_columns_fast_path() {
334+
let left = Column::String(StringColumn::from_iter([
335+
"existing short",
336+
"existing long string that definitely exceeds inline storage!",
337+
]));
338+
let right = Column::String(StringColumn::from_iter([
339+
"append short",
340+
"append another extremely long string to force buffer usage!!!",
341+
]));
342+
343+
let result = Column::concat_columns(vec![left, right].into_iter()).unwrap();
344+
let result = result.into_string().unwrap();
345+
let values = result.iter().collect::<Vec<_>>();
346+
347+
assert_eq!(values, vec![
348+
"existing short",
349+
"existing long string that definitely exceeds inline storage!",
350+
"append short",
351+
"append another extremely long string to force buffer usage!!!",
352+
]);
353+
}
354+
355+
#[test]
356+
fn test_concat_string_columns_compacts_sparse_buffers() {
357+
let long = "x".repeat(20_000);
358+
let source = StringColumn::from_iter((0..8).map(|idx| format!("{idx}-{long}")));
359+
let source_total_buffer_len = source.total_buffer_len();
360+
let sparse_left = Column::String(source.clone().sliced(0, 1));
361+
let sparse_right = Column::String(source.clone().sliced(7, 1));
362+
363+
let result = Column::concat_columns(vec![sparse_left, sparse_right].into_iter()).unwrap();
364+
let result = result.into_string().unwrap();
365+
366+
assert_eq!(result.iter().collect::<Vec<_>>(), vec![
367+
format!("0-{long}"),
368+
format!("7-{long}"),
369+
]);
370+
assert!(
371+
result.total_buffer_len() < source_total_buffer_len,
372+
"concat should compact sparse string buffers instead of retaining them"
373+
);
374+
}
375+
}

src/query/expression/src/types/string.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,21 @@ impl StringColumnBuilder {
342342
self.data.extend_values(other.iter());
343343
}
344344

345+
pub fn append_column_for_concat(&mut self, other: &StringColumn) {
346+
debug_assert!(
347+
self.row_buffer.is_empty(),
348+
"append_column_for_concat expects no pending row data"
349+
);
350+
351+
let source = other.clone().maybe_gc();
352+
unsafe {
353+
self.data.append_views_unchecked(
354+
source.views().as_slice().iter(),
355+
source.data_buffers().as_ref(),
356+
);
357+
}
358+
}
359+
345360
pub fn build(self) -> StringColumn {
346361
self.data.into()
347362
}
@@ -397,6 +412,29 @@ impl StringColumnBuilder {
397412
}
398413
}
399414

415+
#[cfg(test)]
416+
mod tests {
417+
use super::StringColumn;
418+
use super::StringColumnBuilder;
419+
420+
#[test]
421+
fn test_append_column_copies_sparse_string_buffers() {
422+
let long = "x".repeat(20_000);
423+
let source = StringColumn::from_iter((0..8).map(|idx| format!("{idx}-{long}")));
424+
let sparse = source.sliced(7, 1);
425+
426+
let mut builder = StringColumnBuilder::with_capacity(1);
427+
builder.append_column(&sparse);
428+
let result = builder.build();
429+
430+
assert_eq!(result.iter().collect::<Vec<_>>(), vec![format!("7-{long}")]);
431+
assert!(
432+
result.total_buffer_len() < sparse.total_buffer_len(),
433+
"append_column should not retain the full source buffers"
434+
);
435+
}
436+
}
437+
400438
impl NullableColumnBuilder<StringType> {
401439
pub fn take_from_views(
402440
views: &[ColumnView<NullableType<StringType>>],

src/query/functions/src/scalars/string.rs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,6 +1056,16 @@ fn substr(builder: &mut StringColumnBuilder, str: &str, pos: i64, len: u64) {
10561056
return;
10571057
}
10581058

1059+
if str.is_ascii() {
1060+
substr_ascii(builder, str, pos, len);
1061+
return;
1062+
}
1063+
1064+
if len == 1 && pos > 0 {
1065+
substr_one_char(builder, str, pos);
1066+
return;
1067+
}
1068+
10591069
let char_len = str.chars().count();
10601070
let start = if pos > 0 {
10611071
(pos - 1).min(char_len as i64) as usize
@@ -1068,3 +1078,81 @@ fn substr(builder: &mut StringColumnBuilder, str: &str, pos: i64, len: u64) {
10681078
builder.put_char_iter(str.chars().skip(start).take(len as usize));
10691079
builder.commit_row();
10701080
}
1081+
1082+
#[inline]
1083+
fn substr_ascii(builder: &mut StringColumnBuilder, str: &str, pos: i64, len: u64) {
1084+
let byte_len = str.len();
1085+
let start = if pos > 0 {
1086+
(pos - 1).min(byte_len as i64) as usize
1087+
} else {
1088+
byte_len
1089+
.checked_sub(pos.unsigned_abs() as usize)
1090+
.unwrap_or(byte_len)
1091+
};
1092+
1093+
let end = start.saturating_add(len as usize).min(byte_len);
1094+
builder.put_slice(&str.as_bytes()[start..end]);
1095+
builder.commit_row();
1096+
}
1097+
1098+
#[inline]
1099+
fn substr_one_char(builder: &mut StringColumnBuilder, str: &str, pos: i64) {
1100+
let target = (pos - 1) as usize;
1101+
let bytes = str.as_bytes();
1102+
let mut byte_idx = 0;
1103+
let mut char_idx = 0;
1104+
1105+
while byte_idx < bytes.len() {
1106+
if char_idx == target {
1107+
let start = byte_idx;
1108+
byte_idx += 1;
1109+
while byte_idx < bytes.len() && is_utf8_continuation_byte(bytes[byte_idx]) {
1110+
byte_idx += 1;
1111+
}
1112+
builder.put_slice(&bytes[start..byte_idx]);
1113+
builder.commit_row();
1114+
return;
1115+
}
1116+
1117+
byte_idx += 1;
1118+
while byte_idx < bytes.len() && is_utf8_continuation_byte(bytes[byte_idx]) {
1119+
byte_idx += 1;
1120+
}
1121+
char_idx += 1;
1122+
}
1123+
1124+
builder.commit_row();
1125+
}
1126+
1127+
#[inline]
1128+
fn is_utf8_continuation_byte(byte: u8) -> bool {
1129+
(byte & 0b1100_0000) == 0b1000_0000
1130+
}
1131+
1132+
#[cfg(test)]
1133+
mod tests {
1134+
use databend_common_expression::types::string::StringColumnBuilder;
1135+
1136+
use super::substr;
1137+
1138+
fn eval_substr(input: &str, pos: i64, len: u64) -> String {
1139+
let mut builder = StringColumnBuilder::with_capacity(1);
1140+
substr(&mut builder, input, pos, len);
1141+
builder.build_scalar()
1142+
}
1143+
1144+
#[test]
1145+
fn test_substr_ascii_fast_path() {
1146+
assert_eq!(eval_substr("abcdef", 2, 3), "bcd");
1147+
assert_eq!(eval_substr("abcdef", -2, 2), "ef");
1148+
assert_eq!(eval_substr("abcdef", 20, 1), "");
1149+
}
1150+
1151+
#[test]
1152+
fn test_substr_single_char_fast_path() {
1153+
assert_eq!(eval_substr("abcdef", 3, 1), "c");
1154+
assert_eq!(eval_substr("你好世界", 3, 1), "世");
1155+
assert_eq!(eval_substr("a你b", 2, 1), "你");
1156+
assert_eq!(eval_substr("こんにちは", 2, 1), "ん");
1157+
}
1158+
}

src/query/functions/tests/it/scalars/string.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,8 +736,14 @@ fn test_substr(file: &mut impl Write) {
736736
run_ast(file, "substr('Sakila' from -4 for 2)", &[]);
737737
run_ast(file, "substr('sakila' FROM -4)", &[]);
738738
run_ast(file, "substr('abc',2)", &[]);
739+
run_ast(file, "substr('abcdef', 2, 3)", &[]);
740+
run_ast(file, "substr('abcdef', -2, 2)", &[]);
741+
run_ast(file, "substr('abcdef', 20, 1)", &[]);
739742
run_ast(file, "substr('你好世界', 3)", &[]);
743+
run_ast(file, "substr('你好世界', 3, 1)", &[]);
744+
run_ast(file, "substr('a你b', 2, 1)", &[]);
740745
run_ast(file, "substr('こんにちは', 2)", &[]);
746+
run_ast(file, "substr('こんにちは', 2, 1)", &[]);
741747
run_ast(file, "substr('abc', pos, len)", &[
742748
(
743749
"pos",

src/query/functions/tests/it/scalars/testdata/string.txt

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3594,6 +3594,33 @@ output domain : {"bc"..="bc"}
35943594
output : 'bc'
35953595

35963596

3597+
ast : substr('abcdef', 2, 3)
3598+
raw expr : substr('abcdef', 2, 3)
3599+
checked expr : substr<String, Int64, UInt64>("abcdef", CAST<UInt8>(2_u8 AS Int64), CAST<UInt8>(3_u8 AS UInt64))
3600+
optimized expr : "bcd"
3601+
output type : String
3602+
output domain : {"bcd"..="bcd"}
3603+
output : 'bcd'
3604+
3605+
3606+
ast : substr('abcdef', -2, 2)
3607+
raw expr : substr('abcdef', -2, 2)
3608+
checked expr : substr<String, Int64, UInt64>("abcdef", CAST<Int8>(-2_i8 AS Int64), CAST<UInt8>(2_u8 AS UInt64))
3609+
optimized expr : "ef"
3610+
output type : String
3611+
output domain : {"ef"..="ef"}
3612+
output : 'ef'
3613+
3614+
3615+
ast : substr('abcdef', 20, 1)
3616+
raw expr : substr('abcdef', 20, 1)
3617+
checked expr : substr<String, Int64, UInt64>("abcdef", CAST<UInt8>(20_u8 AS Int64), CAST<UInt8>(1_u8 AS UInt64))
3618+
optimized expr : ""
3619+
output type : String
3620+
output domain : {""..=""}
3621+
output : ''
3622+
3623+
35973624
ast : substr('你好世界', 3)
35983625
raw expr : substr('你好世界', 3)
35993626
checked expr : substr<String, Int64>("你好世界", CAST<UInt8>(3_u8 AS Int64))
@@ -3603,6 +3630,24 @@ output domain : {"世界"..="世界"}
36033630
output : '世界'
36043631

36053632

3633+
ast : substr('你好世界', 3, 1)
3634+
raw expr : substr('你好世界', 3, 1)
3635+
checked expr : substr<String, Int64, UInt64>("你好世界", CAST<UInt8>(3_u8 AS Int64), CAST<UInt8>(1_u8 AS UInt64))
3636+
optimized expr : "世"
3637+
output type : String
3638+
output domain : {"世"..="世"}
3639+
output : '世'
3640+
3641+
3642+
ast : substr('a你b', 2, 1)
3643+
raw expr : substr('a你b', 2, 1)
3644+
checked expr : substr<String, Int64, UInt64>("a你b", CAST<UInt8>(2_u8 AS Int64), CAST<UInt8>(1_u8 AS UInt64))
3645+
optimized expr : "你"
3646+
output type : String
3647+
output domain : {"你"..="你"}
3648+
output : '你'
3649+
3650+
36063651
ast : substr('こんにちは', 2)
36073652
raw expr : substr('こんにちは', 2)
36083653
checked expr : substr<String, Int64>("こんにちは", CAST<UInt8>(2_u8 AS Int64))
@@ -3612,6 +3657,15 @@ output domain : {"んにちは"..="んにちは"}
36123657
output : 'んにちは'
36133658

36143659

3660+
ast : substr('こんにちは', 2, 1)
3661+
raw expr : substr('こんにちは', 2, 1)
3662+
checked expr : substr<String, Int64, UInt64>("こんにちは", CAST<UInt8>(2_u8 AS Int64), CAST<UInt8>(1_u8 AS UInt64))
3663+
optimized expr : "ん"
3664+
output type : String
3665+
output domain : {"ん"..="ん"}
3666+
output : 'ん'
3667+
3668+
36153669
ast : substr('abc', pos, len)
36163670
raw expr : substr('abc', pos::Int8, len::UInt8)
36173671
checked expr : substr<String, Int64, UInt64>("abc", CAST<Int8>(pos AS Int64), CAST<UInt8>(len AS UInt64))

0 commit comments

Comments
 (0)