1818use arrow:: array:: { ArrayRef , Int64Array , StringArray , StringViewArray } ;
1919use arrow:: datatypes:: { DataType , Field } ;
2020use criterion:: { BenchmarkId , Criterion , criterion_group, criterion_main} ;
21+ use datafusion_common:: ScalarValue ;
2122use datafusion_common:: config:: ConfigOptions ;
2223use datafusion_expr:: { ColumnarValue , ScalarFunctionArgs , ScalarUDF } ;
2324use datafusion_functions:: string:: split_part;
@@ -29,15 +30,15 @@ use std::sync::Arc;
2930
3031const N_ROWS : usize = 8192 ;
3132
32- /// Creates strings with `num_parts` random alphanumeric segments of `part_len`
33- /// bytes each, joined by `delimiter`.
34- fn gen_split_part_data (
33+ /// Creates an array of strings with `num_parts` random alphanumeric segments
34+ /// of `part_len` bytes each, joined by `delimiter`.
35+ fn gen_string_array (
3536 n_rows : usize ,
3637 num_parts : usize ,
3738 part_len : usize ,
3839 delimiter : & str ,
3940 use_string_view : bool ,
40- ) -> ( ColumnarValue , ColumnarValue ) {
41+ ) -> ColumnarValue {
4142 let mut rng = StdRng :: seed_from_u64 ( 42 ) ;
4243
4344 let mut strings: Vec < String > = Vec :: with_capacity ( n_rows) ;
@@ -54,22 +55,12 @@ fn gen_split_part_data(
5455 strings. push ( parts. join ( delimiter) ) ;
5556 }
5657
57- let delimiters: Vec < String > = vec ! [ delimiter. to_string( ) ; n_rows] ;
58-
5958 if use_string_view {
6059 let string_array: StringViewArray = strings. into_iter ( ) . map ( Some ) . collect ( ) ;
61- let delimiter_array: StringViewArray = delimiters. into_iter ( ) . map ( Some ) . collect ( ) ;
62- (
63- ColumnarValue :: Array ( Arc :: new ( string_array) as ArrayRef ) ,
64- ColumnarValue :: Array ( Arc :: new ( delimiter_array) as ArrayRef ) ,
65- )
60+ ColumnarValue :: Array ( Arc :: new ( string_array) as ArrayRef )
6661 } else {
6762 let string_array: StringArray = strings. into_iter ( ) . map ( Some ) . collect ( ) ;
68- let delimiter_array: StringArray = delimiters. into_iter ( ) . map ( Some ) . collect ( ) ;
69- (
70- ColumnarValue :: Array ( Arc :: new ( string_array) as ArrayRef ) ,
71- ColumnarValue :: Array ( Arc :: new ( delimiter_array) as ArrayRef ) ,
72- )
63+ ColumnarValue :: Array ( Arc :: new ( string_array) as ArrayRef )
7364 }
7465}
7566
@@ -81,12 +72,10 @@ fn bench_split_part(
8172 name : & str ,
8273 tag : & str ,
8374 strings : ColumnarValue ,
84- delimiters : ColumnarValue ,
85- position : i64 ,
75+ delimiter : ColumnarValue ,
76+ position : ColumnarValue ,
8677) {
87- let positions: ColumnarValue =
88- ColumnarValue :: Array ( Arc :: new ( Int64Array :: from ( vec ! [ position; N_ROWS ] ) ) ) ;
89- let args = vec ! [ strings, delimiters, positions] ;
78+ let args = vec ! [ strings, delimiter, position] ;
9079 let arg_fields: Vec < _ > = args
9180 . iter ( )
9281 . enumerate ( )
@@ -119,108 +108,143 @@ fn criterion_benchmark(c: &mut Criterion) {
119108 let config_options = Arc :: new ( ConfigOptions :: default ( ) ) ;
120109 let mut group = c. benchmark_group ( "split_part" ) ;
121110
122- // Utf8, single-char delimiter, first position
111+ // ── Scalar delimiter and position ────────────────
112+
113+ // Utf8, single-char delimiter, scalar args
123114 {
124- let ( strings, delimiters) = gen_split_part_data ( N_ROWS , 10 , 8 , "." , false ) ;
115+ let strings = gen_string_array ( N_ROWS , 10 , 8 , "." , false ) ;
116+ let delimiter = ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( "." . into ( ) ) ) ) ;
117+ let position = ColumnarValue :: Scalar ( ScalarValue :: Int64 ( Some ( 1 ) ) ) ;
125118 bench_split_part (
126119 & mut group,
127120 & split_part_func,
128121 & config_options,
129- "utf8_single_char " ,
122+ "scalar_utf8_single_char " ,
130123 "pos_first" ,
131124 strings,
132- delimiters ,
133- 1 ,
125+ delimiter ,
126+ position ,
134127 ) ;
135128 }
136129
137- // Utf8, single-char delimiter, middle position
138130 {
139- let ( strings, delimiters) = gen_split_part_data ( N_ROWS , 10 , 8 , "." , false ) ;
131+ let strings = gen_string_array ( N_ROWS , 10 , 8 , "." , false ) ;
132+ let delimiter = ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( "." . into ( ) ) ) ) ;
133+ let position = ColumnarValue :: Scalar ( ScalarValue :: Int64 ( Some ( 5 ) ) ) ;
140134 bench_split_part (
141135 & mut group,
142136 & split_part_func,
143137 & config_options,
144- "utf8_single_char " ,
138+ "scalar_utf8_single_char " ,
145139 "pos_middle" ,
146140 strings,
147- delimiters ,
148- 5 ,
141+ delimiter ,
142+ position ,
149143 ) ;
150144 }
151145
152- // Utf8, single-char delimiter, negative position
153146 {
154- let ( strings, delimiters) = gen_split_part_data ( N_ROWS , 10 , 8 , "." , false ) ;
147+ let strings = gen_string_array ( N_ROWS , 10 , 8 , "." , false ) ;
148+ let delimiter = ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( "." . into ( ) ) ) ) ;
149+ let position = ColumnarValue :: Scalar ( ScalarValue :: Int64 ( Some ( -1 ) ) ) ;
155150 bench_split_part (
156151 & mut group,
157152 & split_part_func,
158153 & config_options,
159- "utf8_single_char " ,
154+ "scalar_utf8_single_char " ,
160155 "pos_negative" ,
161156 strings,
162- delimiters ,
163- - 1 ,
157+ delimiter ,
158+ position ,
164159 ) ;
165160 }
166161
167- // Utf8, multi-char delimiter, middle position
162+ // Utf8, multi-char delimiter, scalar args
168163 {
169- let ( strings, delimiters) = gen_split_part_data ( N_ROWS , 10 , 8 , "~@~" , false ) ;
164+ let strings = gen_string_array ( N_ROWS , 10 , 8 , "~@~" , false ) ;
165+ let delimiter = ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( "~@~" . into ( ) ) ) ) ;
166+ let position = ColumnarValue :: Scalar ( ScalarValue :: Int64 ( Some ( 5 ) ) ) ;
170167 bench_split_part (
171168 & mut group,
172169 & split_part_func,
173170 & config_options,
174- "utf8_multi_char " ,
171+ "scalar_utf8_multi_char " ,
175172 "pos_middle" ,
176173 strings,
177- delimiters ,
178- 5 ,
174+ delimiter ,
175+ position ,
179176 ) ;
180177 }
181178
182- // Utf8View, single-char delimiter, first position
179+ // Utf8, long strings, scalar args
183180 {
184- let ( strings, delimiters) = gen_split_part_data ( N_ROWS , 10 , 8 , "." , true ) ;
181+ let strings = gen_string_array ( N_ROWS , 50 , 16 , "." , false ) ;
182+ let delimiter = ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( "." . into ( ) ) ) ) ;
183+ let position = ColumnarValue :: Scalar ( ScalarValue :: Int64 ( Some ( 25 ) ) ) ;
185184 bench_split_part (
186185 & mut group,
187186 & split_part_func,
188187 & config_options,
189- "utf8view_single_char " ,
190- "pos_first " ,
188+ "scalar_utf8_long_strings " ,
189+ "pos_middle " ,
191190 strings,
192- delimiters ,
193- 1 ,
191+ delimiter ,
192+ position ,
194193 ) ;
195194 }
196195
197- // Utf8, single-char delimiter, many long parts
196+ // Utf8View, long parts, scalar args
197+ {
198+ let strings = gen_string_array ( N_ROWS , 10 , 32 , "." , true ) ;
199+ let delimiter = ColumnarValue :: Scalar ( ScalarValue :: Utf8View ( Some ( "." . into ( ) ) ) ) ;
200+ let position = ColumnarValue :: Scalar ( ScalarValue :: Int64 ( Some ( 5 ) ) ) ;
201+ bench_split_part (
202+ & mut group,
203+ & split_part_func,
204+ & config_options,
205+ "scalar_utf8view_long_parts" ,
206+ "pos_middle" ,
207+ strings,
208+ delimiter,
209+ position,
210+ ) ;
211+ }
212+
213+ // ── Array delimiter and position ─────────────────
214+
215+ // Utf8, single-char delimiter, array args
198216 {
199- let ( strings, delimiters) = gen_split_part_data ( N_ROWS , 50 , 16 , "." , false ) ;
217+ let strings = gen_string_array ( N_ROWS , 10 , 8 , "." , false ) ;
218+ let delimiters: StringArray = vec ! [ Some ( "." ) ; N_ROWS ] . into_iter ( ) . collect ( ) ;
219+ let delimiter = ColumnarValue :: Array ( Arc :: new ( delimiters) as ArrayRef ) ;
220+ let positions = ColumnarValue :: Array ( Arc :: new ( Int64Array :: from ( vec ! [ 5 ; N_ROWS ] ) ) ) ;
200221 bench_split_part (
201222 & mut group,
202223 & split_part_func,
203224 & config_options,
204- "utf8_long_strings " ,
225+ "array_utf8_single_char " ,
205226 "pos_middle" ,
206227 strings,
207- delimiters ,
208- 25 ,
228+ delimiter ,
229+ positions ,
209230 ) ;
210231 }
211232
212- // Utf8View, single -char delimiter, middle position, long parts
233+ // Utf8, multi -char delimiter, array args
213234 {
214- let ( strings, delimiters) = gen_split_part_data ( N_ROWS , 10 , 32 , "." , true ) ;
235+ let strings = gen_string_array ( N_ROWS , 10 , 8 , "~@~" , false ) ;
236+ let delimiters: StringArray = vec ! [ Some ( "~@~" ) ; N_ROWS ] . into_iter ( ) . collect ( ) ;
237+ let delimiter = ColumnarValue :: Array ( Arc :: new ( delimiters) as ArrayRef ) ;
238+ let positions = ColumnarValue :: Array ( Arc :: new ( Int64Array :: from ( vec ! [ 5 ; N_ROWS ] ) ) ) ;
215239 bench_split_part (
216240 & mut group,
217241 & split_part_func,
218242 & config_options,
219- "utf8view_long_parts " ,
243+ "array_utf8_multi_char " ,
220244 "pos_middle" ,
221245 strings,
222- delimiters ,
223- 5 ,
246+ delimiter ,
247+ positions ,
224248 ) ;
225249 }
226250
0 commit comments