1515// specific language governing permissions and limitations
1616// under the License.
1717
18- use arrow:: array:: OffsetSizeTrait ;
18+ use arrow:: array:: { ArrayRef , OffsetSizeTrait , StringArray , StringViewBuilder } ;
1919use arrow:: datatypes:: { DataType , Field } ;
2020use arrow:: util:: bench_util:: {
2121 create_string_array_with_len, create_string_view_array_with_len,
@@ -47,52 +47,124 @@ fn create_args<O: OffsetSizeTrait>(
4747 }
4848}
4949
50+ /// Create a Utf8 array where every value contains non-ASCII Unicode text.
51+ fn create_unicode_utf8_args ( size : usize ) -> Vec < ColumnarValue > {
52+ let array = Arc :: new ( StringArray :: from_iter_values ( std:: iter:: repeat_n (
53+ "ñAnDÚ ÁrBOL ОлЕГ ÍslENsku" ,
54+ size,
55+ ) ) ) as ArrayRef ;
56+ vec ! [ ColumnarValue :: Array ( array) ]
57+ }
58+
59+ /// Create a Utf8View array where every value contains non-ASCII Unicode text.
60+ fn create_unicode_utf8view_args ( size : usize ) -> Vec < ColumnarValue > {
61+ let mut builder = StringViewBuilder :: with_capacity ( size) ;
62+ for _ in 0 ..size {
63+ builder. append_value ( "ñAnDÚ ÁrBOL ОлЕГ ÍslENsku" ) ;
64+ }
65+ let array = Arc :: new ( builder. finish ( ) ) as ArrayRef ;
66+ vec ! [ ColumnarValue :: Array ( array) ]
67+ }
68+
5069fn criterion_benchmark ( c : & mut Criterion ) {
5170 let initcap = unicode:: initcap ( ) ;
5271 let config_options = Arc :: new ( ConfigOptions :: default ( ) ) ;
5372
54- // Grouped benchmarks for array sizes - to compare with scalar performance
73+ // Array benchmarks: vary both row count and string length
74+ for size in [ 1024 , 4096 , 8192 ] {
75+ for str_len in [ 16 , 128 ] {
76+ let mut group =
77+ c. benchmark_group ( format ! ( "initcap size={size} str_len={str_len}" ) ) ;
78+ group. sampling_mode ( SamplingMode :: Flat ) ;
79+ group. sample_size ( 10 ) ;
80+ group. measurement_time ( Duration :: from_secs ( 10 ) ) ;
81+
82+ // Utf8
83+ let array_args = create_args :: < i32 > ( size, str_len, false ) ;
84+ let array_arg_fields = vec ! [ Field :: new( "arg_0" , DataType :: Utf8 , true ) . into( ) ] ;
85+
86+ group. bench_function ( "array_utf8" , |b| {
87+ b. iter ( || {
88+ black_box ( initcap. invoke_with_args ( ScalarFunctionArgs {
89+ args : array_args. clone ( ) ,
90+ arg_fields : array_arg_fields. clone ( ) ,
91+ number_rows : size,
92+ return_field : Field :: new ( "f" , DataType :: Utf8 , true ) . into ( ) ,
93+ config_options : Arc :: clone ( & config_options) ,
94+ } ) )
95+ } )
96+ } ) ;
97+
98+ // Utf8View
99+ let array_view_args = create_args :: < i32 > ( size, str_len, true ) ;
100+ let array_view_arg_fields =
101+ vec ! [ Field :: new( "arg_0" , DataType :: Utf8View , true ) . into( ) ] ;
102+
103+ group. bench_function ( "array_utf8view" , |b| {
104+ b. iter ( || {
105+ black_box ( initcap. invoke_with_args ( ScalarFunctionArgs {
106+ args : array_view_args. clone ( ) ,
107+ arg_fields : array_view_arg_fields. clone ( ) ,
108+ number_rows : size,
109+ return_field : Field :: new ( "f" , DataType :: Utf8View , true ) . into ( ) ,
110+ config_options : Arc :: clone ( & config_options) ,
111+ } ) )
112+ } )
113+ } ) ;
114+
115+ group. finish ( ) ;
116+ }
117+ }
118+
119+ // Unicode array benchmarks
55120 for size in [ 1024 , 4096 , 8192 ] {
56- let mut group = c. benchmark_group ( format ! ( "initcap size={size}" ) ) ;
121+ let mut group = c. benchmark_group ( format ! ( "initcap unicode size={size}" ) ) ;
57122 group. sampling_mode ( SamplingMode :: Flat ) ;
58123 group. sample_size ( 10 ) ;
59124 group. measurement_time ( Duration :: from_secs ( 10 ) ) ;
60125
61- // Array benchmark - Utf8
62- let array_args = create_args :: < i32 > ( size, 16 , false ) ;
63- let array_arg_fields = vec ! [ Field :: new( "arg_0" , DataType :: Utf8 , true ) . into( ) ] ;
64- let batch_len = size;
126+ let unicode_args = create_unicode_utf8_args ( size) ;
127+ let unicode_arg_fields = vec ! [ Field :: new( "arg_0" , DataType :: Utf8 , true ) . into( ) ] ;
65128
66129 group. bench_function ( "array_utf8" , |b| {
67130 b. iter ( || {
68131 black_box ( initcap. invoke_with_args ( ScalarFunctionArgs {
69- args : array_args . clone ( ) ,
70- arg_fields : array_arg_fields . clone ( ) ,
71- number_rows : batch_len ,
132+ args : unicode_args . clone ( ) ,
133+ arg_fields : unicode_arg_fields . clone ( ) ,
134+ number_rows : size ,
72135 return_field : Field :: new ( "f" , DataType :: Utf8 , true ) . into ( ) ,
73136 config_options : Arc :: clone ( & config_options) ,
74137 } ) )
75138 } )
76139 } ) ;
77140
78- // Array benchmark - Utf8View
79- let array_view_args = create_args :: < i32 > ( size, 16 , true ) ;
80- let array_view_arg_fields =
141+ let unicode_view_args = create_unicode_utf8view_args ( size) ;
142+ let unicode_view_arg_fields =
81143 vec ! [ Field :: new( "arg_0" , DataType :: Utf8View , true ) . into( ) ] ;
82144
83145 group. bench_function ( "array_utf8view" , |b| {
84146 b. iter ( || {
85147 black_box ( initcap. invoke_with_args ( ScalarFunctionArgs {
86- args : array_view_args . clone ( ) ,
87- arg_fields : array_view_arg_fields . clone ( ) ,
88- number_rows : batch_len ,
148+ args : unicode_view_args . clone ( ) ,
149+ arg_fields : unicode_view_arg_fields . clone ( ) ,
150+ number_rows : size ,
89151 return_field : Field :: new ( "f" , DataType :: Utf8View , true ) . into ( ) ,
90152 config_options : Arc :: clone ( & config_options) ,
91153 } ) )
92154 } )
93155 } ) ;
94156
95- // Scalar benchmark - Utf8 (the optimization we added)
157+ group. finish ( ) ;
158+ }
159+
160+ // Scalar benchmarks: independent of array size, run once
161+ {
162+ let mut group = c. benchmark_group ( "initcap scalar" ) ;
163+ group. sampling_mode ( SamplingMode :: Flat ) ;
164+ group. sample_size ( 10 ) ;
165+ group. measurement_time ( Duration :: from_secs ( 10 ) ) ;
166+
167+ // Utf8
96168 let scalar_args = vec ! [ ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some (
97169 "hello world test string" . to_string( ) ,
98170 ) ) ) ] ;
@@ -110,7 +182,7 @@ fn criterion_benchmark(c: &mut Criterion) {
110182 } )
111183 } ) ;
112184
113- // Scalar benchmark - Utf8View
185+ // Utf8View
114186 let scalar_view_args = vec ! [ ColumnarValue :: Scalar ( ScalarValue :: Utf8View ( Some (
115187 "hello world test string" . to_string( ) ,
116188 ) ) ) ] ;
0 commit comments