1616// under the License.
1717
1818use std:: any:: Any ;
19+ use std:: str:: from_utf8_unchecked;
1920use std:: sync:: Arc ;
2021
21- use arrow:: array:: { Array , StringArray } ;
22+ use arrow:: array:: { Array , StringBuilder } ;
2223use arrow:: datatypes:: DataType ;
2324use arrow:: {
2425 array:: { as_dictionary_array, as_largestring_array, as_string_array} ,
@@ -110,37 +111,104 @@ impl ScalarUDFImpl for SparkHex {
110111 }
111112}
112113
113- fn hex_int64 ( num : i64 ) -> String {
114- format ! ( "{num:X}" )
114+ #[ inline]
115+ fn hex_int64 ( num : i64 , buffer : & mut Vec < u8 > ) {
116+ const HEX_CHARS : & [ u8 ; 16 ] = b"0123456789ABCDEF" ;
117+
118+ if num == 0 {
119+ buffer. push ( HEX_CHARS [ 0 ] ) ;
120+ return ;
121+ }
122+
123+ let mut n = num;
124+ let mut temp = [ 0u8 ; 16 ] ;
125+ let mut i = 16 ;
126+ while n != 0 && i > 0 {
127+ i -= 1 ;
128+ let digest = ( n & 0xF ) as u8 ;
129+ temp[ i] = HEX_CHARS [ digest as usize ] ;
130+ n >>= 4 ;
131+ }
132+
133+ buffer. extend_from_slice ( & temp[ i..] ) ;
115134}
116135
117136/// Hex encoding lookup tables for fast byte-to-hex conversion
118137const HEX_CHARS_LOWER : & [ u8 ; 16 ] = b"0123456789abcdef" ;
119138const HEX_CHARS_UPPER : & [ u8 ; 16 ] = b"0123456789ABCDEF" ;
120139
121140#[ inline]
122- fn hex_encode < T : AsRef < [ u8 ] > > ( data : T , lower_case : bool ) -> String {
141+ fn hex_encode < T : AsRef < [ u8 ] > > ( data : T , lower_case : bool , buffer : & mut Vec < u8 > ) {
123142 let bytes = data. as_ref ( ) ;
124- let mut s = String :: with_capacity ( bytes. len ( ) * 2 ) ;
125143 let hex_chars = if lower_case {
126144 HEX_CHARS_LOWER
127145 } else {
128146 HEX_CHARS_UPPER
129147 } ;
130148 for & b in bytes {
131- s . push ( hex_chars[ ( b >> 4 ) as usize ] as char ) ;
132- s . push ( hex_chars[ ( b & 0x0f ) as usize ] as char ) ;
149+ buffer . push ( hex_chars[ ( b >> 4 ) as usize ] ) ;
150+ buffer . push ( hex_chars[ ( b & 0x0f ) as usize ] ) ;
133151 }
134- s
135152}
136153
137- # [ inline ( always ) ]
138- fn hex_bytes < T : AsRef < [ u8 ] > > (
139- bytes : T ,
154+ /// Generic hex encoding for byte array types
155+ fn hex_encode_bytes < ' a , I , T > (
156+ iter : I ,
140157 lowercase : bool ,
141- ) -> Result < String , std:: fmt:: Error > {
142- let hex_string = hex_encode ( bytes, lowercase) ;
143- Ok ( hex_string)
158+ len : usize ,
159+ ) -> Result < ColumnarValue , DataFusionError >
160+ where
161+ I : Iterator < Item = Option < T > > ,
162+ T : AsRef < [ u8 ] > + ' a ,
163+ {
164+ let mut builder = StringBuilder :: with_capacity ( len, len * 64 ) ;
165+ let mut buffer = Vec :: with_capacity ( 16 ) ;
166+ let hex_chars = if lowercase {
167+ HEX_CHARS_LOWER
168+ } else {
169+ HEX_CHARS_UPPER
170+ } ;
171+
172+ for v in iter {
173+ if let Some ( b) = v {
174+ buffer. clear ( ) ;
175+ let bytes = b. as_ref ( ) ;
176+ for & byte in bytes {
177+ buffer. push ( hex_chars[ ( byte >> 4 ) as usize ] ) ;
178+ buffer. push ( hex_chars[ ( byte & 0x0f ) as usize ] ) ;
179+ }
180+ unsafe {
181+ builder. append_value ( from_utf8_unchecked ( & buffer) ) ;
182+ }
183+ } else {
184+ builder. append_null ( ) ;
185+ }
186+ }
187+
188+ Ok ( ColumnarValue :: Array ( Arc :: new ( builder. finish ( ) ) ) )
189+ }
190+
191+ /// Generic hex encoding for int64 type
192+ fn hex_encode_int64 < ' a , I > ( iter : I , len : usize ) -> Result < ColumnarValue , DataFusionError >
193+ where
194+ I : Iterator < Item = Option < i64 > > ,
195+ {
196+ let mut builder = StringBuilder :: with_capacity ( len, len * 64 ) ;
197+ let mut buffer = Vec :: with_capacity ( 16 ) ;
198+
199+ for v in iter {
200+ if let Some ( num) = v {
201+ buffer. clear ( ) ;
202+ hex_int64 ( num, & mut buffer) ;
203+ unsafe {
204+ builder. append_value ( from_utf8_unchecked ( & buffer) ) ;
205+ }
206+ } else {
207+ builder. append_null ( ) ;
208+ }
209+ }
210+
211+ Ok ( ColumnarValue :: Array ( Arc :: new ( builder. finish ( ) ) ) )
144212}
145213
146214/// Spark-compatible `hex` function
@@ -166,103 +234,72 @@ pub fn compute_hex(
166234 ColumnarValue :: Array ( array) => match array. data_type ( ) {
167235 DataType :: Int64 => {
168236 let array = as_int64_array ( array) ?;
169-
170- let hexed_array: StringArray =
171- array. iter ( ) . map ( |v| v. map ( hex_int64) ) . collect ( ) ;
172-
173- Ok ( ColumnarValue :: Array ( Arc :: new ( hexed_array) ) )
237+ hex_encode_int64 ( array. iter ( ) , array. len ( ) )
174238 }
175239 DataType :: Utf8 => {
176240 let array = as_string_array ( array) ;
177-
178- let hexed: StringArray = array
179- . iter ( )
180- . map ( |v| v. map ( |b| hex_bytes ( b, lowercase) ) . transpose ( ) )
181- . collect :: < Result < _ , _ > > ( ) ?;
182-
183- Ok ( ColumnarValue :: Array ( Arc :: new ( hexed) ) )
241+ hex_encode_bytes ( array. iter ( ) , lowercase, array. len ( ) )
184242 }
185243 DataType :: Utf8View => {
186244 let array = as_string_view_array ( array) ?;
187-
188- let hexed: StringArray = array
189- . iter ( )
190- . map ( |v| v. map ( |b| hex_bytes ( b, lowercase) ) . transpose ( ) )
191- . collect :: < Result < _ , _ > > ( ) ?;
192-
193- Ok ( ColumnarValue :: Array ( Arc :: new ( hexed) ) )
245+ hex_encode_bytes ( array. iter ( ) , lowercase, array. len ( ) )
194246 }
195247 DataType :: LargeUtf8 => {
196248 let array = as_largestring_array ( array) ;
197-
198- let hexed: StringArray = array
199- . iter ( )
200- . map ( |v| v. map ( |b| hex_bytes ( b, lowercase) ) . transpose ( ) )
201- . collect :: < Result < _ , _ > > ( ) ?;
202-
203- Ok ( ColumnarValue :: Array ( Arc :: new ( hexed) ) )
249+ hex_encode_bytes ( array. iter ( ) , lowercase, array. len ( ) )
204250 }
205251 DataType :: Binary => {
206252 let array = as_binary_array ( array) ?;
207-
208- let hexed: StringArray = array
209- . iter ( )
210- . map ( |v| v. map ( |b| hex_bytes ( b, lowercase) ) . transpose ( ) )
211- . collect :: < Result < _ , _ > > ( ) ?;
212-
213- Ok ( ColumnarValue :: Array ( Arc :: new ( hexed) ) )
253+ hex_encode_bytes ( array. iter ( ) , lowercase, array. len ( ) )
214254 }
215255 DataType :: LargeBinary => {
216256 let array = as_large_binary_array ( array) ?;
217-
218- let hexed: StringArray = array
219- . iter ( )
220- . map ( |v| v. map ( |b| hex_bytes ( b, lowercase) ) . transpose ( ) )
221- . collect :: < Result < _ , _ > > ( ) ?;
222-
223- Ok ( ColumnarValue :: Array ( Arc :: new ( hexed) ) )
257+ hex_encode_bytes ( array. iter ( ) , lowercase, array. len ( ) )
224258 }
225259 DataType :: FixedSizeBinary ( _) => {
226260 let array = as_fixed_size_binary_array ( array) ?;
227-
228- let hexed: StringArray = array
229- . iter ( )
230- . map ( |v| v. map ( |b| hex_bytes ( b, lowercase) ) . transpose ( ) )
231- . collect :: < Result < _ , _ > > ( ) ?;
232-
233- Ok ( ColumnarValue :: Array ( Arc :: new ( hexed) ) )
261+ hex_encode_bytes ( array. iter ( ) , lowercase, array. len ( ) )
234262 }
235263 DataType :: Dictionary ( _, value_type) => {
236264 let dict = as_dictionary_array :: < Int32Type > ( & array) ;
237-
238- let values = match * * value_type {
239- DataType :: Int64 => as_int64_array ( dict. values ( ) ) ?
240- . iter ( )
241- . map ( |v| v. map ( hex_int64) )
242- . collect :: < Vec < _ > > ( ) ,
243- DataType :: Utf8 => as_string_array ( dict. values ( ) )
244- . iter ( )
245- . map ( |v| v. map ( |b| hex_bytes ( b, lowercase) ) . transpose ( ) )
246- . collect :: < Result < _ , _ > > ( ) ?,
247- DataType :: Binary => as_binary_array ( dict. values ( ) ) ?
248- . iter ( )
249- . map ( |v| v. map ( |b| hex_bytes ( b, lowercase) ) . transpose ( ) )
250- . collect :: < Result < _ , _ > > ( ) ?,
251- _ => exec_err ! (
252- "hex got an unexpected argument type: {}" ,
253- array. data_type( )
254- ) ?,
255- } ;
256-
257- let new_values: Vec < Option < String > > = dict
258- . keys ( )
259- . iter ( )
260- . map ( |key| key. map ( |k| values[ k as usize ] . clone ( ) ) . unwrap_or ( None ) )
261- . collect ( ) ;
262-
263- let string_array_values = StringArray :: from ( new_values) ;
264-
265- Ok ( ColumnarValue :: Array ( Arc :: new ( string_array_values) ) )
265+ let keys = dict. keys ( ) ;
266+ let values = dict. values ( ) ;
267+ // let mut buffer = Vec::with_capacity(16);
268+
269+ match * * value_type {
270+ DataType :: Int64 => {
271+ let int_values = as_int64_array ( values) ?;
272+ hex_encode_int64 (
273+ keys. iter ( ) . map ( |k| k. map ( |idx| int_values. value ( idx as usize ) ) ) ,
274+ dict. len ( ) ,
275+ )
276+ }
277+ DataType :: Utf8 => {
278+ let str_values = as_string_array ( values) ;
279+ hex_encode_bytes (
280+ keys. iter ( ) . map ( |k| {
281+ k. map ( |idx| str_values. value ( idx as usize ) . as_bytes ( ) )
282+ } ) ,
283+ lowercase,
284+ dict. len ( ) ,
285+ )
286+ }
287+ DataType :: Binary => {
288+ let bin_values = as_binary_array ( values) ?;
289+ hex_encode_bytes (
290+ keys. iter ( )
291+ . map ( |k| k. map ( |idx| bin_values. value ( idx as usize ) ) ) ,
292+ lowercase,
293+ dict. len ( ) ,
294+ )
295+ }
296+ _ => {
297+ exec_err ! (
298+ "hex got an unexpected argument type: {}" ,
299+ array. data_type( )
300+ )
301+ }
302+ }
266303 }
267304 _ => exec_err ! ( "hex got an unexpected argument type: {}" , array. data_type( ) ) ,
268305 } ,
@@ -272,6 +309,7 @@ pub fn compute_hex(
272309
273310#[ cfg( test) ]
274311mod test {
312+ use std:: str:: from_utf8_unchecked;
275313 use std:: sync:: Arc ;
276314
277315 use arrow:: array:: { Int64Array , StringArray } ;
@@ -374,12 +412,18 @@ mod test {
374412 #[ test]
375413 fn test_hex_int64 ( ) {
376414 let num = 1234 ;
377- let hexed = super :: hex_int64 ( num) ;
378- assert_eq ! ( hexed, "4D2" . to_string( ) ) ;
415+ let mut cache = Vec :: with_capacity ( 16 ) ;
416+ super :: hex_int64 ( num, & mut cache) ;
417+ unsafe {
418+ assert_eq ! ( from_utf8_unchecked( & cache) , "4D2" . to_string( ) ) ;
419+ }
379420
380421 let num = -1 ;
381- let hexed = super :: hex_int64 ( num) ;
382- assert_eq ! ( hexed, "FFFFFFFFFFFFFFFF" . to_string( ) ) ;
422+ cache. clear ( ) ;
423+ super :: hex_int64 ( num, & mut cache) ;
424+ unsafe {
425+ assert_eq ! ( from_utf8_unchecked( & cache) , "FFFFFFFFFFFFFFFF" . to_string( ) ) ;
426+ }
383427 }
384428
385429 #[ test]
0 commit comments