@@ -49,12 +49,11 @@ macro_rules! decode_list {
4949 if len == -1 {
5050 builder. values( ) . append_null( ) ;
5151 } else {
52- let s = std:: str :: from_utf8(
53- & raw_vector[ offset..offset + len as usize ] ,
54- )
55- . map_err( |e| {
56- Internal ( format!( "Invalid utf8 in list element: {e}" ) )
57- } ) ?;
52+ let s = unsafe {
53+ std:: str :: from_utf8_unchecked(
54+ & raw_vector[ offset..offset + len as usize ] ,
55+ )
56+ } ;
5857 builder. values( ) . append_value( s) ;
5958 offset += len as usize ;
6059 }
@@ -72,9 +71,7 @@ macro_rules! decode_scalar_string {
7271 for raw_bytes in $raw {
7372 match raw_bytes {
7473 Some ( raw_vector) => {
75- let s = std:: str :: from_utf8( raw_vector) . map_err( |e| {
76- Internal ( format!( "Invalid utf8 in GroupValuesDictionary: {e}" ) )
77- } ) ?;
74+ let s = unsafe { std:: str :: from_utf8_unchecked( raw_vector) } ;
7875 builder. append_value( s) ;
7976 }
8077 None => builder. append_null( ) ,
@@ -116,7 +113,6 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> {
116113 create_hashes ( [ Arc :: clone ( values) ] , & self . random_state , & mut hashes) ?;
117114 Ok ( hashes)
118115 }
119-
120116 fn get_raw_bytes ( values : & ArrayRef , index : usize ) -> Cow < ' _ , [ u8 ] > {
121117 match values. data_type ( ) {
122118 DataType :: Utf8 => Cow :: Borrowed (
@@ -256,33 +252,35 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> {
256252 key_array : & PrimitiveArray < K > ,
257253 ) -> ( ArrayRef , Vec < Option < usize > > ) {
258254 let mut key_to_canonical: Vec < Option < usize > > = vec ! [ None ; values. len( ) ] ;
255+ let mut canonical_indices: Vec < usize > = Vec :: with_capacity ( key_array. len ( ) ) ;
259256 let mut bytes_to_canonical: HashMap < Vec < u8 > , usize > =
260257 HashMap :: with_capacity ( values. len ( ) ) ;
261- let mut canonical_indices: Vec < usize > = Vec :: with_capacity ( key_array. len ( ) ) ;
262258
263- let new_keys: Vec < Option < usize > > = ( 0 ..key_array. len ( ) )
264- . map ( |i| {
265- if key_array. is_null ( i) {
266- return None ;
267- }
268- let old_key = key_array. value ( i) . to_usize ( ) . unwrap ( ) ;
269- if let Some ( canonical) = key_to_canonical[ old_key] {
270- return Some ( canonical) ;
259+ let mut new_keys: Vec < Option < usize > > = Vec :: with_capacity ( key_array. len ( ) ) ;
260+ for i in 0 ..key_array. len ( ) {
261+ if key_array. is_null ( i) {
262+ new_keys. push ( None ) ;
263+ continue ;
264+ }
265+ let old_key = key_array. value ( i) . to_usize ( ) . unwrap ( ) ;
266+ if let Some ( canonical) = key_to_canonical[ old_key] {
267+ new_keys. push ( Some ( canonical) ) ;
268+ continue ;
269+ }
270+ let raw = Self :: get_raw_bytes ( values, old_key) . to_vec ( ) ;
271+
272+ let canonical = match bytes_to_canonical. get ( & raw ) {
273+ Some ( & idx) => idx,
274+ None => {
275+ let idx = canonical_indices. len ( ) ;
276+ canonical_indices. push ( old_key) ;
277+ bytes_to_canonical. insert ( raw, idx) ;
278+ idx
271279 }
272- let raw = Self :: get_raw_bytes ( values, old_key) . to_vec ( ) ;
273- let canonical = match bytes_to_canonical. get ( & raw ) {
274- Some ( & idx) => idx,
275- None => {
276- let idx = canonical_indices. len ( ) ;
277- canonical_indices. push ( old_key) ;
278- bytes_to_canonical. insert ( raw, idx) ;
279- idx
280- }
281- } ;
282- key_to_canonical[ old_key] = Some ( canonical) ;
283- Some ( canonical)
284- } )
285- . collect ( ) ;
280+ } ;
281+ key_to_canonical[ old_key] = Some ( canonical) ;
282+ new_keys. push ( Some ( canonical) ) ;
283+ }
286284
287285 let indices = UInt64Array :: from (
288286 canonical_indices
@@ -298,7 +296,7 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> {
298296 values : & ArrayRef ,
299297 key_array : & PrimitiveArray < K > ,
300298 ) -> ( ArrayRef , Vec < Option < usize > > ) {
301- if values. len ( ) < 20 {
299+ if values. len ( ) < 25 {
302300 //benchmarks show that for small arrays a nested loops works better due to cache locality and lower overhead,
303301 Self :: normalize_dict_linear ( values, key_array)
304302 } else {
@@ -334,7 +332,6 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K>
334332 "GroupValuesDictionary only supports a single column"
335333 ) ;
336334 let array = Arc :: clone ( & cols[ 0 ] ) ;
337- groups. clear ( ) ; // zero out buffer
338335 let dict_array = array
339336 . as_any ( )
340337 . downcast_ref :: < DictionaryArray < K > > ( )
@@ -353,6 +350,8 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K>
353350 if key_array. is_empty ( ) {
354351 return Ok ( ( ) ) ;
355352 }
353+ groups. clear ( ) ; // zero out buffer
354+ groups. reserve ( key_array. len ( ) ) ;
356355 let ( values, keys_as_usize) = Self :: normalize_dict_array ( values, key_array) ;
357356
358357 let values = & values;
0 commit comments