@@ -117,9 +117,14 @@ where
117117 let mut buffer = cell. try_borrow_mut ( )
118118 . map_err ( |_| _internal_datafusion_err ! ( "with_hashes cannot be called reentrantly on the same thread" ) ) ?;
119119
120- // Ensure buffer has sufficient length, clearing old values
120+ // Ensure buffer has sufficient capacity without zero-filling.
121+ // create_hashes writes all positions (including null sentinels),
122+ // so pre-zeroing is unnecessary.
121123 buffer. clear ( ) ;
122- buffer. resize ( required_size, 0 ) ;
124+ buffer. reserve ( required_size) ;
125+ // SAFETY: create_hashes will write every position in the buffer
126+ // (null positions get a consistent sentinel hash).
127+ unsafe { buffer. set_len ( required_size) } ;
123128
124129 // Create hashes in the buffer - this consumes the iterator
125130 create_hashes ( iter, random_state, & mut buffer[ ..required_size] ) ?;
@@ -244,6 +249,10 @@ fn hash_array_primitive<T>(
244249 hashes_buffer[ i] = hasher. finish ( ) ;
245250 }
246251 } else {
252+ // Fill with null sentinel, then overwrite valid positions.
253+ // This allows callers to skip pre-zeroing the buffer.
254+ let null_hash = random_state. hash_one ( 1u8 ) ;
255+ hashes_buffer. fill ( null_hash) ;
247256 for i in array. nulls ( ) . unwrap ( ) . valid_indices ( ) {
248257 let value = unsafe { array. value_unchecked ( i) } ;
249258 hashes_buffer[ i] = value. hash_one ( random_state) ;
@@ -289,6 +298,10 @@ fn hash_array<T>(
289298 combine_hashes ( value. hash_one ( random_state) , hashes_buffer[ i] ) ;
290299 }
291300 } else {
301+ // Fill with null sentinel, then overwrite valid positions.
302+ // This allows callers to skip pre-zeroing the buffer.
303+ let null_hash = random_state. hash_one ( 1u8 ) ;
304+ hashes_buffer. fill ( null_hash) ;
292305 for i in array. nulls ( ) . unwrap ( ) . valid_indices ( ) {
293306 let value = unsafe { array. value_unchecked ( i) } ;
294307 hashes_buffer[ i] = value. hash_one ( random_state) ;
@@ -331,9 +344,13 @@ fn hash_string_view_array_inner<
331344 }
332345 } ;
333346
347+ let null_hash = random_state. hash_one ( 1u8 ) ;
334348 let hashes_and_views = hashes_buffer. iter_mut ( ) . zip ( array. views ( ) . iter ( ) ) ;
335349 for ( i, ( hash, & v) ) in hashes_and_views. enumerate ( ) {
336350 if HAS_NULLS && array. is_null ( i) {
351+ if !REHASH {
352+ * hash = null_hash;
353+ }
337354 continue ;
338355 }
339356 let view_len = v as u32 ;
@@ -447,6 +464,7 @@ fn hash_dictionary_inner<
447464 let mut dict_hashes = vec ! [ 0 ; dict_values. len( ) ] ;
448465 create_hashes ( [ dict_values] , random_state, & mut dict_hashes) ?;
449466
467+ let null_hash = random_state. hash_one ( 1u8 ) ;
450468 if HAS_NULL_KEYS {
451469 for ( hash, key) in hashes_buffer. iter_mut ( ) . zip ( array. keys ( ) . iter ( ) ) {
452470 if let Some ( key) = key {
@@ -457,7 +475,11 @@ fn hash_dictionary_inner<
457475 } else {
458476 * hash = dict_hashes[ idx] ;
459477 }
478+ } else if !MULTI_COL {
479+ * hash = null_hash;
460480 }
481+ } else if !MULTI_COL {
482+ * hash = null_hash;
461483 }
462484 }
463485 } else {
@@ -469,6 +491,8 @@ fn hash_dictionary_inner<
469491 } else {
470492 * hash = dict_hashes[ idx] ;
471493 }
494+ } else if !MULTI_COL {
495+ * hash = null_hash;
472496 }
473497 }
474498 }
@@ -916,6 +940,10 @@ fn hash_run_array_inner<
916940 let end_in_slice = ( absolute_run_end - array_offset) . min ( array_len) ;
917941
918942 if HAS_NULL_VALUES && sliced_values. is_null ( adjusted_physical_index) {
943+ if !REHASH {
944+ let null_hash = random_state. hash_one ( 1u8 ) ;
945+ hashes_buffer[ start_in_slice..end_in_slice] . fill ( null_hash) ;
946+ }
919947 start_in_slice = end_in_slice;
920948 continue ;
921949 }
@@ -1103,11 +1131,34 @@ where
11031131 for ( i, array) in arrays. into_iter ( ) . enumerate ( ) {
11041132 // combine hashes with `combine_hashes` for all columns besides the first
11051133 let rehash = i >= 1 ;
1106- hash_single_array ( array. as_dyn_array ( ) , random_state, hashes_buffer, rehash) ?;
1134+ let arr = array. as_dyn_array ( ) ;
1135+ // Complex types (struct, list, map, union) always combine with
1136+ // existing hash values rather than initializing them, so the buffer
1137+ // must be zeroed when they appear as the first column.
1138+ if !rehash && needs_zero_init ( arr. data_type ( ) ) {
1139+ hashes_buffer. fill ( 0 ) ;
1140+ }
1141+ hash_single_array ( arr, random_state, hashes_buffer, rehash) ?;
11071142 }
11081143 Ok ( hashes_buffer)
11091144}
11101145
1146+ /// Returns true for types whose hash functions always combine with existing
1147+ /// buffer values (no `rehash=false` path), requiring zero-initialized buffers.
1148+ fn needs_zero_init ( dt : & DataType ) -> bool {
1149+ matches ! (
1150+ dt,
1151+ DataType :: Struct ( _)
1152+ | DataType :: List ( _)
1153+ | DataType :: LargeList ( _)
1154+ | DataType :: ListView ( _)
1155+ | DataType :: LargeListView ( _)
1156+ | DataType :: Map ( _, _)
1157+ | DataType :: FixedSizeList ( _, _)
1158+ | DataType :: Union ( _, _)
1159+ )
1160+ }
1161+
11111162#[ cfg( test) ]
11121163mod tests {
11131164 use std:: sync:: Arc ;
@@ -1190,11 +1241,12 @@ mod tests {
11901241 create_hashes( & [ binary_array] , & random_state, & mut binary_hashes)
11911242 . unwrap( ) ;
11921243
1193- // Null values result in a zero hash,
1244+ // Null values result in a consistent null sentinel hash
1245+ let null_hash = random_state. hash_one( 1u8 ) ;
11941246 for ( val, hash) in binary. iter( ) . zip( binary_hashes. iter( ) ) {
11951247 match val {
11961248 Some ( _) => assert_ne!( * hash, 0 ) ,
1197- None => assert_eq!( * hash, 0 ) ,
1249+ None => assert_eq!( * hash, null_hash ) ,
11981250 }
11991251 }
12001252
@@ -1260,11 +1312,12 @@ mod tests {
12601312 let mut dict_hashes = vec![ 0 ; strings. len( ) ] ;
12611313 create_hashes( & [ dict_array] , & random_state, & mut dict_hashes) . unwrap( ) ;
12621314
1263- // Null values result in a zero hash,
1315+ // Null values result in a consistent null sentinel hash
1316+ let null_hash = random_state. hash_one( 1u8 ) ;
12641317 for ( val, hash) in strings. iter( ) . zip( string_hashes. iter( ) ) {
12651318 match val {
12661319 Some ( _) => assert_ne!( * hash, 0 ) ,
1267- None => assert_eq!( * hash, 0 ) ,
1320+ None => assert_eq!( * hash, null_hash ) ,
12681321 }
12691322 }
12701323
@@ -1377,11 +1430,12 @@ mod tests {
13771430 let mut dict_hashes = vec ! [ 0 ; strings. len( ) ] ;
13781431 create_hashes ( & [ dict_array] , & random_state, & mut dict_hashes) . unwrap ( ) ;
13791432
1380- // Null values result in a zero hash,
1433+ // Null values result in a consistent null sentinel hash
1434+ let null_hash = random_state. hash_one ( 1u8 ) ;
13811435 for ( val, hash) in strings. iter ( ) . zip ( string_hashes. iter ( ) ) {
13821436 match val {
13831437 Some ( _) => assert_ne ! ( * hash, 0 ) ,
1384- None => assert_eq ! ( * hash, 0 ) ,
1438+ None => assert_eq ! ( * hash, null_hash ) ,
13851439 }
13861440 }
13871441
@@ -2047,10 +2101,11 @@ mod tests {
20472101 & mut hashes,
20482102 ) ?;
20492103
2104+ let null_hash = random_state. hash_one ( 1u8 ) ;
20502105 assert_eq ! ( hashes[ 0 ] , hashes[ 1 ] ) ;
20512106 assert_ne ! ( hashes[ 0 ] , 0 ) ;
20522107 assert_eq ! ( hashes[ 2 ] , hashes[ 3 ] ) ;
2053- assert_eq ! ( hashes[ 2 ] , 0 ) ;
2108+ assert_eq ! ( hashes[ 2 ] , null_hash ) ;
20542109 assert_eq ! ( hashes[ 4 ] , hashes[ 5 ] ) ;
20552110 assert_ne ! ( hashes[ 4 ] , 0 ) ;
20562111 assert_ne ! ( hashes[ 0 ] , hashes[ 4 ] ) ;
0 commit comments