@@ -206,6 +206,62 @@ macro_rules! hash_array_decimal {
206206 } ;
207207}
208208
209+ /// Hash a list array by recursively hashing each element.
210+ /// For each row, we hash all elements in the list.
211+ /// Spark hashes arrays by recursively hashing each element, where each
212+ /// element's hash is computed using the previous element's hash as the seed.
213+ /// This creates a chain: hash(elem_n, hash(elem_n-1, ... hash(elem_0, seed)...))
214+ #[ macro_export]
215+ macro_rules! hash_list_array {
216+ ( $array_type: ident, $offset_type: ty, $column: ident, $hashes: ident, $recursive_hash_method: ident) => {
217+ let list_array = $column
218+ . as_any( )
219+ . downcast_ref:: <$array_type>( )
220+ . unwrap_or_else( || {
221+ panic!(
222+ "Failed to downcast column to {}. Actual data type: {:?}." ,
223+ stringify!( $array_type) ,
224+ $column. data_type( )
225+ )
226+ } ) ;
227+
228+ let values = list_array. values( ) ;
229+ let offsets = list_array. offsets( ) ;
230+
231+ if list_array. null_count( ) == 0 {
232+ // Fast path: no nulls, skip null checks
233+ for ( row_idx, hash) in $hashes. iter_mut( ) . enumerate( ) {
234+ let start = offsets[ row_idx] as usize ;
235+ let end = offsets[ row_idx + 1 ] as usize ;
236+ let len = end - start;
237+ // Hash each element in sequence, chaining the hash values
238+ for elem_idx in 0 ..len {
239+ let elem_array = values. slice( start + elem_idx, 1 ) ;
240+ let mut single_hash = [ * hash] ;
241+ $recursive_hash_method( & [ elem_array] , & mut single_hash) ?;
242+ * hash = single_hash[ 0 ] ;
243+ }
244+ }
245+ } else {
246+ // Slow path: array has nulls, check each row
247+ for ( row_idx, hash) in $hashes. iter_mut( ) . enumerate( ) {
248+ if !list_array. is_null( row_idx) {
249+ let start = offsets[ row_idx] as usize ;
250+ let end = offsets[ row_idx + 1 ] as usize ;
251+ let len = end - start;
252+ // Hash each element in sequence, chaining the hash values
253+ for elem_idx in 0 ..len {
254+ let elem_array = values. slice( start + elem_idx, 1 ) ;
255+ let mut single_hash = [ * hash] ;
256+ $recursive_hash_method( & [ elem_array] , & mut single_hash) ?;
257+ * hash = single_hash[ 0 ] ;
258+ }
259+ }
260+ }
261+ }
262+ } ;
263+ }
264+
209265/// Creates hash values for every row, based on the values in the
210266/// columns.
211267///
@@ -214,9 +270,10 @@ macro_rules! hash_array_decimal {
214270///
215271/// `hash_method` is the hash function to use.
216272/// `create_dictionary_hash_method` is the function to create hashes for dictionary arrays input.
273+ /// `recursive_hash_method` is the function to call for recursive hashing of complex types.
217274#[ macro_export]
218275macro_rules! create_hashes_internal {
219- ( $arrays: ident, $hashes_buffer: ident, $hash_method: ident, $create_dictionary_hash_method: ident) => {
276+ ( $arrays: ident, $hashes_buffer: ident, $hash_method: ident, $create_dictionary_hash_method: ident, $recursive_hash_method : ident ) => {
220277 use arrow:: datatypes:: { DataType , TimeUnit } ;
221278 use arrow:: array:: { types:: * , * } ;
222279
@@ -425,6 +482,105 @@ macro_rules! create_hashes_internal {
425482 ) ) )
426483 }
427484 } ,
485+ DataType :: List ( _) => {
486+ $crate:: hash_list_array!( ListArray , i32 , col, $hashes_buffer, $recursive_hash_method) ;
487+ }
488+ DataType :: LargeList ( _) => {
489+ $crate:: hash_list_array!( LargeListArray , i64 , col, $hashes_buffer, $recursive_hash_method) ;
490+ }
491+ DataType :: FixedSizeList ( _, size) => {
492+ let list_array = col. as_any( ) . downcast_ref:: <FixedSizeListArray >( ) . unwrap( ) ;
493+ let values = list_array. values( ) ;
494+ let list_size = * size as usize ;
495+
496+ if list_array. null_count( ) == 0 {
497+ // Fast path: no nulls, skip null checks
498+ for ( row_idx, hash) in $hashes_buffer. iter_mut( ) . enumerate( ) {
499+ let start = row_idx * list_size;
500+ // Hash each element in sequence, chaining the hash values
501+ for elem_idx in 0 ..list_size {
502+ let elem_array = values. slice( start + elem_idx, 1 ) ;
503+ let mut single_hash = [ * hash] ;
504+ $recursive_hash_method( & [ elem_array] , & mut single_hash) ?;
505+ * hash = single_hash[ 0 ] ;
506+ }
507+ }
508+ } else {
509+ // Slow path: array has nulls, check each row
510+ for ( row_idx, hash) in $hashes_buffer. iter_mut( ) . enumerate( ) {
511+ if !list_array. is_null( row_idx) {
512+ let start = row_idx * list_size;
513+ // Hash each element in sequence, chaining the hash values
514+ for elem_idx in 0 ..list_size {
515+ let elem_array = values. slice( start + elem_idx, 1 ) ;
516+ let mut single_hash = [ * hash] ;
517+ $recursive_hash_method( & [ elem_array] , & mut single_hash) ?;
518+ * hash = single_hash[ 0 ] ;
519+ }
520+ }
521+ }
522+ }
523+ }
524+ DataType :: Struct ( _) => {
525+ let struct_array = col. as_any( ) . downcast_ref:: <StructArray >( ) . unwrap( ) ;
526+ // Hash each field of the struct - Spark hashes all fields recursively
527+ let columns: Vec <ArrayRef > = struct_array. columns( ) . to_vec( ) ;
528+ if !columns. is_empty( ) {
529+ $recursive_hash_method( & columns, $hashes_buffer) ?;
530+ }
531+ }
532+ DataType :: Map ( _, _) => {
533+ let map_array = col. as_any( ) . downcast_ref:: <MapArray >( ) . unwrap( ) ;
534+ // For maps, Spark hashes by iterating through (key, value) pairs
535+ // For each entry, hash the key then the value
536+ let keys = map_array. keys( ) ;
537+ let values = map_array. values( ) ;
538+ let offsets = map_array. offsets( ) ;
539+
540+ if map_array. null_count( ) == 0 {
541+ // Fast path: no nulls, skip null checks
542+ for ( row_idx, hash) in $hashes_buffer. iter_mut( ) . enumerate( ) {
543+ let start = offsets[ row_idx] as usize ;
544+ let end = offsets[ row_idx + 1 ] as usize ;
545+ // Hash each key-value pair in sequence
546+ for entry_idx in start..end {
547+ // Hash the key
548+ let key_array = keys. slice( entry_idx, 1 ) ;
549+ let mut single_hash = [ * hash] ;
550+ $recursive_hash_method( & [ key_array] , & mut single_hash) ?;
551+ * hash = single_hash[ 0 ] ;
552+
553+ // Hash the value
554+ let value_array = values. slice( entry_idx, 1 ) ;
555+ single_hash = [ * hash] ;
556+ $recursive_hash_method( & [ value_array] , & mut single_hash) ?;
557+ * hash = single_hash[ 0 ] ;
558+ }
559+ }
560+ } else {
561+ // Slow path: array has nulls, check each row
562+ for ( row_idx, hash) in $hashes_buffer. iter_mut( ) . enumerate( ) {
563+ if !map_array. is_null( row_idx) {
564+ let start = offsets[ row_idx] as usize ;
565+ let end = offsets[ row_idx + 1 ] as usize ;
566+ // Hash each key-value pair in sequence
567+ for entry_idx in start..end {
568+ // Hash the key
569+ let key_array = keys. slice( entry_idx, 1 ) ;
570+ let mut single_hash = [ * hash] ;
571+ $recursive_hash_method( & [ key_array] , & mut single_hash) ?;
572+ * hash = single_hash[ 0 ] ;
573+
574+ // Hash the value
575+ let value_array = values. slice( entry_idx, 1 ) ;
576+ single_hash = [ * hash] ;
577+ $recursive_hash_method( & [ value_array] , & mut single_hash) ?;
578+ * hash = single_hash[ 0 ] ;
579+ }
580+ }
581+ }
582+ }
583+ }
428584 _ => {
429585 // This is internal because we should have caught this before.
430586 return Err ( DataFusionError :: Internal ( format!(
0 commit comments