1919
2020use ahash:: RandomState ;
2121use arrow:: array:: {
22- Array , ArrayRef , ArrowPrimitiveType , DictionaryArray , GenericStringArray ,
23- NullBufferBuilder , OffsetSizeTrait , PrimitiveArray , RunArray , StringViewArray ,
24- StructArray , make_array,
22+ Array , ArrayRef , ArrowPrimitiveType , DictionaryArray , GenericStringArray , Int32Array ,
23+ Int64Array , ListArray , MapArray , NullBufferBuilder , OffsetSizeTrait , PrimitiveArray ,
24+ RunArray , StringViewArray , StructArray , UnionArray , make_array,
2525} ;
26- use arrow:: buffer:: NullBuffer ;
26+ use arrow:: buffer:: { NullBuffer , OffsetBuffer , ScalarBuffer } ;
2727use arrow:: datatypes:: {
28- ArrowDictionaryKeyType , DataType , Field , Fields , Int32Type , Int64Type ,
28+ ArrowDictionaryKeyType , DataType , Field , Fields , Int32Type , Int64Type , UnionFields ,
2929} ;
3030use criterion:: { Bencher , Criterion , criterion_group, criterion_main} ;
3131use datafusion_common:: hash_utils:: with_hashes;
@@ -40,6 +40,7 @@ const BATCH_SIZE: usize = 8192;
4040struct BenchData {
4141 name : & ' static str ,
4242 array : ArrayRef ,
43+ /// Union arrays can't have null bitmasks added
4344 supports_nulls : bool ,
4445}
4546
@@ -78,6 +79,26 @@ fn criterion_benchmark(c: &mut Criterion) {
7879 array : pool. dictionary_array :: < Int32Type > ( BATCH_SIZE ) ,
7980 supports_nulls : true ,
8081 } ,
82+ BenchData {
83+ name : "list_array" ,
84+ array : list_array ( BATCH_SIZE ) ,
85+ supports_nulls : true ,
86+ } ,
87+ BenchData {
88+ name : "map_array" ,
89+ array : map_array ( BATCH_SIZE ) ,
90+ supports_nulls : true ,
91+ } ,
92+ BenchData {
93+ name : "sparse_union" ,
94+ array : sparse_union_array ( BATCH_SIZE ) ,
95+ supports_nulls : false ,
96+ } ,
97+ BenchData {
98+ name : "dense_union" ,
99+ array : dense_union_array ( BATCH_SIZE ) ,
100+ supports_nulls : false ,
101+ } ,
81102 BenchData {
82103 name : "struct_array" ,
83104 array : create_struct_array ( & pool, BATCH_SIZE ) ,
@@ -103,10 +124,9 @@ fn criterion_benchmark(c: &mut Criterion) {
103124 let arrays = vec ! [ array. clone( ) , array. clone( ) , array. clone( ) ] ;
104125 do_hash_test ( b, & arrays) ;
105126 } ) ;
106-
127+ // Union arrays can't have null bitmasks
107128 if supports_nulls {
108129 let nullable_array = add_nulls ( & array) ;
109-
110130 c. bench_function ( & format ! ( "{name}: single, nulls" ) , |b| {
111131 do_hash_test ( b, std:: slice:: from_ref ( & nullable_array) ) ;
112132 } ) ;
@@ -268,6 +288,222 @@ where
268288 Arc :: new ( array)
269289}
270290
291+ /// Benchmark sliced arrays to demonstrate the optimization for when an array is
292+ /// sliced, the underlying buffer may be much larger than what's referenced by
293+ /// the slice. The optimization avoids hashing unreferenced elements.
294+ fn sliced_array_benchmark ( c : & mut Criterion ) {
295+ // Test with different slice ratios: slice_size / total_size
296+ // Smaller ratio = more potential savings from the optimization
297+ let slice_ratios = [ 10 , 5 , 2 ] ; // 1/10, 1/5, 1/2 of total
298+
299+ for ratio in slice_ratios {
300+ let total_rows = BATCH_SIZE * ratio;
301+ let slice_offset = BATCH_SIZE * ( ratio / 2 ) ; // Take from middle
302+ let slice_len = BATCH_SIZE ;
303+
304+ // Sliced ListArray
305+ {
306+ let full_array = list_array ( total_rows) ;
307+ let sliced: ArrayRef = Arc :: new (
308+ full_array
309+ . as_any ( )
310+ . downcast_ref :: < ListArray > ( )
311+ . unwrap ( )
312+ . slice ( slice_offset, slice_len) ,
313+ ) ;
314+ c. bench_function (
315+ & format ! ( "list_array_sliced: 1/{ratio} of {total_rows} rows" ) ,
316+ |b| {
317+ do_hash_test_with_len ( b, std:: slice:: from_ref ( & sliced) , slice_len) ;
318+ } ,
319+ ) ;
320+ }
321+
322+ // Sliced MapArray
323+ {
324+ let full_array = map_array ( total_rows) ;
325+ let sliced: ArrayRef = Arc :: new (
326+ full_array
327+ . as_any ( )
328+ . downcast_ref :: < MapArray > ( )
329+ . unwrap ( )
330+ . slice ( slice_offset, slice_len) ,
331+ ) ;
332+ c. bench_function (
333+ & format ! ( "map_array_sliced: 1/{ratio} of {total_rows} rows" ) ,
334+ |b| {
335+ do_hash_test_with_len ( b, std:: slice:: from_ref ( & sliced) , slice_len) ;
336+ } ,
337+ ) ;
338+ }
339+
340+ // Sliced Sparse UnionArray
341+ {
342+ let full_array = sparse_union_array ( total_rows) ;
343+ let sliced: ArrayRef = Arc :: new (
344+ full_array
345+ . as_any ( )
346+ . downcast_ref :: < UnionArray > ( )
347+ . unwrap ( )
348+ . slice ( slice_offset, slice_len) ,
349+ ) ;
350+ c. bench_function (
351+ & format ! ( "sparse_union_sliced: 1/{ratio} of {total_rows} rows" ) ,
352+ |b| {
353+ do_hash_test_with_len ( b, std:: slice:: from_ref ( & sliced) , slice_len) ;
354+ } ,
355+ ) ;
356+ }
357+ }
358+ }
359+
360+ fn do_hash_test_with_len ( b : & mut Bencher , arrays : & [ ArrayRef ] , expected_len : usize ) {
361+ let state = RandomState :: new ( ) ;
362+ b. iter ( || {
363+ with_hashes ( arrays, & state, |hashes| {
364+ assert_eq ! ( hashes. len( ) , expected_len) ;
365+ Ok ( ( ) )
366+ } )
367+ . unwrap ( ) ;
368+ } ) ;
369+ }
370+
371+ fn list_array ( num_rows : usize ) -> ArrayRef {
372+ let mut rng = make_rng ( ) ;
373+ let elements_per_row = 5 ;
374+ let total_elements = num_rows * elements_per_row;
375+
376+ let values: Int64Array = ( 0 ..total_elements)
377+ . map ( |_| Some ( rng. random :: < i64 > ( ) ) )
378+ . collect ( ) ;
379+ let offsets: Vec < i32 > = ( 0 ..=num_rows)
380+ . map ( |i| ( i * elements_per_row) as i32 )
381+ . collect ( ) ;
382+
383+ Arc :: new ( ListArray :: new (
384+ Arc :: new ( Field :: new ( "item" , DataType :: Int64 , true ) ) ,
385+ OffsetBuffer :: new ( ScalarBuffer :: from ( offsets) ) ,
386+ Arc :: new ( values) ,
387+ None ,
388+ ) )
389+ }
390+
391+ fn map_array ( num_rows : usize ) -> ArrayRef {
392+ let mut rng = make_rng ( ) ;
393+ let entries_per_row = 5 ;
394+ let total_entries = num_rows * entries_per_row;
395+
396+ let keys: Int32Array = ( 0 ..total_entries)
397+ . map ( |_| Some ( rng. random :: < i32 > ( ) ) )
398+ . collect ( ) ;
399+ let values: Int64Array = ( 0 ..total_entries)
400+ . map ( |_| Some ( rng. random :: < i64 > ( ) ) )
401+ . collect ( ) ;
402+ let offsets: Vec < i32 > = ( 0 ..=num_rows)
403+ . map ( |i| ( i * entries_per_row) as i32 )
404+ . collect ( ) ;
405+
406+ let entries = StructArray :: try_new (
407+ Fields :: from ( vec ! [
408+ Field :: new( "keys" , DataType :: Int32 , false ) ,
409+ Field :: new( "values" , DataType :: Int64 , true ) ,
410+ ] ) ,
411+ vec ! [ Arc :: new( keys) , Arc :: new( values) ] ,
412+ None ,
413+ )
414+ . unwrap ( ) ;
415+
416+ Arc :: new ( MapArray :: new (
417+ Arc :: new ( Field :: new (
418+ "entries" ,
419+ DataType :: Struct ( Fields :: from ( vec ! [
420+ Field :: new( "keys" , DataType :: Int32 , false ) ,
421+ Field :: new( "values" , DataType :: Int64 , true ) ,
422+ ] ) ) ,
423+ false ,
424+ ) ) ,
425+ OffsetBuffer :: new ( ScalarBuffer :: from ( offsets) ) ,
426+ entries,
427+ None ,
428+ false ,
429+ ) )
430+ }
431+
432+ fn sparse_union_array ( num_rows : usize ) -> ArrayRef {
433+ let mut rng = make_rng ( ) ;
434+ let num_types = 5 ;
435+
436+ let type_ids: Vec < i8 > = ( 0 ..num_rows)
437+ . map ( |_| rng. random_range ( 0 ..num_types) as i8 )
438+ . collect ( ) ;
439+ let ( fields, children) : ( Vec < _ > , Vec < _ > ) = ( 0 ..num_types)
440+ . map ( |i| {
441+ (
442+ (
443+ i as i8 ,
444+ Arc :: new ( Field :: new ( format ! ( "f{i}" ) , DataType :: Int64 , true ) ) ,
445+ ) ,
446+ primitive_array :: < Int64Type > ( num_rows) ,
447+ )
448+ } )
449+ . unzip ( ) ;
450+
451+ Arc :: new (
452+ UnionArray :: try_new (
453+ UnionFields :: from_iter ( fields) ,
454+ ScalarBuffer :: from ( type_ids) ,
455+ None ,
456+ children,
457+ )
458+ . unwrap ( ) ,
459+ )
460+ }
461+
462+ fn dense_union_array ( num_rows : usize ) -> ArrayRef {
463+ let mut rng = make_rng ( ) ;
464+ let num_types = 5 ;
465+ let type_ids: Vec < i8 > = ( 0 ..num_rows)
466+ . map ( |_| rng. random_range ( 0 ..num_types) as i8 )
467+ . collect ( ) ;
468+
469+ let mut type_counts = vec ! [ 0i32 ; num_types] ;
470+ for & tid in & type_ids {
471+ type_counts[ tid as usize ] += 1 ;
472+ }
473+
474+ let mut current_offsets = vec ! [ 0i32 ; num_types] ;
475+ let offsets: Vec < i32 > = type_ids
476+ . iter ( )
477+ . map ( |& tid| {
478+ let offset = current_offsets[ tid as usize ] ;
479+ current_offsets[ tid as usize ] += 1 ;
480+ offset
481+ } )
482+ . collect ( ) ;
483+
484+ let ( fields, children) : ( Vec < _ > , Vec < _ > ) = ( 0 ..num_types)
485+ . map ( |i| {
486+ (
487+ (
488+ i as i8 ,
489+ Arc :: new ( Field :: new ( format ! ( "f{i}" ) , DataType :: Int64 , true ) ) ,
490+ ) ,
491+ primitive_array :: < Int64Type > ( type_counts[ i] as usize ) ,
492+ )
493+ } )
494+ . unzip ( ) ;
495+
496+ Arc :: new (
497+ UnionArray :: try_new (
498+ UnionFields :: from_iter ( fields) ,
499+ ScalarBuffer :: from ( type_ids) ,
500+ Some ( ScalarBuffer :: from ( offsets) ) ,
501+ children,
502+ )
503+ . unwrap ( ) ,
504+ )
505+ }
506+
271507fn boolean_array ( array_len : usize ) -> ArrayRef {
272508 let mut rng = make_rng ( ) ;
273509 Arc :: new (
@@ -329,5 +565,5 @@ where
329565 )
330566}
331567
332- criterion_group ! ( benches, criterion_benchmark) ;
568+ criterion_group ! ( benches, criterion_benchmark, sliced_array_benchmark ) ;
333569criterion_main ! ( benches) ;
0 commit comments