@@ -30,7 +30,30 @@ use crate::types::bytes::ByteArrayNativeType;
3030use crate :: types:: { BinaryViewType , ByteViewType , StringViewType } ;
3131use crate :: { ArrayRef , GenericByteViewArray } ;
3232
33- const DEFAULT_BLOCK_SIZE : u32 = 8 * 1024 ;
33+ const STARTING_BLOCK_SIZE : u32 = 8 * 1024 ; // 8KiB
34+ const MAX_BLOCK_SIZE : u32 = 2 * 1024 * 1024 ; // 2MiB
35+
36+ enum BlockSizeGrowthStrategy {
37+ Fixed { size : u32 } ,
38+ Exponential { current_size : u32 } ,
39+ }
40+
41+ impl BlockSizeGrowthStrategy {
42+ fn next_size ( & mut self ) -> u32 {
43+ match self {
44+ Self :: Fixed { size } => * size,
45+ Self :: Exponential { current_size } => {
46+ if * current_size < MAX_BLOCK_SIZE {
47+ // we have fixed start/end block sizes, so we can't overflow
48+ * current_size = current_size. saturating_mul ( 2 ) ;
49+ * current_size
50+ } else {
51+ MAX_BLOCK_SIZE
52+ }
53+ }
54+ }
55+ }
56+ }
3457
3558/// A builder for [`GenericByteViewArray`]
3659///
@@ -58,7 +81,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
5881 null_buffer_builder : NullBufferBuilder ,
5982 completed : Vec < Buffer > ,
6083 in_progress : Vec < u8 > ,
61- block_size : u32 ,
84+ block_size : BlockSizeGrowthStrategy ,
6285 /// Some if deduplicating strings
6386 /// map `<string hash> -> <index to the views>`
6487 string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
@@ -78,15 +101,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
78101 null_buffer_builder : NullBufferBuilder :: new ( capacity) ,
79102 completed : vec ! [ ] ,
80103 in_progress : vec ! [ ] ,
81- block_size : DEFAULT_BLOCK_SIZE ,
104+ block_size : BlockSizeGrowthStrategy :: Exponential {
105+ current_size : STARTING_BLOCK_SIZE ,
106+ } ,
82107 string_tracker : None ,
83108 phantom : Default :: default ( ) ,
84109 }
85110 }
86111
112+ /// Set a fixed buffer size for variable length strings
113+ ///
114+ /// The block size is the size of the buffer used to store values greater
115+ /// than 12 bytes. The builder allocates new buffers when the current
116+ /// buffer is full.
117+ ///
118+ /// By default the builder balances buffer size and buffer count by
119+ /// growing buffer size exponentially from 8KB up to 2MB. The
120+ /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB.
121+ ///
122+ /// If this method is used, any new buffers allocated are
123+ /// exactly this size. This can be useful for advanced users
124+ /// that want to control the memory usage and buffer count.
125+ ///
126+ /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications.
127+ pub fn with_fixed_block_size ( self , block_size : u32 ) -> Self {
128+ debug_assert ! ( block_size > 0 , "Block size must be greater than 0" ) ;
129+ Self {
130+ block_size : BlockSizeGrowthStrategy :: Fixed { size : block_size } ,
131+ ..self
132+ }
133+ }
134+
87135 /// Override the size of buffers to allocate for holding string data
136+ /// Use `with_fixed_block_size` instead.
137+ #[ deprecated( note = "Use `with_fixed_block_size` instead" ) ]
88138 pub fn with_block_size ( self , block_size : u32 ) -> Self {
89- Self { block_size , .. self }
139+ self . with_fixed_block_size ( block_size )
90140 }
91141
92142 /// Deduplicate strings while building the array
@@ -277,7 +327,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
277327 let required_cap = self . in_progress . len ( ) + v. len ( ) ;
278328 if self . in_progress . capacity ( ) < required_cap {
279329 self . flush_in_progress ( ) ;
280- let to_reserve = v. len ( ) . max ( self . block_size as usize ) ;
330+ let to_reserve = v. len ( ) . max ( self . block_size . next_size ( ) as usize ) ;
281331 self . in_progress . reserve ( to_reserve) ;
282332 } ;
283333 let offset = self . in_progress . len ( ) as u32 ;
@@ -478,7 +528,7 @@ mod tests {
478528
479529 let mut builder = StringViewBuilder :: new ( )
480530 . with_deduplicate_strings ( )
481- . with_block_size ( value_1. len ( ) as u32 * 2 ) ; // so that we will have multiple buffers
531+ . with_fixed_block_size ( value_1. len ( ) as u32 * 2 ) ; // so that we will have multiple buffers
482532
483533 let values = vec ! [
484534 Some ( value_1) ,
@@ -585,4 +635,46 @@ mod tests {
585635 "Invalid argument error: No block found with index 5"
586636 ) ;
587637 }
638+
639+ #[ test]
640+ fn test_string_view_with_block_size_growth ( ) {
641+ let mut exp_builder = StringViewBuilder :: new ( ) ;
642+ let mut fixed_builder = StringViewBuilder :: new ( ) . with_fixed_block_size ( STARTING_BLOCK_SIZE ) ;
643+
644+ let long_string = String :: from_utf8 ( vec ! [ b'a' ; STARTING_BLOCK_SIZE as usize ] ) . unwrap ( ) ;
645+
646+ for i in 0 ..9 {
647+ // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M
648+ for _ in 0 ..( 2_u32 . pow ( i) ) {
649+ exp_builder. append_value ( & long_string) ;
650+ fixed_builder. append_value ( & long_string) ;
651+ }
652+ exp_builder. flush_in_progress ( ) ;
653+ fixed_builder. flush_in_progress ( ) ;
654+
655+ // Every step only add one buffer, but the buffer size is much larger
656+ assert_eq ! ( exp_builder. completed. len( ) , i as usize + 1 ) ;
657+ assert_eq ! (
658+ exp_builder. completed[ i as usize ] . len( ) ,
659+ STARTING_BLOCK_SIZE as usize * 2_usize . pow( i)
660+ ) ;
661+
662+ // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1
663+ assert_eq ! ( fixed_builder. completed. len( ) , 2_usize . pow( i + 1 ) - 1 ) ;
664+
665+ // Every buffer is fixed size
666+ assert ! ( fixed_builder
667+ . completed
668+ . iter( )
669+ . all( |b| b. len( ) == STARTING_BLOCK_SIZE as usize ) ) ;
670+ }
671+
672+ // Add one more value, and the buffer stop growing.
673+ exp_builder. append_value ( & long_string) ;
674+ exp_builder. flush_in_progress ( ) ;
675+ assert_eq ! (
676+ exp_builder. completed. last( ) . unwrap( ) . capacity( ) ,
677+ MAX_BLOCK_SIZE as usize
678+ ) ;
679+ }
588680}
0 commit comments