1818//! Common utilities for implementing unicode functions
1919
2020use arrow:: array:: {
21- Array , ArrayAccessor , ArrayIter , ArrayRef , ByteView , GenericStringArray , Int64Array ,
22- OffsetSizeTrait , StringViewArray , make_view,
21+ Array , ArrayRef , ByteView , GenericStringArray , Int64Array , OffsetSizeTrait ,
22+ StringViewArray , make_view,
2323} ;
2424use arrow:: datatypes:: DataType ;
2525use arrow_buffer:: { NullBuffer , ScalarBuffer } ;
26+ use datafusion_common:: Result ;
2627use datafusion_common:: ScalarValue ;
2728use datafusion_common:: cast:: {
2829 as_generic_string_array, as_int64_array, as_string_view_array,
@@ -99,17 +100,17 @@ fn left_right_byte_length(string: &str, n: i64) -> usize {
99100/// General implementation for `left` and `right` functions
100101pub ( crate ) fn general_left_right < F : LeftRightSlicer > (
101102 args : & [ ArrayRef ] ,
102- ) -> datafusion_common :: Result < ArrayRef > {
103+ ) -> Result < ArrayRef > {
103104 let n_array = as_int64_array ( & args[ 1 ] ) ?;
104105
105106 match args[ 0 ] . data_type ( ) {
106107 DataType :: Utf8 => {
107108 let string_array = as_generic_string_array :: < i32 > ( & args[ 0 ] ) ?;
108- general_left_right_array :: < i32 , _ , F > ( string_array, n_array)
109+ general_left_right_array :: < i32 , F > ( string_array, n_array)
109110 }
110111 DataType :: LargeUtf8 => {
111112 let string_array = as_generic_string_array :: < i64 > ( & args[ 0 ] ) ?;
112- general_left_right_array :: < i64 , _ , F > ( string_array, n_array)
113+ general_left_right_array :: < i64 , F > ( string_array, n_array)
113114 }
114115 DataType :: Utf8View => {
115116 let string_view_array = as_string_view_array ( & args[ 0 ] ) ?;
@@ -119,83 +120,125 @@ pub(crate) fn general_left_right<F: LeftRightSlicer>(
119120 }
120121}
121122
122- /// `general_left_right` implementation for strings
123- fn general_left_right_array <
124- ' a ,
125- T : OffsetSizeTrait ,
126- V : ArrayAccessor < Item = & ' a str > ,
127- F : LeftRightSlicer ,
128- > (
129- string_array : V ,
123+ /// Returns true if all offsets in the array fit in u32, meaning the values
124+ /// buffer can be referenced by StringView's u32 offset field.
125+ fn values_fit_in_u32 < T : OffsetSizeTrait > ( string_array : & GenericStringArray < T > ) -> bool {
126+ string_array
127+ . offsets ( )
128+ . last ( )
129+ . map ( |offset| offset. as_usize ( ) <= u32:: MAX as usize )
130+ . unwrap_or ( true )
131+ }
132+
133+ /// `left`/`right` for Utf8/LargeUtf8 input.
134+ ///
135+ /// When offsets fit in u32, produces a zero-copy `StringViewArray` with views
136+ /// pointing into the input values buffer. Otherwise falls back to building a
137+ /// `StringViewArray` by copying.
138+ fn general_left_right_array < T : OffsetSizeTrait , F : LeftRightSlicer > (
139+ string_array : & GenericStringArray < T > ,
130140 n_array : & Int64Array ,
131- ) -> datafusion_common:: Result < ArrayRef > {
132- let iter = ArrayIter :: new ( string_array) ;
133- let result = iter
134- . zip ( n_array. iter ( ) )
135- . map ( |( string, n) | match ( string, n) {
136- ( Some ( string) , Some ( n) ) => {
137- let range = F :: slice ( string, n) ;
138- // Extract a given range from a byte-indexed slice
139- Some ( & string[ range] )
140- }
141- _ => None ,
142- } )
143- . collect :: < GenericStringArray < T > > ( ) ;
141+ ) -> Result < ArrayRef > {
142+ if !values_fit_in_u32 ( string_array) {
143+ let result = string_array
144+ . iter ( )
145+ . zip ( n_array. iter ( ) )
146+ . map ( |( string, n) | match ( string, n) {
147+ ( Some ( string) , Some ( n) ) => Some ( & string[ F :: slice ( string, n) ] ) ,
148+ _ => None ,
149+ } )
150+ . collect :: < StringViewArray > ( ) ;
151+ return Ok ( Arc :: new ( result) as ArrayRef ) ;
152+ }
153+
154+ let len = string_array. len ( ) ;
155+ let offsets = string_array. value_offsets ( ) ;
156+ let nulls = NullBuffer :: union ( string_array. nulls ( ) , n_array. nulls ( ) ) ;
144157
145- Ok ( Arc :: new ( result) as ArrayRef )
158+ let mut views_buf = Vec :: with_capacity ( len) ;
159+ let mut has_out_of_line = false ;
160+
161+ for ( i, offset) in offsets. iter ( ) . enumerate ( ) . take ( len) {
162+ if nulls. as_ref ( ) . is_some_and ( |n| !n. is_valid ( i) ) {
163+ views_buf. push ( 0 ) ;
164+ continue ;
165+ }
166+
167+ // SAFETY: we just checked validity above
168+ let string = unsafe { string_array. value_unchecked ( i) } ;
169+ let n = n_array. value ( i) ;
170+ let range = F :: slice ( string, n) ;
171+ let result_bytes = & string. as_bytes ( ) [ range. clone ( ) ] ;
172+ if result_bytes. len ( ) > 12 {
173+ has_out_of_line = true ;
174+ }
175+
176+ let buf_offset = offset. as_usize ( ) as u32 + range. start as u32 ;
177+ views_buf. push ( make_view ( result_bytes, 0 , buf_offset) ) ;
178+ }
179+
180+ let views = ScalarBuffer :: from ( views_buf) ;
181+ let data_buffers = if has_out_of_line {
182+ vec ! [ string_array. values( ) . clone( ) ]
183+ } else {
184+ vec ! [ ]
185+ } ;
186+
187+ // SAFETY:
188+ // - Each view is produced by `make_view` with correct bytes and offset
189+ // - Out-of-line views reference buffer index 0, which is the original
190+ // values buffer included in data_buffers when has_out_of_line is true
191+ // - values_fit_in_u32 guarantees all offsets fit in u32
192+ unsafe {
193+ let array = StringViewArray :: new_unchecked ( views, data_buffers, nulls) ;
194+ Ok ( Arc :: new ( array) as ArrayRef )
195+ }
146196}
147197
148- /// `general_left_right` implementation for StringViewArray
198+ /// `general_left_right` for StringViewArray input.
149199fn general_left_right_view < F : LeftRightSlicer > (
150200 string_view_array : & StringViewArray ,
151201 n_array : & Int64Array ,
152- ) -> datafusion_common:: Result < ArrayRef > {
153- let len = n_array. len ( ) ;
154-
202+ ) -> Result < ArrayRef > {
155203 let views = string_view_array. views ( ) ;
156- // Every string in StringViewArray has one corresponding view in `views`
157- debug_assert ! ( views. len( ) == string_view_array. len( ) ) ;
158-
159- // Compose null buffer at once
160- let string_nulls = string_view_array. nulls ( ) ;
161- let n_nulls = n_array. nulls ( ) ;
162- let new_nulls = NullBuffer :: union ( string_nulls, n_nulls) ;
204+ let new_nulls = NullBuffer :: union ( string_view_array. nulls ( ) , n_array. nulls ( ) ) ;
205+ let len = n_array. len ( ) ;
206+ let mut has_out_of_line = false ;
163207
164208 let new_views = ( 0 ..len)
165209 . map ( |idx| {
166- let view = views[ idx] ;
167-
168- let is_valid = match & new_nulls {
169- Some ( nulls_buf) => nulls_buf. is_valid ( idx) ,
170- None => true ,
171- } ;
172-
173- if is_valid {
174- let string: & str = string_view_array. value ( idx) ;
175- let n = n_array. value ( idx) ;
176-
177- // Input string comes from StringViewArray, so it should fit in 32-bit length
178- let range = F :: slice ( string, n) ;
179- let result_bytes = & string. as_bytes ( ) [ range. clone ( ) ] ;
180-
181- let byte_view = ByteView :: from ( view) ;
182- // New offset starts at 0 for left, and at `range.start` for right,
183- // which is encoded in the given range
184- let new_offset = byte_view. offset + ( range. start as u32 ) ;
185- // Reuse buffer
186- make_view ( result_bytes, byte_view. buffer_index , new_offset)
187- } else {
188- // For nulls, keep the original view
189- view
210+ if new_nulls. as_ref ( ) . is_some_and ( |n| !n. is_valid ( idx) ) {
211+ return 0 ;
190212 }
213+
214+ // SAFETY: we just checked validity above
215+ let string: & str = unsafe { string_view_array. value_unchecked ( idx) } ;
216+ let n = n_array. value ( idx) ;
217+
218+ let range = F :: slice ( string, n) ;
219+ let result_bytes = & string. as_bytes ( ) [ range. clone ( ) ] ;
220+ if result_bytes. len ( ) > 12 {
221+ has_out_of_line = true ;
222+ }
223+
224+ let byte_view = ByteView :: from ( views[ idx] ) ;
225+ let new_offset = byte_view. offset + ( range. start as u32 ) ;
226+ make_view ( result_bytes, byte_view. buffer_index , new_offset)
191227 } )
192228 . collect :: < Vec < u128 > > ( ) ;
193229
194- // Buffers are unchanged
195- let result = StringViewArray :: try_new (
196- ScalarBuffer :: from ( new_views) ,
197- Vec :: from ( string_view_array. data_buffers ( ) ) ,
198- new_nulls,
199- ) ?;
200- Ok ( Arc :: new ( result) as ArrayRef )
230+ let views = ScalarBuffer :: from ( new_views) ;
231+ let data_buffers = if has_out_of_line {
232+ string_view_array. data_buffers ( ) . to_vec ( )
233+ } else {
234+ vec ! [ ]
235+ } ;
236+
237+ // SAFETY:
238+ // - Each view is produced by `make_view` with correct bytes and offset
239+ // - Out-of-line views reuse the original buffer index and adjusted offset
240+ unsafe {
241+ let array = StringViewArray :: new_unchecked ( views, data_buffers, new_nulls) ;
242+ Ok ( Arc :: new ( array) as ArrayRef )
243+ }
201244}
0 commit comments