1515// specific language governing permissions and limitations
1616// under the License.
1717
18- use std:: sync:: Arc ;
19-
20- use crate :: utils:: utf8_to_str_type;
21- use arrow:: array:: {
22- Array , ArrayRef , AsArray , GenericStringArray , GenericStringBuilder , Int64Array ,
23- StringArrayType , StringLikeArrayBuilder , StringViewArray , StringViewBuilder ,
18+ use crate :: strings:: {
19+ BulkNullStringArrayBuilder , GenericStringArrayBuilder , StringViewArrayBuilder ,
2420} ;
21+ use crate :: utils:: utf8_to_str_type;
22+ use arrow:: array:: { Array , ArrayRef , AsArray , Int64Array , StringArrayType } ;
23+ use arrow:: buffer:: NullBuffer ;
2524use arrow:: datatypes:: DataType ;
2625use arrow:: datatypes:: DataType :: { LargeUtf8 , Utf8 , Utf8View } ;
2726use datafusion_common:: cast:: as_int64_array;
@@ -190,43 +189,28 @@ fn repeat(string_array: &ArrayRef, count_array: &ArrayRef) -> Result<ArrayRef> {
190189 number_array,
191190 i32:: MAX as usize ,
192191 ) ?;
193- let builder = StringViewBuilder :: with_capacity ( string_array. len ( ) ) ;
194- repeat_impl :: < & StringViewArray , StringViewBuilder > (
195- & string_view_array,
196- number_array,
197- max_item_capacity,
198- builder,
199- )
192+ let builder = StringViewArrayBuilder :: with_capacity ( string_array. len ( ) ) ;
193+ repeat_impl ( & string_view_array, number_array, max_item_capacity, builder)
200194 }
201195 Utf8 => {
202196 let string_arr = string_array. as_string :: < i32 > ( ) ;
203197 let ( total_capacity, max_item_capacity) =
204198 calculate_capacities ( & string_arr, number_array, i32:: MAX as usize ) ?;
205- let builder = GenericStringBuilder :: < i32 > :: with_capacity (
199+ let builder = GenericStringArrayBuilder :: < i32 > :: with_capacity (
206200 string_array. len ( ) ,
207201 total_capacity,
208202 ) ;
209- repeat_impl :: < & GenericStringArray < i32 > , GenericStringBuilder < i32 > > (
210- & string_arr,
211- number_array,
212- max_item_capacity,
213- builder,
214- )
203+ repeat_impl ( & string_arr, number_array, max_item_capacity, builder)
215204 }
216205 LargeUtf8 => {
217206 let string_arr = string_array. as_string :: < i64 > ( ) ;
218207 let ( total_capacity, max_item_capacity) =
219208 calculate_capacities ( & string_arr, number_array, i64:: MAX as usize ) ?;
220- let builder = GenericStringBuilder :: < i64 > :: with_capacity (
209+ let builder = GenericStringArrayBuilder :: < i64 > :: with_capacity (
221210 string_array. len ( ) ,
222211 total_capacity,
223212 ) ;
224- repeat_impl :: < & GenericStringArray < i64 > , GenericStringBuilder < i64 > > (
225- & string_arr,
226- number_array,
227- max_item_capacity,
228- builder,
229- )
213+ repeat_impl ( & string_arr, number_array, max_item_capacity, builder)
230214 }
231215 other => exec_err ! (
232216 "Unsupported data type {other:?} for function repeat. \
@@ -278,7 +262,7 @@ fn repeat_impl<'a, S, B>(
278262) -> Result < ArrayRef >
279263where
280264 S : StringArrayType < ' a > + ' a ,
281- B : StringLikeArrayBuilder ,
265+ B : BulkNullStringArrayBuilder ,
282266{
283267 // Reusable buffer to avoid allocations in string.repeat()
284268 let mut buffer = Vec :: < u8 > :: with_capacity ( max_item_capacity) ;
@@ -301,12 +285,18 @@ where
301285 }
302286 }
303287
304- // Fast path: no nulls in either array
305- if string_array. null_count ( ) == 0 && number_array. null_count ( ) == 0 {
288+ // Output is null IFF either input is null
289+ let nulls = NullBuffer :: union ( string_array. nulls ( ) , number_array. nulls ( ) ) ;
290+
291+ if let Some ( ref n) = nulls {
306292 for i in 0 ..string_array. len ( ) {
307- // SAFETY: i is within bounds (0..len) and null_count() == 0 guarantees valid value
293+ if n. is_null ( i) {
294+ builder. append_placeholder ( ) ;
295+ continue ;
296+ }
297+ // SAFETY: index `i` in both arrays is valid
308298 let string = unsafe { string_array. value_unchecked ( i) } ;
309- let count = number_array. value ( i) ;
299+ let count = unsafe { number_array. value_unchecked ( i) } ;
310300 if count > 0 {
311301 repeat_to_buffer ( & mut buffer, string, count as usize ) ;
312302 // SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
@@ -316,27 +306,30 @@ where
316306 }
317307 }
318308 } else {
319- // Slow path: handle nulls
320- for ( string, number) in string_array. iter ( ) . zip ( number_array. iter ( ) ) {
321- match ( string, number) {
322- ( Some ( string) , Some ( count) ) if count > 0 => {
323- repeat_to_buffer ( & mut buffer, string, count as usize ) ;
324- // SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
325- builder
326- . append_value ( unsafe { std:: str:: from_utf8_unchecked ( & buffer) } ) ;
327- }
328- ( Some ( _) , Some ( _) ) => builder. append_value ( "" ) ,
329- _ => builder. append_null ( ) ,
309+ for i in 0 ..string_array. len ( ) {
310+ // SAFETY: no nulls, so every index in both arrays is valid
311+ let string = unsafe { string_array. value_unchecked ( i) } ;
312+ let count = unsafe { number_array. value_unchecked ( i) } ;
313+ if count > 0 {
314+ repeat_to_buffer ( & mut buffer, string, count as usize ) ;
315+ // SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
316+ builder. append_value ( unsafe { std:: str:: from_utf8_unchecked ( & buffer) } ) ;
317+ } else {
318+ builder. append_value ( "" ) ;
330319 }
331320 }
332321 }
333322
334- Ok ( Arc :: new ( builder. finish ( ) ) as ArrayRef )
323+ builder. finish ( nulls )
335324}
336325
337326#[ cfg( test) ]
338327mod tests {
339- use arrow:: array:: { Array , LargeStringArray , StringArray , StringViewArray } ;
328+ use std:: sync:: Arc ;
329+
330+ use arrow:: array:: {
331+ Array , ArrayRef , Int64Array , LargeStringArray , StringArray , StringViewArray ,
332+ } ;
340333 use arrow:: datatypes:: DataType :: { LargeUtf8 , Utf8 , Utf8View } ;
341334
342335 use datafusion_common:: ScalarValue ;
@@ -444,4 +437,69 @@ mod tests {
444437
445438 Ok ( ( ) )
446439 }
440+
441+ // Slicing the input arrays produces a NullBuffer with a non-zero offset.
442+ // The tests below use 6-row inputs sliced to (1, 4) so that:
443+ // slot 0 (orig 1): "a" × 3 → "aaa"
444+ // slot 1 (orig 2): "bb" × 2 → "bbbb"
445+ // slot 2 (orig 3): "c" × NULL → NULL (count-side null)
446+ // slot 3 (orig 4): NULL × 1 → NULL (string-side null)
447+ fn sliced_offset_inputs < F > ( make_strings : F ) -> ( ArrayRef , ArrayRef )
448+ where
449+ F : FnOnce ( Vec < Option < & ' static str > > ) -> ArrayRef ,
450+ {
451+ let strings = make_strings ( vec ! [
452+ None ,
453+ Some ( "a" ) ,
454+ Some ( "bb" ) ,
455+ Some ( "c" ) ,
456+ None ,
457+ Some ( "d" ) ,
458+ ] ) ;
459+ let counts: ArrayRef = Arc :: new ( Int64Array :: from ( vec ! [
460+ Some ( 2 ) ,
461+ Some ( 3 ) ,
462+ Some ( 2 ) ,
463+ None ,
464+ Some ( 1 ) ,
465+ Some ( 2 ) ,
466+ ] ) ) ;
467+ ( strings. slice ( 1 , 4 ) , counts. slice ( 1 , 4 ) )
468+ }
469+
470+ fn assert_sliced_offset_output < A : Array + ' static > ( result : ArrayRef )
471+ where
472+ for < ' a > & ' a A : arrow:: array:: ArrayAccessor < Item = & ' a str > ,
473+ {
474+ let result = result. as_any ( ) . downcast_ref :: < A > ( ) . unwrap ( ) ;
475+ assert_eq ! ( result. len( ) , 4 ) ;
476+ assert_eq ! ( arrow:: array:: ArrayAccessor :: value( & result, 0 ) , "aaa" ) ;
477+ assert_eq ! ( arrow:: array:: ArrayAccessor :: value( & result, 1 ) , "bbbb" ) ;
478+ assert ! ( result. is_null( 2 ) ) ;
479+ assert ! ( result. is_null( 3 ) ) ;
480+ assert_eq ! ( result. null_count( ) , 2 ) ;
481+ }
482+
483+ #[ test]
484+ fn test_repeat_sliced_string_with_null_offset ( ) {
485+ let ( strings, counts) = sliced_offset_inputs ( |v| Arc :: new ( StringArray :: from ( v) ) ) ;
486+ let result = super :: repeat ( & strings, & counts) . unwrap ( ) ;
487+ assert_sliced_offset_output :: < StringArray > ( result) ;
488+ }
489+
490+ #[ test]
491+ fn test_repeat_sliced_large_string_with_null_offset ( ) {
492+ let ( strings, counts) =
493+ sliced_offset_inputs ( |v| Arc :: new ( LargeStringArray :: from ( v) ) ) ;
494+ let result = super :: repeat ( & strings, & counts) . unwrap ( ) ;
495+ assert_sliced_offset_output :: < LargeStringArray > ( result) ;
496+ }
497+
498+ #[ test]
499+ fn test_repeat_sliced_string_view_with_null_offset ( ) {
500+ let ( strings, counts) =
501+ sliced_offset_inputs ( |v| Arc :: new ( StringViewArray :: from ( v) ) ) ;
502+ let result = super :: repeat ( & strings, & counts) . unwrap ( ) ;
503+ assert_sliced_offset_output :: < StringViewArray > ( result) ;
504+ }
447505}
0 commit comments