@@ -133,27 +133,31 @@ fn split_array(
133133 // Build the result ListArray
134134 let mut offsets: Vec < i32 > = Vec :: with_capacity ( string_array. len ( ) + 1 ) ;
135135 let mut values: Vec < String > = Vec :: new ( ) ;
136+ let mut null_buffer_builder = arrow:: array:: BooleanBufferBuilder :: new ( string_array. len ( ) ) ;
136137 offsets. push ( 0 ) ;
137138
138139 for i in 0 ..string_array. len ( ) {
139140 if string_array. is_null ( i) {
140- // NULL input produces empty array element (maintain position )
141+ // NULL input produces NULL in result (Spark behavior )
141142 offsets. push ( offsets[ i] ) ;
143+ null_buffer_builder. append ( false ) ; // false = NULL
142144 } else {
143145 let string_val = string_array. value ( i) ;
144146 let parts = split_with_regex ( string_val, & regex, limit) ;
145147 values. extend ( parts) ;
146148 offsets. push ( values. len ( ) as i32 ) ;
149+ null_buffer_builder. append ( true ) ; // true = valid
147150 }
148151 }
149152
150153 let values_array = Arc :: new ( GenericStringArray :: < i32 > :: from ( values) ) as ArrayRef ;
151- let field = Arc :: new ( Field :: new ( "item" , DataType :: Utf8 , false ) ) ;
154+ let field = Arc :: new ( Field :: new ( "item" , DataType :: Utf8 , true ) ) ;
155+ let nulls = arrow:: buffer:: NullBuffer :: new ( null_buffer_builder. finish ( ) ) ;
152156 let list_array = ListArray :: new (
153157 field,
154158 arrow:: buffer:: OffsetBuffer :: new ( offsets. into ( ) ) ,
155159 values_array,
156- None , // No nulls at list level
160+ Some ( nulls) ,
157161 ) ;
158162
159163 Ok ( ColumnarValue :: Array ( Arc :: new ( list_array) ) )
@@ -166,26 +170,31 @@ fn split_large_string_array(
166170) -> DataFusionResult < ColumnarValue > {
167171 let mut offsets: Vec < i32 > = Vec :: with_capacity ( string_array. len ( ) + 1 ) ;
168172 let mut values: Vec < String > = Vec :: new ( ) ;
173+ let mut null_buffer_builder = arrow:: array:: BooleanBufferBuilder :: new ( string_array. len ( ) ) ;
169174 offsets. push ( 0 ) ;
170175
171176 for i in 0 ..string_array. len ( ) {
172177 if string_array. is_null ( i) {
178+ // NULL input produces NULL in result (Spark behavior)
173179 offsets. push ( offsets[ i] ) ;
180+ null_buffer_builder. append ( false ) ; // false = NULL
174181 } else {
175182 let string_val = string_array. value ( i) ;
176183 let parts = split_with_regex ( string_val, regex, limit) ;
177184 values. extend ( parts) ;
178185 offsets. push ( values. len ( ) as i32 ) ;
186+ null_buffer_builder. append ( true ) ; // true = valid
179187 }
180188 }
181189
182190 let values_array = Arc :: new ( GenericStringArray :: < i32 > :: from ( values) ) as ArrayRef ;
183- let field = Arc :: new ( Field :: new ( "item" , DataType :: Utf8 , false ) ) ;
191+ let field = Arc :: new ( Field :: new ( "item" , DataType :: Utf8 , true ) ) ;
192+ let nulls = arrow:: buffer:: NullBuffer :: new ( null_buffer_builder. finish ( ) ) ;
184193 let list_array = ListArray :: new (
185194 field,
186195 arrow:: buffer:: OffsetBuffer :: new ( offsets. into ( ) ) ,
187196 values_array,
188- None ,
197+ Some ( nulls ) ,
189198 ) ;
190199
191200 Ok ( ColumnarValue :: Array ( Arc :: new ( list_array) ) )
@@ -309,4 +318,41 @@ mod tests {
309318 let parts = split_string ( "a,b,c,," , "," , -1 ) . unwrap ( ) ;
310319 assert_eq ! ( parts, vec![ "a" , "b" , "c" , "" , "" ] ) ;
311320 }
321+
322+ #[ test]
323+ fn test_split_with_nulls ( ) {
324+ // Test that NULL inputs produce NULL outputs (not empty arrays)
325+ let string_array = Arc :: new ( StringArray :: from ( vec ! [
326+ Some ( "a,b,c" ) ,
327+ None ,
328+ Some ( "x,y" ) ,
329+ None ,
330+ ] ) ) as ArrayRef ;
331+ let pattern = ColumnarValue :: Scalar ( ScalarValue :: Utf8 ( Some ( "," . to_string ( ) ) ) ) ;
332+ let args = vec ! [ ColumnarValue :: Array ( string_array) , pattern] ;
333+
334+ let result = spark_split ( & args) . unwrap ( ) ;
335+ match result {
336+ ColumnarValue :: Array ( arr) => {
337+ let list_array = arr. as_any ( ) . downcast_ref :: < ListArray > ( ) . unwrap ( ) ;
338+ assert_eq ! ( list_array. len( ) , 4 ) ;
339+ // First row: valid ["a", "b", "c"]
340+ assert ! ( !list_array. is_null( 0 ) ) ;
341+ // Second row: NULL
342+ assert ! ( list_array. is_null( 1 ) ) ;
343+ // Third row: valid ["x", "y"]
344+ assert ! ( !list_array. is_null( 2 ) ) ;
345+ // Fourth row: NULL
346+ assert ! ( list_array. is_null( 3 ) ) ;
347+ }
348+ _ => panic ! ( "Expected Array result" ) ,
349+ }
350+ }
351+
352+ #[ test]
353+ fn test_split_empty_string ( ) {
354+ // Test that empty string input produces array with single empty string
355+ let parts = split_string ( "" , "," , -1 ) . unwrap ( ) ;
356+ assert_eq ! ( parts, vec![ "" ] ) ;
357+ }
312358}
0 commit comments