Skip to content

Commit 905b2d9

Browse files
committed
Fix split function to return NULL for NULL inputs
1 parent afd54c4 commit 905b2d9

1 file changed

Lines changed: 51 additions & 5 deletions

File tree

  • native/spark-expr/src/string_funcs

native/spark-expr/src/string_funcs/split.rs

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,27 +133,31 @@ fn split_array(
133133
// Build the result ListArray
134134
let mut offsets: Vec<i32> = Vec::with_capacity(string_array.len() + 1);
135135
let mut values: Vec<String> = Vec::new();
136+
let mut null_buffer_builder = arrow::array::BooleanBufferBuilder::new(string_array.len());
136137
offsets.push(0);
137138

138139
for i in 0..string_array.len() {
139140
if string_array.is_null(i) {
140-
// NULL input produces empty array element (maintain position)
141+
// NULL input produces NULL in result (Spark behavior)
141142
offsets.push(offsets[i]);
143+
null_buffer_builder.append(false); // false = NULL
142144
} else {
143145
let string_val = string_array.value(i);
144146
let parts = split_with_regex(string_val, &regex, limit);
145147
values.extend(parts);
146148
offsets.push(values.len() as i32);
149+
null_buffer_builder.append(true); // true = valid
147150
}
148151
}
149152

150153
let values_array = Arc::new(GenericStringArray::<i32>::from(values)) as ArrayRef;
151-
let field = Arc::new(Field::new("item", DataType::Utf8, false));
154+
let field = Arc::new(Field::new("item", DataType::Utf8, true));
155+
let nulls = arrow::buffer::NullBuffer::new(null_buffer_builder.finish());
152156
let list_array = ListArray::new(
153157
field,
154158
arrow::buffer::OffsetBuffer::new(offsets.into()),
155159
values_array,
156-
None, // No nulls at list level
160+
Some(nulls),
157161
);
158162

159163
Ok(ColumnarValue::Array(Arc::new(list_array)))
@@ -166,26 +170,31 @@ fn split_large_string_array(
166170
) -> DataFusionResult<ColumnarValue> {
167171
let mut offsets: Vec<i32> = Vec::with_capacity(string_array.len() + 1);
168172
let mut values: Vec<String> = Vec::new();
173+
let mut null_buffer_builder = arrow::array::BooleanBufferBuilder::new(string_array.len());
169174
offsets.push(0);
170175

171176
for i in 0..string_array.len() {
172177
if string_array.is_null(i) {
178+
// NULL input produces NULL in result (Spark behavior)
173179
offsets.push(offsets[i]);
180+
null_buffer_builder.append(false); // false = NULL
174181
} else {
175182
let string_val = string_array.value(i);
176183
let parts = split_with_regex(string_val, regex, limit);
177184
values.extend(parts);
178185
offsets.push(values.len() as i32);
186+
null_buffer_builder.append(true); // true = valid
179187
}
180188
}
181189

182190
let values_array = Arc::new(GenericStringArray::<i32>::from(values)) as ArrayRef;
183-
let field = Arc::new(Field::new("item", DataType::Utf8, false));
191+
let field = Arc::new(Field::new("item", DataType::Utf8, true));
192+
let nulls = arrow::buffer::NullBuffer::new(null_buffer_builder.finish());
184193
let list_array = ListArray::new(
185194
field,
186195
arrow::buffer::OffsetBuffer::new(offsets.into()),
187196
values_array,
188-
None,
197+
Some(nulls),
189198
);
190199

191200
Ok(ColumnarValue::Array(Arc::new(list_array)))
@@ -309,4 +318,41 @@ mod tests {
309318
let parts = split_string("a,b,c,,", ",", -1).unwrap();
310319
assert_eq!(parts, vec!["a", "b", "c", "", ""]);
311320
}
321+
322+
#[test]
323+
fn test_split_with_nulls() {
324+
// Test that NULL inputs produce NULL outputs (not empty arrays)
325+
let string_array = Arc::new(StringArray::from(vec![
326+
Some("a,b,c"),
327+
None,
328+
Some("x,y"),
329+
None,
330+
])) as ArrayRef;
331+
let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string())));
332+
let args = vec![ColumnarValue::Array(string_array), pattern];
333+
334+
let result = spark_split(&args).unwrap();
335+
match result {
336+
ColumnarValue::Array(arr) => {
337+
let list_array = arr.as_any().downcast_ref::<ListArray>().unwrap();
338+
assert_eq!(list_array.len(), 4);
339+
// First row: valid ["a", "b", "c"]
340+
assert!(!list_array.is_null(0));
341+
// Second row: NULL
342+
assert!(list_array.is_null(1));
343+
// Third row: valid ["x", "y"]
344+
assert!(!list_array.is_null(2));
345+
// Fourth row: NULL
346+
assert!(list_array.is_null(3));
347+
}
348+
_ => panic!("Expected Array result"),
349+
}
350+
}
351+
352+
#[test]
353+
fn test_split_empty_string() {
354+
// Test that empty string input produces array with single empty string
355+
let parts = split_string("", ",", -1).unwrap();
356+
assert_eq!(parts, vec![""]);
357+
}
312358
}

0 commit comments

Comments
 (0)