Skip to content

Commit 42d9835

Browse files
authored
perf: Optimize NULL handling in array_remove (#21532)
## Which issue does this PR close? - Closes #21531. ## Rationale for this change `array_remove` computes the result NULL bitmap incrementally (row-by-row). This is inefficient; it is faster to compute the result NULL bitmap via the bitwise AND of the input NULL bitmaps. Benchmarks (Arm64): ``` - array_remove_binary/remove/10: 6.8ms → 6.6ms (-2.9%) - array_remove_binary/remove/100: 13.0ms → 12.7ms (-2.3%) - array_remove_binary/remove/500: 75.2ms → 76.1ms (+1.2%) - array_remove_boolean/remove/10: 4.5ms → 4.5ms (-0.6%) - array_remove_boolean/remove/100: 8.0ms → 7.8ms (-2.5%) - array_remove_boolean/remove/500: 21.0ms → 21.3ms (+1.4%) - array_remove_decimal64/remove/10: 5.6ms → 5.7ms (+1.8%) - array_remove_decimal64/remove/100: 10.0ms → 10.2ms (+2.0%) - array_remove_decimal64/remove/500: 58.6ms → 56.2ms (-4.1%) - array_remove_f64/remove/10: 5.4ms → 5.4ms (0.0%) - array_remove_f64/remove/100: 7.7ms → 7.8ms (+1.3%) - array_remove_f64/remove/500: 36.5ms → 36.8ms (+0.8%) - array_remove_fixed_size_binary/remove/10: 6.1ms → 5.4ms (-11.5%) - array_remove_fixed_size_binary/remove/100: 12.4ms → 11.6ms (-6.5%) - array_remove_fixed_size_binary/remove/500: 66.7ms → 64.4ms (-3.4%) - array_remove_int64/remove/10: 5.6ms → 5.7ms (+1.8%) - array_remove_int64/remove/100: 7.8ms → 7.8ms (-1.0%) - array_remove_int64/remove/500: 35.5ms → 36.3ms (+2.3%) - array_remove_strings/remove/10: 6.6ms → 6.8ms (+3.0%) - array_remove_strings/remove/100: 18.0ms → 18.3ms (+1.7%) - array_remove_strings/remove/500: 78.0ms → 79.7ms (+2.2%) ``` Not a massive win, but I think still worth making the change; the resulting code is also more idiomatic. ## What changes are included in this PR? * Implement optimization * Tweak `array_remove` benchmark for f64: the previous code resulted in never actually removing anything from the array, which was inconsistent with the other benchmarks and probably unintentional. ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent fbb5240 commit 42d9835

File tree

2 files changed

+9
-10
lines changed

2 files changed

+9
-10
lines changed

datafusion/functions-nested/benches/array_remove.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ fn create_f64_list_array(
389389
if rng.random::<f64>() < null_density {
390390
None
391391
} else {
392-
Some(rng.random_range(0.0..array_size as f64))
392+
Some(rng.random_range(0..array_size as i64) as f64)
393393
}
394394
})
395395
.collect::<Float64Array>();

datafusion/functions-nested/src/remove.rs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
use crate::utils;
2121
use crate::utils::make_scalar_function;
2222
use arrow::array::{
23-
Array, ArrayRef, Capacities, GenericListArray, MutableArrayData, NullBufferBuilder,
24-
OffsetSizeTrait, cast::AsArray, make_array,
23+
Array, ArrayRef, Capacities, GenericListArray, MutableArrayData, OffsetSizeTrait,
24+
cast::AsArray, make_array,
2525
};
26-
use arrow::buffer::OffsetBuffer;
26+
use arrow::buffer::{NullBuffer, OffsetBuffer};
2727
use arrow::datatypes::{DataType, FieldRef};
2828
use datafusion_common::cast::as_int64_array;
2929
use datafusion_common::utils::ListCoercion;
@@ -386,12 +386,13 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
386386
false,
387387
Capacities::Array(original_data.len()),
388388
);
389-
let mut valid = NullBufferBuilder::new(list_array.len());
389+
390+
// Pre-compute combined null bitmap
391+
let nulls = NullBuffer::union(list_array.nulls(), element_array.nulls());
390392

391393
for (row_index, offset_window) in list_array.offsets().windows(2).enumerate() {
392-
if list_array.is_null(row_index) || element_array.is_null(row_index) {
394+
if nulls.as_ref().is_some_and(|nulls| nulls.is_null(row_index)) {
393395
offsets.push(offsets[row_index]);
394-
valid.append_null();
395396
continue;
396397
}
397398

@@ -414,7 +415,6 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
414415
if num_to_remove == 0 {
415416
mutable.extend(0, start, end);
416417
offsets.push(offsets[row_index] + OffsetSize::usize_as(end - start));
417-
valid.append_non_null();
418418
continue;
419419
}
420420

@@ -445,15 +445,14 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
445445
}
446446

447447
offsets.push(offsets[row_index] + OffsetSize::usize_as(copied));
448-
valid.append_non_null();
449448
}
450449

451450
let new_values = make_array(mutable.freeze());
452451
Ok(Arc::new(GenericListArray::<OffsetSize>::try_new(
453452
Arc::clone(list_field),
454453
OffsetBuffer::new(offsets.into()),
455454
new_values,
456-
valid.finish(),
455+
nulls,
457456
)?))
458457
}
459458

0 commit comments

Comments
 (0)