Skip to content

Commit b6b542e

Browse files
neilconwaycomphead
andauthored
perf: Optimize array_positions() for scalar needle (#20770)
## Which issue does this PR close? - Closes #20769. ## Rationale for this change `array_positions` previously compared the needle against each row's sub-array individually. When the needle is a scalar (the common case), we can do a single bulk `arrow_ord::cmp::not_distinct` comparison against the entire flat values buffer and then walk the result bitmap, which is significantly faster: the speedup on the `array_positions()` microbenchmarks ranges from 5x to 40x, depending on the size of the array. The same pattern has already been applied to `array_position` (#20532), and previously to other array UDFs. ## What changes are included in this PR? - Add benchmarks for `array_positions`. - Implement bulk-comparison optimization - Refactor `array_position`'s existing fast path slightly for consistency - Code cleanup to use "haystack" and "needle" consistently, not vague terms like "list_array" and "element" - Add unit tests for `array_positions` with sliced ListArrays, for peace of mind - Add unit tests for sliced lists and sliced lists with nulls for the new `array_positions` fast path. ## Are these changes tested? Yes. ## Are there any user-facing changes? No. ## AI usage Multiple AI tools were used to iterate on this PR. I have reviewed and understand the resulting code. --------- Co-authored-by: Oleks V <comphead@users.noreply.github.com>
1 parent a6a4df9 commit b6b542e

File tree

4 files changed

+460
-130
lines changed

4 files changed

+460
-130
lines changed

datafusion/functions-nested/benches/array_position.rs

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use criterion::{
2424
use datafusion_common::ScalarValue;
2525
use datafusion_common::config::ConfigOptions;
2626
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
27-
use datafusion_functions_nested::position::ArrayPosition;
27+
use datafusion_functions_nested::position::{ArrayPosition, ArrayPositions};
2828
use rand::Rng;
2929
use rand::SeedableRng;
3030
use rand::rngs::StdRng;
@@ -39,6 +39,7 @@ const SENTINEL_NEEDLE: i64 = -1;
3939
fn criterion_benchmark(c: &mut Criterion) {
4040
for size in [10, 100, 500] {
4141
bench_array_position(c, size);
42+
bench_array_positions(c, size);
4243
}
4344
}
4445

@@ -146,6 +147,112 @@ fn bench_array_position(c: &mut Criterion, array_size: usize) {
146147
group.finish();
147148
}
148149

150+
fn bench_array_positions(c: &mut Criterion, array_size: usize) {
151+
let mut group = c.benchmark_group("array_positions_i64");
152+
let haystack_found_once = create_haystack_with_sentinel(
153+
NUM_ROWS,
154+
array_size,
155+
NULL_DENSITY,
156+
SENTINEL_NEEDLE,
157+
0,
158+
);
159+
let haystack_found_many = create_haystack_with_sentinels(
160+
NUM_ROWS,
161+
array_size,
162+
NULL_DENSITY,
163+
SENTINEL_NEEDLE,
164+
);
165+
let haystack_not_found =
166+
create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY);
167+
let num_rows = haystack_not_found.len();
168+
let arg_fields: Vec<Arc<Field>> = vec![
169+
Field::new("haystack", haystack_not_found.data_type().clone(), false).into(),
170+
Field::new("needle", DataType::Int64, false).into(),
171+
];
172+
let return_field: Arc<Field> = Field::new(
173+
"result",
174+
DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, true))),
175+
true,
176+
)
177+
.into();
178+
let config_options = Arc::new(ConfigOptions::default());
179+
let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE));
180+
181+
let args_found_once = vec![
182+
ColumnarValue::Array(haystack_found_once.clone()),
183+
ColumnarValue::Scalar(needle.clone()),
184+
];
185+
group.bench_with_input(
186+
BenchmarkId::new("found_once", array_size),
187+
&array_size,
188+
|b, _| {
189+
let udf = ArrayPositions::new();
190+
b.iter(|| {
191+
black_box(
192+
udf.invoke_with_args(ScalarFunctionArgs {
193+
args: args_found_once.clone(),
194+
arg_fields: arg_fields.clone(),
195+
number_rows: num_rows,
196+
return_field: return_field.clone(),
197+
config_options: config_options.clone(),
198+
})
199+
.unwrap(),
200+
)
201+
})
202+
},
203+
);
204+
205+
let args_found_many = vec![
206+
ColumnarValue::Array(haystack_found_many.clone()),
207+
ColumnarValue::Scalar(needle.clone()),
208+
];
209+
group.bench_with_input(
210+
BenchmarkId::new("found_many", array_size),
211+
&array_size,
212+
|b, _| {
213+
let udf = ArrayPositions::new();
214+
b.iter(|| {
215+
black_box(
216+
udf.invoke_with_args(ScalarFunctionArgs {
217+
args: args_found_many.clone(),
218+
arg_fields: arg_fields.clone(),
219+
number_rows: num_rows,
220+
return_field: return_field.clone(),
221+
config_options: config_options.clone(),
222+
})
223+
.unwrap(),
224+
)
225+
})
226+
},
227+
);
228+
229+
let args_not_found = vec![
230+
ColumnarValue::Array(haystack_not_found.clone()),
231+
ColumnarValue::Scalar(needle.clone()),
232+
];
233+
group.bench_with_input(
234+
BenchmarkId::new("not_found", array_size),
235+
&array_size,
236+
|b, _| {
237+
let udf = ArrayPositions::new();
238+
b.iter(|| {
239+
black_box(
240+
udf.invoke_with_args(ScalarFunctionArgs {
241+
args: args_not_found.clone(),
242+
arg_fields: arg_fields.clone(),
243+
number_rows: num_rows,
244+
return_field: return_field.clone(),
245+
config_options: config_options.clone(),
246+
})
247+
.unwrap(),
248+
)
249+
})
250+
},
251+
);
252+
253+
group.finish();
254+
}
255+
149256
fn create_haystack_without_sentinel(
150257
num_rows: usize,
151258
array_size: usize,

0 commit comments

Comments
 (0)