Skip to content

Commit d692df0

Browse files
authored
feat: Optimize hash util for MapArray (#20179)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #20151 . ## Rationale for this change Reduce the irrelevant data being used to hash for `MapArray` <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent b6e4f95 commit d692df0

File tree

2 files changed

+363
-15
lines changed

2 files changed

+363
-15
lines changed

datafusion/common/benches/with_hashes.rs

Lines changed: 244 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919
2020
use ahash::RandomState;
2121
use arrow::array::{
22-
Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericStringArray,
23-
NullBufferBuilder, OffsetSizeTrait, PrimitiveArray, RunArray, StringViewArray,
24-
StructArray, make_array,
22+
Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericStringArray, Int32Array,
23+
Int64Array, ListArray, MapArray, NullBufferBuilder, OffsetSizeTrait, PrimitiveArray,
24+
RunArray, StringViewArray, StructArray, UnionArray, make_array,
2525
};
26-
use arrow::buffer::NullBuffer;
26+
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
2727
use arrow::datatypes::{
28-
ArrowDictionaryKeyType, DataType, Field, Fields, Int32Type, Int64Type,
28+
ArrowDictionaryKeyType, DataType, Field, Fields, Int32Type, Int64Type, UnionFields,
2929
};
3030
use criterion::{Bencher, Criterion, criterion_group, criterion_main};
3131
use datafusion_common::hash_utils::with_hashes;
@@ -40,6 +40,7 @@ const BATCH_SIZE: usize = 8192;
4040
struct BenchData {
4141
name: &'static str,
4242
array: ArrayRef,
43+
/// Union arrays can't have null bitmasks added
4344
supports_nulls: bool,
4445
}
4546

@@ -78,6 +79,26 @@ fn criterion_benchmark(c: &mut Criterion) {
7879
array: pool.dictionary_array::<Int32Type>(BATCH_SIZE),
7980
supports_nulls: true,
8081
},
82+
BenchData {
83+
name: "list_array",
84+
array: list_array(BATCH_SIZE),
85+
supports_nulls: true,
86+
},
87+
BenchData {
88+
name: "map_array",
89+
array: map_array(BATCH_SIZE),
90+
supports_nulls: true,
91+
},
92+
BenchData {
93+
name: "sparse_union",
94+
array: sparse_union_array(BATCH_SIZE),
95+
supports_nulls: false,
96+
},
97+
BenchData {
98+
name: "dense_union",
99+
array: dense_union_array(BATCH_SIZE),
100+
supports_nulls: false,
101+
},
81102
BenchData {
82103
name: "struct_array",
83104
array: create_struct_array(&pool, BATCH_SIZE),
@@ -103,10 +124,9 @@ fn criterion_benchmark(c: &mut Criterion) {
103124
let arrays = vec![array.clone(), array.clone(), array.clone()];
104125
do_hash_test(b, &arrays);
105126
});
106-
127+
// Union arrays can't have null bitmasks
107128
if supports_nulls {
108129
let nullable_array = add_nulls(&array);
109-
110130
c.bench_function(&format!("{name}: single, nulls"), |b| {
111131
do_hash_test(b, std::slice::from_ref(&nullable_array));
112132
});
@@ -268,6 +288,222 @@ where
268288
Arc::new(array)
269289
}
270290

291+
/// Benchmark sliced arrays to demonstrate the optimization for when an array is
292+
/// sliced, the underlying buffer may be much larger than what's referenced by
293+
/// the slice. The optimization avoids hashing unreferenced elements.
294+
fn sliced_array_benchmark(c: &mut Criterion) {
295+
// Test with different slice ratios: slice_size / total_size
296+
// Smaller ratio = more potential savings from the optimization
297+
let slice_ratios = [10, 5, 2]; // 1/10, 1/5, 1/2 of total
298+
299+
for ratio in slice_ratios {
300+
let total_rows = BATCH_SIZE * ratio;
301+
let slice_offset = BATCH_SIZE * (ratio / 2); // Take from middle
302+
let slice_len = BATCH_SIZE;
303+
304+
// Sliced ListArray
305+
{
306+
let full_array = list_array(total_rows);
307+
let sliced: ArrayRef = Arc::new(
308+
full_array
309+
.as_any()
310+
.downcast_ref::<ListArray>()
311+
.unwrap()
312+
.slice(slice_offset, slice_len),
313+
);
314+
c.bench_function(
315+
&format!("list_array_sliced: 1/{ratio} of {total_rows} rows"),
316+
|b| {
317+
do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
318+
},
319+
);
320+
}
321+
322+
// Sliced MapArray
323+
{
324+
let full_array = map_array(total_rows);
325+
let sliced: ArrayRef = Arc::new(
326+
full_array
327+
.as_any()
328+
.downcast_ref::<MapArray>()
329+
.unwrap()
330+
.slice(slice_offset, slice_len),
331+
);
332+
c.bench_function(
333+
&format!("map_array_sliced: 1/{ratio} of {total_rows} rows"),
334+
|b| {
335+
do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
336+
},
337+
);
338+
}
339+
340+
// Sliced Sparse UnionArray
341+
{
342+
let full_array = sparse_union_array(total_rows);
343+
let sliced: ArrayRef = Arc::new(
344+
full_array
345+
.as_any()
346+
.downcast_ref::<UnionArray>()
347+
.unwrap()
348+
.slice(slice_offset, slice_len),
349+
);
350+
c.bench_function(
351+
&format!("sparse_union_sliced: 1/{ratio} of {total_rows} rows"),
352+
|b| {
353+
do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
354+
},
355+
);
356+
}
357+
}
358+
}
359+
360+
fn do_hash_test_with_len(b: &mut Bencher, arrays: &[ArrayRef], expected_len: usize) {
361+
let state = RandomState::new();
362+
b.iter(|| {
363+
with_hashes(arrays, &state, |hashes| {
364+
assert_eq!(hashes.len(), expected_len);
365+
Ok(())
366+
})
367+
.unwrap();
368+
});
369+
}
370+
371+
fn list_array(num_rows: usize) -> ArrayRef {
372+
let mut rng = make_rng();
373+
let elements_per_row = 5;
374+
let total_elements = num_rows * elements_per_row;
375+
376+
let values: Int64Array = (0..total_elements)
377+
.map(|_| Some(rng.random::<i64>()))
378+
.collect();
379+
let offsets: Vec<i32> = (0..=num_rows)
380+
.map(|i| (i * elements_per_row) as i32)
381+
.collect();
382+
383+
Arc::new(ListArray::new(
384+
Arc::new(Field::new("item", DataType::Int64, true)),
385+
OffsetBuffer::new(ScalarBuffer::from(offsets)),
386+
Arc::new(values),
387+
None,
388+
))
389+
}
390+
391+
fn map_array(num_rows: usize) -> ArrayRef {
392+
let mut rng = make_rng();
393+
let entries_per_row = 5;
394+
let total_entries = num_rows * entries_per_row;
395+
396+
let keys: Int32Array = (0..total_entries)
397+
.map(|_| Some(rng.random::<i32>()))
398+
.collect();
399+
let values: Int64Array = (0..total_entries)
400+
.map(|_| Some(rng.random::<i64>()))
401+
.collect();
402+
let offsets: Vec<i32> = (0..=num_rows)
403+
.map(|i| (i * entries_per_row) as i32)
404+
.collect();
405+
406+
let entries = StructArray::try_new(
407+
Fields::from(vec![
408+
Field::new("keys", DataType::Int32, false),
409+
Field::new("values", DataType::Int64, true),
410+
]),
411+
vec![Arc::new(keys), Arc::new(values)],
412+
None,
413+
)
414+
.unwrap();
415+
416+
Arc::new(MapArray::new(
417+
Arc::new(Field::new(
418+
"entries",
419+
DataType::Struct(Fields::from(vec![
420+
Field::new("keys", DataType::Int32, false),
421+
Field::new("values", DataType::Int64, true),
422+
])),
423+
false,
424+
)),
425+
OffsetBuffer::new(ScalarBuffer::from(offsets)),
426+
entries,
427+
None,
428+
false,
429+
))
430+
}
431+
432+
fn sparse_union_array(num_rows: usize) -> ArrayRef {
433+
let mut rng = make_rng();
434+
let num_types = 5;
435+
436+
let type_ids: Vec<i8> = (0..num_rows)
437+
.map(|_| rng.random_range(0..num_types) as i8)
438+
.collect();
439+
let (fields, children): (Vec<_>, Vec<_>) = (0..num_types)
440+
.map(|i| {
441+
(
442+
(
443+
i as i8,
444+
Arc::new(Field::new(format!("f{i}"), DataType::Int64, true)),
445+
),
446+
primitive_array::<Int64Type>(num_rows),
447+
)
448+
})
449+
.unzip();
450+
451+
Arc::new(
452+
UnionArray::try_new(
453+
UnionFields::from_iter(fields),
454+
ScalarBuffer::from(type_ids),
455+
None,
456+
children,
457+
)
458+
.unwrap(),
459+
)
460+
}
461+
462+
fn dense_union_array(num_rows: usize) -> ArrayRef {
463+
let mut rng = make_rng();
464+
let num_types = 5;
465+
let type_ids: Vec<i8> = (0..num_rows)
466+
.map(|_| rng.random_range(0..num_types) as i8)
467+
.collect();
468+
469+
let mut type_counts = vec![0i32; num_types];
470+
for &tid in &type_ids {
471+
type_counts[tid as usize] += 1;
472+
}
473+
474+
let mut current_offsets = vec![0i32; num_types];
475+
let offsets: Vec<i32> = type_ids
476+
.iter()
477+
.map(|&tid| {
478+
let offset = current_offsets[tid as usize];
479+
current_offsets[tid as usize] += 1;
480+
offset
481+
})
482+
.collect();
483+
484+
let (fields, children): (Vec<_>, Vec<_>) = (0..num_types)
485+
.map(|i| {
486+
(
487+
(
488+
i as i8,
489+
Arc::new(Field::new(format!("f{i}"), DataType::Int64, true)),
490+
),
491+
primitive_array::<Int64Type>(type_counts[i] as usize),
492+
)
493+
})
494+
.unzip();
495+
496+
Arc::new(
497+
UnionArray::try_new(
498+
UnionFields::from_iter(fields),
499+
ScalarBuffer::from(type_ids),
500+
Some(ScalarBuffer::from(offsets)),
501+
children,
502+
)
503+
.unwrap(),
504+
)
505+
}
506+
271507
fn boolean_array(array_len: usize) -> ArrayRef {
272508
let mut rng = make_rng();
273509
Arc::new(
@@ -329,5 +565,5 @@ where
329565
)
330566
}
331567

332-
criterion_group!(benches, criterion_benchmark);
568+
criterion_group!(benches, criterion_benchmark, sliced_array_benchmark);
333569
criterion_main!(benches);

0 commit comments

Comments
 (0)