Skip to content

Commit 3d90d4b

Browse files
authored
datafusion/common: Add support for hashing ListView arrays (#19814)
## Which issue does this PR close? Implements the hashing part of #19782 ## Rationale for this change To aggregate by ListView arrays, we need to be able to hash them. ## What changes are included in this PR? The hashing code and tests. ## Are these changes tested? Yes, unit tests. ## Are there any user-facing changes? No, only additive functionality. @alamb @Jefffrey
1 parent 0c959d6 commit 3d90d4b

File tree

1 file changed

+150
-2
lines changed

1 file changed

+150
-2
lines changed

datafusion/common/src/hash_utils.rs

Lines changed: 150 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ use arrow::{downcast_dictionary_array, downcast_primitive_array};
2727
#[cfg(not(feature = "force_hash_collisions"))]
2828
use crate::cast::{
2929
as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
30-
as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
31-
as_string_array, as_string_view_array, as_struct_array, as_union_array,
30+
as_generic_binary_array, as_large_list_array, as_large_list_view_array,
31+
as_list_array, as_list_view_array, as_map_array, as_string_array,
32+
as_string_view_array, as_struct_array, as_union_array,
3233
};
3334
use crate::error::Result;
3435
use crate::error::{_internal_datafusion_err, _internal_err};
@@ -538,6 +539,45 @@ where
538539
Ok(())
539540
}
540541

542+
#[cfg(not(feature = "force_hash_collisions"))]
543+
fn hash_list_view_array<OffsetSize>(
544+
array: &GenericListViewArray<OffsetSize>,
545+
random_state: &RandomState,
546+
hashes_buffer: &mut [u64],
547+
) -> Result<()>
548+
where
549+
OffsetSize: OffsetSizeTrait,
550+
{
551+
let values = array.values();
552+
let offsets = array.value_offsets();
553+
let sizes = array.value_sizes();
554+
let nulls = array.nulls();
555+
let mut values_hashes = vec![0u64; values.len()];
556+
create_hashes([values], random_state, &mut values_hashes)?;
557+
if let Some(nulls) = nulls {
558+
for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
559+
if nulls.is_valid(i) {
560+
let hash = &mut hashes_buffer[i];
561+
let start = offset.as_usize();
562+
let end = start + size.as_usize();
563+
for values_hash in &values_hashes[start..end] {
564+
*hash = combine_hashes(*hash, *values_hash);
565+
}
566+
}
567+
}
568+
} else {
569+
for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
570+
let hash = &mut hashes_buffer[i];
571+
let start = offset.as_usize();
572+
let end = start + size.as_usize();
573+
for values_hash in &values_hashes[start..end] {
574+
*hash = combine_hashes(*hash, *values_hash);
575+
}
576+
}
577+
}
578+
Ok(())
579+
}
580+
541581
#[cfg(not(feature = "force_hash_collisions"))]
542582
fn hash_union_array(
543583
array: &UnionArray,
@@ -714,6 +754,14 @@ fn hash_single_array(
714754
let array = as_large_list_array(array)?;
715755
hash_list_array(array, random_state, hashes_buffer)?;
716756
}
757+
DataType::ListView(_) => {
758+
let array = as_list_view_array(array)?;
759+
hash_list_view_array(array, random_state, hashes_buffer)?;
760+
}
761+
DataType::LargeListView(_) => {
762+
let array = as_large_list_view_array(array)?;
763+
hash_list_view_array(array, random_state, hashes_buffer)?;
764+
}
717765
DataType::Map(_, _) => {
718766
let array = as_map_array(array)?;
719767
hash_map_array(array, random_state, hashes_buffer)?;
@@ -1128,6 +1176,106 @@ mod tests {
11281176
assert_eq!(hashes[1], hashes[6]); // null vs empty list
11291177
}
11301178

1179+
#[test]
1180+
// Tests actual values of hashes, which are different if forcing collisions
1181+
#[cfg(not(feature = "force_hash_collisions"))]
1182+
fn create_hashes_for_list_view_arrays() {
1183+
use arrow::buffer::{NullBuffer, ScalarBuffer};
1184+
1185+
// Create values array: [0, 1, 2, 3, null, 5]
1186+
let values = Arc::new(Int32Array::from(vec![
1187+
Some(0),
1188+
Some(1),
1189+
Some(2),
1190+
Some(3),
1191+
None,
1192+
Some(5),
1193+
])) as ArrayRef;
1194+
let field = Arc::new(Field::new("item", DataType::Int32, true));
1195+
1196+
// Create ListView with the following logical structure:
1197+
// Row 0: [0, 1, 2] (offset=0, size=3)
1198+
// Row 1: null (null bit set)
1199+
// Row 2: [3, null, 5] (offset=3, size=3)
1200+
// Row 3: [3, null, 5] (offset=3, size=3) - same as row 2
1201+
// Row 4: null (null bit set)
1202+
// Row 5: [0, 1, 2] (offset=0, size=3) - same as row 0
1203+
// Row 6: [] (offset=0, size=0) - empty list
1204+
let offsets = ScalarBuffer::from(vec![0i32, 0, 3, 3, 0, 0, 0]);
1205+
let sizes = ScalarBuffer::from(vec![3i32, 0, 3, 3, 0, 3, 0]);
1206+
let nulls = Some(NullBuffer::from(vec![
1207+
true, false, true, true, false, true, true,
1208+
]));
1209+
1210+
let list_view_array =
1211+
Arc::new(ListViewArray::new(field, offsets, sizes, values, nulls))
1212+
as ArrayRef;
1213+
1214+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1215+
let mut hashes = vec![0; list_view_array.len()];
1216+
create_hashes(&[list_view_array], &random_state, &mut hashes).unwrap();
1217+
1218+
assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
1219+
assert_eq!(hashes[1], hashes[4]); // both null
1220+
assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
1221+
assert_eq!(hashes[1], hashes[6]); // null vs empty list
1222+
1223+
// Negative tests: different content should produce different hashes
1224+
assert_ne!(hashes[0], hashes[2]); // [0, 1, 2] vs [3, null, 5]
1225+
assert_ne!(hashes[0], hashes[6]); // [0, 1, 2] vs []
1226+
assert_ne!(hashes[2], hashes[6]); // [3, null, 5] vs []
1227+
}
1228+
1229+
#[test]
1230+
// Tests actual values of hashes, which are different if forcing collisions
1231+
#[cfg(not(feature = "force_hash_collisions"))]
1232+
fn create_hashes_for_large_list_view_arrays() {
1233+
use arrow::buffer::{NullBuffer, ScalarBuffer};
1234+
1235+
// Create values array: [0, 1, 2, 3, null, 5]
1236+
let values = Arc::new(Int32Array::from(vec![
1237+
Some(0),
1238+
Some(1),
1239+
Some(2),
1240+
Some(3),
1241+
None,
1242+
Some(5),
1243+
])) as ArrayRef;
1244+
let field = Arc::new(Field::new("item", DataType::Int32, true));
1245+
1246+
// Create LargeListView with the following logical structure:
1247+
// Row 0: [0, 1, 2] (offset=0, size=3)
1248+
// Row 1: null (null bit set)
1249+
// Row 2: [3, null, 5] (offset=3, size=3)
1250+
// Row 3: [3, null, 5] (offset=3, size=3) - same as row 2
1251+
// Row 4: null (null bit set)
1252+
// Row 5: [0, 1, 2] (offset=0, size=3) - same as row 0
1253+
// Row 6: [] (offset=0, size=0) - empty list
1254+
let offsets = ScalarBuffer::from(vec![0i64, 0, 3, 3, 0, 0, 0]);
1255+
let sizes = ScalarBuffer::from(vec![3i64, 0, 3, 3, 0, 3, 0]);
1256+
let nulls = Some(NullBuffer::from(vec![
1257+
true, false, true, true, false, true, true,
1258+
]));
1259+
1260+
let large_list_view_array = Arc::new(LargeListViewArray::new(
1261+
field, offsets, sizes, values, nulls,
1262+
)) as ArrayRef;
1263+
1264+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1265+
let mut hashes = vec![0; large_list_view_array.len()];
1266+
create_hashes(&[large_list_view_array], &random_state, &mut hashes).unwrap();
1267+
1268+
assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
1269+
assert_eq!(hashes[1], hashes[4]); // both null
1270+
assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
1271+
assert_eq!(hashes[1], hashes[6]); // null vs empty list
1272+
1273+
// Negative tests: different content should produce different hashes
1274+
assert_ne!(hashes[0], hashes[2]); // [0, 1, 2] vs [3, null, 5]
1275+
assert_ne!(hashes[0], hashes[6]); // [0, 1, 2] vs []
1276+
assert_ne!(hashes[2], hashes[6]); // [3, null, 5] vs []
1277+
}
1278+
11311279
#[test]
11321280
// Tests actual values of hashes, which are different if forcing collisions
11331281
#[cfg(not(feature = "force_hash_collisions"))]

0 commit comments

Comments
 (0)