Skip to content

Commit e30d6d1

Browse files
committed
re-run benchmarks
1 parent 5d6a9b0 commit e30d6d1

1 file changed

Lines changed: 34 additions & 35 deletions

File tree

  • datafusion/physical-plan/src/aggregates/group_values/single_group_by

datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs

Lines changed: 34 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,11 @@ macro_rules! decode_list {
4949
if len == -1 {
5050
builder.values().append_null();
5151
} else {
52-
let s = std::str::from_utf8(
53-
&raw_vector[offset..offset + len as usize],
54-
)
55-
.map_err(|e| {
56-
Internal(format!("Invalid utf8 in list element: {e}"))
57-
})?;
52+
let s = unsafe {
53+
std::str::from_utf8_unchecked(
54+
&raw_vector[offset..offset + len as usize],
55+
)
56+
};
5857
builder.values().append_value(s);
5958
offset += len as usize;
6059
}
@@ -72,9 +71,7 @@ macro_rules! decode_scalar_string {
7271
for raw_bytes in $raw {
7372
match raw_bytes {
7473
Some(raw_vector) => {
75-
let s = std::str::from_utf8(raw_vector).map_err(|e| {
76-
Internal(format!("Invalid utf8 in GroupValuesDictionary: {e}"))
77-
})?;
74+
let s = unsafe { std::str::from_utf8_unchecked(raw_vector) };
7875
builder.append_value(s);
7976
}
8077
None => builder.append_null(),
@@ -116,7 +113,6 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> {
116113
create_hashes([Arc::clone(values)], &self.random_state, &mut hashes)?;
117114
Ok(hashes)
118115
}
119-
120116
fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> {
121117
match values.data_type() {
122118
DataType::Utf8 => Cow::Borrowed(
@@ -256,33 +252,35 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> {
256252
key_array: &PrimitiveArray<K>,
257253
) -> (ArrayRef, Vec<Option<usize>>) {
258254
let mut key_to_canonical: Vec<Option<usize>> = vec![None; values.len()];
255+
let mut canonical_indices: Vec<usize> = Vec::with_capacity(key_array.len());
259256
let mut bytes_to_canonical: HashMap<Vec<u8>, usize> =
260257
HashMap::with_capacity(values.len());
261-
let mut canonical_indices: Vec<usize> = Vec::with_capacity(key_array.len());
262258

263-
let new_keys: Vec<Option<usize>> = (0..key_array.len())
264-
.map(|i| {
265-
if key_array.is_null(i) {
266-
return None;
267-
}
268-
let old_key = key_array.value(i).to_usize().unwrap();
269-
if let Some(canonical) = key_to_canonical[old_key] {
270-
return Some(canonical);
259+
let mut new_keys: Vec<Option<usize>> = Vec::with_capacity(key_array.len());
260+
for i in 0..key_array.len() {
261+
if key_array.is_null(i) {
262+
new_keys.push(None);
263+
continue;
264+
}
265+
let old_key = key_array.value(i).to_usize().unwrap();
266+
if let Some(canonical) = key_to_canonical[old_key] {
267+
new_keys.push(Some(canonical));
268+
continue;
269+
}
270+
let raw = Self::get_raw_bytes(values, old_key).to_vec();
271+
272+
let canonical = match bytes_to_canonical.get(&raw) {
273+
Some(&idx) => idx,
274+
None => {
275+
let idx = canonical_indices.len();
276+
canonical_indices.push(old_key);
277+
bytes_to_canonical.insert(raw, idx);
278+
idx
271279
}
272-
let raw = Self::get_raw_bytes(values, old_key).to_vec();
273-
let canonical = match bytes_to_canonical.get(&raw) {
274-
Some(&idx) => idx,
275-
None => {
276-
let idx = canonical_indices.len();
277-
canonical_indices.push(old_key);
278-
bytes_to_canonical.insert(raw, idx);
279-
idx
280-
}
281-
};
282-
key_to_canonical[old_key] = Some(canonical);
283-
Some(canonical)
284-
})
285-
.collect();
280+
};
281+
key_to_canonical[old_key] = Some(canonical);
282+
new_keys.push(Some(canonical));
283+
}
286284

287285
let indices = UInt64Array::from(
288286
canonical_indices
@@ -298,7 +296,7 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> {
298296
values: &ArrayRef,
299297
key_array: &PrimitiveArray<K>,
300298
) -> (ArrayRef, Vec<Option<usize>>) {
301-
if values.len() < 20 {
299+
if values.len() < 25 {
302300
//benchmarks show that for small arrays a nested loops works better due to cache locality and lower overhead,
303301
Self::normalize_dict_linear(values, key_array)
304302
} else {
@@ -334,7 +332,6 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K>
334332
"GroupValuesDictionary only supports a single column"
335333
);
336334
let array = Arc::clone(&cols[0]);
337-
groups.clear(); // zero out buffer
338335
let dict_array = array
339336
.as_any()
340337
.downcast_ref::<DictionaryArray<K>>()
@@ -353,6 +350,8 @@ impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K>
353350
if key_array.is_empty() {
354351
return Ok(());
355352
}
353+
groups.clear(); // zero out buffer
354+
groups.reserve(key_array.len());
356355
let (values, keys_as_usize) = Self::normalize_dict_array(values, key_array);
357356

358357
let values = &values;

0 commit comments

Comments
 (0)