Skip to content

Commit 268a9f1

Browse files
committed
optimize hex
1 parent 3102dd6 commit 268a9f1

1 file changed

Lines changed: 138 additions & 94 deletions

File tree

  • datafusion/spark/src/function/math

datafusion/spark/src/function/math/hex.rs

Lines changed: 138 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
// under the License.
1717

1818
use std::any::Any;
19+
use std::str::from_utf8_unchecked;
1920
use std::sync::Arc;
2021

21-
use arrow::array::{Array, StringArray};
22+
use arrow::array::{Array, StringBuilder};
2223
use arrow::datatypes::DataType;
2324
use arrow::{
2425
array::{as_dictionary_array, as_largestring_array, as_string_array},
@@ -110,37 +111,104 @@ impl ScalarUDFImpl for SparkHex {
110111
}
111112
}
112113

113-
fn hex_int64(num: i64) -> String {
114-
format!("{num:X}")
114+
#[inline]
115+
fn hex_int64(num: i64, buffer: &mut Vec<u8>) {
116+
const HEX_CHARS: &[u8; 16] = b"0123456789ABCDEF";
117+
118+
if num == 0 {
119+
buffer.push(HEX_CHARS[0]);
120+
return;
121+
}
122+
123+
let mut n = num;
124+
let mut temp = [0u8; 16];
125+
let mut i = 16;
126+
while n != 0 && i > 0 {
127+
i -= 1;
128+
let digest = (n & 0xF) as u8;
129+
temp[i] = HEX_CHARS[digest as usize];
130+
n >>= 4;
131+
}
132+
133+
buffer.extend_from_slice(&temp[i..]);
115134
}
116135

117136
/// Hex encoding lookup tables for fast byte-to-hex conversion
118137
const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
119138
const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
120139

121140
#[inline]
122-
fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool) -> String {
141+
fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool, buffer: &mut Vec<u8>) {
123142
let bytes = data.as_ref();
124-
let mut s = String::with_capacity(bytes.len() * 2);
125143
let hex_chars = if lower_case {
126144
HEX_CHARS_LOWER
127145
} else {
128146
HEX_CHARS_UPPER
129147
};
130148
for &b in bytes {
131-
s.push(hex_chars[(b >> 4) as usize] as char);
132-
s.push(hex_chars[(b & 0x0f) as usize] as char);
149+
buffer.push(hex_chars[(b >> 4) as usize]);
150+
buffer.push(hex_chars[(b & 0x0f) as usize]);
133151
}
134-
s
135152
}
136153

137-
#[inline(always)]
138-
fn hex_bytes<T: AsRef<[u8]>>(
139-
bytes: T,
154+
/// Generic hex encoding for byte array types
155+
fn hex_encode_bytes<'a, I, T>(
156+
iter: I,
140157
lowercase: bool,
141-
) -> Result<String, std::fmt::Error> {
142-
let hex_string = hex_encode(bytes, lowercase);
143-
Ok(hex_string)
158+
len: usize,
159+
) -> Result<ColumnarValue, DataFusionError>
160+
where
161+
I: Iterator<Item = Option<T>>,
162+
T: AsRef<[u8]> + 'a,
163+
{
164+
let mut builder = StringBuilder::with_capacity(len, len * 64);
165+
let mut buffer = Vec::with_capacity(16);
166+
let hex_chars = if lowercase {
167+
HEX_CHARS_LOWER
168+
} else {
169+
HEX_CHARS_UPPER
170+
};
171+
172+
for v in iter {
173+
if let Some(b) = v {
174+
buffer.clear();
175+
let bytes = b.as_ref();
176+
for &byte in bytes {
177+
buffer.push(hex_chars[(byte >> 4) as usize]);
178+
buffer.push(hex_chars[(byte & 0x0f) as usize]);
179+
}
180+
unsafe {
181+
builder.append_value(from_utf8_unchecked(&buffer));
182+
}
183+
} else {
184+
builder.append_null();
185+
}
186+
}
187+
188+
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
189+
}
190+
191+
/// Generic hex encoding for int64 type
192+
fn hex_encode_int64<'a, I>(iter: I, len: usize) -> Result<ColumnarValue, DataFusionError>
193+
where
194+
I: Iterator<Item = Option<i64>>,
195+
{
196+
let mut builder = StringBuilder::with_capacity(len, len * 64);
197+
let mut buffer = Vec::with_capacity(16);
198+
199+
for v in iter {
200+
if let Some(num) = v {
201+
buffer.clear();
202+
hex_int64(num, &mut buffer);
203+
unsafe {
204+
builder.append_value(from_utf8_unchecked(&buffer));
205+
}
206+
} else {
207+
builder.append_null();
208+
}
209+
}
210+
211+
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
144212
}
145213

146214
/// Spark-compatible `hex` function
@@ -166,103 +234,72 @@ pub fn compute_hex(
166234
ColumnarValue::Array(array) => match array.data_type() {
167235
DataType::Int64 => {
168236
let array = as_int64_array(array)?;
169-
170-
let hexed_array: StringArray =
171-
array.iter().map(|v| v.map(hex_int64)).collect();
172-
173-
Ok(ColumnarValue::Array(Arc::new(hexed_array)))
237+
hex_encode_int64(array.iter(), array.len())
174238
}
175239
DataType::Utf8 => {
176240
let array = as_string_array(array);
177-
178-
let hexed: StringArray = array
179-
.iter()
180-
.map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
181-
.collect::<Result<_, _>>()?;
182-
183-
Ok(ColumnarValue::Array(Arc::new(hexed)))
241+
hex_encode_bytes(array.iter(), lowercase, array.len())
184242
}
185243
DataType::Utf8View => {
186244
let array = as_string_view_array(array)?;
187-
188-
let hexed: StringArray = array
189-
.iter()
190-
.map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
191-
.collect::<Result<_, _>>()?;
192-
193-
Ok(ColumnarValue::Array(Arc::new(hexed)))
245+
hex_encode_bytes(array.iter(), lowercase, array.len())
194246
}
195247
DataType::LargeUtf8 => {
196248
let array = as_largestring_array(array);
197-
198-
let hexed: StringArray = array
199-
.iter()
200-
.map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
201-
.collect::<Result<_, _>>()?;
202-
203-
Ok(ColumnarValue::Array(Arc::new(hexed)))
249+
hex_encode_bytes(array.iter(), lowercase, array.len())
204250
}
205251
DataType::Binary => {
206252
let array = as_binary_array(array)?;
207-
208-
let hexed: StringArray = array
209-
.iter()
210-
.map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
211-
.collect::<Result<_, _>>()?;
212-
213-
Ok(ColumnarValue::Array(Arc::new(hexed)))
253+
hex_encode_bytes(array.iter(), lowercase, array.len())
214254
}
215255
DataType::LargeBinary => {
216256
let array = as_large_binary_array(array)?;
217-
218-
let hexed: StringArray = array
219-
.iter()
220-
.map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
221-
.collect::<Result<_, _>>()?;
222-
223-
Ok(ColumnarValue::Array(Arc::new(hexed)))
257+
hex_encode_bytes(array.iter(), lowercase, array.len())
224258
}
225259
DataType::FixedSizeBinary(_) => {
226260
let array = as_fixed_size_binary_array(array)?;
227-
228-
let hexed: StringArray = array
229-
.iter()
230-
.map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
231-
.collect::<Result<_, _>>()?;
232-
233-
Ok(ColumnarValue::Array(Arc::new(hexed)))
261+
hex_encode_bytes(array.iter(), lowercase, array.len())
234262
}
235263
DataType::Dictionary(_, value_type) => {
236264
let dict = as_dictionary_array::<Int32Type>(&array);
237-
238-
let values = match **value_type {
239-
DataType::Int64 => as_int64_array(dict.values())?
240-
.iter()
241-
.map(|v| v.map(hex_int64))
242-
.collect::<Vec<_>>(),
243-
DataType::Utf8 => as_string_array(dict.values())
244-
.iter()
245-
.map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
246-
.collect::<Result<_, _>>()?,
247-
DataType::Binary => as_binary_array(dict.values())?
248-
.iter()
249-
.map(|v| v.map(|b| hex_bytes(b, lowercase)).transpose())
250-
.collect::<Result<_, _>>()?,
251-
_ => exec_err!(
252-
"hex got an unexpected argument type: {}",
253-
array.data_type()
254-
)?,
255-
};
256-
257-
let new_values: Vec<Option<String>> = dict
258-
.keys()
259-
.iter()
260-
.map(|key| key.map(|k| values[k as usize].clone()).unwrap_or(None))
261-
.collect();
262-
263-
let string_array_values = StringArray::from(new_values);
264-
265-
Ok(ColumnarValue::Array(Arc::new(string_array_values)))
265+
let keys = dict.keys();
266+
let values = dict.values();
267+
// let mut buffer = Vec::with_capacity(16);
268+
269+
match **value_type {
270+
DataType::Int64 => {
271+
let int_values = as_int64_array(values)?;
272+
hex_encode_int64(
273+
keys.iter().map(|k| k.map(|idx| int_values.value(idx as usize))),
274+
dict.len(),
275+
)
276+
}
277+
DataType::Utf8 => {
278+
let str_values = as_string_array(values);
279+
hex_encode_bytes(
280+
keys.iter().map(|k| {
281+
k.map(|idx| str_values.value(idx as usize).as_bytes())
282+
}),
283+
lowercase,
284+
dict.len(),
285+
)
286+
}
287+
DataType::Binary => {
288+
let bin_values = as_binary_array(values)?;
289+
hex_encode_bytes(
290+
keys.iter()
291+
.map(|k| k.map(|idx| bin_values.value(idx as usize))),
292+
lowercase,
293+
dict.len(),
294+
)
295+
}
296+
_ => {
297+
exec_err!(
298+
"hex got an unexpected argument type: {}",
299+
array.data_type()
300+
)
301+
}
302+
}
266303
}
267304
_ => exec_err!("hex got an unexpected argument type: {}", array.data_type()),
268305
},
@@ -272,6 +309,7 @@ pub fn compute_hex(
272309

273310
#[cfg(test)]
274311
mod test {
312+
use std::str::from_utf8_unchecked;
275313
use std::sync::Arc;
276314

277315
use arrow::array::{Int64Array, StringArray};
@@ -374,12 +412,18 @@ mod test {
374412
#[test]
375413
fn test_hex_int64() {
376414
let num = 1234;
377-
let hexed = super::hex_int64(num);
378-
assert_eq!(hexed, "4D2".to_string());
415+
let mut cache = Vec::with_capacity(16);
416+
super::hex_int64(num, &mut cache);
417+
unsafe {
418+
assert_eq!(from_utf8_unchecked(&cache), "4D2".to_string());
419+
}
379420

380421
let num = -1;
381-
let hexed = super::hex_int64(num);
382-
assert_eq!(hexed, "FFFFFFFFFFFFFFFF".to_string());
422+
cache.clear();
423+
super::hex_int64(num, &mut cache);
424+
unsafe {
425+
assert_eq!(from_utf8_unchecked(&cache), "FFFFFFFFFFFFFFFF".to_string());
426+
}
383427
}
384428

385429
#[test]

0 commit comments

Comments
 (0)