Skip to content

Commit 3102dd6

Browse files
committed
add spark_hex bench
1 parent 646213e commit 3102dd6

2 files changed

Lines changed: 156 additions & 0 deletions

File tree

datafusion/spark/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,7 @@ name = "char"
6565
[[bench]]
6666
harness = false
6767
name = "space"
68+
69+
[[bench]]
70+
harness = false
71+
name = "hex"

datafusion/spark/benches/hex.rs

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
20+
use arrow::array::*;
21+
use arrow::datatypes::*;
22+
use criterion::{Criterion, criterion_group, criterion_main};
23+
use datafusion_common::config::ConfigOptions;
24+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
25+
use datafusion_spark::function::math::hex::SparkHex;
26+
use rand::rngs::StdRng;
27+
use rand::{Rng, SeedableRng};
28+
use std::hint::black_box;
29+
use std::sync::Arc;
30+
31+
fn seedable_rng() -> StdRng {
32+
StdRng::seed_from_u64(42)
33+
}
34+
35+
fn generate_int64_data(size: usize, null_density: f32) -> PrimitiveArray<Int64Type> {
36+
let mut rng = seedable_rng();
37+
(0..size)
38+
.map(|_| {
39+
if rng.random::<f32>() < null_density {
40+
None
41+
} else {
42+
Some(rng.random_range::<i64, _>(-999_999_999_999..999_999_999_999))
43+
}
44+
})
45+
.collect()
46+
}
47+
48+
fn generate_utf8_data(size: usize, null_density: f32) -> StringArray {
49+
let mut rng = seedable_rng();
50+
let mut builder = StringBuilder::new();
51+
for _ in 0..size {
52+
if rng.random::<f32>() < null_density {
53+
builder.append_null();
54+
} else {
55+
let len = rng.random_range::<usize, _>(1..=100);
56+
let s: String =
57+
std::iter::repeat_with(|| rng.random_range(b'a'..=b'z') as char)
58+
.take(len)
59+
.collect();
60+
builder.append_value(&s);
61+
}
62+
}
63+
builder.finish()
64+
}
65+
66+
fn generate_binary_data(size: usize, null_density: f32) -> BinaryArray {
67+
let mut rng = seedable_rng();
68+
let mut builder = BinaryBuilder::new();
69+
for _ in 0..size {
70+
if rng.random::<f32>() < null_density {
71+
builder.append_null();
72+
} else {
73+
let len = rng.random_range::<usize, _>(1..=100);
74+
let bytes: Vec<u8> = (0..len).map(|_| rng.random()).collect();
75+
builder.append_value(&bytes);
76+
}
77+
}
78+
builder.finish()
79+
}
80+
81+
fn generate_int64_dict_data(
82+
size: usize,
83+
null_density: f32,
84+
) -> DictionaryArray<Int32Type> {
85+
let mut rng = seedable_rng();
86+
let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int64Type>::new();
87+
for _ in 0..size {
88+
if rng.random::<f32>() < null_density {
89+
builder.append_null();
90+
} else {
91+
builder.append_value(
92+
rng.random_range::<i64, _>(-999_999_999_999..999_999_999_999),
93+
);
94+
}
95+
}
96+
builder.finish()
97+
}
98+
99+
fn run_benchmark(c: &mut Criterion, name: &str, size: usize, array: Arc<dyn Array>) {
100+
let hex_func = SparkHex::new();
101+
let args = vec![ColumnarValue::Array(array)];
102+
let arg_fields: Vec<_> = args
103+
.iter()
104+
.enumerate()
105+
.map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
106+
.collect();
107+
let config_options = Arc::new(ConfigOptions::default());
108+
109+
c.bench_function(&format!("{name}/size={size}"), |b| {
110+
b.iter(|| {
111+
black_box(
112+
hex_func
113+
.invoke_with_args(ScalarFunctionArgs {
114+
args: args.clone(),
115+
arg_fields: arg_fields.clone(),
116+
number_rows: size,
117+
return_field: Arc::new(Field::new("f", DataType::Utf8, true)),
118+
config_options: Arc::clone(&config_options),
119+
})
120+
.unwrap(),
121+
)
122+
})
123+
});
124+
}
125+
126+
fn criterion_benchmark(c: &mut Criterion) {
127+
let sizes = vec![1024, 4096, 8192];
128+
let null_density = 0.1;
129+
130+
for &size in &sizes {
131+
let data = generate_int64_data(size, null_density);
132+
run_benchmark(c, "hex_int64", size, Arc::new(data));
133+
}
134+
135+
for &size in &sizes {
136+
let data = generate_utf8_data(size, null_density);
137+
run_benchmark(c, "hex_utf8", size, Arc::new(data));
138+
}
139+
140+
for &size in &sizes {
141+
let data = generate_binary_data(size, null_density);
142+
run_benchmark(c, "hex_binary", size, Arc::new(data));
143+
}
144+
145+
for &size in &sizes {
146+
let data = generate_int64_dict_data(size, null_density);
147+
run_benchmark(c, "hex_int64_dict", size, Arc::new(data));
148+
}
149+
}
150+
151+
criterion_group!(benches, criterion_benchmark);
152+
criterion_main!(benches);

0 commit comments

Comments
 (0)