Skip to content

Commit afc0537

Browse files
friendlymatthewOmega359alamb
authored
Format Date32 to string given timestamp specifiers (apache#15361)
* Format Date32 to string given timestamp specifiers * Eagerly cast Date32 to Date64 before formatting * Remove superfluous clone * POC for parsing format with StrftimeItems to determine if there are any time specifiers present. * Update benchmark code * Stage initial selective retry * Update comments * Keep track of consecutive Date32 failures * Revert "Keep track of consecutive Date32 failures" This reverts commit b379fe1. * Updated example filename. * Updates to reflect changes in main. * Documentation update. --------- Co-authored-by: Bruce Ritchie <bruce.ritchie@veeva.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent f27b103 commit afc0537

3 files changed

Lines changed: 293 additions & 39 deletions

File tree

datafusion/functions/benches/to_char.rs

Lines changed: 150 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ use rand::prelude::IndexedRandom;
3333
use rand::rngs::ThreadRng;
3434
use rand::Rng;
3535

36-
fn random_date_in_range(
36+
fn pick_date_in_range(
3737
rng: &mut ThreadRng,
3838
start_date: NaiveDate,
3939
end_date: NaiveDate,
@@ -43,7 +43,7 @@ fn random_date_in_range(
4343
start_date + TimeDelta::try_days(random_days).unwrap()
4444
}
4545

46-
fn data(rng: &mut ThreadRng) -> Date32Array {
46+
fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array {
4747
let mut data: Vec<i32> = vec![];
4848
let unix_days_from_ce = NaiveDate::from_ymd_opt(1970, 1, 1)
4949
.unwrap()
@@ -56,39 +56,139 @@ fn data(rng: &mut ThreadRng) -> Date32Array {
5656
.expect("Date should parse");
5757
for _ in 0..1000 {
5858
data.push(
59-
random_date_in_range(rng, start_date, end_date).num_days_from_ce()
59+
pick_date_in_range(rng, start_date, end_date).num_days_from_ce()
6060
- unix_days_from_ce,
6161
);
6262
}
6363

6464
Date32Array::from(data)
6565
}
6666

67-
fn patterns(rng: &mut ThreadRng) -> StringArray {
68-
let samples = [
69-
"%Y:%m:%d".to_string(),
70-
"%d-%m-%Y".to_string(),
71-
"%d%m%Y".to_string(),
72-
"%Y%m%d".to_string(),
73-
"%Y...%m...%d".to_string(),
74-
];
75-
let mut data: Vec<String> = vec![];
67+
const DATE_PATTERNS: [&str; 5] =
68+
["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
69+
70+
const DATETIME_PATTERNS: [&str; 8] = [
71+
"%Y:%m:%d %H:%M%S",
72+
"%Y:%m:%d %_H:%M%S",
73+
"%Y:%m:%d %k:%M%S",
74+
"%d-%m-%Y %I%P-%M-%S %f",
75+
"%d%m%Y %H",
76+
"%Y%m%d %M-%S %.3f",
77+
"%Y...%m...%d %T%3f",
78+
"%c",
79+
];
80+
81+
fn pick_date_pattern(rng: &mut ThreadRng) -> String {
82+
DATE_PATTERNS
83+
.choose(rng)
84+
.expect("Empty list of date patterns")
85+
.to_string()
86+
}
87+
88+
fn pick_date_time_pattern(rng: &mut ThreadRng) -> String {
89+
DATETIME_PATTERNS
90+
.choose(rng)
91+
.expect("Empty list of date time patterns")
92+
.to_string()
93+
}
94+
95+
fn pick_date_and_date_time_mixed_pattern(rng: &mut ThreadRng) -> String {
96+
match rng.random_bool(0.5) {
97+
true => pick_date_pattern(rng),
98+
false => pick_date_time_pattern(rng),
99+
}
100+
}
101+
102+
fn generate_pattern_array(
103+
rng: &mut ThreadRng,
104+
pick_fn: impl Fn(&mut ThreadRng) -> String,
105+
) -> StringArray {
106+
let mut data = Vec::with_capacity(1000);
107+
76108
for _ in 0..1000 {
77-
data.push(samples.choose(rng).unwrap().to_string());
109+
data.push(pick_fn(rng));
78110
}
79111

80112
StringArray::from(data)
81113
}
82114

115+
fn generate_date_pattern_array(rng: &mut ThreadRng) -> StringArray {
116+
generate_pattern_array(rng, pick_date_pattern)
117+
}
118+
119+
fn generate_datetime_pattern_array(rng: &mut ThreadRng) -> StringArray {
120+
generate_pattern_array(rng, pick_date_time_pattern)
121+
}
122+
123+
fn generate_mixed_pattern_array(rng: &mut ThreadRng) -> StringArray {
124+
generate_pattern_array(rng, pick_date_and_date_time_mixed_pattern)
125+
}
126+
83127
fn criterion_benchmark(c: &mut Criterion) {
84128
let config_options = Arc::new(ConfigOptions::default());
85129

86-
c.bench_function("to_char_array_array_1000", |b| {
130+
c.bench_function("to_char_array_date_only_patterns_1000", |b| {
131+
let mut rng = rand::rng();
132+
let data_arr = generate_date32_array(&mut rng);
133+
let batch_len = data_arr.len();
134+
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
135+
let patterns = ColumnarValue::Array(Arc::new(generate_date_pattern_array(
136+
&mut rng,
137+
)) as ArrayRef);
138+
139+
b.iter(|| {
140+
black_box(
141+
to_char()
142+
.invoke_with_args(ScalarFunctionArgs {
143+
args: vec![data.clone(), patterns.clone()],
144+
arg_fields: vec![
145+
Field::new("a", data.data_type(), true).into(),
146+
Field::new("b", patterns.data_type(), true).into(),
147+
],
148+
number_rows: batch_len,
149+
return_field: Field::new("f", DataType::Utf8, true).into(),
150+
config_options: Arc::clone(&config_options),
151+
})
152+
.expect("to_char should work on valid values"),
153+
)
154+
})
155+
});
156+
157+
c.bench_function("to_char_array_datetime_patterns_1000", |b| {
158+
let mut rng = rand::rng();
159+
let data_arr = generate_date32_array(&mut rng);
160+
let batch_len = data_arr.len();
161+
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
162+
let patterns = ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
163+
&mut rng,
164+
)) as ArrayRef);
165+
166+
b.iter(|| {
167+
black_box(
168+
to_char()
169+
.invoke_with_args(ScalarFunctionArgs {
170+
args: vec![data.clone(), patterns.clone()],
171+
arg_fields: vec![
172+
Field::new("a", data.data_type(), true).into(),
173+
Field::new("b", patterns.data_type(), true).into(),
174+
],
175+
number_rows: batch_len,
176+
return_field: Field::new("f", DataType::Utf8, true).into(),
177+
config_options: Arc::clone(&config_options),
178+
})
179+
.expect("to_char should work on valid values"),
180+
)
181+
})
182+
});
183+
184+
c.bench_function("to_char_array_mixed_patterns_1000", |b| {
87185
let mut rng = rand::rng();
88-
let data_arr = data(&mut rng);
186+
let data_arr = generate_date32_array(&mut rng);
89187
let batch_len = data_arr.len();
90188
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
91-
let patterns = ColumnarValue::Array(Arc::new(patterns(&mut rng)) as ArrayRef);
189+
let patterns = ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
190+
&mut rng,
191+
)) as ArrayRef);
92192

93193
b.iter(|| {
94194
black_box(
@@ -108,13 +208,13 @@ fn criterion_benchmark(c: &mut Criterion) {
108208
})
109209
});
110210

111-
c.bench_function("to_char_array_scalar_1000", |b| {
211+
c.bench_function("to_char_scalar_date_only_pattern_1000", |b| {
112212
let mut rng = rand::rng();
113-
let data_arr = data(&mut rng);
213+
let data_arr = generate_date32_array(&mut rng);
114214
let batch_len = data_arr.len();
115215
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
116216
let patterns =
117-
ColumnarValue::Scalar(ScalarValue::Utf8(Some("%Y-%m-%d".to_string())));
217+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
118218

119219
b.iter(|| {
120220
black_box(
@@ -134,7 +234,35 @@ fn criterion_benchmark(c: &mut Criterion) {
134234
})
135235
});
136236

137-
c.bench_function("to_char_scalar_scalar_1000", |b| {
237+
c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
238+
let mut rng = rand::rng();
239+
let data_arr = generate_date32_array(&mut rng);
240+
let batch_len = data_arr.len();
241+
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
242+
let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
243+
pick_date_time_pattern(&mut rng),
244+
)));
245+
246+
b.iter(|| {
247+
black_box(
248+
to_char()
249+
.invoke_with_args(ScalarFunctionArgs {
250+
args: vec![data.clone(), patterns.clone()],
251+
arg_fields: vec![
252+
Field::new("a", data.data_type(), true).into(),
253+
Field::new("b", patterns.data_type(), true).into(),
254+
],
255+
number_rows: batch_len,
256+
return_field: Field::new("f", DataType::Utf8, true).into(),
257+
config_options: Arc::clone(&config_options),
258+
})
259+
.expect("to_char should work on valid values"),
260+
)
261+
})
262+
});
263+
264+
c.bench_function("to_char_scalar_1000", |b| {
265+
let mut rng = rand::rng();
138266
let timestamp = "2026-07-08T09:10:11"
139267
.parse::<NaiveDateTime>()
140268
.unwrap()
@@ -144,9 +272,8 @@ fn criterion_benchmark(c: &mut Criterion) {
144272
.timestamp_nanos_opt()
145273
.unwrap();
146274
let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), None));
147-
let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
148-
"%d-%m-%Y %H:%M:%S".to_string(),
149-
)));
275+
let pattern =
276+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
150277

151278
b.iter(|| {
152279
black_box(

0 commit comments

Comments
 (0)