Skip to content

Commit 7f37faa

Browse files
authored
perf(cubesql): Fast path for timespamp parsing for cube results (cube-js#10826)
parse_date_str` previously walked a 7-format `chrono::NaiveDateTime::parse_from_str` cascade. Cube's canonical wire shape is `%Y-%m-%dT%H:%M:%S%.3f` — `YYYY-MM-DDTHH:MM:SS.fff`, exactly 23 bytes — and was the 5th attempt in that cascade, so most rows reparsed the prefix 4 times before matching. Benchmark on Apple M3 Max (`cargo bench --bench transform_response`, 10k rows, master vs. this commit): | Shape (10k rows) | Before | After | Δ | | ------------------------- | --------- | --------- | ------ | | row / cols=8 / td=1 | 22.48 ms | 15.84 ms | -29.5% | | columnar / cols=8 / td=1 | 7.76 ms | 3.08 ms | -60.3% | | row / cols=16 / td=2 | ~43 ms | 36.19 ms | -15.8% | | columnar / cols=16 / td=2 | ~16 ms | 6.11 ms | -61.8% | | columnar / cols=16 / td=0 | unchanged | unchanged | noise |
1 parent 3226a63 commit 7f37faa

2 files changed

Lines changed: 201 additions & 51 deletions

File tree

rust/cubesql/cubesql/src/compile/date_parser.rs

Lines changed: 163 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,173 @@ use crate::compile::engine::df::scan::DataFusionError;
22
use chrono::{NaiveDate, NaiveDateTime};
33

44
pub fn parse_date_str(s: &str) -> Result<NaiveDateTime, DataFusionError> {
5-
let parsed = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S")
6-
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f"))
7-
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S"))
8-
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f UTC"))
9-
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f"))
10-
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.fZ"))
11-
.or_else(|_| {
12-
NaiveDate::parse_from_str(s, "%Y-%m-%d").map(|date| date.and_hms_opt(0, 0, 0).unwrap())
13-
});
5+
if let Some(parsed) = parse_fast(s) {
6+
return Ok(parsed);
7+
}
148

15-
parsed.map_err(|e| {
9+
parse_with_chrono(s).map_err(|e| {
1610
DataFusionError::Internal(format!(
1711
"Can't parse date/time string literal {:?}: {}",
1812
s, e
1913
))
2014
})
2115
}
16+
17+
#[inline]
18+
fn digit(b: u8) -> Option<u32> {
19+
let d = b.wrapping_sub(b'0');
20+
if d <= 9 {
21+
Some(d as u32)
22+
} else {
23+
None
24+
}
25+
}
26+
27+
#[inline]
28+
fn ascii_u32_2(b: &[u8], at: usize) -> Option<u32> {
29+
Some(digit(b[at])? * 10 + digit(b[at + 1])?)
30+
}
31+
32+
#[inline]
33+
fn ascii_u32_4(b: &[u8], at: usize) -> Option<u32> {
34+
Some(
35+
digit(b[at])? * 1000 + digit(b[at + 1])? * 100 + digit(b[at + 2])? * 10 + digit(b[at + 3])?,
36+
)
37+
}
38+
39+
/// Recognizes only `YYYY-MM-DDTHH:MM:SS.fff` (23 bytes, `T` separator, 3-digit fraction).
40+
fn parse_fast(s: &str) -> Option<NaiveDateTime> {
41+
let b = s.as_bytes();
42+
if b.len() != 23
43+
|| b[4] != b'-'
44+
|| b[7] != b'-'
45+
|| b[10] != b'T'
46+
|| b[13] != b':'
47+
|| b[16] != b':'
48+
|| b[19] != b'.'
49+
{
50+
return None;
51+
}
52+
53+
let year = ascii_u32_4(b, 0)? as i32;
54+
let month = ascii_u32_2(b, 5)?;
55+
let day = ascii_u32_2(b, 8)?;
56+
let hour = ascii_u32_2(b, 11)?;
57+
let minute = ascii_u32_2(b, 14)?;
58+
let second = ascii_u32_2(b, 17)?;
59+
let frac_millis = digit(b[20])? * 100 + digit(b[21])? * 10 + digit(b[22])?;
60+
61+
NaiveDate::from_ymd_opt(year, month, day)?.and_hms_nano_opt(
62+
hour,
63+
minute,
64+
second,
65+
frac_millis * 1_000_000,
66+
)
67+
}
68+
69+
fn parse_with_chrono(s: &str) -> chrono::ParseResult<NaiveDateTime> {
70+
NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f")
71+
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f"))
72+
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S"))
73+
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S"))
74+
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.fZ"))
75+
.or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f UTC"))
76+
.or_else(|_| {
77+
NaiveDate::parse_from_str(s, "%Y-%m-%d").map(|date| date.and_hms_opt(0, 0, 0).unwrap())
78+
})
79+
}
80+
81+
#[cfg(test)]
82+
mod tests {
83+
use super::*;
84+
85+
fn ymd_hmsn(y: i32, m: u32, d: u32, h: u32, mi: u32, s: u32, n: u32) -> NaiveDateTime {
86+
NaiveDate::from_ymd_opt(y, m, d)
87+
.unwrap()
88+
.and_hms_nano_opt(h, mi, s, n)
89+
.unwrap()
90+
}
91+
92+
#[test]
93+
fn fast_path_accepts_canonical_shape() {
94+
let cases: &[(&str, NaiveDateTime)] = &[
95+
("2022-01-01T00:00:00.000", ymd_hmsn(2022, 1, 1, 0, 0, 0, 0)),
96+
(
97+
"2024-06-15T13:45:07.123",
98+
ymd_hmsn(2024, 6, 15, 13, 45, 7, 123_000_000),
99+
),
100+
(
101+
"9999-12-31T23:59:59.999",
102+
ymd_hmsn(9999, 12, 31, 23, 59, 59, 999_000_000),
103+
),
104+
];
105+
106+
for (input, expected) in cases {
107+
assert_eq!(parse_fast(input), Some(*expected), "fast path: {}", input);
108+
assert_eq!(
109+
parse_date_str(input).unwrap(),
110+
*expected,
111+
"wrapper: {}",
112+
input
113+
);
114+
}
115+
}
116+
117+
#[test]
118+
fn fast_path_rejects_non_canonical() {
119+
// Wrong length / shape — must not be fast-parsed.
120+
let rejects = [
121+
"2022",
122+
"2022-01-01",
123+
"2022-01-01 00:00:00",
124+
"2022-01-01T00:00:00",
125+
"2022-01-01T00:00:00.000Z",
126+
"2022-01-01 00:00:00.000",
127+
"2022-01-01T00:00:00.123456",
128+
"2022-13-01T00:00:00.000",
129+
"2022-01-32T00:00:00.000",
130+
"2022/01/01T00:00:00.000",
131+
"2022-01-01x00:00:00.000",
132+
"2022-01-01T25:00:00.000",
133+
"2022-01-01T00:60:00.000",
134+
"2022-01-01T00:00:60.000",
135+
];
136+
for s in rejects {
137+
assert!(parse_fast(s).is_none(), "unexpectedly fast-parsed: {:?}", s);
138+
}
139+
}
140+
141+
#[test]
142+
fn wrapper_handles_other_shapes_via_chrono_fallback() {
143+
let cases: &[(&str, NaiveDateTime)] = &[
144+
("2022-01-01", ymd_hmsn(2022, 1, 1, 0, 0, 0, 0)),
145+
("2022-01-01 00:00:00", ymd_hmsn(2022, 1, 1, 0, 0, 0, 0)),
146+
("2022-01-01T00:00:00", ymd_hmsn(2022, 1, 1, 0, 0, 0, 0)),
147+
("2022-01-01T00:00:00.000Z", ymd_hmsn(2022, 1, 1, 0, 0, 0, 0)),
148+
(
149+
"2022-01-01 00:00:00.000 UTC",
150+
ymd_hmsn(2022, 1, 1, 0, 0, 0, 0),
151+
),
152+
(
153+
"2024-06-15T13:45:07.123456789",
154+
ymd_hmsn(2024, 6, 15, 13, 45, 7, 123_456_789),
155+
),
156+
];
157+
158+
for (input, expected) in cases {
159+
assert_eq!(
160+
parse_date_str(input).unwrap(),
161+
*expected,
162+
"wrapper: {}",
163+
input
164+
);
165+
}
166+
}
167+
168+
#[test]
169+
fn rejects_propagate_through_wrapper() {
170+
for s in ["", "2022/01/01", "2022-01-01T00:00:00+02:00"] {
171+
assert!(parse_date_str(s).is_err(), "should error: {:?}", s);
172+
}
173+
}
174+
}

rust/cubesql/cubesql/src/compile/engine/df/scan.rs

Lines changed: 38 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,14 +1086,12 @@ macro_rules! transform_response_body {
10861086
(FieldValue::String(s), builder) => {
10871087
let timestamp = parse_date_str(s.as_ref())?;
10881088
// TODO switch parsing to microseconds
1089-
if timestamp.and_utc().timestamp_millis() > (((1i64) << 62) / 1_000_000) {
1090-
builder.append_null()?;
1091-
} else if let Some(nanos) = timestamp.and_utc().timestamp_nanos_opt() {
1089+
if let Some(nanos) = timestamp.and_utc().timestamp_nanos_opt() {
10921090
builder.append_value(nanos)?;
10931091
} else {
10941092
log::error!(
10951093
"Unable to cast timestamp value to nanoseconds: {}",
1096-
timestamp.to_string()
1094+
timestamp
10971095
);
10981096
builder.append_null()?;
10991097
}
@@ -1114,12 +1112,7 @@ macro_rules! transform_response_body {
11141112
{
11151113
(FieldValue::String(s), builder) => {
11161114
let timestamp = parse_date_str(s.as_ref())?;
1117-
// TODO switch parsing to microseconds
1118-
if timestamp.and_utc().timestamp_millis() > (((1 as i64) << 62) / 1_000_000) {
1119-
builder.append_null()?;
1120-
} else {
1121-
builder.append_value(timestamp.and_utc().timestamp_millis())?;
1122-
}
1115+
builder.append_value(timestamp.and_utc().timestamp_millis())?;
11231116
},
11241117
},
11251118
{
@@ -1136,28 +1129,18 @@ macro_rules! transform_response_body {
11361129
field_name,
11371130
{
11381131
(FieldValue::String(s), builder) => {
1139-
let date = NaiveDate::parse_from_str(s.as_ref(), "%Y-%m-%d")
1140-
// FIXME: temporary solution for cases when expected type is Date32
1141-
// but underlying data is a Timestamp
1142-
.or_else(|_| NaiveDate::parse_from_str(s.as_ref(), "%Y-%m-%dT00:00:00.000"))
1143-
.map_err(|e| {
1144-
DataFusionError::Execution(format!(
1145-
"Can't parse date: '{}': {}",
1146-
s, e
1147-
))
1148-
});
1149-
match date {
1150-
Ok(date) => {
1132+
match parse_date_str(s.as_ref()) {
1133+
Ok(timestamp) => {
11511134
let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
1152-
let days_since_epoch = date.num_days_from_ce() - epoch.num_days_from_ce();
1135+
let days_since_epoch = timestamp.date().num_days_from_ce()
1136+
- epoch.num_days_from_ce();
11531137
builder.append_value(days_since_epoch)?;
11541138
}
11551139
Err(error) => {
11561140
log::error!(
11571141
"Unable to parse value as Date32: {}",
1158-
error.to_string()
1142+
error
11591143
);
1160-
11611144
builder.append_null()?
11621145
}
11631146
}
@@ -1375,7 +1358,9 @@ mod tests {
13751358
use cubeclient::models::V1LoadResponse;
13761359
use datafusion::{
13771360
arrow::{
1378-
array::{BooleanArray, Float64Array, StringArray, TimestampNanosecondArray},
1361+
array::{
1362+
BooleanArray, Date32Array, Float64Array, StringArray, TimestampNanosecondArray,
1363+
},
13791364
datatypes::{Field, Schema},
13801365
},
13811366
execution::{
@@ -1441,11 +1426,11 @@ mod tests {
14411426
"timeDimensions": []
14421427
},
14431428
"data": [
1444-
{"KibanaSampleDataEcommerce.count": null, "KibanaSampleDataEcommerce.maxPrice": null, "KibanaSampleDataEcommerce.isBool": null, "KibanaSampleDataEcommerce.orderDate": null, "KibanaSampleDataEcommerce.city": "City 1"},
1445-
{"KibanaSampleDataEcommerce.count": 5, "KibanaSampleDataEcommerce.maxPrice": 5.05, "KibanaSampleDataEcommerce.isBool": true, "KibanaSampleDataEcommerce.orderDate": "2022-01-01 00:00:00.000", "KibanaSampleDataEcommerce.city": "City 2"},
1446-
{"KibanaSampleDataEcommerce.count": "5", "KibanaSampleDataEcommerce.maxPrice": "5.05", "KibanaSampleDataEcommerce.isBool": false, "KibanaSampleDataEcommerce.orderDate": "2023-01-01 00:00:00.000", "KibanaSampleDataEcommerce.city": "City 3"},
1447-
{"KibanaSampleDataEcommerce.count": null, "KibanaSampleDataEcommerce.maxPrice": null, "KibanaSampleDataEcommerce.isBool": "true", "KibanaSampleDataEcommerce.orderDate": "9999-12-31 00:00:00.000", "KibanaSampleDataEcommerce.city": "City 4"},
1448-
{"KibanaSampleDataEcommerce.count": null, "KibanaSampleDataEcommerce.maxPrice": null, "KibanaSampleDataEcommerce.isBool": "false", "KibanaSampleDataEcommerce.orderDate": null, "KibanaSampleDataEcommerce.city": null}
1429+
{"KibanaSampleDataEcommerce.count": null, "KibanaSampleDataEcommerce.maxPrice": null, "KibanaSampleDataEcommerce.isBool": null, "KibanaSampleDataEcommerce.orderTimestamp": null, "KibanaSampleDataEcommerce.orderDate": null, "KibanaSampleDataEcommerce.city": "City 1"},
1430+
{"KibanaSampleDataEcommerce.count": 5, "KibanaSampleDataEcommerce.maxPrice": 5.05, "KibanaSampleDataEcommerce.isBool": true, "KibanaSampleDataEcommerce.orderTimestamp": "2022-01-01 00:00:00.000", "KibanaSampleDataEcommerce.orderDate": "2022-01-01", "KibanaSampleDataEcommerce.city": "City 2"},
1431+
{"KibanaSampleDataEcommerce.count": "5", "KibanaSampleDataEcommerce.maxPrice": "5.05", "KibanaSampleDataEcommerce.isBool": false, "KibanaSampleDataEcommerce.orderTimestamp": "2023-01-01 00:00:00.000", "KibanaSampleDataEcommerce.orderDate": "2023-01-01", "KibanaSampleDataEcommerce.city": "City 3"},
1432+
{"KibanaSampleDataEcommerce.count": null, "KibanaSampleDataEcommerce.maxPrice": null, "KibanaSampleDataEcommerce.isBool": "true", "KibanaSampleDataEcommerce.orderTimestamp": "9999-12-31 00:00:00.000", "KibanaSampleDataEcommerce.orderDate": "9999-12-31", "KibanaSampleDataEcommerce.city": "City 4"},
1433+
{"KibanaSampleDataEcommerce.count": null, "KibanaSampleDataEcommerce.maxPrice": null, "KibanaSampleDataEcommerce.isBool": "false", "KibanaSampleDataEcommerce.orderTimestamp": null, "KibanaSampleDataEcommerce.orderDate": null, "KibanaSampleDataEcommerce.city": null}
14491434
]
14501435
}]
14511436
}
@@ -1498,7 +1483,7 @@ mod tests {
14981483
}
14991484

15001485
#[tokio::test]
1501-
async fn test_df_cube_scan_execute() {
1486+
async fn test_df_cube_scan_execute() -> Result<(), CubeError> {
15021487
assert_eq!(std::mem::size_of::<FieldValue>(), 24);
15031488

15041489
let schema = Arc::new(Schema::new(vec![
@@ -1510,7 +1495,7 @@ mod tests {
15101495
false,
15111496
),
15121497
Field::new(
1513-
"KibanaSampleDataEcommerce.orderDate",
1498+
"KibanaSampleDataEcommerce.orderTimestamp",
15141499
DataType::Timestamp(TimeUnit::Nanosecond, None),
15151500
false,
15161501
),
@@ -1521,6 +1506,11 @@ mod tests {
15211506
false,
15221507
),
15231508
Field::new("KibanaSampleDataEcommerce.city", DataType::Utf8, false),
1509+
Field::new(
1510+
"KibanaSampleDataEcommerce.orderDate",
1511+
DataType::Date32,
1512+
false,
1513+
),
15241514
]));
15251515

15261516
let scan_node = CubeScanExecutionPlan {
@@ -1543,6 +1533,7 @@ mod tests {
15431533
]),
15441534
dimensions: Some(vec![
15451535
"KibanaSampleDataEcommerce.isBool".to_string(),
1536+
"KibanaSampleDataEcommerce.orderTimestamp".to_string(),
15461537
"KibanaSampleDataEcommerce.orderDate".to_string(),
15471538
"KibanaSampleDataEcommerce.city".to_string(),
15481539
]),
@@ -1565,9 +1556,7 @@ mod tests {
15651556
config_obj: crate::config::Config::test().config_obj(),
15661557
};
15671558

1568-
let runtime = Arc::new(
1569-
RuntimeEnv::new(RuntimeConfig::new()).expect("Unable to create RuntimeEnv for testing"),
1570-
);
1559+
let runtime = Arc::new(RuntimeEnv::new(RuntimeConfig::new())?);
15711560
let task = Arc::new(TaskContext::new(
15721561
"test".to_string(),
15731562
"session".to_string(),
@@ -1576,8 +1565,8 @@ mod tests {
15761565
HashMap::new(),
15771566
runtime,
15781567
));
1579-
let stream = scan_node.execute(0, task).await.unwrap();
1580-
let batches = common::collect(stream).await.unwrap();
1568+
let stream = scan_node.execute(0, task).await?;
1569+
let batches = common::collect(stream).await?;
15811570

15821571
assert_eq!(
15831572
batches[0],
@@ -1627,9 +1616,17 @@ mod tests {
16271616
Some("City 4"),
16281617
None
16291618
])) as ArrayRef,
1619+
Arc::new(Date32Array::from(vec![
1620+
None,
1621+
Some(18993),
1622+
Some(19358),
1623+
Some(2_932_896),
1624+
None,
1625+
])) as ArrayRef,
16301626
],
1631-
)
1632-
.unwrap()
1633-
)
1627+
)?
1628+
);
1629+
1630+
Ok(())
16341631
}
16351632
}

0 commit comments

Comments
 (0)