Skip to content

Commit 7482768

Browse files
rampage644claude
andauthored
fix(datediff): use boundary-count semantics to match Snowflake (#132)
DATEDIFF previously returned CEIL((b-a) / unit) for second, minute, day, and sub-second parts, which rounded any positive sub-unit difference up to 1. Snowflake's documented semantics is the count of `part`-boundaries crossed between the two endpoints: floor(b / unit) - floor(a / unit). Truncate each endpoint to `part` precision via div_euclid and subtract the quotients. The hour branch also gained a correct implementation (the old heuristic over-counted whenever the hour component changed within a single-hour span, e.g. 01:30 -> 02:30 returned 2). Fixes #130 Co-authored-by: Claude <noreply@anthropic.com>
1 parent 2df570f commit 7482768

6 files changed

Lines changed: 153 additions & 64 deletions

File tree

crates/functions/src/datetime/date_diff.rs

Lines changed: 49 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,6 @@ impl DateDiffFunc {
100100

101101
let arr1 = cast(lhs, &DataType::Timestamp(TimeUnit::Nanosecond, None))?;
102102
let arr2 = cast(rhs, &DataType::Timestamp(TimeUnit::Nanosecond, None))?;
103-
let diff = sub(&arr2, &arr1)?;
104-
let diff_arr = diff
105-
.as_any()
106-
.downcast_ref::<DurationNanosecondArray>()
107-
.context(dtime_errors::CantCastToSnafu {
108-
v: "duration_nsec".to_string(),
109-
})?;
110103
match unit_type {
111104
DatePart::Quarter | DatePart::Year | DatePart::YearISO => {
112105
let arr1 = &date_part(&arr1, unit_type)?;
@@ -136,37 +129,25 @@ impl DateDiffFunc {
136129
let result = cast(&result, &DataType::Int64)?;
137130
Ok(ColumnarValue::Array(Arc::new(result)))
138131
}
139-
DatePart::Week | DatePart::WeekISO => Ok(self.weeks_diff(diff_arr)),
140-
DatePart::Day | DatePart::DayOfYear => Ok(Self::diff(diff_arr, 86_400 * SECOND)),
141-
DatePart::Hour => {
142-
let nanos_in_hour: i64 = 3_600 * SECOND;
143-
let arr1 = &date_part(&arr1, unit_type)?;
144-
let arr2 = &date_part(&arr2, unit_type)?;
145-
let hours_diff = cast(&sub(&arr2, &arr1)?, &DataType::Int64)?;
146-
let hours_arr = as_int64_array(&hours_diff)?;
147-
148-
let result = diff_arr
149-
.iter()
150-
.zip(hours_arr.iter())
151-
.map(|(nanos, diff)| match (nanos, diff) {
152-
(Some(n), Some(hours_diff)) => {
153-
let res = n.div_euclid(nanos_in_hour);
154-
if hours_diff != 0 {
155-
Some(res + 1)
156-
} else {
157-
Some(res)
158-
}
159-
}
160-
_ => None,
161-
})
162-
.collect::<Int64Array>();
163-
Ok(ColumnarValue::Array(Arc::new(result)))
132+
DatePart::Week | DatePart::WeekISO => {
133+
let diff = sub(&arr2, &arr1)?;
134+
let diff_arr = diff
135+
.as_any()
136+
.downcast_ref::<DurationNanosecondArray>()
137+
.context(dtime_errors::CantCastToSnafu {
138+
v: "duration_nsec".to_string(),
139+
})?;
140+
Ok(self.weeks_diff(diff_arr))
141+
}
142+
DatePart::Day | DatePart::DayOfYear => {
143+
Self::boundary_diff(&arr1, &arr2, 86_400 * SECOND)
164144
}
165-
DatePart::Minute => Ok(Self::diff(diff_arr, 60 * SECOND)),
166-
DatePart::Second => Ok(Self::diff(diff_arr, SECOND)),
167-
DatePart::Millisecond => Ok(Self::diff(diff_arr, 1_000_000)),
168-
DatePart::Microsecond => Ok(Self::diff(diff_arr, 1_000)),
169-
_ => Ok(Self::diff(diff_arr, 1)),
145+
DatePart::Hour => Self::boundary_diff(&arr1, &arr2, 3_600 * SECOND),
146+
DatePart::Minute => Self::boundary_diff(&arr1, &arr2, 60 * SECOND),
147+
DatePart::Second => Self::boundary_diff(&arr1, &arr2, SECOND),
148+
DatePart::Millisecond => Self::boundary_diff(&arr1, &arr2, 1_000_000),
149+
DatePart::Microsecond => Self::boundary_diff(&arr1, &arr2, 1_000),
150+
_ => Self::boundary_diff(&arr1, &arr2, 1),
170151
}
171152
}
172153

@@ -182,26 +163,15 @@ impl DateDiffFunc {
182163
| DatePart::Millisecond
183164
| DatePart::Microsecond
184165
| DatePart::Nanosecond => {
185-
// Cast TIME to Int64 nanoseconds from midnight, compute diff
186-
let lhs_i64 = cast(lhs, &DataType::Int64)?;
187-
let rhs_i64 = cast(rhs, &DataType::Int64)?;
188-
let diff_i64 = sub(&rhs_i64, &lhs_i64)?;
189-
// Convert to Duration(Ns) to reuse the generic diff logic
190-
let diff_ns = cast(&diff_i64, &DataType::Duration(TimeUnit::Nanosecond))?;
191-
let diff_arr = diff_ns
192-
.as_any()
193-
.downcast_ref::<DurationNanosecondArray>()
194-
.context(dtime_errors::CantCastToSnafu {
195-
v: "duration_nsec".to_string(),
196-
})?;
197-
Ok(match unit_type {
198-
DatePart::Hour => Self::diff(diff_arr, 3_600 * SECOND),
199-
DatePart::Minute => Self::diff(diff_arr, 60 * SECOND),
200-
DatePart::Second => Self::diff(diff_arr, SECOND),
201-
DatePart::Millisecond => Self::diff(diff_arr, 1_000_000),
202-
DatePart::Microsecond => Self::diff(diff_arr, 1_000),
203-
_ => Self::diff(diff_arr, 1),
204-
})
166+
let coef = match unit_type {
167+
DatePart::Hour => 3_600 * SECOND,
168+
DatePart::Minute => 60 * SECOND,
169+
DatePart::Second => SECOND,
170+
DatePart::Millisecond => 1_000_000,
171+
DatePart::Microsecond => 1_000,
172+
_ => 1,
173+
};
174+
Self::boundary_diff(lhs, rhs, coef)
205175
}
206176
_ => dtime_errors::DateDiffInvalidComponentForTimeSnafu {
207177
component: format!("{unit_type:?}"),
@@ -234,12 +204,28 @@ impl DateDiffFunc {
234204
ColumnarValue::Array(Arc::new(diff))
235205
}
236206

237-
fn diff(diff_arr: &DurationNanosecondArray, coef: i64) -> ColumnarValue {
238-
let diff_arr: Int64Array = diff_arr.unary(|x| {
239-
let div = x / coef;
240-
if x % coef == 0 { div } else { div + 1 }
241-
});
242-
ColumnarValue::Array(Arc::new(diff_arr))
207+
// Snowflake's DATEDIFF returns the number of `part`-boundaries crossed
208+
// between the two endpoints, not the fractional elapsed duration. We
209+
// implement that by truncating each endpoint to `coef` precision
210+
// independently (floor division) and subtracting the integer quotients.
211+
fn boundary_diff(
212+
lhs: &Arc<dyn Array>,
213+
rhs: &Arc<dyn Array>,
214+
coef: i64,
215+
) -> Result<ColumnarValue> {
216+
let lhs_i64 = cast(lhs, &DataType::Int64)?;
217+
let rhs_i64 = cast(rhs, &DataType::Int64)?;
218+
let a = as_int64_array(&lhs_i64)?;
219+
let b = as_int64_array(&rhs_i64)?;
220+
let result: Int64Array = a
221+
.iter()
222+
.zip(b.iter())
223+
.map(|(a, b)| match (a, b) {
224+
(Some(a), Some(b)) => Some(b.div_euclid(coef) - a.div_euclid(coef)),
225+
_ => None,
226+
})
227+
.collect();
228+
Ok(ColumnarValue::Array(Arc::new(result)))
243229
}
244230
}
245231

crates/functions/src/tests/datetime/datediff.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,64 @@ test_query!(
5858
CAST('1970-02-01 00:15:00' AS TIMESTAMP)) AS date_time",
5959
snapshot_path = "datediff"
6060
);
61+
62+
// DATEDIFF uses boundary-count semantics (matches Snowflake), not
63+
// ceiling-of-duration. These cases all produce 0 because the endpoints
64+
// sit in the same `part` bucket even though the true duration is positive.
65+
test_query!(
66+
boundary_count_same_bucket,
67+
"SELECT
68+
DATEDIFF('second',
69+
TIMESTAMP '2020-01-01 00:00:00.100',
70+
TIMESTAMP '2020-01-01 00:00:00.900') AS sec_sub,
71+
DATEDIFF('minute',
72+
TIMESTAMP '2020-01-01 00:00:05',
73+
TIMESTAMP '2020-01-01 00:00:55') AS min_sub,
74+
DATEDIFF('hour',
75+
TIMESTAMP '2020-01-01 01:30:00',
76+
TIMESTAMP '2020-01-01 01:50:00') AS hour_sub,
77+
DATEDIFF('day',
78+
TIMESTAMP '2020-01-01 08:00:00',
79+
TIMESTAMP '2020-01-01 20:00:00') AS day_sub;",
80+
snapshot_path = "datediff"
81+
);
82+
83+
// Endpoints straddle a single boundary: DATEDIFF returns 1 even when the
84+
// true elapsed duration is less than one full unit.
85+
test_query!(
86+
boundary_count_straddle,
87+
"SELECT
88+
DATEDIFF('second',
89+
TIMESTAMP '2020-01-01 00:00:00.900',
90+
TIMESTAMP '2020-01-01 00:00:01.100') AS sec_straddle,
91+
DATEDIFF('minute',
92+
TIMESTAMP '2020-01-01 01:00:55',
93+
TIMESTAMP '2020-01-01 01:01:05') AS min_straddle,
94+
DATEDIFF('hour',
95+
TIMESTAMP '2020-01-01 01:55:00',
96+
TIMESTAMP '2020-01-01 02:05:00') AS hour_straddle,
97+
DATEDIFF('day',
98+
TIMESTAMP '2020-01-01 23:00:00',
99+
TIMESTAMP '2020-01-02 01:00:00') AS day_straddle;",
100+
snapshot_path = "datediff"
101+
);
102+
103+
// Counts boundaries, not rounded duration: a 1.5-second span that crosses
104+
// exactly one second-boundary returns 1, not 2 (CEIL(1.5) = 2 would be wrong).
105+
test_query!(
106+
boundary_count_not_ceiling,
107+
"SELECT
108+
DATEDIFF('second',
109+
TIMESTAMP '2020-01-01 00:00:00.250',
110+
TIMESTAMP '2020-01-01 00:00:01.750') AS sec_1_5,
111+
DATEDIFF('second',
112+
TIMESTAMP '2020-01-01 00:00:00.500',
113+
TIMESTAMP '2020-01-01 00:00:02.900') AS sec_2_4,
114+
DATEDIFF('second',
115+
TIMESTAMP '2020-01-01 00:00:00.000',
116+
TIMESTAMP '2020-01-01 00:00:02.500') AS sec_2_5,
117+
DATEDIFF('hour',
118+
TIMESTAMP '2020-01-01 01:30:00',
119+
TIMESTAMP '2020-01-01 02:30:00') AS hour_1h_two_buckets;",
120+
snapshot_path = "datediff"
121+
);
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
source: crates/functions/src/tests/datetime/datediff.rs
3+
assertion_line: 105
4+
description: "\"SELECT\n DATEDIFF('second',\n TIMESTAMP '2020-01-01 00:00:00.250',\n TIMESTAMP '2020-01-01 00:00:01.750') AS sec_1_5,\n DATEDIFF('second',\n TIMESTAMP '2020-01-01 00:00:00.500',\n TIMESTAMP '2020-01-01 00:00:02.900') AS sec_2_4,\n DATEDIFF('second',\n TIMESTAMP '2020-01-01 00:00:00.000',\n TIMESTAMP '2020-01-01 00:00:02.500') AS sec_2_5,\n DATEDIFF('hour',\n TIMESTAMP '2020-01-01 01:30:00',\n TIMESTAMP '2020-01-01 02:30:00') AS hour_1h_two_buckets;\""
5+
---
6+
Ok(
7+
[
8+
"+---------+---------+---------+---------------------+",
9+
"| sec_1_5 | sec_2_4 | sec_2_5 | hour_1h_two_buckets |",
10+
"+---------+---------+---------+---------------------+",
11+
"| 1 | 2 | 2 | 1 |",
12+
"+---------+---------+---------+---------------------+",
13+
],
14+
)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
source: crates/functions/src/tests/datetime/datediff.rs
3+
assertion_line: 65
4+
description: "\"SELECT\n DATEDIFF('second',\n TIMESTAMP '2020-01-01 00:00:00.100',\n TIMESTAMP '2020-01-01 00:00:00.900') AS sec_sub,\n DATEDIFF('minute',\n TIMESTAMP '2020-01-01 00:00:05',\n TIMESTAMP '2020-01-01 00:00:55') AS min_sub,\n DATEDIFF('hour',\n TIMESTAMP '2020-01-01 01:30:00',\n TIMESTAMP '2020-01-01 01:50:00') AS hour_sub,\n DATEDIFF('day',\n TIMESTAMP '2020-01-01 08:00:00',\n TIMESTAMP '2020-01-01 20:00:00') AS day_sub;\""
5+
---
6+
Ok(
7+
[
8+
"+---------+---------+----------+---------+",
9+
"| sec_sub | min_sub | hour_sub | day_sub |",
10+
"+---------+---------+----------+---------+",
11+
"| 0 | 0 | 0 | 0 |",
12+
"+---------+---------+----------+---------+",
13+
],
14+
)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
source: crates/functions/src/tests/datetime/datediff.rs
3+
assertion_line: 85
4+
description: "\"SELECT\n DATEDIFF('second',\n TIMESTAMP '2020-01-01 00:00:00.900',\n TIMESTAMP '2020-01-01 00:00:01.100') AS sec_straddle,\n DATEDIFF('minute',\n TIMESTAMP '2020-01-01 01:00:55',\n TIMESTAMP '2020-01-01 01:01:05') AS min_straddle,\n DATEDIFF('hour',\n TIMESTAMP '2020-01-01 01:55:00',\n TIMESTAMP '2020-01-01 02:05:00') AS hour_straddle,\n DATEDIFF('day',\n TIMESTAMP '2020-01-01 23:00:00',\n TIMESTAMP '2020-01-02 01:00:00') AS day_straddle;\""
5+
---
6+
Ok(
7+
[
8+
"+--------------+--------------+---------------+--------------+",
9+
"| sec_straddle | min_straddle | hour_straddle | day_straddle |",
10+
"+--------------+--------------+---------------+--------------+",
11+
"| 1 | 1 | 1 | 1 |",
12+
"+--------------+--------------+---------------+--------------+",
13+
],
14+
)

crates/functions/src/tests/datetime/snapshots/datediff/query_different_types.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Ok(
77
"+---------+---------+---------+---------+-----------+",
88
"| ts_date | date_ts | ts_time | time_ts | date_time |",
99
"+---------+---------+---------+---------+-----------+",
10-
"| 6 | 7 | 5 | 5 | 44655 |",
10+
"| 6 | 6 | 5 | 5 | 44655 |",
1111
"+---------+---------+---------+---------+-----------+",
1212
],
1313
)

0 commit comments

Comments
 (0)