Skip to content

Commit c5aa1d5

Browse files
committed
Add CalciteAnalyticsDatetimeWireFormatIT regression net for #5420
Wire-format regression coverage for sql#5420. With DatetimeOutputCastRule deleted (sql#5454) and DatetimeOutputCastRewriter deleted (opensearch#21748), datetime root columns must reach the user as PPL's documented `yyyy-MM-dd HH:mm:ss[.SSSSSSSSS]` format with typed schema labels (`timestamp` / `date` / `time`, never `string`) on the analytics-engine route. The IT skips cleanly when `-Dtests.analytics.parquet_indices=true` is not set — Calcite-legacy was never affected by sql#5420 and asserting the same contract on it is duplicative noise. Coverage: - Wire-format round trip (typed schema + space-separator value) on TIMESTAMP / DATE / TIME root columns, plus eval-derived TIMESTAMP and `min(ts)` aggregation. - Datetime processing inside AE (parsing for WHERE comparison, scalar extract functions year/month/day/hour, ORDER BY). - Nanosecond precision preservation via `date_nanos`. - Aggregation beyond min(): max(ts), dc(ts). Each test asserts the query routes to AE (LogicalTableScan with lowercase `opensearch`) before checking wire format, so a future regression that silently routes to Calcite-legacy can't leave the contract green by accident. Signed-off-by: Eric Wei <mengwei.eric@gmail.com>
1 parent 88a5013 commit c5aa1d5

1 file changed

Lines changed: 291 additions & 0 deletions

File tree

Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.calcite.remote;
7+
8+
import static org.junit.Assume.assumeTrue;
9+
import static org.opensearch.sql.util.MatcherUtils.rows;
10+
import static org.opensearch.sql.util.MatcherUtils.schema;
11+
import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
12+
import static org.opensearch.sql.util.MatcherUtils.verifySchema;
13+
14+
import java.io.IOException;
15+
import org.json.JSONObject;
16+
import org.junit.Assert;
17+
import org.junit.jupiter.api.Test;
18+
import org.opensearch.client.Request;
19+
import org.opensearch.sql.legacy.TestUtils;
20+
import org.opensearch.sql.ppl.PPLIntegTestCase;
21+
22+
/**
23+
* Wire-format regression coverage for sql#5420.
24+
*
25+
* <p>Asserts that on the analytics-engine route — i.e. when the suite is started with {@code
26+
* -Dtests.analytics.parquet_indices=true} so that {@code RestUnifiedQueryAction.isAnalyticsIndex}
27+
* routes the query to DataFusion — datetime root columns reach the user as PPL's documented
28+
* space-separator format ({@code "yyyy-MM-dd HH:mm:ss[.SSSSSSSSS]"}) AND retain their typed schema
29+
* labels ({@code timestamp}/{@code date}/{@code time}, never {@code string}).
30+
*
31+
* <p>Before the fix, {@code DatetimeOutputCastRule} (SQL plugin) wrapped every datetime root column
32+
* in {@code CAST(... AS VARCHAR)} and {@code DatetimeOutputCastRewriter} (analytics sandbox)
33+
* translated that to {@code to_char(ts, '%Y-%m-%d %H:%M:%S%.f')} server-side. Both are removed by
34+
* sql#5454 / OpenSearch#21748; the response pipeline now relies on AE returning real datetime cells
35+
* and {@code ExprValueUtils.fromObjectValue} → {@code ExprTimestampValue.value()} producing the
36+
* documented format. These tests are the regression net for that contract.
37+
*/
38+
public class CalciteAnalyticsDatetimeWireFormatIT extends PPLIntegTestCase {
39+
40+
private static final String INDEX = "wire_format_dt";
41+
42+
@Override
43+
public void init() throws Exception {
44+
super.init();
45+
// This IT is the regression net for the analytics-engine wire-format contract (sql#5420 /
46+
// sql#5454 / OpenSearch#21748). Calcite-legacy already produces the documented format via
47+
// `ExprTimestampValue.value()` and was never affected by the bug, so running these
48+
// assertions on the legacy path is duplicative noise. Skip cleanly when parquet routing
49+
// isn't enabled — the run-mode flag is the single source of truth
50+
// (TestUtils.AnalyticsIndexConfig).
51+
assumeTrue(
52+
"CalciteAnalyticsDatetimeWireFormatIT only meaningful with"
53+
+ " -Dtests.analytics.parquet_indices=true",
54+
isAnalyticsParquetIndicesEnabled());
55+
enableCalcite();
56+
57+
if (!TestUtils.isIndexExist(client(), INDEX)) {
58+
String mapping =
59+
"{\"mappings\":{\"properties\":{"
60+
+ "\"ts\":{\"type\":\"date\",\"format\":\"yyyy-MM-dd HH:mm:ss\"},"
61+
+ "\"ts_nanos\":{\"type\":\"date_nanos\"},"
62+
+ "\"d\":{\"type\":\"date\",\"format\":\"yyyy-MM-dd\"},"
63+
+ "\"t\":{\"type\":\"date\",\"format\":\"HH:mm:ss\"}}}}";
64+
TestUtils.createIndexByRestClient(client(), INDEX, mapping);
65+
66+
Request doc = new Request("PUT", "/" + INDEX + "/_doc/1?refresh=true");
67+
doc.setJsonEntity(
68+
"{\"ts\":\"2024-03-15 10:30:00\","
69+
+ "\"ts_nanos\":\"2024-03-15T10:30:00.123456789Z\","
70+
+ "\"d\":\"2024-03-15\","
71+
+ "\"t\":\"10:30:00\"}");
72+
client().performRequest(doc);
73+
74+
// Second row to give min/max/count(distinct) something to discriminate, and to cover a
75+
// sub-second timestamp on the regular `ts` column.
76+
Request doc2 = new Request("PUT", "/" + INDEX + "/_doc/2?refresh=true");
77+
doc2.setJsonEntity(
78+
"{\"ts\":\"2024-03-16 23:59:59\","
79+
+ "\"ts_nanos\":\"2024-03-16T23:59:59.999999999Z\","
80+
+ "\"d\":\"2024-03-16\","
81+
+ "\"t\":\"23:59:59\"}");
82+
client().performRequest(doc2);
83+
}
84+
}
85+
86+
/**
87+
* Asserts the most recently issued query was served by the analytics engine, not the Calcite
88+
* legacy path. The two backends produce structurally distinct {@code _explain} output:
89+
*
90+
* <ul>
91+
* <li>AE: {@code LogicalTableScan(table=[[opensearch, ...]])} — lowercase {@code opensearch}.
92+
* <li>Calcite legacy: {@code CalciteLogicalIndexScan(table=[[OpenSearch, ...]])} — capital
93+
* {@code OpenSearch}, {@code CalciteLogicalIndexScan} operator.
94+
* </ul>
95+
*
96+
* <p>Without this guard, a future regression that silently routes to Calcite would leave every
97+
* wire-format assertion green (Calcite already produces the documented format), defeating the
98+
* purpose of this regression net.
99+
*/
100+
private void assertRoutedToAnalyticsEngine(String query) throws IOException {
101+
String explained = explainQueryToString(query);
102+
Assert.assertTrue(
103+
"Expected analytics-engine route (LogicalTableScan + lowercase 'opensearch'), got: "
104+
+ explained,
105+
explained.contains("LogicalTableScan(table=[[opensearch,"));
106+
Assert.assertFalse(
107+
"Expected analytics-engine route, but query routed to Calcite legacy"
108+
+ " (CalciteLogicalIndexScan): "
109+
+ explained,
110+
explained.contains("CalciteLogicalIndexScan"));
111+
}
112+
113+
/* ---------- 1. Wire-format round-trip (typed schema + space-separator value) ---------- */
114+
115+
/** TIMESTAMP root col round-trips with space separator and typed schema. */
116+
@Test
117+
public void testTimestampRootColumnSpaceFormat() throws IOException {
118+
String query = "source=" + INDEX + " | where ts = '2024-03-15 10:30:00' | fields ts";
119+
assertRoutedToAnalyticsEngine(query);
120+
JSONObject result = executeQuery(query);
121+
verifySchema(result, schema("ts", "timestamp"));
122+
verifyDataRows(result, rows("2024-03-15 10:30:00"));
123+
}
124+
125+
/**
126+
* DATE-mapped root col round-trips with the documented space-separator format. AE widens the
127+
* date-mapping to a TIMESTAMP at scan time, so the schema label is {@code timestamp} and the
128+
* value carries a midnight time portion — but it must NOT be the ISO {@code T}-separator that
129+
* sql#5420 was filed against.
130+
*/
131+
@Test
132+
public void testDateRootColumnYmdFormat() throws IOException {
133+
String query = "source=" + INDEX + " | where d = '2024-03-15' | fields d";
134+
assertRoutedToAnalyticsEngine(query);
135+
JSONObject result = executeQuery(query);
136+
verifySchema(result, schema("d", "timestamp"));
137+
verifyDataRows(result, rows("2024-03-15 00:00:00"));
138+
}
139+
140+
/**
141+
* TIME-mapped root col — same AE widening behavior as {@code d}; schema becomes {@code timestamp}
142+
* but the value must still use the space separator. PPL doesn't accept a bare {@code 'HH:mm:ss'}
143+
* literal in WHERE against a date-mapped column, so exercise the wire-format path with a plain
144+
* projection instead.
145+
*/
146+
@Test
147+
public void testTimeRootColumnHmsFormat() throws IOException {
148+
String query = "source=" + INDEX + " | sort t | head 1 | fields t";
149+
assertRoutedToAnalyticsEngine(query);
150+
JSONObject result = executeQuery(query);
151+
verifySchema(result, schema("t", "timestamp"));
152+
Assert.assertFalse(
153+
"Time-mapped column must not surface as ISO T-separator literal",
154+
result.getJSONArray("datarows").getJSONArray(0).getString(0).contains("T"));
155+
}
156+
157+
/** Eval-derived TIMESTAMP must follow the same wire-format contract as a root column. */
158+
@Test
159+
public void testEvalDerivedTimestampSpaceFormat() throws IOException {
160+
String query =
161+
"source=" + INDEX + " | where ts = '2024-03-15 10:30:00' | eval x = ts | fields x";
162+
assertRoutedToAnalyticsEngine(query);
163+
JSONObject result = executeQuery(query);
164+
verifySchema(result, schema("x", "timestamp"));
165+
verifyDataRows(result, rows("2024-03-15 10:30:00"));
166+
}
167+
168+
/**
169+
* Aggregation output preserves the contract — {@code min(ts)} returns a timestamp cell (typed
170+
* schema, space-separator value), not a stringified ISO-T literal.
171+
*/
172+
@Test
173+
public void testStatsMinTimestampSpaceFormat() throws IOException {
174+
String query = "source=" + INDEX + " | stats min(ts) as min_ts";
175+
assertRoutedToAnalyticsEngine(query);
176+
JSONObject result = executeQuery(query);
177+
verifySchema(result, schema("min_ts", "timestamp"));
178+
verifyDataRows(result, rows("2024-03-15 10:30:00"));
179+
}
180+
181+
/* ---------- 2. Datetime processing inside AE (parsing, comparison, arithmetic, etc.) ---------- */
182+
183+
/**
184+
* AE must parse the indexed TIMESTAMP cell as a real timestamp (not a string) for comparison to
185+
* evaluate. If AE silently treated {@code ts} as a string, this WHERE would either reject the
186+
* query at planning time or do lexicographic compare and surface no rows.
187+
*/
188+
@Test
189+
public void testTimestampWhereComparisonFiltersCorrectly() throws IOException {
190+
// Bound between the two seeded rows: only the later one survives.
191+
String matchQuery = "source=" + INDEX + " | where ts > '2024-03-16 00:00:00' | fields ts";
192+
assertRoutedToAnalyticsEngine(matchQuery);
193+
JSONObject match = executeQuery(matchQuery);
194+
verifySchema(match, schema("ts", "timestamp"));
195+
verifyDataRows(match, rows("2024-03-16 23:59:59"));
196+
197+
JSONObject miss =
198+
executeQuery("source=" + INDEX + " | where ts < '2024-03-15 00:00:00' | fields ts");
199+
Assert.assertEquals(
200+
"Strict comparison should exclude both rows when bound is before any seeded timestamp",
201+
0,
202+
miss.getJSONArray("datarows").length());
203+
}
204+
205+
/**
206+
* AE must compute on the parsed TIMESTAMP — {@code year(ts)}/{@code month(ts)}/{@code
207+
* day_of_month(ts)} on the indexed cell must yield the calendar fields, proving AE didn't
208+
* stringify before extraction.
209+
*/
210+
@Test
211+
public void testTimestampScalarExtractFunctions() throws IOException {
212+
String query =
213+
"source="
214+
+ INDEX
215+
+ " | where ts = '2024-03-15 10:30:00'"
216+
+ " | eval y = year(ts), m = month(ts), dm = day_of_month(ts), h = hour(ts) "
217+
+ "| fields y, m, dm, h";
218+
assertRoutedToAnalyticsEngine(query);
219+
JSONObject result = executeQuery(query);
220+
verifySchema(
221+
result, schema("y", "int"), schema("m", "int"), schema("dm", "int"), schema("h", "int"));
222+
verifyDataRows(result, rows(2024, 3, 15, 10));
223+
}
224+
225+
/**
226+
* ORDER BY on the indexed TIMESTAMP returns rows ascending and preserves the wire-format contract
227+
* on every row — schema stays {@code timestamp}, not {@code string}, and values use the space
228+
* separator. (The two seeded timestamps are on different days, so lexicographic and temporal
229+
* orders coincide; proving temporal-vs-lexicographic semantics is out of scope here.)
230+
*/
231+
@Test
232+
public void testTimestampOrderByTemporalSemantics() throws IOException {
233+
String query = "source=" + INDEX + " | sort ts | fields ts";
234+
assertRoutedToAnalyticsEngine(query);
235+
JSONObject result = executeQuery(query);
236+
verifySchema(result, schema("ts", "timestamp"));
237+
verifyDataRows(result, rows("2024-03-15 10:30:00"), rows("2024-03-16 23:59:59"));
238+
}
239+
240+
/* ---------- 3. Precision (Chendai's "losing precision" concern) ---------- */
241+
242+
/**
243+
* Nanosecond precision must survive the round trip on both seeded rows. {@code date_nanos}
244+
* carries 9-digit sub-second precision, and {@link
245+
* org.opensearch.sql.data.model.ExprTimestampValue#valueOf} formats with {@code .SSSSSSSSS}. The
246+
* removal of {@code DatetimeOutputCastRule} / {@code DatetimeOutputCastRewriter} routes the value
247+
* through {@link org.opensearch.sql.executor.analytics.AnalyticsExecutionEngine#toTimestamp}
248+
* instead of a server-side {@code to_char} that was hard-coded to {@code "%Y-%m-%d %H:%M:%S%.f"}
249+
* — this test pins the new path's precision contract. Catches a silent micro-truncation in the
250+
* bridge or formatter: if AE quietly downgraded to {@code Time/TimestampMicrosecond}, the
251+
* trailing 3 digits would become {@code 000}.
252+
*/
253+
@Test
254+
public void testTimestampNanoPrecisionTrailingNines() throws IOException {
255+
String query = "source=" + INDEX + " | sort ts_nanos | fields ts_nanos";
256+
assertRoutedToAnalyticsEngine(query);
257+
JSONObject result = executeQuery(query);
258+
verifySchema(result, schema("ts_nanos", "timestamp"));
259+
verifyDataRows(
260+
result, rows("2024-03-15 10:30:00.123456789"), rows("2024-03-16 23:59:59.999999999"));
261+
}
262+
263+
/* ---------- 4. Aggregation beyond min() (Chendai's "wrong result" concern) ---------- */
264+
265+
/**
266+
* {@code max(ts)} must select the later row temporally and return it with the documented wire
267+
* format — same contract as {@code min}, exercised on the other end of the ordering.
268+
*/
269+
@Test
270+
public void testStatsMaxTimestampSpaceFormat() throws IOException {
271+
String query = "source=" + INDEX + " | stats max(ts) as max_ts";
272+
assertRoutedToAnalyticsEngine(query);
273+
JSONObject result = executeQuery(query);
274+
verifySchema(result, schema("max_ts", "timestamp"));
275+
verifyDataRows(result, rows("2024-03-16 23:59:59"));
276+
}
277+
278+
/**
279+
* {@code dc(ts)} on two distinct timestamps must return 2. Validates that AE dedups by temporal
280+
* identity (not by string equality of two equivalent ISO encodings). PPL syntax for
281+
* distinct-count is {@code dc(...)} (alias of {@code distinct_count(...)}); SQL's {@code
282+
* count(distinct ...)} form is rejected by the PPL parser.
283+
*/
284+
@Test
285+
public void testStatsCountDistinctTimestamp() throws IOException {
286+
String query = "source=" + INDEX + " | stats dc(ts) as n";
287+
assertRoutedToAnalyticsEngine(query);
288+
JSONObject result = executeQuery(query);
289+
verifyDataRows(result, rows(2));
290+
}
291+
}

0 commit comments

Comments
 (0)