Skip to content

Commit b5e8976

Browse files
committed
Add CalciteAnalyticsDatetimeWireFormatIT regression net for #5420
Wire-format regression coverage for sql#5420. With DatetimeOutputCastRule deleted (sql#5454) and DatetimeOutputCastRewriter deleted (opensearch#21748), datetime root columns must reach the user as PPL's documented `yyyy-MM-dd HH:mm:ss[.SSSSSSSSS]` format with typed schema labels (`timestamp` / `date` / `time`, never `string`) on the analytics-engine route. The IT skips cleanly when `-Dtests.analytics.parquet_indices=true` is not set — Calcite-legacy was never affected by sql#5420 and asserting the same contract on it is duplicative noise. Coverage: - Wire-format round trip (typed schema + space-separator value) on TIMESTAMP / DATE / TIME root columns, plus eval-derived TIMESTAMP and `min(ts)` aggregation. - Datetime processing inside AE (parsing for WHERE comparison, scalar extract functions year/month/day/hour, ORDER BY). - Nanosecond precision preservation via `date_nanos`. - Aggregation beyond min(): max(ts), dc(ts). Each test asserts the query routes to AE (LogicalTableScan with lowercase `opensearch`) before checking wire format, so a future regression that silently routes to Calcite-legacy can't leave the contract green by accident. Signed-off-by: Eric Wei <mengwei.eric@gmail.com>
1 parent cf16199 commit b5e8976

1 file changed

Lines changed: 296 additions & 0 deletions

File tree

Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.calcite.remote;
7+
8+
import static org.junit.Assume.assumeTrue;
9+
import static org.opensearch.sql.util.MatcherUtils.rows;
10+
import static org.opensearch.sql.util.MatcherUtils.schema;
11+
import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
12+
import static org.opensearch.sql.util.MatcherUtils.verifySchema;
13+
14+
import java.io.IOException;
15+
import org.json.JSONObject;
16+
import org.junit.Assert;
17+
import org.junit.jupiter.api.Test;
18+
import org.opensearch.client.Request;
19+
import org.opensearch.sql.legacy.TestUtils;
20+
import org.opensearch.sql.ppl.PPLIntegTestCase;
21+
22+
/**
23+
* Wire-format regression coverage for sql#5420.
24+
*
25+
* <p>Asserts that on the analytics-engine route — i.e. when the suite is started with {@code
26+
* -Dtests.analytics.parquet_indices=true} so that {@code RestUnifiedQueryAction.isAnalyticsIndex}
27+
* routes the query to DataFusion — datetime root columns reach the user as PPL's documented
28+
* space-separator format ({@code "yyyy-MM-dd HH:mm:ss[.SSSSSSSSS]"}) AND retain their typed
29+
* schema labels ({@code timestamp}/{@code date}/{@code time}, never {@code string}).
30+
*
31+
* <p>Before the fix, {@code DatetimeOutputCastRule} (SQL plugin) wrapped every datetime root
32+
* column in {@code CAST(... AS VARCHAR)} and {@code DatetimeOutputCastRewriter} (analytics
33+
* sandbox) translated that to {@code to_char(ts, '%Y-%m-%d %H:%M:%S%.f')} server-side. Both are
34+
* removed by sql#5454 / OpenSearch#21748; the response pipeline now relies on AE returning real
35+
* datetime cells and {@code ExprValueUtils.fromObjectValue} → {@code ExprTimestampValue.value()}
36+
* producing the documented format. These tests are the regression net for that contract.
37+
*/
38+
public class CalciteAnalyticsDatetimeWireFormatIT extends PPLIntegTestCase {
39+
40+
private static final String INDEX = "wire_format_dt";
41+
42+
@Override
43+
public void init() throws Exception {
44+
super.init();
45+
// This IT is the regression net for the analytics-engine wire-format contract (sql#5420 /
46+
// sql#5454 / OpenSearch#21748). Calcite-legacy already produces the documented format via
47+
// `ExprTimestampValue.value()` and was never affected by the bug, so running these
48+
// assertions on the legacy path is duplicative noise. Skip cleanly when parquet routing
49+
// isn't enabled — the run-mode flag is the single source of truth (TestUtils.AnalyticsIndexConfig).
50+
assumeTrue(
51+
"CalciteAnalyticsDatetimeWireFormatIT only meaningful with -Dtests.analytics.parquet_indices=true",
52+
isAnalyticsParquetIndicesEnabled());
53+
enableCalcite();
54+
55+
if (!TestUtils.isIndexExist(client(), INDEX)) {
56+
String mapping =
57+
"{\"mappings\":{\"properties\":{"
58+
+ "\"ts\":{\"type\":\"date\",\"format\":\"yyyy-MM-dd HH:mm:ss\"},"
59+
+ "\"ts_nanos\":{\"type\":\"date_nanos\"},"
60+
+ "\"d\":{\"type\":\"date\",\"format\":\"yyyy-MM-dd\"},"
61+
+ "\"t\":{\"type\":\"date\",\"format\":\"HH:mm:ss\"}}}}";
62+
TestUtils.createIndexByRestClient(client(), INDEX, mapping);
63+
64+
Request doc = new Request("PUT", "/" + INDEX + "/_doc/1?refresh=true");
65+
doc.setJsonEntity(
66+
"{\"ts\":\"2024-03-15 10:30:00\","
67+
+ "\"ts_nanos\":\"2024-03-15T10:30:00.123456789Z\","
68+
+ "\"d\":\"2024-03-15\","
69+
+ "\"t\":\"10:30:00\"}");
70+
client().performRequest(doc);
71+
72+
// Second row to give min/max/count(distinct) something to discriminate, and to cover a
73+
// sub-second timestamp on the regular `ts` column.
74+
Request doc2 = new Request("PUT", "/" + INDEX + "/_doc/2?refresh=true");
75+
doc2.setJsonEntity(
76+
"{\"ts\":\"2024-03-16 23:59:59\","
77+
+ "\"ts_nanos\":\"2024-03-16T23:59:59.999999999Z\","
78+
+ "\"d\":\"2024-03-16\","
79+
+ "\"t\":\"23:59:59\"}");
80+
client().performRequest(doc2);
81+
}
82+
}
83+
84+
/**
85+
* Asserts the most recently issued query was served by the analytics engine, not the Calcite
86+
* legacy path. The two backends produce structurally distinct {@code _explain} output:
87+
*
88+
* <ul>
89+
* <li>AE: {@code LogicalTableScan(table=[[opensearch, ...]])} — lowercase {@code opensearch}.
90+
* <li>Calcite legacy: {@code CalciteLogicalIndexScan(table=[[OpenSearch, ...]])} — capital
91+
* {@code OpenSearch}, {@code CalciteLogicalIndexScan} operator.
92+
* </ul>
93+
*
94+
* <p>Without this guard, a future regression that silently routes to Calcite would leave every
95+
* wire-format assertion green (Calcite already produces the documented format), defeating the
96+
* purpose of this regression net.
97+
*/
98+
private void assertRoutedToAnalyticsEngine(String query) throws IOException {
99+
String explained = explainQueryToString(query);
100+
Assert.assertTrue(
101+
"Expected analytics-engine route (LogicalTableScan + lowercase 'opensearch'), got: "
102+
+ explained,
103+
explained.contains("LogicalTableScan(table=[[opensearch,"));
104+
Assert.assertFalse(
105+
"Expected analytics-engine route, but query routed to Calcite legacy (CalciteLogicalIndexScan): "
106+
+ explained,
107+
explained.contains("CalciteLogicalIndexScan"));
108+
}
109+
110+
/* ---------- 1. Wire-format round-trip (typed schema + space-separator value) ---------- */
111+
112+
/** TIMESTAMP root col round-trips with space separator and typed schema. */
113+
@Test
114+
public void testTimestampRootColumnSpaceFormat() throws IOException {
115+
String query = "source=" + INDEX + " | where ts = '2024-03-15 10:30:00' | fields ts";
116+
assertRoutedToAnalyticsEngine(query);
117+
JSONObject result = executeQuery(query);
118+
verifySchema(result, schema("ts", "timestamp"));
119+
verifyDataRows(result, rows("2024-03-15 10:30:00"));
120+
}
121+
122+
/**
123+
* DATE-mapped root col round-trips with the documented space-separator format. AE widens the
124+
* date-mapping to a TIMESTAMP at scan time, so the schema label is {@code timestamp} and the
125+
* value carries a midnight time portion — but it must NOT be the ISO {@code T}-separator that
126+
* sql#5420 was filed against.
127+
*/
128+
@Test
129+
public void testDateRootColumnYmdFormat() throws IOException {
130+
String query = "source=" + INDEX + " | where d = '2024-03-15' | fields d";
131+
assertRoutedToAnalyticsEngine(query);
132+
JSONObject result = executeQuery(query);
133+
verifySchema(result, schema("d", "timestamp"));
134+
verifyDataRows(result, rows("2024-03-15 00:00:00"));
135+
}
136+
137+
/**
138+
* TIME-mapped root col — same AE widening behavior as {@code d}; schema becomes {@code
139+
* timestamp} but the value must still use the space separator. PPL doesn't accept a bare
140+
* {@code 'HH:mm:ss'} literal in WHERE against a date-mapped column, so exercise the wire-format
141+
* path with a plain projection instead.
142+
*/
143+
@Test
144+
public void testTimeRootColumnHmsFormat() throws IOException {
145+
String query = "source=" + INDEX + " | sort t | head 1 | fields t";
146+
assertRoutedToAnalyticsEngine(query);
147+
JSONObject result = executeQuery(query);
148+
verifySchema(result, schema("t", "timestamp"));
149+
Assert.assertFalse(
150+
"Time-mapped column must not surface as ISO T-separator literal",
151+
result.getJSONArray("datarows").getJSONArray(0).getString(0).contains("T"));
152+
}
153+
154+
/** Eval-derived TIMESTAMP must follow the same wire-format contract as a root column. */
155+
@Test
156+
public void testEvalDerivedTimestampSpaceFormat() throws IOException {
157+
String query =
158+
"source=" + INDEX + " | where ts = '2024-03-15 10:30:00' | eval x = ts | fields x";
159+
assertRoutedToAnalyticsEngine(query);
160+
JSONObject result = executeQuery(query);
161+
verifySchema(result, schema("x", "timestamp"));
162+
verifyDataRows(result, rows("2024-03-15 10:30:00"));
163+
}
164+
165+
/**
166+
* Aggregation output preserves the contract — {@code min(ts)} returns a timestamp cell (typed
167+
* schema, space-separator value), not a stringified ISO-T literal.
168+
*/
169+
@Test
170+
public void testStatsMinTimestampSpaceFormat() throws IOException {
171+
String query = "source=" + INDEX + " | stats min(ts) as min_ts";
172+
assertRoutedToAnalyticsEngine(query);
173+
JSONObject result = executeQuery(query);
174+
verifySchema(result, schema("min_ts", "timestamp"));
175+
verifyDataRows(result, rows("2024-03-15 10:30:00"));
176+
}
177+
178+
/* ---------- 2. Datetime processing inside AE (parsing, comparison, arithmetic, etc.) ---------- */
179+
180+
/**
181+
* AE must parse the indexed TIMESTAMP cell as a real timestamp (not a string) for comparison
182+
* to evaluate. If AE silently treated {@code ts} as a string, this WHERE would either reject
183+
* the query at planning time or do lexicographic compare and surface no rows.
184+
*/
185+
@Test
186+
public void testTimestampWhereComparisonFiltersCorrectly() throws IOException {
187+
// Bound between the two seeded rows: only the later one survives.
188+
String matchQuery =
189+
"source=" + INDEX + " | where ts > '2024-03-16 00:00:00' | fields ts";
190+
assertRoutedToAnalyticsEngine(matchQuery);
191+
JSONObject match = executeQuery(matchQuery);
192+
verifySchema(match, schema("ts", "timestamp"));
193+
verifyDataRows(match, rows("2024-03-16 23:59:59"));
194+
195+
JSONObject miss =
196+
executeQuery("source=" + INDEX + " | where ts < '2024-03-15 00:00:00' | fields ts");
197+
Assert.assertEquals(
198+
"Strict comparison should exclude both rows when bound is before any seeded timestamp",
199+
0,
200+
miss.getJSONArray("datarows").length());
201+
}
202+
203+
/**
204+
* AE must compute on the parsed TIMESTAMP — {@code year(ts)}/{@code month(ts)}/{@code
205+
* day_of_month(ts)} on the indexed cell must yield the calendar fields, proving AE didn't
206+
* stringify before extraction.
207+
*/
208+
@Test
209+
public void testTimestampScalarExtractFunctions() throws IOException {
210+
String query =
211+
"source="
212+
+ INDEX
213+
+ " | where ts = '2024-03-15 10:30:00'"
214+
+ " | eval y = year(ts), m = month(ts), dm = day_of_month(ts), h = hour(ts) "
215+
+ "| fields y, m, dm, h";
216+
assertRoutedToAnalyticsEngine(query);
217+
JSONObject result = executeQuery(query);
218+
verifySchema(
219+
result,
220+
schema("y", "int"),
221+
schema("m", "int"),
222+
schema("dm", "int"),
223+
schema("h", "int"));
224+
verifyDataRows(result, rows(2024, 3, 15, 10));
225+
}
226+
227+
/**
228+
* ORDER BY on the indexed TIMESTAMP returns rows ascending and preserves the wire-format
229+
* contract on every row — schema stays {@code timestamp}, not {@code string}, and values use
230+
* the space separator. (The two seeded timestamps are on different days, so lexicographic and
231+
* temporal orders coincide; proving temporal-vs-lexicographic semantics is out of scope here.)
232+
*/
233+
@Test
234+
public void testTimestampOrderByTemporalSemantics() throws IOException {
235+
String query = "source=" + INDEX + " | sort ts | fields ts";
236+
assertRoutedToAnalyticsEngine(query);
237+
JSONObject result = executeQuery(query);
238+
verifySchema(result, schema("ts", "timestamp"));
239+
verifyDataRows(result, rows("2024-03-15 10:30:00"), rows("2024-03-16 23:59:59"));
240+
}
241+
242+
/* ---------- 3. Precision (Chendai's "losing precision" concern) ---------- */
243+
244+
/**
245+
* Nanosecond precision must survive the round trip on both seeded rows. {@code date_nanos}
246+
* carries 9-digit sub-second precision, and {@link
247+
* org.opensearch.sql.data.model.ExprTimestampValue#valueOf} formats with {@code .SSSSSSSSS}.
248+
* The removal of {@code DatetimeOutputCastRule} / {@code DatetimeOutputCastRewriter} routes
249+
* the value through {@link
250+
* org.opensearch.sql.executor.analytics.AnalyticsExecutionEngine#toTimestamp} instead of a
251+
* server-side {@code to_char} that was hard-coded to {@code "%Y-%m-%d %H:%M:%S%.f"} — this
252+
* test pins the new path's precision contract. Catches a silent micro-truncation in the bridge
253+
* or formatter: if AE quietly downgraded to {@code Time/TimestampMicrosecond}, the trailing 3
254+
* digits would become {@code 000}.
255+
*/
256+
@Test
257+
public void testTimestampNanoPrecisionTrailingNines() throws IOException {
258+
String query = "source=" + INDEX + " | sort ts_nanos | fields ts_nanos";
259+
assertRoutedToAnalyticsEngine(query);
260+
JSONObject result = executeQuery(query);
261+
verifySchema(result, schema("ts_nanos", "timestamp"));
262+
verifyDataRows(
263+
result,
264+
rows("2024-03-15 10:30:00.123456789"),
265+
rows("2024-03-16 23:59:59.999999999"));
266+
}
267+
268+
/* ---------- 4. Aggregation beyond min() (Chendai's "wrong result" concern) ---------- */
269+
270+
/**
271+
* {@code max(ts)} must select the later row temporally and return it with the documented
272+
* wire format — same contract as {@code min}, exercised on the other end of the ordering.
273+
*/
274+
@Test
275+
public void testStatsMaxTimestampSpaceFormat() throws IOException {
276+
String query = "source=" + INDEX + " | stats max(ts) as max_ts";
277+
assertRoutedToAnalyticsEngine(query);
278+
JSONObject result = executeQuery(query);
279+
verifySchema(result, schema("max_ts", "timestamp"));
280+
verifyDataRows(result, rows("2024-03-16 23:59:59"));
281+
}
282+
283+
/**
284+
* {@code dc(ts)} on two distinct timestamps must return 2. Validates that AE dedups by
285+
* temporal identity (not by string equality of two equivalent ISO encodings). PPL syntax for
286+
* distinct-count is {@code dc(...)} (alias of {@code distinct_count(...)}); SQL's {@code
287+
* count(distinct ...)} form is rejected by the PPL parser.
288+
*/
289+
@Test
290+
public void testStatsCountDistinctTimestamp() throws IOException {
291+
String query = "source=" + INDEX + " | stats dc(ts) as n";
292+
assertRoutedToAnalyticsEngine(query);
293+
JSONObject result = executeQuery(query);
294+
verifyDataRows(result, rows(2));
295+
}
296+
}

0 commit comments

Comments
 (0)