Add CalciteAnalyticsDatetimeWireFormatIT regression net for #5420

mengweieric · mengweieric · commit c5aa1d581d13 · 2026-05-21T16:02:48.000-07:00
Wire-format regression coverage for sql#5420. With DatetimeOutputCastRule
deleted (sql#5454) and DatetimeOutputCastRewriter deleted
(opensearch#21748), datetime root columns must reach the user as PPL's
documented `yyyy-MM-dd HH:mm:ss[.SSSSSSSSS]` format with typed schema
labels (`timestamp` / `date` / `time`, never `string`) on the
analytics-engine route.

The IT skips cleanly when `-Dtests.analytics.parquet_indices=true` is
not set — Calcite-legacy was never affected by sql#5420 and asserting
the same contract on it is duplicative noise.

Coverage:
- Wire-format round trip (typed schema + space-separator value) on
  TIMESTAMP / DATE / TIME root columns, plus eval-derived TIMESTAMP and
  `min(ts)` aggregation.
- Datetime processing inside AE (parsing for WHERE comparison, scalar
  extract functions year/month/day/hour, ORDER BY).
- Nanosecond precision preservation via `date_nanos`.
- Aggregation beyond min(): max(ts), dc(ts).

Each test asserts the query routes to AE (LogicalTableScan with
lowercase `opensearch`) before checking wire format, so a future
regression that silently routes to Calcite-legacy can't leave the
contract green by accident.

Signed-off-by: Eric Wei &lt;mengwei.eric@gmail.com&gt;
diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteAnalyticsDatetimeWireFormatIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteAnalyticsDatetimeWireFormatIT.java
@@ -0,0 +1,291 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.sql.calcite.remote;
+
+import static org.junit.Assume.assumeTrue;
+import static org.opensearch.sql.util.MatcherUtils.rows;
+import static org.opensearch.sql.util.MatcherUtils.schema;
+import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
+import static org.opensearch.sql.util.MatcherUtils.verifySchema;
+
+import java.io.IOException;
+import org.json.JSONObject;
+import org.junit.Assert;
+import org.junit.jupiter.api.Test;
+import org.opensearch.client.Request;
+import org.opensearch.sql.legacy.TestUtils;
+import org.opensearch.sql.ppl.PPLIntegTestCase;
+
+/**
+ * Wire-format regression coverage for sql#5420.
+ *
+ * <p>Asserts that on the analytics-engine route — i.e. when the suite is started with {@code
+ * -Dtests.analytics.parquet_indices=true} so that {@code RestUnifiedQueryAction.isAnalyticsIndex}
+ * routes the query to DataFusion — datetime root columns reach the user as PPL's documented
+ * space-separator format ({@code "yyyy-MM-dd HH:mm:ss[.SSSSSSSSS]"}) AND retain their typed schema
+ * labels ({@code timestamp}/{@code date}/{@code time}, never {@code string}).
+ *
+ * <p>Before the fix, {@code DatetimeOutputCastRule} (SQL plugin) wrapped every datetime root column
+ * in {@code CAST(... AS VARCHAR)} and {@code DatetimeOutputCastRewriter} (analytics sandbox)
+ * translated that to {@code to_char(ts, '%Y-%m-%d %H:%M:%S%.f')} server-side. Both are removed by
+ * sql#5454 / OpenSearch#21748; the response pipeline now relies on AE returning real datetime cells
+ * and {@code ExprValueUtils.fromObjectValue} → {@code ExprTimestampValue.value()} producing the
+ * documented format. These tests are the regression net for that contract.
+ */
+public class CalciteAnalyticsDatetimeWireFormatIT extends PPLIntegTestCase {
+
+  private static final String INDEX = "wire_format_dt";
+
+  @Override
+  public void init() throws Exception {
+    super.init();
+    // This IT is the regression net for the analytics-engine wire-format contract (sql#5420 /
+    // sql#5454 / OpenSearch#21748). Calcite-legacy already produces the documented format via
+    // `ExprTimestampValue.value()` and was never affected by the bug, so running these
+    // assertions on the legacy path is duplicative noise. Skip cleanly when parquet routing
+    // isn't enabled — the run-mode flag is the single source of truth
+    // (TestUtils.AnalyticsIndexConfig).
+    assumeTrue(
+        "CalciteAnalyticsDatetimeWireFormatIT only meaningful with"
+            + " -Dtests.analytics.parquet_indices=true",
+        isAnalyticsParquetIndicesEnabled());
+    enableCalcite();
+
+    if (!TestUtils.isIndexExist(client(), INDEX)) {
+      String mapping =
+          "{\"mappings\":{\"properties\":{"
+              + "\"ts\":{\"type\":\"date\",\"format\":\"yyyy-MM-dd HH:mm:ss\"},"
+              + "\"ts_nanos\":{\"type\":\"date_nanos\"},"
+              + "\"d\":{\"type\":\"date\",\"format\":\"yyyy-MM-dd\"},"
+              + "\"t\":{\"type\":\"date\",\"format\":\"HH:mm:ss\"}}}}";
+      TestUtils.createIndexByRestClient(client(), INDEX, mapping);
+
+      Request doc = new Request("PUT", "/" + INDEX + "/_doc/1?refresh=true");
+      doc.setJsonEntity(
+          "{\"ts\":\"2024-03-15 10:30:00\","
+              + "\"ts_nanos\":\"2024-03-15T10:30:00.123456789Z\","
+              + "\"d\":\"2024-03-15\","
+              + "\"t\":\"10:30:00\"}");
+      client().performRequest(doc);
+
+      // Second row to give min/max/count(distinct) something to discriminate, and to cover a
+      // sub-second timestamp on the regular `ts` column.
+      Request doc2 = new Request("PUT", "/" + INDEX + "/_doc/2?refresh=true");
+      doc2.setJsonEntity(
+          "{\"ts\":\"2024-03-16 23:59:59\","
+              + "\"ts_nanos\":\"2024-03-16T23:59:59.999999999Z\","
+              + "\"d\":\"2024-03-16\","
+              + "\"t\":\"23:59:59\"}");
+      client().performRequest(doc2);
+    }
+  }
+
+  /**
+   * Asserts the most recently issued query was served by the analytics engine, not the Calcite
+   * legacy path. The two backends produce structurally distinct {@code _explain} output:
+   *
+   * <ul>
+   *   <li>AE: {@code LogicalTableScan(table=[[opensearch, ...]])} — lowercase {@code opensearch}.
+   *   <li>Calcite legacy: {@code CalciteLogicalIndexScan(table=[[OpenSearch, ...]])} — capital
+   *       {@code OpenSearch}, {@code CalciteLogicalIndexScan} operator.
+   * </ul>
+   *
+   * <p>Without this guard, a future regression that silently routes to Calcite would leave every
+   * wire-format assertion green (Calcite already produces the documented format), defeating the
+   * purpose of this regression net.
+   */
+  private void assertRoutedToAnalyticsEngine(String query) throws IOException {
+    String explained = explainQueryToString(query);
+    Assert.assertTrue(
+        "Expected analytics-engine route (LogicalTableScan + lowercase 'opensearch'), got: "
+            + explained,
+        explained.contains("LogicalTableScan(table=[[opensearch,"));
+    Assert.assertFalse(
+        "Expected analytics-engine route, but query routed to Calcite legacy"
+            + " (CalciteLogicalIndexScan): "
+            + explained,
+        explained.contains("CalciteLogicalIndexScan"));
+  }
+
+  /* ---------- 1. Wire-format round-trip (typed schema + space-separator value) ---------- */
+
+  /** TIMESTAMP root col round-trips with space separator and typed schema. */
+  @Test
+  public void testTimestampRootColumnSpaceFormat() throws IOException {
+    String query = "source=" + INDEX + " | where ts = '2024-03-15 10:30:00' | fields ts";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(result, schema("ts", "timestamp"));
+    verifyDataRows(result, rows("2024-03-15 10:30:00"));
+  }
+
+  /**
+   * DATE-mapped root col round-trips with the documented space-separator format. AE widens the
+   * date-mapping to a TIMESTAMP at scan time, so the schema label is {@code timestamp} and the
+   * value carries a midnight time portion — but it must NOT be the ISO {@code T}-separator that
+   * sql#5420 was filed against.
+   */
+  @Test
+  public void testDateRootColumnYmdFormat() throws IOException {
+    String query = "source=" + INDEX + " | where d = '2024-03-15' | fields d";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(result, schema("d", "timestamp"));
+    verifyDataRows(result, rows("2024-03-15 00:00:00"));
+  }
+
+  /**
+   * TIME-mapped root col — same AE widening behavior as {@code d}; schema becomes {@code timestamp}
+   * but the value must still use the space separator. PPL doesn't accept a bare {@code 'HH:mm:ss'}
+   * literal in WHERE against a date-mapped column, so exercise the wire-format path with a plain
+   * projection instead.
+   */
+  @Test
+  public void testTimeRootColumnHmsFormat() throws IOException {
+    String query = "source=" + INDEX + " | sort t | head 1 | fields t";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(result, schema("t", "timestamp"));
+    Assert.assertFalse(
+        "Time-mapped column must not surface as ISO T-separator literal",
+        result.getJSONArray("datarows").getJSONArray(0).getString(0).contains("T"));
+  }
+
+  /** Eval-derived TIMESTAMP must follow the same wire-format contract as a root column. */
+  @Test
+  public void testEvalDerivedTimestampSpaceFormat() throws IOException {
+    String query =
+        "source=" + INDEX + " | where ts = '2024-03-15 10:30:00' | eval x = ts | fields x";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(result, schema("x", "timestamp"));
+    verifyDataRows(result, rows("2024-03-15 10:30:00"));
+  }
+
+  /**
+   * Aggregation output preserves the contract — {@code min(ts)} returns a timestamp cell (typed
+   * schema, space-separator value), not a stringified ISO-T literal.
+   */
+  @Test
+  public void testStatsMinTimestampSpaceFormat() throws IOException {
+    String query = "source=" + INDEX + " | stats min(ts) as min_ts";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(result, schema("min_ts", "timestamp"));
+    verifyDataRows(result, rows("2024-03-15 10:30:00"));
+  }
+
+  /* ---------- 2. Datetime processing inside AE (parsing, comparison, arithmetic, etc.) ---------- */
+
+  /**
+   * AE must parse the indexed TIMESTAMP cell as a real timestamp (not a string) for comparison to
+   * evaluate. If AE silently treated {@code ts} as a string, this WHERE would either reject the
+   * query at planning time or do lexicographic compare and surface no rows.
+   */
+  @Test
+  public void testTimestampWhereComparisonFiltersCorrectly() throws IOException {
+    // Bound between the two seeded rows: only the later one survives.
+    String matchQuery = "source=" + INDEX + " | where ts > '2024-03-16 00:00:00' | fields ts";
+    assertRoutedToAnalyticsEngine(matchQuery);
+    JSONObject match = executeQuery(matchQuery);
+    verifySchema(match, schema("ts", "timestamp"));
+    verifyDataRows(match, rows("2024-03-16 23:59:59"));
+
+    JSONObject miss =
+        executeQuery("source=" + INDEX + " | where ts < '2024-03-15 00:00:00' | fields ts");
+    Assert.assertEquals(
+        "Strict comparison should exclude both rows when bound is before any seeded timestamp",
+        0,
+        miss.getJSONArray("datarows").length());
+  }
+
+  /**
+   * AE must compute on the parsed TIMESTAMP — {@code year(ts)}/{@code month(ts)}/{@code
+   * day_of_month(ts)} on the indexed cell must yield the calendar fields, proving AE didn't
+   * stringify before extraction.
+   */
+  @Test
+  public void testTimestampScalarExtractFunctions() throws IOException {
+    String query =
+        "source="
+            + INDEX
+            + " | where ts = '2024-03-15 10:30:00'"
+            + " | eval y = year(ts), m = month(ts), dm = day_of_month(ts), h = hour(ts) "
+            + "| fields y, m, dm, h";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(
+        result, schema("y", "int"), schema("m", "int"), schema("dm", "int"), schema("h", "int"));
+    verifyDataRows(result, rows(2024, 3, 15, 10));
+  }
+
+  /**
+   * ORDER BY on the indexed TIMESTAMP returns rows ascending and preserves the wire-format contract
+   * on every row — schema stays {@code timestamp}, not {@code string}, and values use the space
+   * separator. (The two seeded timestamps are on different days, so lexicographic and temporal
+   * orders coincide; proving temporal-vs-lexicographic semantics is out of scope here.)
+   */
+  @Test
+  public void testTimestampOrderByTemporalSemantics() throws IOException {
+    String query = "source=" + INDEX + " | sort ts | fields ts";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(result, schema("ts", "timestamp"));
+    verifyDataRows(result, rows("2024-03-15 10:30:00"), rows("2024-03-16 23:59:59"));
+  }
+
+  /* ---------- 3. Precision (Chendai's "losing precision" concern) ---------- */
+
+  /**
+   * Nanosecond precision must survive the round trip on both seeded rows. {@code date_nanos}
+   * carries 9-digit sub-second precision, and {@link
+   * org.opensearch.sql.data.model.ExprTimestampValue#valueOf} formats with {@code .SSSSSSSSS}. The
+   * removal of {@code DatetimeOutputCastRule} / {@code DatetimeOutputCastRewriter} routes the value
+   * through {@link org.opensearch.sql.executor.analytics.AnalyticsExecutionEngine#toTimestamp}
+   * instead of a server-side {@code to_char} that was hard-coded to {@code "%Y-%m-%d %H:%M:%S%.f"}
+   * — this test pins the new path's precision contract. Catches a silent micro-truncation in the
+   * bridge or formatter: if AE quietly downgraded to {@code Time/TimestampMicrosecond}, the
+   * trailing 3 digits would become {@code 000}.
+   */
+  @Test
+  public void testTimestampNanoPrecisionTrailingNines() throws IOException {
+    String query = "source=" + INDEX + " | sort ts_nanos | fields ts_nanos";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(result, schema("ts_nanos", "timestamp"));
+    verifyDataRows(
+        result, rows("2024-03-15 10:30:00.123456789"), rows("2024-03-16 23:59:59.999999999"));
+  }
+
+  /* ---------- 4. Aggregation beyond min() (Chendai's "wrong result" concern) ---------- */
+
+  /**
+   * {@code max(ts)} must select the later row temporally and return it with the documented wire
+   * format — same contract as {@code min}, exercised on the other end of the ordering.
+   */
+  @Test
+  public void testStatsMaxTimestampSpaceFormat() throws IOException {
+    String query = "source=" + INDEX + " | stats max(ts) as max_ts";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifySchema(result, schema("max_ts", "timestamp"));
+    verifyDataRows(result, rows("2024-03-16 23:59:59"));
+  }
+
+  /**
+   * {@code dc(ts)} on two distinct timestamps must return 2. Validates that AE dedups by temporal
+   * identity (not by string equality of two equivalent ISO encodings). PPL syntax for
+   * distinct-count is {@code dc(...)} (alias of {@code distinct_count(...)}); SQL's {@code
+   * count(distinct ...)} form is rejected by the PPL parser.
+   */
+  @Test
+  public void testStatsCountDistinctTimestamp() throws IOException {
+    String query = "source=" + INDEX + " | stats dc(ts) as n";
+    assertRoutedToAnalyticsEngine(query);
+    JSONObject result = executeQuery(query);
+    verifyDataRows(result, rows(2));
+  }
+}