Bring CalcitePPLJoinIT to parity on the analytics-engine route

songkant-aws · songkant-aws · commit 41c50522082e · 2026-06-16T07:19:24.000Z
CalcitePPLJoinIT failed on the analytics-engine route (parquet/composite store + DataFusion backend, -Dtests.analytics.parquet_indices=true) for three distinct reasons, none of which are real query defects. This brings the class to parity without weakening the assertions. 1. Shared-index pollution from a non-idempotent seed (the big one). init() runs before every test method (@before) and, with preserveClusterUponCompletion()=true, the state_country index is created once and reused across all methods. init() unconditionally PUT _doc/5..8 after loadIndex(). On the standard route those PUTs overwrite by _id and are harmless to repeat; on the analytics-engine route the parquet/composite store is append-only and does not overwrite by _id, so every method's init() appended 4 more duplicate rows. The shared index grew unboundedly and joins over it inflated (e.g. expected 6, got 60; self-joins far worse). Guard the seed with a static flag so it runs once per class load — the standard route ends at the same stable 8-row state_country it always did. 2. Column ordering. The analytics-engine route builds its scan schema from the serialized index mapping (getSourceAsMap), which OpenSearch returns in alphabetical field order, whereas the v2/Calcite path preserves declared order. Field-list/implicit-projection joins therefore returned the right rows with columns in a different order. Add explicit `| fields ...` to pin the projection order for the affected tests (testComplexSemiJoin, testComplexAntiJoin, testComplexSortPushDownForSMJWithMaxOptionAndFieldList). 3. Row ordering. The analytics-engine coordinator-reduce (RowProducingSink) appends Arrow batches in arrival order from the SEARCH-threadpool response handlers, so a query without ORDER BY has no guaranteed row order — unlike Calcite's deterministic enumerable execution. Cases: - testComplexRightJoin sorts by a column that is null for the right-only rows, leaving their relative order unspecified; switch verifyDataRowsInOrder -> verifyDataRows. - testInnerJoinWithRelationSubquery ends in `stats ... by` with no ORDER BY, so the two output groups come back in a route-dependent order (flaky); switch verifyDataRowsInOrder -> verifyDataRows. - The testCheckAccessTheReference* tests compare two alias-syntax variants to each other via assertJsonEquals on serialized datarows, which is order-sensitive. They only mean to assert the two variants return the same set of rows. Add MatcherUtils.assertJsonRowsEqualIgnoreOrder (multiset compare) and use it for those comparisons. Verified: standard route 43/43 pass (no regression); analytics-engine route drops from 33 failures to only the two remaining exact-equality-on-bare-text cases (testJoinComparing, testJoinSubsearchMaxOut), a separate route limitation (DYNAMIC_STRING_NO_KEYWORD) tracked elsewhere. Signed-off-by: Songkan Tang <songkant@amazon.com>
diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLJoinIT.java
@@ -9,6 +9,7 @@
 import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_OCCUPATION;
 import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_STATE_COUNTRY;
 import static org.opensearch.sql.util.MatcherUtils.assertJsonEquals;
+import static org.opensearch.sql.util.MatcherUtils.assertJsonRowsEqualIgnoreOrder;
 import static org.opensearch.sql.util.MatcherUtils.rows;
 import static org.opensearch.sql.util.MatcherUtils.schema;
 import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
@@ -26,6 +27,19 @@
 
 public class CalcitePPLJoinIT extends PPLIntegTestCase {
 
+  /**
+   * Guards the one-time seeding of extra docs (_id 5..8) into the shared {@code state_country}
+   * index below. {@link #init()} runs before every test method (@Before), and {@code
+   * preserveClusterUponCompletion()} keeps indices alive across methods, so the index is created
+   * once and reused. On the standard route these PUTs overwrite by {@code _id} and are harmless to
+   * repeat, but on the analytics-engine route the parquet/composite store is append-only and does
+   * not overwrite by {@code _id} — re-running them every method accumulates duplicate rows, which
+   * inflates downstream join row counts. Seeding once keeps both routes at a stable 8-row {@code
+   * state_country}. Static so it is set once per class load and reset implicitly when the class
+   * (and its preserved indices, wiped @AfterClass) goes away.
+   */
+  private static boolean stateCountrySeeded = false;
+
   @Override
   public void init() throws Exception {
     super.init();
@@ -35,26 +49,33 @@ public void init() throws Exception {
     loadIndex(Index.STATE_COUNTRY);
     loadIndex(Index.OCCUPATION);
     loadIndex(Index.HOBBIES);
-    Request request1 =
-        new Request("PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/5?refresh=true");
-    request1.setJsonEntity(
-        "{\"name\":\"Jim\",\"age\":27,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
-    client().performRequest(request1);
-    Request request2 =
-        new Request("PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/6?refresh=true");
-    request2.setJsonEntity(
-        "{\"name\":\"Peter\",\"age\":57,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
-    client().performRequest(request2);
-    Request request3 =
-        new Request("PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/7?refresh=true");
-    request3.setJsonEntity(
-        "{\"name\":\"Rick\",\"age\":70,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
-    client().performRequest(request3);
-    Request request4 =
-        new Request("PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/8?refresh=true");
-    request4.setJsonEntity(
-        "{\"name\":\"David\",\"age\":40,\"state\":\"Washington\",\"country\":\"USA\",\"year\":2023,\"month\":4}");
-    client().performRequest(request4);
+    if (!stateCountrySeeded) {
+      Request request1 =
+          new Request(
+              "PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/5?refresh=true");
+      request1.setJsonEntity(
+          "{\"name\":\"Jim\",\"age\":27,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
+      client().performRequest(request1);
+      Request request2 =
+          new Request(
+              "PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/6?refresh=true");
+      request2.setJsonEntity(
+          "{\"name\":\"Peter\",\"age\":57,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
+      client().performRequest(request2);
+      Request request3 =
+          new Request(
+              "PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/7?refresh=true");
+      request3.setJsonEntity(
+          "{\"name\":\"Rick\",\"age\":70,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
+      client().performRequest(request3);
+      Request request4 =
+          new Request(
+              "PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/8?refresh=true");
+      request4.setJsonEntity(
+          "{\"name\":\"David\",\"age\":40,\"state\":\"Washington\",\"country\":\"USA\",\"year\":2023,\"month\":4}");
+      client().performRequest(request4);
+      stateCountrySeeded = true;
+    }
   }
 
   @Test
@@ -239,7 +260,11 @@ public void testComplexRightJoin() throws IOException {
         schema("occupation", "string"),
         schema("b.country", "string"),
         schema("salary", "int"));
-    verifyDataRowsInOrder(
+    // The four right-only rows all have a null a.age, so `sort a.age` leaves their relative order
+    // unspecified — DataFusion (analytics-engine route) breaks the tie differently than the
+    // v2/Calcite path. Assert membership rather than position; the four null-age rows and the two
+    // matched rows are all present.
+    verifyDataRows(
         actual,
         rows(null, null, null, null, "Engineer", "England", 100000),
         rows(null, null, null, null, "Artist", "USA", 70000),
@@ -255,7 +280,8 @@ public void testComplexSemiJoin() throws IOException {
         executeQuery(
             String.format(
                 "source = %s | where country = 'Canada' OR country = 'England' | left semi join"
-                    + " left=a, right=b ON a.name = b.name %s | sort a.age",
+                    + " left=a, right=b ON a.name = b.name %s | sort a.age | fields name, country,"
+                    + " state, month, year, age",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
     verifySchema(
         actual,
@@ -277,7 +303,8 @@ public void testComplexAntiJoin() throws IOException {
         executeQuery(
             String.format(
                 "source = %s | where country = 'Canada' OR country = 'England' | left anti join"
-                    + " left=a, right=b ON a.name = b.name %s | sort a.age",
+                    + " left=a, right=b ON a.name = b.name %s | sort a.age | fields name, country,"
+                    + " state, month, year, age",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
     verifySchema(
         actual,
@@ -529,7 +556,7 @@ public void testCheckAccessTheReferenceByAliases() throws IOException {
             String.format(
                 "source = %s as t1 | JOIN ON t1.name = t2.name %s as t2 | fields t1.name, t2.name",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res1.getJSONArray("datarows").toString(), res2.getJSONArray("datarows").toString());
 
     JSONObject res3 =
@@ -550,9 +577,9 @@ public void testCheckAccessTheReferenceByAliases() throws IOException {
                 "source = %s as tt | JOIN left = t1 ON t1.name = t2.name %s as t2 | fields"
                     + " t1.name",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res3.getJSONArray("datarows").toString(), res4.getJSONArray("datarows").toString());
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res4.getJSONArray("datarows").toString(), res5.getJSONArray("datarows").toString());
   }
 
@@ -570,7 +597,7 @@ public void testCheckAccessTheReferenceBySubqueryAliases() throws IOException {
                 "source = %s | JOIN left = t1 ON t1.name = t2.name [ source = %s as t2 ] | fields"
                     + " t1.name, t2.name",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res1.getJSONArray("datarows").toString(), res2.getJSONArray("datarows").toString());
 
     JSONObject res3 =
@@ -591,9 +618,9 @@ public void testCheckAccessTheReferenceBySubqueryAliases() throws IOException {
                 "source = %s | JOIN left = t1 right = t2 ON t1.name = t2.name [ source = %s ]"
                     + " as tt | fields tt.name",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res3.getJSONArray("datarows").toString(), res4.getJSONArray("datarows").toString());
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res4.getJSONArray("datarows").toString(), res5.getJSONArray("datarows").toString());
   }
 
@@ -617,9 +644,9 @@ public void testCheckAccessTheReferenceByOverrideAliases() throws IOException {
                 "source = %s as tt | JOIN left = t1 ON t1.name = t2.name %s as t2 | fields"
                     + " t1.name",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res1.getJSONArray("datarows").toString(), res2.getJSONArray("datarows").toString());
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res1.getJSONArray("datarows").toString(), res3.getJSONArray("datarows").toString());
   }
 
@@ -643,9 +670,9 @@ public void testCheckAccessTheReferenceByOverrideSubqueryAliases() throws IOExce
                 "source = %s | JOIN left = t1 right = t2 ON t1.name = t2.name [ source = %s ] as tt"
                     + " | fields tt.name",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res1.getJSONArray("datarows").toString(), res2.getJSONArray("datarows").toString());
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res1.getJSONArray("datarows").toString(), res3.getJSONArray("datarows").toString());
   }
 
@@ -669,9 +696,9 @@ public void testCheckAccessTheReferenceByOverrideSubqueryAliases2() throws IOExc
                 "source = %s | JOIN left = t1 right = t2 ON t1.name = t2.name [ source = %s ] as tt"
                     + " | fields t2.name",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_OCCUPATION));
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res1.getJSONArray("datarows").toString(), res2.getJSONArray("datarows").toString());
-    assertJsonEquals(
+    assertJsonRowsEqualIgnoreOrder(
         res1.getJSONArray("datarows").toString(), res3.getJSONArray("datarows").toString());
   }
 
@@ -698,7 +725,10 @@ public void testInnerJoinWithRelationSubquery() throws IOException {
         schema("avg(salary)", "double"),
         schema("age_span", "int"),
         schema("b.country", "string"));
-    verifyDataRowsInOrder(actual, rows(70000.0, 30, "USA"), rows(100000, 70, "England"));
+    // The final `stats ... by` output has no ORDER BY, so row order is unspecified — the
+    // analytics-engine route (RowProducingSink appends batches in arrival order) emits the two
+    // groups in a different order than the v2/Calcite path. Assert membership, not position.
+    verifyDataRows(actual, rows(70000.0, 30, "USA"), rows(100000, 70, "England"));
   }
 
   @Test
@@ -1161,7 +1191,8 @@ public void testComplexSortPushDownForSMJWithMaxOptionAndFieldList() throws IOEx
         executeQuery(
             String.format(
                 "source=%s | eval name2=substring(name, 2, 1) | join max=1 name2,age [ source=%s |"
-                    + " eval name2=substring(state, 2, 1) ]",
+                    + " eval name2=substring(state, 2, 1) ] | fields name, country, state, month,"
+                    + " year, age, name2",
                 TEST_INDEX_STATE_COUNTRY, TEST_INDEX_STATE_COUNTRY));
     verifySchema(
         actual,
diff --git a/integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java b/integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java
@@ -24,6 +24,7 @@
 import java.math.BigDecimal;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.function.Function;
@@ -432,6 +433,29 @@ public static void assertJsonEquals(String expected, String actual) {
         JsonParser.parseString(eliminatePid(actual)));
   }
 
+  /**
+   * Compare two {@code datarows} JSON arrays as multisets — same rows, order ignored. Use when the
+   * test only asserts that two queries return the <em>same set of rows</em> (e.g. checking that two
+   * equivalent alias syntaxes produce the same result), not that they emit them in the same order.
+   * The analytics-engine (DataFusion) route does not guarantee the same row order as the v2/Calcite
+   * route, so a plain {@link #assertJsonEquals} on the serialized datarows is order-sensitive and
+   * flaky on that route; comparing as multisets asserts the intended equivalence without depending
+   * on output order.
+   */
+  public static void assertJsonRowsEqualIgnoreOrder(String expectedRows, String actualRows) {
+    List<String> expected = new ArrayList<>();
+    new JSONArray(eliminatePid(expectedRows))
+        .iterator()
+        .forEachRemaining(o -> expected.add(o.toString()));
+    List<String> actual = new ArrayList<>();
+    new JSONArray(eliminatePid(actualRows))
+        .iterator()
+        .forEachRemaining(o -> actual.add(o.toString()));
+    expected.sort(Comparator.naturalOrder());
+    actual.sort(Comparator.naturalOrder());
+    assertEquals(expected, actual);
+  }
+
   /**
    * Compare two JSON string are equals with ignoring the RelNode id in the Calcite plan.
    * Deprecated, use {@link #assertYamlEqualsIgnoreId(String, String)}