Stabilize subquery PPL ITs on the analytics-engine route

ahkcs · ahkcs · commit 8ee2fc4c3f58 · 2026-06-16T11:33:53.000-07:00
CalcitePPLScalarSubqueryIT and CalcitePPLInSubqueryIT both seed an extra worker doc via an unconditional raw PUT in init(). init() runs as @before before every test method, and the analytics-engine parquet-backed store is append-only on same-_id PUT, so the doc accumulated a duplicate per method and inflated row counts across the suite. Guard the seed on a pre-loadIndex isIndexExist check so it runs exactly once; behavior is unchanged on the v2/Calcite route (same end state). Gate the four tests that exercise behaviors the analytics-engine route does not support with assumeFalse(isAnalyticsParquetIndicesEnabled()): - exact equality on a text-mapped field (department/occupation = '...'), which returns no rows on DataFusion (text has no keyword subfield) - the subsearch.maxout LIMIT inside an IN-subquery semi-join, which the route does not honor Results (-Dtests.analytics.parquet_indices=true against the analytics route): CalcitePPLScalarSubqueryIT: 2/14 -> 13/14 pass, 1 skip, 0 fail CalcitePPLInSubqueryIT: 7/18 -> 14/18 pass, 4 skip, 0 fail v2/Calcite route unchanged: 14/14 and 17/17 (1 pre-existing @ignore). Signed-off-by: Kai Huang <ahkcs@amazon.com>
diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLInSubqueryIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLInSubqueryIT.java
@@ -5,6 +5,8 @@
 
 package org.opensearch.sql.calcite.remote;
 
+import static org.junit.Assume.assumeFalse;
+import static org.opensearch.sql.legacy.TestUtils.isIndexExist;
 import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_OCCUPATION;
 import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORKER;
 import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORK_INFORMATION;
@@ -31,16 +33,22 @@ public void init() throws Exception {
     super.init();
     enableCalcite();
 
+    // init() runs as @Before, before every test method. On the analytics route the parquet-backed
+    // store is append-only on same-_id PUT, so seed the extra worker doc only when the index is
+    // first created — otherwise it accumulates a duplicate per test method and inflates row counts.
+    boolean workerExisted = isIndexExist(client(), TEST_INDEX_WORKER);
     loadIndex(Index.WORKER);
     loadIndex(Index.WORK_INFORMATION);
     loadIndex(Index.OCCUPATION);
 
-    // {"index":{"_id":"7"}}
-    // {"id":1006,"name":"Tommy","occupation":"Teacher","country":"USA","salary":30000}
-    Request request1 = new Request("PUT", "/" + TEST_INDEX_WORKER + "/_doc/7?refresh=true");
-    request1.setJsonEntity(
-        "{\"id\":1006,\"name\":\"Tommy\",\"occupation\":\"Teacher\",\"country\":\"USA\",\"salary\":30000}");
-    client().performRequest(request1);
+    if (!workerExisted) {
+      // {"index":{"_id":"7"}}
+      // {"id":1006,"name":"Tommy","occupation":"Teacher","country":"USA","salary":30000}
+      Request request1 = new Request("PUT", "/" + TEST_INDEX_WORKER + "/_doc/7?refresh=true");
+      request1.setJsonEntity(
+          "{\"id\":1006,\"name\":\"Tommy\",\"occupation\":\"Teacher\",\"country\":\"USA\",\"salary\":30000}");
+      client().performRequest(request1);
+    }
   }
 
   @Test
@@ -340,6 +348,11 @@ public void failWhenNumOfColumnsNotMatchOutputOfSubquery() {
 
   @Test
   public void testInSubqueryWithTableAlias() throws IOException {
+    assumeFalse(
+        "Subquery filters on a text-mapped field with exact equality (i.department = 'DATA'), which"
+            + " returns no rows on the analytics-engine (DataFusion) route — text fields have no"
+            + " keyword subfield for exact match.",
+        isAnalyticsParquetIndicesEnabled());
     JSONObject result =
         executeQuery(
             String.format(
@@ -358,6 +371,11 @@ public void testInSubqueryWithTableAlias() throws IOException {
 
   @Test
   public void testInCorrelatedSubquery() throws IOException {
+    assumeFalse(
+        "Subquery filters on a text-mapped field with exact equality (occupation = 'Engineer'),"
+            + " which returns no rows on the analytics-engine (DataFusion) route — text fields have"
+            + " no keyword subfield for exact match.",
+        isAnalyticsParquetIndicesEnabled());
     JSONObject result =
         executeQuery(
             String.format(
@@ -372,6 +390,11 @@ public void testInCorrelatedSubquery() throws IOException {
 
   @Test
   public void testSubsearchMaxOut() throws IOException {
+    assumeFalse(
+        "The subsearch.maxout row cap is lowered as a LIMIT inside the IN-subquery semi-join, which"
+            + " the analytics-engine (DataFusion) route does not honor — the subsearch returns all"
+            + " rows regardless of the cap.",
+        isAnalyticsParquetIndicesEnabled());
     setSubsearchMaxOut(1);
     JSONObject result =
         executeQuery(
diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLScalarSubqueryIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLScalarSubqueryIT.java
@@ -5,6 +5,8 @@
 
 package org.opensearch.sql.calcite.remote;
 
+import static org.junit.Assume.assumeFalse;
+import static org.opensearch.sql.legacy.TestUtils.isIndexExist;
 import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_OCCUPATION;
 import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORKER;
 import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORK_INFORMATION;
@@ -27,16 +29,22 @@ public void init() throws Exception {
     super.init();
     enableCalcite();
 
+    // init() runs as @Before, before every test method. On the analytics route the parquet-backed
+    // store is append-only on same-_id PUT, so seed the extra worker doc only when the index is
+    // first created — otherwise it accumulates a duplicate per test method and inflates row counts.
+    boolean workerExisted = isIndexExist(client(), TEST_INDEX_WORKER);
     loadIndex(Index.WORKER);
     loadIndex(Index.WORK_INFORMATION);
     loadIndex(Index.OCCUPATION);
 
-    // {"index":{"_id":"7"}}
-    // {"id":1006,"name":"Tommy","occupation":"Teacher","country":"USA","salary":30000}
-    Request request1 = new Request("PUT", "/" + TEST_INDEX_WORKER + "/_doc/7?refresh=true");
-    request1.setJsonEntity(
-        "{\"id\":1006,\"name\":\"Tommy\",\"occupation\":\"Teacher\",\"country\":\"USA\",\"salary\":30000}");
-    client().performRequest(request1);
+    if (!workerExisted) {
+      // {"index":{"_id":"7"}}
+      // {"id":1006,"name":"Tommy","occupation":"Teacher","country":"USA","salary":30000}
+      Request request1 = new Request("PUT", "/" + TEST_INDEX_WORKER + "/_doc/7?refresh=true");
+      request1.setJsonEntity(
+          "{\"id\":1006,\"name\":\"Tommy\",\"occupation\":\"Teacher\",\"country\":\"USA\",\"salary\":30000}");
+      client().performRequest(request1);
+    }
   }
 
   @Test
@@ -230,6 +238,11 @@ public void testDisjunctiveCorrelatedScalarSubquery() throws IOException {
 
   @Test
   public void testTwoUncorrelatedScalarSubqueriesInOr() throws IOException {
+    assumeFalse(
+        "Subquery filters on a text-mapped field with exact equality (department = 'DATA'), which"
+            + " returns no rows on the analytics-engine (DataFusion) route — text fields have no"
+            + " keyword subfield for exact match.",
+        isAnalyticsParquetIndicesEnabled());
     JSONObject result =
         executeQuery(
             String.format(