Skip to content

Commit 3829dfa

Browse files
authored
Stabilize subquery PPL ITs on the analytics-engine route (#5555)
CalcitePPLScalarSubqueryIT and CalcitePPLInSubqueryIT both seed an extra worker doc via an unconditional raw PUT in init(). init() runs as @before before every test method, and the analytics-engine parquet-backed store is append-only on same-_id PUT, so the doc accumulated a duplicate per method and inflated row counts across the suite. Guard the seed on a pre-loadIndex isIndexExist check so it runs exactly once; behavior is unchanged on the v2/Calcite route (same end state). Skip the four tests that exercise behaviors the analytics-engine route does not support, using the assumeNotAnalytics(...) registry (AnalyticsRouteLimitation) plus matching excludeTestsMatching entries in integTestRemote so the skip set stays countable in one place. Two new AnalyticsRouteLimitation constants: - TEXT_FIELD_EXACT_MATCH: exact = / == on an explicitly text-mapped field (no .keyword sub-field) returns no rows on the DataFusion scan. Sibling of DYNAMIC_STRING_NO_KEYWORD for explicitly-mapped (not dynamic) text fields. Covers testTwoUncorrelatedScalarSubqueriesInOr, testInSubqueryWithTableAlias, testInCorrelatedSubquery (each filters department/occupation = '...'). - SUBSEARCH_MAXOUT_IN_SUBQUERY: the subsearch.maxout cap is lowered as a LIMIT on the in-subquery semi-join's right side, which the route does not honor, so the subsearch returns all rows. Covers testSubsearchMaxOut. Results (-Dtests.analytics.parquet_indices=true against the analytics route): CalcitePPLScalarSubqueryIT: 2/14 -> 13/13 run, 0 fail (1 excluded) CalcitePPLInSubqueryIT: 7/18 -> 14/15 run, 0 fail (3 excluded; 1 @ignore) v2/Calcite route unchanged: 14/14 and 17/17 (1 pre-existing @ignore). Signed-off-by: Kai Huang <ahkcs@amazon.com>
1 parent 9663d5f commit 3829dfa

4 files changed

Lines changed: 74 additions & 13 deletions

File tree

integ-test/build.gradle

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,6 +1135,18 @@ task integTestRemote(type: RestIntegTestTask) {
11351135
// nullable tiebreak these would need orders nulls differently than v2/Calcite.
11361136
excludeTestsMatching '*CalcitePPLEnhancedCoalesceIT.testCoalesceWithMixedTypes'
11371137
excludeTestsMatching '*CalcitePPLEnhancedCoalesceIT.testCoalesceBasic'
1138+
1139+
// === Excludes: CalcitePPLScalarSubqueryIT / CalcitePPLInSubqueryIT route divergences ===
1140+
// Each test also carries an in-test assumeNotAnalytics(...) recording the reason (see
1141+
// AnalyticsRouteLimitation); listed here so the AE-route skip set stays countable.
1142+
// - Exact equality on an explicitly text-mapped field (department/occupation = '...')
1143+
// in the subsearch returns no rows on the AE route (analyzed text, no .keyword).
1144+
excludeTestsMatching '*CalcitePPLScalarSubqueryIT.testTwoUncorrelatedScalarSubqueriesInOr'
1145+
excludeTestsMatching '*CalcitePPLInSubqueryIT.testInSubqueryWithTableAlias'
1146+
excludeTestsMatching '*CalcitePPLInSubqueryIT.testInCorrelatedSubquery'
1147+
// - subsearch.maxout is lowered as a LIMIT on the in-subquery semi-join's right side,
1148+
// which the AE route does not honor, so the subsearch returns all rows.
1149+
excludeTestsMatching '*CalcitePPLInSubqueryIT.testSubsearchMaxOut'
11381150
}
11391151
}
11401152

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLInSubqueryIT.java

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55

66
package org.opensearch.sql.calcite.remote;
77

8+
import static org.opensearch.sql.legacy.TestUtils.isIndexExist;
89
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_OCCUPATION;
910
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORKER;
1011
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORK_INFORMATION;
12+
import static org.opensearch.sql.util.AnalyticsRouteLimitation.SUBSEARCH_MAXOUT_IN_SUBQUERY;
13+
import static org.opensearch.sql.util.AnalyticsRouteLimitation.TEXT_FIELD_EXACT_MATCH;
1114
import static org.opensearch.sql.util.MatcherUtils.rows;
1215
import static org.opensearch.sql.util.MatcherUtils.schema;
1316
import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
@@ -31,16 +34,22 @@ public void init() throws Exception {
3134
super.init();
3235
enableCalcite();
3336

37+
// init() runs as @Before, before every test method. On the analytics route the parquet-backed
38+
// store is append-only on same-_id PUT, so seed the extra worker doc only when the index is
39+
// first created — otherwise it accumulates a duplicate per test method and inflates row counts.
40+
boolean workerExisted = isIndexExist(client(), TEST_INDEX_WORKER);
3441
loadIndex(Index.WORKER);
3542
loadIndex(Index.WORK_INFORMATION);
3643
loadIndex(Index.OCCUPATION);
3744

38-
// {"index":{"_id":"7"}}
39-
// {"id":1006,"name":"Tommy","occupation":"Teacher","country":"USA","salary":30000}
40-
Request request1 = new Request("PUT", "/" + TEST_INDEX_WORKER + "/_doc/7?refresh=true");
41-
request1.setJsonEntity(
42-
"{\"id\":1006,\"name\":\"Tommy\",\"occupation\":\"Teacher\",\"country\":\"USA\",\"salary\":30000}");
43-
client().performRequest(request1);
45+
if (!workerExisted) {
46+
// {"index":{"_id":"7"}}
47+
// {"id":1006,"name":"Tommy","occupation":"Teacher","country":"USA","salary":30000}
48+
Request request1 = new Request("PUT", "/" + TEST_INDEX_WORKER + "/_doc/7?refresh=true");
49+
request1.setJsonEntity(
50+
"{\"id\":1006,\"name\":\"Tommy\",\"occupation\":\"Teacher\",\"country\":\"USA\",\"salary\":30000}");
51+
client().performRequest(request1);
52+
}
4453
}
4554

4655
@Test
@@ -340,6 +349,8 @@ public void failWhenNumOfColumnsNotMatchOutputOfSubquery() {
340349

341350
@Test
342351
public void testInSubqueryWithTableAlias() throws IOException {
352+
// Subsearch filters a text-mapped field with exact equality (i.department = 'DATA').
353+
assumeNotAnalytics(TEXT_FIELD_EXACT_MATCH);
343354
JSONObject result =
344355
executeQuery(
345356
String.format(
@@ -358,6 +369,8 @@ public void testInSubqueryWithTableAlias() throws IOException {
358369

359370
@Test
360371
public void testInCorrelatedSubquery() throws IOException {
372+
// Subsearch filters a text-mapped field with exact equality (occupation = 'Engineer').
373+
assumeNotAnalytics(TEXT_FIELD_EXACT_MATCH);
361374
JSONObject result =
362375
executeQuery(
363376
String.format(
@@ -372,6 +385,7 @@ public void testInCorrelatedSubquery() throws IOException {
372385

373386
@Test
374387
public void testSubsearchMaxOut() throws IOException {
388+
assumeNotAnalytics(SUBSEARCH_MAXOUT_IN_SUBQUERY);
375389
setSubsearchMaxOut(1);
376390
JSONObject result =
377391
executeQuery(

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLScalarSubqueryIT.java

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55

66
package org.opensearch.sql.calcite.remote;
77

8+
import static org.opensearch.sql.legacy.TestUtils.isIndexExist;
89
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_OCCUPATION;
910
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORKER;
1011
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_WORK_INFORMATION;
12+
import static org.opensearch.sql.util.AnalyticsRouteLimitation.TEXT_FIELD_EXACT_MATCH;
1113
import static org.opensearch.sql.util.MatcherUtils.rows;
1214
import static org.opensearch.sql.util.MatcherUtils.schema;
1315
import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
@@ -27,16 +29,22 @@ public void init() throws Exception {
2729
super.init();
2830
enableCalcite();
2931

32+
// init() runs as @Before, before every test method. On the analytics route the parquet-backed
33+
// store is append-only on same-_id PUT, so seed the extra worker doc only when the index is
34+
// first created — otherwise it accumulates a duplicate per test method and inflates row counts.
35+
boolean workerExisted = isIndexExist(client(), TEST_INDEX_WORKER);
3036
loadIndex(Index.WORKER);
3137
loadIndex(Index.WORK_INFORMATION);
3238
loadIndex(Index.OCCUPATION);
3339

34-
// {"index":{"_id":"7"}}
35-
// {"id":1006,"name":"Tommy","occupation":"Teacher","country":"USA","salary":30000}
36-
Request request1 = new Request("PUT", "/" + TEST_INDEX_WORKER + "/_doc/7?refresh=true");
37-
request1.setJsonEntity(
38-
"{\"id\":1006,\"name\":\"Tommy\",\"occupation\":\"Teacher\",\"country\":\"USA\",\"salary\":30000}");
39-
client().performRequest(request1);
40+
if (!workerExisted) {
41+
// {"index":{"_id":"7"}}
42+
// {"id":1006,"name":"Tommy","occupation":"Teacher","country":"USA","salary":30000}
43+
Request request1 = new Request("PUT", "/" + TEST_INDEX_WORKER + "/_doc/7?refresh=true");
44+
request1.setJsonEntity(
45+
"{\"id\":1006,\"name\":\"Tommy\",\"occupation\":\"Teacher\",\"country\":\"USA\",\"salary\":30000}");
46+
client().performRequest(request1);
47+
}
4048
}
4149

4250
@Test
@@ -230,6 +238,8 @@ public void testDisjunctiveCorrelatedScalarSubquery() throws IOException {
230238

231239
@Test
232240
public void testTwoUncorrelatedScalarSubqueriesInOr() throws IOException {
241+
// Subsearch filters a text-mapped field with exact equality (department = 'DATA').
242+
assumeNotAnalytics(TEXT_FIELD_EXACT_MATCH);
233243
JSONObject result =
234244
executeQuery(
235245
String.format(

integ-test/src/test/java/org/opensearch/sql/util/AnalyticsRouteLimitation.java

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,19 @@ public enum AnalyticsRouteLimitation {
4747
+ " sub-field that standard OpenSearch adds, and the DataFusion scan can't match on an"
4848
+ " analyzed text field."),
4949

50+
/**
51+
* Exact equality ({@code =} / {@code ==}) on an explicitly {@code text}-mapped field (no {@code
52+
* .keyword} sub-field) returns no rows on the analytics-engine route — the DataFusion scan can't
53+
* match on an analyzed text field. The sibling of {@link #DYNAMIC_STRING_NO_KEYWORD} for fields
54+
* that are mapped {@code text} on purpose rather than by dynamic mapping. Verified directly:
55+
* {@code where department = 'DATA'} returns no rows while {@code like(department, 'DATA')} and
56+
* keyword-field equality both work.
57+
*/
58+
TEXT_FIELD_EXACT_MATCH(
59+
"Exact equality (= / ==) on an explicitly text-mapped field (no .keyword sub-field) returns"
60+
+ " no rows on the analytics-engine route: the DataFusion scan can't match on an analyzed"
61+
+ " text field. Use like() or a keyword field instead."),
62+
5063
/**
5164
* The analytics-engine storage path ({@code DataFormatAwareEngine}) does not support in-place
5265
* document mutation, so tests that seed state via raw {@code PUT}+{@code DELETE} can't run on
@@ -108,7 +121,19 @@ public enum AnalyticsRouteLimitation {
108121
HEAD_WITHOUT_STABLE_SORT(
109122
"head N without a sort on a key that is unique over the head window is non-deterministic on"
110123
+ " the analytics-engine route, and a nullable tiebreak orders nulls differently than the"
111-
+ " v2/Calcite path.");
124+
+ " v2/Calcite path."),
125+
126+
/**
127+
* The {@code subsearch.maxout} cap on an {@code in}-subquery is lowered as a {@code LIMIT} on the
128+
* right-hand side of the semi-join ({@code LogicalSystemLimit(fetch=N, type=SUBSEARCH_MAXOUT)}).
129+
* The analytics-engine route does not honor that LIMIT, so the subsearch returns all rows
130+
* regardless of the cap. Verified: with {@code subsearch.maxout=1} an {@code id in [...]}
131+
* subquery still returns every matching row.
132+
*/
133+
SUBSEARCH_MAXOUT_IN_SUBQUERY(
134+
"subsearch.maxout is not honored on the analytics-engine route: the LIMIT lowered onto the"
135+
+ " in-subquery semi-join's right side is dropped, so the subsearch returns all rows"
136+
+ " regardless of the cap.");
112137

113138
private final String reason;
114139

0 commit comments

Comments
 (0)