Skip to content

Commit 4c1165a

Browse files
authored
Stabilize CalcitePPLConditionBuiltinFunctionIT on the analytics-engine route (#5556)
init() seeds two extra docs into state_country_with_null via unconditional raw PUTs. init() runs as @before before every test method, and the analytics-engine parquet-backed store is append-only on same-_id PUT, so the docs accumulated a duplicate per method and inflated row counts across the suite. Guard the seed on a pre-loadIndex isIndexExist check so it runs exactly once; behavior is unchanged on the v2/Calcite route (same end state). Skip the six tests that exercise behaviors the analytics-engine route does not support, using the assumeNotAnalytics(...) registry (AnalyticsRouteLimitation) plus matching excludeTestsMatching entries in integTestRemote so the skip set stays countable in one place. NESTED_FIELDS is reused; three new constants: - STRUCT_PARENT_FIELD: querying an object/struct parent field directly (isnull/isnotnull(aws)) resolves to FIELD_NOT_FOUND — the route flattens objects to dotted leaf columns and the parent is not a queryable column. - CONCAT_NULL_AS_EMPTY: concat('H', null) = 'H' on the route (DataFusion NULL-as-empty) vs null on v2/Calcite (NULL-propagating). - EARLIEST_LATEST_NOW_CLOCK: earliest('now', utc_timestamp()) is true on the route (same instant) but false on v2 (clock-source divergence). The two nested tests reuse NESTED_FIELDS (nested fields are stripped at index creation on this route, #5541). Results (-Dtests.analytics.parquet_indices=true against the analytics route): CalcitePPLConditionBuiltinFunctionIT: 6/24 -> 18/18 run, 0 fail (6 excluded) v2/Calcite route unchanged: 24/24 pass. Signed-off-by: Kai Huang <ahkcs@amazon.com>
1 parent 3829dfa commit 4c1165a

3 files changed

Lines changed: 88 additions & 12 deletions

File tree

integ-test/build.gradle

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,6 +1147,23 @@ task integTestRemote(type: RestIntegTestTask) {
11471147
// - subsearch.maxout is lowered as a LIMIT on the in-subquery semi-join's right side,
11481148
// which the AE route does not honor, so the subsearch returns all rows.
11491149
excludeTestsMatching '*CalcitePPLInSubqueryIT.testSubsearchMaxOut'
1150+
1151+
// === Excludes: CalcitePPLConditionBuiltinFunctionIT route divergences ===
1152+
// Each test also carries an in-test assumeNotAnalytics(...) recording the reason (see
1153+
// AnalyticsRouteLimitation); listed here so the AE-route skip set stays countable.
1154+
// - isnull/isnotnull on the object/struct parent field big5.aws: objects are flattened
1155+
// to dotted leaf columns and the struct parent is not a queryable column.
1156+
excludeTestsMatching '*CalcitePPLConditionBuiltinFunctionIT.testIsNullWithStruct'
1157+
excludeTestsMatching '*CalcitePPLConditionBuiltinFunctionIT.testIsNotNullWithStruct'
1158+
// - isnull/isnotnull on the nested field nested_simple.address: nested fields are
1159+
// stripped at index creation (the route can't store them).
1160+
excludeTestsMatching '*CalcitePPLConditionBuiltinFunctionIT.testIsNullWithNested'
1161+
excludeTestsMatching '*CalcitePPLConditionBuiltinFunctionIT.testIsNotNullWithNested'
1162+
// - concat('H', null): DataFusion treats NULL as empty string; v2/Calcite propagates NULL.
1163+
excludeTestsMatching '*CalcitePPLConditionBuiltinFunctionIT.testNullIfWithExpression'
1164+
// - earliest('now', utc_timestamp()): 'now' and utc_timestamp() resolve to the same
1165+
// instant on the route (true) but differ on v2 (false).
1166+
excludeTestsMatching '*CalcitePPLConditionBuiltinFunctionIT.testEarliestWithEval'
11501167
}
11511168
}
11521169

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLConditionBuiltinFunctionIT.java

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@
55

66
package org.opensearch.sql.calcite.remote;
77

8+
import static org.opensearch.sql.legacy.TestUtils.isIndexExist;
89
import static org.opensearch.sql.legacy.TestsConstants.*;
10+
import static org.opensearch.sql.util.AnalyticsRouteLimitation.CONCAT_NULL_AS_EMPTY;
11+
import static org.opensearch.sql.util.AnalyticsRouteLimitation.EARLIEST_LATEST_NOW_CLOCK;
12+
import static org.opensearch.sql.util.AnalyticsRouteLimitation.NESTED_FIELDS;
13+
import static org.opensearch.sql.util.AnalyticsRouteLimitation.STRUCT_PARENT_FIELD;
914
import static org.opensearch.sql.util.MatcherUtils.*;
1015
import static org.opensearch.sql.util.MatcherUtils.rows;
1116

@@ -22,22 +27,29 @@ public void init() throws Exception {
2227
super.init();
2328
enableCalcite();
2429

30+
// init() runs as @Before, before every test method. On the analytics route the parquet-backed
31+
// store is append-only on same-_id PUT, so seed the extra docs only when the index is first
32+
// created — otherwise they accumulate a duplicate per test method and inflate row counts.
33+
boolean stateCountryWithNullExisted =
34+
isIndexExist(client(), TEST_INDEX_STATE_COUNTRY_WITH_NULL);
2535
loadIndex(Index.STATE_COUNTRY);
2636
loadIndex(Index.STATE_COUNTRY_WITH_NULL);
2737
loadIndex(Index.CALCS);
2838
loadIndex(Index.NESTED_SIMPLE);
2939
loadIndex(Index.BIG5);
30-
Request request1 =
31-
new Request("PUT", "/" + TEST_INDEX_STATE_COUNTRY_WITH_NULL + "/_doc/7?refresh=true");
32-
request1.setJsonEntity(
33-
"{\"name\":\" "
34-
+ " \",\"age\":27,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
35-
client().performRequest(request1);
36-
Request request2 =
37-
new Request("PUT", "/" + TEST_INDEX_STATE_COUNTRY_WITH_NULL + "/_doc/8?refresh=true");
38-
request2.setJsonEntity(
39-
"{\"name\":\"\",\"age\":57,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
40-
client().performRequest(request2);
40+
if (!stateCountryWithNullExisted) {
41+
Request request1 =
42+
new Request("PUT", "/" + TEST_INDEX_STATE_COUNTRY_WITH_NULL + "/_doc/7?refresh=true");
43+
request1.setJsonEntity(
44+
"{\"name\":\" "
45+
+ " \",\"age\":27,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
46+
client().performRequest(request1);
47+
Request request2 =
48+
new Request("PUT", "/" + TEST_INDEX_STATE_COUNTRY_WITH_NULL + "/_doc/8?refresh=true");
49+
request2.setJsonEntity(
50+
"{\"name\":\"\",\"age\":57,\"state\":\"B.C\",\"country\":\"Canada\",\"year\":2023,\"month\":4}");
51+
client().performRequest(request2);
52+
}
4153
}
4254

4355
@Test
@@ -54,13 +66,17 @@ public void testIsNull() throws IOException {
5466

5567
@Test
5668
public void testIsNullWithStruct() throws IOException {
69+
// Queries the object/struct parent field 'aws' directly.
70+
assumeNotAnalytics(STRUCT_PARENT_FIELD);
5771
JSONObject actual = executeQuery("source=big5 | where isnull(aws) | fields aws");
5872
verifySchema(actual, schema("aws", "struct"));
5973
verifyNumOfRows(actual, 0);
6074
}
6175

6276
@Test
6377
public void testIsNullWithNested() throws IOException {
78+
// Queries a nested field; the route strips nested fields at index creation.
79+
assumeNotAnalytics(NESTED_FIELDS);
6480
JSONObject actual =
6581
executeQuery(
6682
String.format(
@@ -124,13 +140,17 @@ public void testIsNotNullWithSingleNotEquals() throws IOException {
124140

125141
@Test
126142
public void testIsNotNullWithStruct() throws IOException {
143+
// Queries the object/struct parent field 'aws' directly.
144+
assumeNotAnalytics(STRUCT_PARENT_FIELD);
127145
JSONObject actual = executeQuery("source=big5 | where isnotnull(aws) | fields aws");
128146
verifySchema(actual, schema("aws", "struct"));
129147
verifyNumOfRows(actual, 3);
130148
}
131149

132150
@Test
133151
public void testIsNotNullWithNested() throws IOException {
152+
// Queries a nested field; the route strips nested fields at index creation.
153+
assumeNotAnalytics(NESTED_FIELDS);
134154
JSONObject actual =
135155
executeQuery(
136156
String.format(
@@ -165,6 +185,8 @@ public void testNullIf() throws IOException {
165185

166186
@Test
167187
public void testNullIfWithExpression() throws IOException {
188+
// concat('H', name) over the null-name row diverges (NULL-as-empty vs NULL-propagating).
189+
assumeNotAnalytics(CONCAT_NULL_AS_EMPTY);
168190
JSONObject actual =
169191
executeQuery(
170192
String.format(
@@ -354,6 +376,8 @@ public void testLatest() throws IOException {
354376

355377
@Test
356378
public void testEarliestWithEval() throws IOException {
379+
// earliest('now', utc_timestamp()) resolves true on the route but false on v2 (clock source).
380+
assumeNotAnalytics(EARLIEST_LATEST_NOW_CLOCK);
357381
JSONObject actual =
358382
executeQuery(
359383
String.format(

integ-test/src/test/java/org/opensearch/sql/util/AnalyticsRouteLimitation.java

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,42 @@ public enum AnalyticsRouteLimitation {
133133
SUBSEARCH_MAXOUT_IN_SUBQUERY(
134134
"subsearch.maxout is not honored on the analytics-engine route: the LIMIT lowered onto the"
135135
+ " in-subquery semi-join's right side is dropped, so the subsearch returns all rows"
136-
+ " regardless of the cap.");
136+
+ " regardless of the cap."),
137+
138+
/**
139+
* Querying an {@code object}/struct parent field directly (e.g. {@code isnull(aws)} where {@code
140+
* aws} is an {@code object}) fails on the analytics-engine route with {@code FIELD_NOT_FOUND}.
141+
* The route flattens objects into dotted leaf columns — {@code aws.cloudwatch.log_group} scans
142+
* fine — but the struct parent is not exposed as a queryable column. Distinct from {@link
143+
* #NESTED_FIELDS}: {@code object} parents survive in the OpenSearch mapping (they aren't stripped
144+
* at load) yet still can't be referenced as a whole.
145+
*/
146+
STRUCT_PARENT_FIELD(
147+
"Querying an object/struct parent field directly is unsupported on the analytics-engine"
148+
+ " route: objects are flattened to dotted leaf columns and the parent resolves to"
149+
+ " FIELD_NOT_FOUND."),
150+
151+
/**
152+
* {@code concat()} over a NULL argument diverges: the analytics-engine route (DataFusion) treats
153+
* NULL as an empty string (e.g. {@code concat('H', null)} = {@code 'H'}), whereas the v2/Calcite
154+
* engine propagates NULL ({@code concat('H', null)} = {@code null}). Any expression that depends
155+
* on the NULL-propagating behavior over a possibly-null operand diverges.
156+
*/
157+
CONCAT_NULL_AS_EMPTY(
158+
"concat() treats a NULL argument as an empty string on the analytics-engine route (DataFusion"
159+
+ " semantics), whereas the v2/Calcite engine propagates NULL."),
160+
161+
/**
162+
* {@code earliest('now', <ts>)} / {@code latest('now', <ts>)} where {@code <ts>} is {@code
163+
* utc_timestamp()} diverge: on the analytics-engine route the relative-time {@code 'now'} and
164+
* {@code utc_timestamp()} resolve to the same instant (so {@code earliest('now', now)} is {@code
165+
* true}), whereas on the v2/Calcite path they differ (it is {@code false}) — a clock-source
166+
* divergence between the relative-time evaluation and {@code utc_timestamp()}.
167+
*/
168+
EARLIEST_LATEST_NOW_CLOCK(
169+
"earliest/latest with relative-time 'now' against utc_timestamp() diverges on the"
170+
+ " analytics-engine route: 'now' and utc_timestamp() resolve to the same instant"
171+
+ " (earliest('now', now) is true), but differ on the v2/Calcite path (false).");
137172

138173
private final String reason;
139174

0 commit comments

Comments
 (0)