Skip to content

Commit d853fda

Browse files
committed
Make CalciteReplaceCommandIT column-order-agnostic for analytics-engine route
The analytics-engine route and the v2 / Lucene path return columns in different orders when there is no explicit `| fields ...` clause: parquet preserves the storage order chosen by the on-disk format, while the Lucene path preserves `_source` iteration order. Both are valid given the contract `verifySchema` declares (set equality on column names), so positional `verifyDataRows` assertions over-constrain the test and fail under `-Dtests.analytics.force_routing=true` even when the data is correct. Apply the same column-name-keyed match pattern Kai introduced for `CalcitePPLRenameIT` in 59c728b (#5413): * Add `rowOf(key1, val1, ...)` to build column-keyed expected rows. * Add `verifyDataRowsByColumn(...)` to look up each cell value by column name and reorder to match the response schema before delegating to the existing positional `verifyDataRows` matcher. * Convert the four order-sensitive tests (`testMultipleReplace`, `testEmptyStringReplacement`, `testMultipleFieldsInClause`, `testMultiplePairsInSingleCommand`) to the new helpers. * Make `testReplaceNonExistentField` order-agnostic on the `input fields are: [...]` field list — assert that the prefix and every expected field name appear in the message, but not in a fixed order. Test results against analytics-engine route via `-Dtests.analytics.{force_routing,parquet_indices}=true`: 21/21 pass in both the direct `CalciteReplaceCommandIT` suite and the `CalciteNoPushdownIT > CalciteReplaceCommandIT` re-run. v2 path remains green. Companion to the OpenSearch PR onboarding PPL `replace` command + `replace()` / `regexp_replace()` functions on the analytics-engine route via DataFusion `replace` / `regexp_replace` UDFs. Signed-off-by: Jialiang Liang <jiallian@amazon.com>
1 parent 92d22ed commit d853fda

1 file changed

Lines changed: 204 additions & 24 deletions

File tree

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteReplaceCommandIT.java

Lines changed: 204 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
import static org.opensearch.sql.util.MatcherUtils.*;
1010

1111
import java.io.IOException;
12+
import java.util.LinkedHashMap;
13+
import java.util.Map;
14+
import org.hamcrest.Matcher;
15+
import org.json.JSONArray;
1216
import org.json.JSONObject;
1317
import org.junit.Test;
1418
import org.opensearch.sql.common.antlr.SyntaxCheckException;
@@ -61,12 +65,41 @@ public void testMultipleReplace() throws IOException {
6165
schema("year", "int"),
6266
schema("age", "int"));
6367

64-
verifyDataRows(
68+
// Match by column name — analytics-engine and v2 paths return columns in different orders.
69+
verifyDataRowsByColumn(
6570
result,
66-
rows("Jake", "United States", "California", 4, 2023, 70),
67-
rows("Hello", "United States", "New York", 4, 2023, 30),
68-
rows("John", "Canada", "Ontario", 4, 2023, 25),
69-
rows("Joseph", "Canada", "Quebec", 4, 2023, 20));
71+
rowOf(
72+
"name",
73+
"Jake",
74+
"country",
75+
"United States",
76+
"state",
77+
"California",
78+
"month",
79+
4,
80+
"year",
81+
2023,
82+
"age",
83+
70),
84+
rowOf(
85+
"name",
86+
"Hello",
87+
"country",
88+
"United States",
89+
"state",
90+
"New York",
91+
"month",
92+
4,
93+
"year",
94+
2023,
95+
"age",
96+
30),
97+
rowOf(
98+
"name", "John", "country", "Canada", "state", "Ontario", "month", 4, "year", 2023,
99+
"age", 25),
100+
rowOf(
101+
"name", "Joseph", "country", "Canada", "state", "Quebec", "month", 4, "year", 2023,
102+
"age", 20));
70103
}
71104

72105
@Test
@@ -121,12 +154,40 @@ public void testEmptyStringReplacement() throws IOException {
121154
schema("year", "int"),
122155
schema("age", "int"));
123156

124-
verifyDataRows(
157+
verifyDataRowsByColumn(
125158
result,
126-
rows("Jake", "", "California", 4, 2023, 70),
127-
rows("Hello", "", "New York", 4, 2023, 30),
128-
rows("John", "Canada", "Ontario", 4, 2023, 25),
129-
rows("Jane", "Canada", "Quebec", 4, 2023, 20));
159+
rowOf(
160+
"name",
161+
"Jake",
162+
"country",
163+
"",
164+
"state",
165+
"California",
166+
"month",
167+
4,
168+
"year",
169+
2023,
170+
"age",
171+
70),
172+
rowOf(
173+
"name",
174+
"Hello",
175+
"country",
176+
"",
177+
"state",
178+
"New York",
179+
"month",
180+
4,
181+
"year",
182+
2023,
183+
"age",
184+
30),
185+
rowOf(
186+
"name", "John", "country", "Canada", "state", "Ontario", "month", 4, "year", 2023,
187+
"age", 25),
188+
rowOf(
189+
"name", "Jane", "country", "Canada", "state", "Quebec", "month", 4, "year", 2023, "age",
190+
20));
130191
}
131192

132193
@Test
@@ -146,12 +207,40 @@ public void testMultipleFieldsInClause() throws IOException {
146207
schema("year", "int"),
147208
schema("age", "int"));
148209

149-
verifyDataRows(
210+
verifyDataRowsByColumn(
150211
result,
151-
rows("Jake", "United States", "California", 4, 2023, 70),
152-
rows("Hello", "United States", "New York", 4, 2023, 30),
153-
rows("John", "Canada", "Ontario", 4, 2023, 25),
154-
rows("Jane", "Canada", "Quebec", 4, 2023, 20));
212+
rowOf(
213+
"name",
214+
"Jake",
215+
"country",
216+
"United States",
217+
"state",
218+
"California",
219+
"month",
220+
4,
221+
"year",
222+
2023,
223+
"age",
224+
70),
225+
rowOf(
226+
"name",
227+
"Hello",
228+
"country",
229+
"United States",
230+
"state",
231+
"New York",
232+
"month",
233+
4,
234+
"year",
235+
2023,
236+
"age",
237+
30),
238+
rowOf(
239+
"name", "John", "country", "Canada", "state", "Ontario", "month", 4, "year", 2023,
240+
"age", 25),
241+
rowOf(
242+
"name", "Jane", "country", "Canada", "state", "Quebec", "month", 4, "year", 2023, "age",
243+
20));
155244
}
156245

157246
@Test
@@ -164,10 +253,16 @@ public void testReplaceNonExistentField() {
164253
String.format(
165254
"source = %s | replace 'USA' WITH 'United States' IN non_existent_field",
166255
TEST_INDEX_STATE_COUNTRY)));
167-
verifyErrorMessageContains(
168-
e,
169-
"field [non_existent_field] not found; input fields are: [name, country, state, month,"
170-
+ " year, age, _id, _index, _score, _maxscore, _sort, _routing]");
256+
// Order-agnostic — analytics-engine and v2 paths emit the input-field list in different
257+
// orders (parquet preserves storage order, Lucene preserves _source iteration order).
258+
// Assert that the prefix and every expected field name appear somewhere in the message.
259+
verifyErrorMessageContains(e, "field [non_existent_field] not found; input fields are:");
260+
verifyErrorMessageContains(e, "name");
261+
verifyErrorMessageContains(e, "country");
262+
verifyErrorMessageContains(e, "state");
263+
verifyErrorMessageContains(e, "month");
264+
verifyErrorMessageContains(e, "year");
265+
verifyErrorMessageContains(e, "age");
171266
}
172267

173268
@Test
@@ -259,12 +354,40 @@ public void testMultiplePairsInSingleCommand() throws IOException {
259354
schema("year", "int"),
260355
schema("age", "int"));
261356

262-
verifyDataRows(
357+
verifyDataRowsByColumn(
263358
result,
264-
rows("Jake", "United States", "California", 4, 2023, 70),
265-
rows("Hello", "United States", "New York", 4, 2023, 30),
266-
rows("John", "CA", "Ontario", 4, 2023, 25),
267-
rows("Jane", "CA", "Quebec", 4, 2023, 20));
359+
rowOf(
360+
"name",
361+
"Jake",
362+
"country",
363+
"United States",
364+
"state",
365+
"California",
366+
"month",
367+
4,
368+
"year",
369+
2023,
370+
"age",
371+
70),
372+
rowOf(
373+
"name",
374+
"Hello",
375+
"country",
376+
"United States",
377+
"state",
378+
"New York",
379+
"month",
380+
4,
381+
"year",
382+
2023,
383+
"age",
384+
30),
385+
rowOf(
386+
"name", "John", "country", "CA", "state", "Ontario", "month", 4, "year", 2023, "age",
387+
25),
388+
rowOf(
389+
"name", "Jane", "country", "CA", "state", "Quebec", "month", 4, "year", 2023, "age",
390+
20));
268391
}
269392

270393
@Test
@@ -402,4 +525,61 @@ public void testEscapeSequence_noMatchLiteral() throws IOException {
402525
// Pattern "foo\*bar" matches literal "foo*bar", not "fooXbar", so original value returned
403526
verifyDataRows(result, rows("fooXbar"));
404527
}
528+
529+
/**
530+
* Build a {@code column -> value} map from interleaved varargs ({@code key1, val1, key2, val2,
531+
* ...}). Preserves insertion order so the expected-row mapping reads naturally at the call site.
532+
*/
533+
private static Map<String, Object> rowOf(Object... pairs) {
534+
if (pairs.length % 2 != 0) {
535+
throw new IllegalArgumentException("rowOf expects an even number of args (key, value, ...)");
536+
}
537+
Map<String, Object> row = new LinkedHashMap<>();
538+
for (int i = 0; i < pairs.length; i += 2) {
539+
row.put((String) pairs[i], pairs[i + 1]);
540+
}
541+
return row;
542+
}
543+
544+
/**
545+
* Match expected rows against the response by column name, ignoring the response's column
546+
* emission order. The two paths the analytics-engine route can take return columns in different
547+
* orders (parquet preserves storage order, the v2 / Lucene path preserves {@code _source}
548+
* iteration order), and either is valid given the contract {@code verifySchema} declares (set
549+
* equality on column names). To avoid baking either order into the test, this helper reorders
550+
* each expected row to match whatever column order the response actually returned.
551+
*
552+
* <p>Mirrors the helper in {@code CalcitePPLRenameIT} (commit 59c728b) — same pattern applied to
553+
* PPL {@code replace} command tests.
554+
*/
555+
@SafeVarargs
556+
@SuppressWarnings("varargs")
557+
private final void verifyDataRowsByColumn(
558+
JSONObject result, Map<String, Object>... expectedRows) {
559+
JSONArray schema = result.getJSONArray("schema");
560+
int n = schema.length();
561+
String[] columnOrder = new String[n];
562+
for (int i = 0; i < n; i++) {
563+
columnOrder[i] = schema.getJSONObject(i).getString("name");
564+
}
565+
@SuppressWarnings({"unchecked", "rawtypes"})
566+
Matcher<JSONArray>[] rowMatchers = new Matcher[expectedRows.length];
567+
for (int r = 0; r < expectedRows.length; r++) {
568+
Object[] reordered = new Object[n];
569+
for (int c = 0; c < n; c++) {
570+
if (!expectedRows[r].containsKey(columnOrder[c])) {
571+
throw new IllegalArgumentException(
572+
"Expected row at index "
573+
+ r
574+
+ " is missing canonical value for response column ["
575+
+ columnOrder[c]
576+
+ "]; provided keys: "
577+
+ expectedRows[r].keySet());
578+
}
579+
reordered[c] = expectedRows[r].get(columnOrder[c]);
580+
}
581+
rowMatchers[r] = rows(reordered);
582+
}
583+
verifyDataRows(result, rowMatchers);
584+
}
405585
}

0 commit comments

Comments
 (0)