diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index f5d7532f9c6..05a6de12395 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -973,10 +973,60 @@ public RelNode visitBin(Bin node, CalcitePlanContext context) { String alias = node.getAlias() != null ? node.getAlias() : fieldName; projectPlusOverriding(List.of(binExpression), List.of(alias), context); + dropStructParentsFor(alias, context); return context.relBuilder.peek(); } + /** + * If {@code dottedName} addresses a nested leaf inside a struct that OpenSearch has exposed + * through both its struct-parent columns and its flattened leaf columns (e.g. the telemetry + * mapping exposes {@code resource}, {@code resource.attributes}, ..., {@code + * resource.attributes.telemetry.sdk.version} side-by-side), drop the struct-parent prefixes from + * the current row. This keeps a subsequent {@link #tryToRemoveNestedFields(CalcitePlanContext)} + * pass from collapsing the flattened leaves back into the parents when the final implicit {@code + * fields *} projection runs. + * + *

This preserves the behaviour that issue #4482 originally required for {@code bin} on a + * nested field without an explicit {@code fields} projection. It is invoked from two places: + * + *

+ * + * Using this narrowly-scoped pruning instead of a global prefix-override in {@link + * #shouldOverrideField} is what keeps issue #5185 and the reviewer's {@code eval agent.name = + * ...} case safe. + * + *

No-op when no such struct-parent columns exist (e.g. flat columns or MAP roots from {@code + * spath}). + */ + private void dropStructParentsFor(String dottedName, CalcitePlanContext context) { + if (dottedName == null || dottedName.indexOf('.') < 0) { + return; + } + List fieldNames = context.relBuilder.peek().getRowType().getFieldNames(); + List parentsToDrop = new ArrayList<>(); + int dotIdx = dottedName.indexOf('.'); + while (dotIdx >= 0) { + String prefix = dottedName.substring(0, dotIdx); + if (fieldNames.contains(prefix)) { + parentsToDrop.add(context.relBuilder.field(prefix)); + } + dotIdx = dottedName.indexOf('.', dotIdx + 1); + } + if (!parentsToDrop.isEmpty()) { + context.relBuilder.projectExcept(parentsToDrop); + } + } + @Override public RelNode visitParse(Parse node, CalcitePlanContext context) { visitChildren(node, context); @@ -1292,12 +1342,12 @@ private RelNode buildConversionProjection(ConversionState state, CalcitePlanCont private void projectPlusOverriding( List newFields, List newNames, CalcitePlanContext context) { - List originalFieldNames = context.relBuilder.peek().getRowType().getFieldNames(); + Set originalFieldNameSet = + new HashSet<>(context.relBuilder.peek().getRowType().getFieldNames()); + List overriddenNames = + newNames.stream().filter(originalFieldNameSet::contains).toList(); List toOverrideList = - originalFieldNames.stream() - .filter(originalName -> shouldOverrideField(originalName, newNames)) - .map(a -> (RexNode) context.relBuilder.field(a)) - .toList(); + overriddenNames.stream().map(a -> (RexNode) context.relBuilder.field(a)).toList(); // 1. add the new fields, For example "age0, country0" context.relBuilder.projectPlus(newFields); // 2. drop the overriding field list, it's duplicated now. For example "age, country" @@ -1313,17 +1363,49 @@ private void projectPlusOverriding( expectedRenameFields.addAll(newNames); // 5. rename context.relBuilder.rename(expectedRenameFields); + // 6. For each overridden dotted-path name that matched an existing flattened nested leaf, + // prune the struct-parent columns that OpenSearch exposed side-by-side with that leaf. Without + // this, a downstream implicit `fields *` invokes `tryToRemoveNestedFields`, which would drop + // the freshly-assigned dotted leaf back out again because its struct-parent prefix is still in + // the row schema (see issue #4482 and the scratch coverage in CalciteEvalCommandIT). + // + // Gating on "the override actually fired" is what keeps the reviewer's PR #5351 case safe: + // `source=idx | fields agent | eval agent.name = "test"` has no pre-existing `agent.name` + // column, so overriddenNames is empty and the struct-parent `agent` survives untouched. + // It also keeps issue #5185 safe — spath introduces a MAP root and subsequent eval assigns + // to brand-new dotted paths that were not already in the row schema. + for (String overridden : overriddenNames) { + dropStructParentsFor(overridden, context); + } } + /** + * Determine whether the column {@code originalName} should be replaced when a batch of new + * columns named {@code newNames} is being added. Only exact-name matches count as overrides — + * {@code eval foo.bar = ...} creates a brand new field literally named {@code foo.bar} and must + * never drop sibling or parent fields. This mirrors SPL1 semantics, where assigning a dotted name + * introduces a literal column of that name without touching any other field. + * + *

Earlier revisions (see PR #4606 / #5351) attempted to broaden this to a {@code + * newName.startsWith(originalName + ".")} prefix match. That prefix branch silently dropped any + * column that happened to be a prefix of an eval target, which caused two regressions: + * + *

+ * + * Struct-parent pruning for the "override on a real flattened nested leaf" case is handled + * uniformly in {@link #projectPlusOverriding(List, List, CalcitePlanContext)}, which invokes + * {@link #dropStructParentsFor(String, CalcitePlanContext)} only for overrides that actually + * replaced an existing column. This keeps issue #4482 fixed across every command that funnels + * through {@code projectPlusOverriding} (bin, eval, rex/sed, trendline, expand, flatten, + * patterns) without reintroducing the #5185 / reviewer regressions here. + */ private boolean shouldOverrideField(String originalName, List newNames) { - return newNames.stream() - .anyMatch( - newName -> - // Match exact field names (e.g., "age" == "age") for flat fields - newName.equals(originalName) - // OR match nested paths (e.g., "resource.attributes..." starts with - // "resource.") - || newName.startsWith(originalName + ".")); + return newNames.contains(originalName); } private List> extractInputRefList(List aggCalls) { diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteEvalCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteEvalCommandIT.java index 588a4a784f9..219020b1650 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteEvalCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteEvalCommandIT.java @@ -6,11 +6,13 @@ package org.opensearch.sql.calcite.remote; import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_BANK; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_TELEMETRY; import static org.opensearch.sql.util.MatcherUtils.rows; import static org.opensearch.sql.util.MatcherUtils.schema; import static org.opensearch.sql.util.MatcherUtils.verifyDataRows; import static org.opensearch.sql.util.MatcherUtils.verifySchema; +import com.google.common.collect.ImmutableMap; import java.io.IOException; import org.json.JSONObject; import org.junit.jupiter.api.Test; @@ -25,6 +27,7 @@ public void init() throws Exception { enableCalcite(); loadIndex(Index.BANK); + loadIndex(Index.TELEMETRY); // Create test data for string concatenation Request request1 = new Request("PUT", "/test_eval/_doc/1?refresh=true"); @@ -38,6 +41,21 @@ public void init() throws Exception { Request request3 = new Request("PUT", "/test_eval/_doc/3?refresh=true"); request3.setJsonEntity("{\"name\": \"Charlie\", \"age\": null, \"title\": \"Analyst\"}"); client().performRequest(request3); + + // Index with a struct field `agent` to reproduce the reviewer's case from PR #5351: + // source= | fields agent | eval agent.name = "test" + // Rely on dynamic mapping — OpenSearch infers `agent` as an object with string children + // from the document contents. Using dynamic mapping keeps the init idempotent across + // repeated `@Before` invocations in the preserved cluster. + Request agentDoc1 = new Request("PUT", "/test_eval_agent/_doc/1?refresh=true"); + agentDoc1.setJsonEntity( + "{\"agent\": {\"name\": \"winlogbeat\", \"version\": \"7.0\"}, \"message\": \"hello\"}"); + client().performRequest(agentDoc1); + + Request agentDoc2 = new Request("PUT", "/test_eval_agent/_doc/2?refresh=true"); + agentDoc2.setJsonEntity( + "{\"agent\": {\"name\": \"filebeat\", \"version\": \"8.1\"}, \"message\": \"world\"}"); + client().performRequest(agentDoc2); } @Test @@ -86,6 +104,90 @@ public void testEvalStringConcatenationWithLiterals() throws IOException { rows("Charlie", "Analyst", "Name: Charlie, Title: Analyst")); } + @Test + public void testEvalDottedNameDoesNotDropStructParent() throws IOException { + // Reviewer's case from PR #5351: assigning a new dotted-path column must not remove the + // struct-parent column that happens to be a prefix of the eval target. + // Equivalent SPL1 query: + // source= | fields agent | eval agent.name = "test" + // Before the fix, the prefix-override in shouldOverrideField silently dropped the `agent` + // column entirely from the result schema. With the fix, `agent` is preserved. + // The newly-created literal column `agent.name` is also available (verified via an + // explicit trailing `fields` projection that bypasses tryToRemoveNestedFields). + JSONObject result = + executeQuery( + "source=test_eval_agent | fields agent | eval `agent.name` = 'test' | fields agent," + + " `agent.name`"); + verifySchema(result, schema("agent", "struct"), schema("agent.name", "string")); + verifyDataRows( + result, + rows(ImmutableMap.of("name", "winlogbeat", "version", "7.0"), "test"), + rows(ImmutableMap.of("name", "filebeat", "version", "8.1"), "test")); + } + + @Test + public void testEvalDottedNamePreservesStructParent_ImplicitProject() throws IOException { + // Complementary coverage for the reviewer's case without the explicit trailing projection. + // With the implicit `fields *` (AllFields) that the PPL parser appends, the downstream + // `tryToRemoveNestedFields` pass still collapses the flattened leaf back into its struct + // parent -- but the important regression guard is that the struct parent `agent` is no + // longer dropped by `shouldOverrideField`'s prefix branch. + JSONObject result = + executeQuery("source=test_eval_agent | fields agent | eval `agent.name` = 'test'"); + verifySchema(result, schema("agent", "struct")); + } + + @Test + public void testEvalOverrideOfFlattenedNestedLeafSurvivesImplicitProject() throws IOException { + // Regression guard for PR #5351: eval assigning a new value to a dotted name that matches an + // existing OpenSearch flattened nested leaf must not have that value silently eaten by the + // implicit `fields *` pass. + // + // The telemetry mapping exposes struct parents (resource, resource.attributes, ..., + // resource.attributes.telemetry.sdk) side-by-side with the flattened leaves. When eval + // overrides the leaf, projectPlusOverriding now prunes those struct parents so a subsequent + // tryToRemoveNestedFields pass does not delete the overridden leaf on the way out. + // + // Before the fix, this query returned rows with the original `resource` struct (still + // containing the pre-override integer version) and no `resource.attributes.telemetry.sdk.*` + // flattened leaves at all -- the "OVERRIDE" string was completely lost. + JSONObject result = + executeQuery( + String.format( + "source=%s | eval `resource.attributes.telemetry.sdk.version` = 'OVERRIDE'", + TEST_INDEX_TELEMETRY)); + + verifyDataRows( + result, + rows(true, "java", "opentelemetry", 9, "OVERRIDE"), + rows(false, "python", "opentelemetry", 12, "OVERRIDE"), + rows(true, "javascript", "opentelemetry", 9, "OVERRIDE"), + rows(false, "go", "opentelemetry", 16, "OVERRIDE"), + rows(true, "rust", "opentelemetry", 12, "OVERRIDE")); + } + + @Test + public void testEvalOverrideOfFlattenedNestedLeafWithExplicitProject() throws IOException { + // Control for the test above: with an explicit trailing `fields` projection, the implicit + // `fields *` codepath (and tryToRemoveNestedFields) does not run, so eval always returned + // the overridden value even before the fix. This test pins the user-facing contract for the + // explicit-projection variant regardless of internal pruning behaviour. + JSONObject result = + executeQuery( + String.format( + "source=%s | eval `resource.attributes.telemetry.sdk.version` = 'OVERRIDE' | fields" + + " `resource.attributes.telemetry.sdk.version`", + TEST_INDEX_TELEMETRY)); + verifySchema(result, schema("resource.attributes.telemetry.sdk.version", "string")); + verifyDataRows( + result, + rows("OVERRIDE"), + rows("OVERRIDE"), + rows("OVERRIDE"), + rows("OVERRIDE"), + rows("OVERRIDE")); + } + @Test public void testEvalStringConcatenationWithExistingData() throws IOException { JSONObject result = diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java index ec6f8583b23..0bd7ac803f9 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java @@ -216,4 +216,27 @@ public void testSpathAutoExtractWithSort() throws IOException { verifySchema(result, schema("doc.user.name", "string")); verifyDataRowsInOrder(result, rows("Alice"), rows("John")); } + + @Test + public void testSpathAutoExtractWithMultiFieldEval() throws IOException { + JSONObject result = + executeQuery( + "source=test_spath_cmd | spath input=doc" + + " | eval doc.user.name=doc.user.name, doc.user.age=doc.user.age" + + " | fields doc.user.name, doc.user.age"); + verifySchema(result, schema("doc.user.name", "string"), schema("doc.user.age", "string")); + verifyDataRows(result, rows("Alice", "25"), rows("John", "30")); + } + + @Test + public void testSpathAutoExtractWithSeparateEvalCommands() throws IOException { + JSONObject result = + executeQuery( + "source=test_spath_cmd | spath input=doc" + + " | eval doc.user.name=doc.user.name" + + " | eval doc.user.age=doc.user.age" + + " | fields doc.user.name, doc.user.age"); + verifySchema(result, schema("doc.user.name", "string"), schema("doc.user.age", "string")); + verifyDataRows(result, rows("Alice", "25"), rows("John", "30")); + } } diff --git a/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/5185.yml b/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/5185.yml new file mode 100644 index 00000000000..0f939a03585 --- /dev/null +++ b/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/5185.yml @@ -0,0 +1,69 @@ +setup: + - do: + query.settings: + body: + transient: + plugins.calcite.enabled: true + - do: + indices.create: + index: issue5185 + body: + settings: + number_of_shards: 1 + number_of_replicas: 0 + mappings: + properties: + doc: + type: text + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "issue5185", "_id": "1"}}' + - '{"doc": "{\"user\":{\"name\":\"John\",\"age\":30}}"}' + - '{"index": {"_index": "issue5185", "_id": "2"}}' + - '{"doc": "{\"user\":{\"name\":\"Alice\",\"age\":25}}"}' + +--- +teardown: + - do: + indices.delete: + index: issue5185 + ignore_unavailable: true + - do: + query.settings: + body: + transient: + plugins.calcite.enabled: false + +--- +"Issue 5185: eval with multiple dotted-path assignments from MAP column": + - skip: + features: + - headers + - allowed_warnings + - do: + headers: + Content-Type: 'application/json' + ppl: + body: + query: "source=issue5185 | spath input=doc | eval doc.user.name=doc.user.name, doc.user.age=doc.user.age | fields doc.user.name, doc.user.age" + + - match: { total: 2 } + - length: { datarows: 2 } + +--- +"Issue 5185: separate eval commands with dotted-path from MAP column": + - skip: + features: + - headers + - allowed_warnings + - do: + headers: + Content-Type: 'application/json' + ppl: + body: + query: "source=issue5185 | spath input=doc | eval doc.user.name=doc.user.name | eval doc.user.age=doc.user.age | fields doc.user.name, doc.user.age" + + - match: { total: 2 } + - length: { datarows: 2 } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java index 9967b10543e..879d48bc4de 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java @@ -123,6 +123,34 @@ public void testSpathAutoExtractModeWithFields() { + "FROM `scott`.`EMP`"); } + @Test + public void testSpathAutoExtractWithMultiFieldEval() { + // Issue #5185: eval with multiple dotted-path assignments from MAP column + // should not remove the MAP root field + withPPLQuery( + "source=EMP | spath input=ENAME" + + " | eval ENAME.user.name=ENAME.user.name, ENAME.user.age=ENAME.user.age" + + " | fields ENAME.user.name, ENAME.user.age") + .expectLogical( + "LogicalProject(ENAME.user.name=[ITEM(JSON_EXTRACT_ALL($1), 'user.name')]," + + " ENAME.user.age=[ITEM(JSON_EXTRACT_ALL($1), 'user.age')])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"); + } + + @Test + public void testSpathAutoExtractWithSeparateEvalCommands() { + // Issue #5185: separate eval commands with dotted-path assignments from MAP column + withPPLQuery( + "source=EMP | spath input=ENAME" + + " | eval ENAME.user.name=ENAME.user.name" + + " | eval ENAME.user.age=ENAME.user.age" + + " | fields ENAME.user.name, ENAME.user.age") + .expectLogical( + "LogicalProject(ENAME.user.name=[ITEM(JSON_EXTRACT_ALL($1), 'user.name')]," + + " ENAME.user.age=[ITEM(JSON_EXTRACT_ALL($1), 'user.age')])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"); + } + @Test public void testSpathAutoExtractModeWithSort() { withPPLQuery("source=EMP | spath input=ENAME output=result" + " | sort result.user.name")