diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java index e415b3eba13..78510632b96 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java @@ -116,6 +116,7 @@ CalciteNoMvCommandIT.class, CalciteMvExpandCommandIT.class, CalcitePPLGraphLookupIT.class, + CalciteMixedFieldTypeIT.class, }) public class CalciteNoPushdownIT { private static boolean wasPushdownEnabled; diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMixedFieldTypeIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMixedFieldTypeIT.java new file mode 100644 index 00000000000..84c53738ae3 --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMixedFieldTypeIT.java @@ -0,0 +1,110 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.util.MatcherUtils.rows; +import static org.opensearch.sql.util.MatcherUtils.schema; +import static org.opensearch.sql.util.MatcherUtils.verifyDataRows; +import static org.opensearch.sql.util.MatcherUtils.verifySchema; +import static org.opensearch.sql.util.TestUtils.createIndexByRestClient; +import static org.opensearch.sql.util.TestUtils.isIndexExist; +import static org.opensearch.sql.util.TestUtils.performRequest; + +import java.io.IOException; +import org.json.JSONObject; +import org.junit.jupiter.api.Test; +import org.opensearch.client.Request; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +/** + * Integration tests for querying wildcard indices where a field has conflicting types (e.g., text + * vs keyword) across different indices. See GitHub issue #4659. + */ +public class CalciteMixedFieldTypeIT extends PPLIntegTestCase { + + private static final String LOG_TEXT_INDEX = "test_log_text_4659"; + private static final String LOG_KEYWORD_INDEX = "test_log_keyword_4659"; + + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + createTestIndices(); + } + + private void createTestIndices() throws IOException { + // Create index with msg as text type + if (!isIndexExist(client(), LOG_TEXT_INDEX)) { + String textMapping = + "{\"mappings\":{\"properties\":{\"msg\":{\"type\":\"text\"}," + + "\"idx\":{\"type\":\"integer\"}}}}"; + createIndexByRestClient(client(), LOG_TEXT_INDEX, textMapping); + Request bulkReq = new Request("POST", "/" + LOG_TEXT_INDEX + "/_bulk?refresh=true"); + bulkReq.setJsonEntity( + "{\"index\":{\"_id\":\"1\"}}\n" + "{\"msg\":\"status=200\",\"idx\":1}\n"); + performRequest(client(), bulkReq); + } + + // Create index with msg as keyword type + if (!isIndexExist(client(), LOG_KEYWORD_INDEX)) { + String keywordMapping = + "{\"mappings\":{\"properties\":{\"msg\":{\"type\":\"keyword\"}," + + "\"idx\":{\"type\":\"integer\"}}}}"; + createIndexByRestClient(client(), LOG_KEYWORD_INDEX, keywordMapping); + Request bulkReq = new Request("POST", "/" + LOG_KEYWORD_INDEX + "/_bulk?refresh=true"); + bulkReq.setJsonEntity( + "{\"index\":{\"_id\":\"1\"}}\n" + "{\"msg\":\"status=200\",\"idx\":2}\n"); + performRequest(client(), bulkReq); + } + } + + @Test + public void testWildcardQueryWithMixedTextAndKeywordField() throws IOException { + // Query using wildcard index pattern that matches both indices + // Both documents should be returned regardless of field type conflict + JSONObject result = executeQuery("source=test_log_*_4659 | fields msg, idx | sort idx"); + verifySchema(result, schema("msg", "string"), schema("idx", "int")); + verifyDataRows(result, rows("status=200", 1), rows("status=200", 2)); + } + + @Test + public void testWildcardQueryWithEvalOnMixedField() throws IOException { + // Eval uses the Calcite script engine to compute expression on each shard. + // When the merged type is keyword, DOC_VALUE is used, but text shards have no doc_values + // which returns null and causes the eval to produce null for the text-typed shard. + JSONObject result = + executeQuery( + "source=test_log_*_4659 | eval upper_msg = upper(msg) | fields idx, upper_msg" + + " | sort idx"); + verifySchema(result, schema("idx", "int"), schema("upper_msg", "string")); + verifyDataRows(result, rows(1, "STATUS=200"), rows(2, "STATUS=200")); + } + + @Test + public void testWildcardQueryWithScriptFilterOnMixedField() throws IOException { + // Script-based filter pushed down to each shard uses DOC_VALUE retrieval. + // When the merged type is keyword but the field is text on some shards, + // doc_values are not available, causing shard failures and missing results. + JSONObject result = + executeQuery( + "source=test_log_*_4659 | where upper(msg) = 'STATUS=200' | fields msg, idx" + + " | sort idx"); + verifySchema(result, schema("msg", "string"), schema("idx", "int")); + verifyDataRows(result, rows("status=200", 1), rows("status=200", 2)); + } + + @Test + public void testWildcardQueryWithRexOnMixedField() throws IOException { + // Rex command on the conflicting field should work across both indices + JSONObject result = + executeQuery( + "source=test_log_*_4659 | rex field=msg 'status=(?\\\\d+)'" + + " | fields msg, idx, statusCode | sort idx"); + verifySchema( + result, schema("msg", "string"), schema("idx", "int"), schema("statusCode", "string")); + verifyDataRows(result, rows("status=200", 1, "200"), rows("status=200", 2, "200")); + } +} diff --git a/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/4659.yml b/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/4659.yml new file mode 100644 index 00000000000..f111f0175f9 --- /dev/null +++ b/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/4659.yml @@ -0,0 +1,90 @@ +setup: + - do: + query.settings: + body: + transient: + plugins.calcite.enabled : true + - do: + indices.create: + index: log_text_4659 + body: + mappings: + properties: + msg: + type: text + idx: + type: integer + - do: + indices.create: + index: log_keyword_4659 + body: + mappings: + properties: + msg: + type: keyword + idx: + type: integer + - do: + bulk: + index: log_text_4659 + refresh: true + body: + - '{"index": {"_id": "1"}}' + - '{"msg": "status=200", "idx": 1}' + - do: + bulk: + index: log_keyword_4659 + refresh: true + body: + - '{"index": {"_id": "1"}}' + - '{"msg": "status=200", "idx": 2}' + +--- +teardown: + - do: + query.settings: + body: + transient: + plugins.calcite.enabled : false + - do: + indices.delete: + index: log_text_4659 + ignore: 404 + - do: + indices.delete: + index: log_keyword_4659 + ignore: 404 + +--- +"PPL wildcard query returns all documents across indices with mixed text/keyword field types": + - skip: + features: + - headers + - allowed_warnings + - do: + allowed_warnings: + - 'Loading the fielddata on the _id field is deprecated and will be removed in future versions. If you require sorting or aggregating on this field you should also include the id in the body of your documents, and map this field as a keyword field that has [doc_values] enabled' + headers: + Content-Type: 'application/json' + ppl: + body: + query: 'source=log_*_4659 | fields msg, idx | sort idx' + - match: {"total": 2} + - match: {"datarows": [["status=200", 1], ["status=200", 2]]} + +--- +"PPL script filter works across indices with mixed text/keyword field types": + - skip: + features: + - headers + - allowed_warnings + - do: + allowed_warnings: + - 'Loading the fielddata on the _id field is deprecated and will be removed in future versions. If you require sorting or aggregating on this field you should also include the id in the body of your documents, and map this field as a keyword field that has [doc_values] enabled' + headers: + Content-Type: 'application/json' + ppl: + body: + query: "source=log_*_4659 | where upper(msg) = 'STATUS=200' | fields msg, idx | sort idx" + - match: {"total": 2} + - match: {"datarows": [["status=200", 1], ["status=200", 2]]} diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/util/MergeRules/MergeRuleHelper.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/util/MergeRules/MergeRuleHelper.java index b2b851adec7..6cc6f1803a7 100644 --- a/opensearch/src/main/java/org/opensearch/sql/opensearch/util/MergeRules/MergeRuleHelper.java +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/util/MergeRules/MergeRuleHelper.java @@ -12,7 +12,7 @@ public class MergeRuleHelper { private static final List RULES = List.of( - new DeepMergeRule(), new LatestRule() // must come last + new DeepMergeRule(), new TextKeywordConflictRule(), new LatestRule() // must come last ); public static MergeRule selectRule(OpenSearchDataType source, OpenSearchDataType target) { diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/util/MergeRules/TextKeywordConflictRule.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/util/MergeRules/TextKeywordConflictRule.java new file mode 100644 index 00000000000..04d98e1a60f --- /dev/null +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/util/MergeRules/TextKeywordConflictRule.java @@ -0,0 +1,72 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.util.MergeRules; + +import java.util.Map; +import org.opensearch.sql.opensearch.data.type.OpenSearchDataType; +import org.opensearch.sql.opensearch.data.type.OpenSearchDataType.MappingType; +import org.opensearch.sql.opensearch.data.type.OpenSearchTextType; + +/** + * Merge rule for text/keyword type conflicts across indices. When a field is text in one index and + * keyword in another, or text-with-keyword-subfield in one and text-without in another, we merge to + * text WITHOUT keyword subfields. This forces _source retrieval instead of doc_values, which works + * universally across all shards regardless of the actual field type. + * + *

See GitHub issue #4659. + */ +public class TextKeywordConflictRule implements MergeRule { + + @Override + public boolean isMatch(OpenSearchDataType source, OpenSearchDataType target) { + if (source == null || target == null) { + return false; + } + MappingType sourceMapping = source.getMappingType(); + MappingType targetMapping = target.getMappingType(); + if (sourceMapping == null || targetMapping == null) { + return false; + } + // Match when one is text and the other is keyword + if (isTextLike(sourceMapping) && isKeyword(targetMapping)) { + return true; + } + if (isKeyword(sourceMapping) && isTextLike(targetMapping)) { + return true; + } + // Match when both are text but one has keyword subfields and the other does not + if (isTextLike(sourceMapping) && isTextLike(targetMapping)) { + boolean sourceHasKeywordSub = hasKeywordSubField(source); + boolean targetHasKeywordSub = hasKeywordSubField(target); + return sourceHasKeywordSub != targetHasKeywordSub; + } + return false; + } + + @Override + public void mergeInto( + String key, OpenSearchDataType source, Map target) { + // Always merge to text WITHOUT keyword subfields. + // This forces _source retrieval, which works for both text and keyword fields. + target.put(key, OpenSearchTextType.of()); + } + + private static boolean isTextLike(MappingType mappingType) { + return mappingType == MappingType.Text || mappingType == MappingType.MatchOnlyText; + } + + private static boolean isKeyword(MappingType mappingType) { + return mappingType == MappingType.Keyword; + } + + private static boolean hasKeywordSubField(OpenSearchDataType type) { + if (type instanceof OpenSearchTextType textType) { + return textType.getFields().values().stream() + .anyMatch(f -> f.getMappingType() == MappingType.Keyword); + } + return false; + } +} diff --git a/opensearch/src/test/java/org/opensearch/sql/opensearch/util/MergeRules/TextKeywordConflictRuleTest.java b/opensearch/src/test/java/org/opensearch/sql/opensearch/util/MergeRules/TextKeywordConflictRuleTest.java new file mode 100644 index 00000000000..22b1b36ca91 --- /dev/null +++ b/opensearch/src/test/java/org/opensearch/sql/opensearch/util/MergeRules/TextKeywordConflictRuleTest.java @@ -0,0 +1,146 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.opensearch.util.MergeRules; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.HashMap; +import java.util.Map; +import org.junit.jupiter.api.Test; +import org.opensearch.sql.opensearch.data.type.OpenSearchDataType; +import org.opensearch.sql.opensearch.data.type.OpenSearchDataType.MappingType; +import org.opensearch.sql.opensearch.data.type.OpenSearchTextType; + +class TextKeywordConflictRuleTest { + + private final TextKeywordConflictRule rule = new TextKeywordConflictRule(); + + @Test + void testMatchTextAndKeyword() { + OpenSearchDataType text = OpenSearchDataType.of(MappingType.Text); + OpenSearchDataType keyword = OpenSearchDataType.of(MappingType.Keyword); + assertTrue(rule.isMatch(text, keyword)); + assertTrue(rule.isMatch(keyword, text)); + } + + @Test + void testMatchMatchOnlyTextAndKeyword() { + OpenSearchDataType matchOnlyText = OpenSearchDataType.of(MappingType.MatchOnlyText); + OpenSearchDataType keyword = OpenSearchDataType.of(MappingType.Keyword); + assertTrue(rule.isMatch(matchOnlyText, keyword)); + assertTrue(rule.isMatch(keyword, matchOnlyText)); + } + + @Test + void testMatchTextWithKeywordSubfieldAndTextWithout() { + OpenSearchTextType textWithKeyword = + OpenSearchTextType.of(Map.of("keyword", OpenSearchDataType.of(MappingType.Keyword))); + OpenSearchTextType textWithout = OpenSearchTextType.of(); + assertTrue(rule.isMatch(textWithKeyword, textWithout)); + assertTrue(rule.isMatch(textWithout, textWithKeyword)); + } + + @Test + void testNoMatchSameTextWithoutSubfields() { + OpenSearchTextType text1 = OpenSearchTextType.of(); + OpenSearchTextType text2 = OpenSearchTextType.of(); + assertFalse(rule.isMatch(text1, text2)); + } + + @Test + void testNoMatchBothTextWithKeywordSubfields() { + OpenSearchTextType textWithKeyword1 = + OpenSearchTextType.of(Map.of("keyword", OpenSearchDataType.of(MappingType.Keyword))); + OpenSearchTextType textWithKeyword2 = + OpenSearchTextType.of(Map.of("keyword", OpenSearchDataType.of(MappingType.Keyword))); + assertFalse(rule.isMatch(textWithKeyword1, textWithKeyword2)); + } + + @Test + void testNoMatchKeywordAndKeyword() { + OpenSearchDataType keyword1 = OpenSearchDataType.of(MappingType.Keyword); + OpenSearchDataType keyword2 = OpenSearchDataType.of(MappingType.Keyword); + assertFalse(rule.isMatch(keyword1, keyword2)); + } + + @Test + void testNoMatchIntegerAndKeyword() { + OpenSearchDataType integer = OpenSearchDataType.of(MappingType.Integer); + OpenSearchDataType keyword = OpenSearchDataType.of(MappingType.Keyword); + assertFalse(rule.isMatch(integer, keyword)); + } + + @Test + void testNoMatchNullSource() { + OpenSearchDataType keyword = OpenSearchDataType.of(MappingType.Keyword); + assertFalse(rule.isMatch(null, keyword)); + } + + @Test + void testNoMatchNullTarget() { + OpenSearchDataType text = OpenSearchDataType.of(MappingType.Text); + assertFalse(rule.isMatch(text, null)); + } + + @Test + void testMergeProducesTextWithoutKeywordSubfields() { + OpenSearchDataType keyword = OpenSearchDataType.of(MappingType.Keyword); + Map target = new HashMap<>(); + target.put("msg", keyword); + + OpenSearchDataType text = OpenSearchDataType.of(MappingType.Text); + rule.mergeInto("msg", text, target); + + OpenSearchDataType merged = target.get("msg"); + assertInstanceOf(OpenSearchTextType.class, merged); + OpenSearchTextType mergedText = (OpenSearchTextType) merged; + assertTrue(mergedText.getFields().isEmpty(), "Merged type should have no keyword subfields"); + } + + @Test + void testMergeHelperIntegration() { + // Simulate merging two index mappings with conflicting text/keyword types + Map target = new HashMap<>(); + target.put("msg", OpenSearchDataType.of(MappingType.Keyword)); + target.put("idx", OpenSearchDataType.of(MappingType.Integer)); + + Map source = new HashMap<>(); + source.put("msg", OpenSearchDataType.of(MappingType.Text)); + source.put("idx", OpenSearchDataType.of(MappingType.Integer)); + + MergeRuleHelper.merge(target, source); + + // msg should be merged to text without keyword subfields + assertInstanceOf(OpenSearchTextType.class, target.get("msg")); + OpenSearchTextType mergedText = (OpenSearchTextType) target.get("msg"); + assertTrue(mergedText.getFields().isEmpty()); + + // idx should remain integer (same type in both, LatestRule applies) + assertEquals(MappingType.Integer, target.get("idx").getMappingType()); + } + + @Test + void testToKeywordSubFieldReturnsNullForMergedType() { + // After merging text and keyword, toKeywordSubField should return null, + // forcing SOURCE retrieval instead of DOC_VALUE + Map target = new HashMap<>(); + target.put("msg", OpenSearchDataType.of(MappingType.Keyword)); + + Map source = new HashMap<>(); + source.put("msg", OpenSearchDataType.of(MappingType.Text)); + + MergeRuleHelper.merge(target, source); + + OpenSearchDataType mergedType = target.get("msg"); + String result = OpenSearchTextType.toKeywordSubField("msg", mergedType.getExprType()); + // Should return null because the merged text type has no keyword subfield + assertNull(result); + } +}