Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
CalciteNoMvCommandIT.class,
CalciteMvExpandCommandIT.class,
CalcitePPLGraphLookupIT.class,
CalciteMixedFieldTypeIT.class,
})
public class CalciteNoPushdownIT {
private static boolean wasPushdownEnabled;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.calcite.remote;

import static org.opensearch.sql.util.MatcherUtils.rows;
import static org.opensearch.sql.util.MatcherUtils.schema;
import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
import static org.opensearch.sql.util.MatcherUtils.verifySchema;
import static org.opensearch.sql.util.TestUtils.createIndexByRestClient;
import static org.opensearch.sql.util.TestUtils.isIndexExist;
import static org.opensearch.sql.util.TestUtils.performRequest;

import java.io.IOException;
import org.json.JSONObject;
import org.junit.jupiter.api.Test;
import org.opensearch.client.Request;
import org.opensearch.sql.ppl.PPLIntegTestCase;

/**
* Integration tests for querying wildcard indices where a field has conflicting types (e.g., text
* vs keyword) across different indices. See GitHub issue #4659.
*/
public class CalciteMixedFieldTypeIT extends PPLIntegTestCase {

private static final String LOG_TEXT_INDEX = "test_log_text_4659";
private static final String LOG_KEYWORD_INDEX = "test_log_keyword_4659";

@Override
public void init() throws Exception {
super.init();
enableCalcite();
createTestIndices();
}

private void createTestIndices() throws IOException {
// Create index with msg as text type
if (!isIndexExist(client(), LOG_TEXT_INDEX)) {
String textMapping =
"{\"mappings\":{\"properties\":{\"msg\":{\"type\":\"text\"},"
+ "\"idx\":{\"type\":\"integer\"}}}}";
createIndexByRestClient(client(), LOG_TEXT_INDEX, textMapping);
Request bulkReq = new Request("POST", "/" + LOG_TEXT_INDEX + "/_bulk?refresh=true");
bulkReq.setJsonEntity(
"{\"index\":{\"_id\":\"1\"}}\n" + "{\"msg\":\"status=200\",\"idx\":1}\n");
performRequest(client(), bulkReq);
}

// Create index with msg as keyword type
if (!isIndexExist(client(), LOG_KEYWORD_INDEX)) {
String keywordMapping =
"{\"mappings\":{\"properties\":{\"msg\":{\"type\":\"keyword\"},"
+ "\"idx\":{\"type\":\"integer\"}}}}";
createIndexByRestClient(client(), LOG_KEYWORD_INDEX, keywordMapping);
Request bulkReq = new Request("POST", "/" + LOG_KEYWORD_INDEX + "/_bulk?refresh=true");
bulkReq.setJsonEntity(
"{\"index\":{\"_id\":\"1\"}}\n" + "{\"msg\":\"status=200\",\"idx\":2}\n");
performRequest(client(), bulkReq);
}
}

@Test
public void testWildcardQueryWithMixedTextAndKeywordField() throws IOException {
// Query using wildcard index pattern that matches both indices
// Both documents should be returned regardless of field type conflict
JSONObject result = executeQuery("source=test_log_*_4659 | fields msg, idx | sort idx");
verifySchema(result, schema("msg", "string"), schema("idx", "int"));
verifyDataRows(result, rows("status=200", 1), rows("status=200", 2));
}

@Test
public void testWildcardQueryWithEvalOnMixedField() throws IOException {
// Eval uses the Calcite script engine to compute expression on each shard.
// When the merged type is keyword, DOC_VALUE is used, but text shards have no doc_values
// which returns null and causes the eval to produce null for the text-typed shard.
JSONObject result =
executeQuery(
"source=test_log_*_4659 | eval upper_msg = upper(msg) | fields idx, upper_msg"
+ " | sort idx");
verifySchema(result, schema("idx", "int"), schema("upper_msg", "string"));
verifyDataRows(result, rows(1, "STATUS=200"), rows(2, "STATUS=200"));
}

@Test
public void testWildcardQueryWithScriptFilterOnMixedField() throws IOException {
// Script-based filter pushed down to each shard uses DOC_VALUE retrieval.
// When the merged type is keyword but the field is text on some shards,
// doc_values are not available, causing shard failures and missing results.
JSONObject result =
executeQuery(
"source=test_log_*_4659 | where upper(msg) = 'STATUS=200' | fields msg, idx"
+ " | sort idx");
verifySchema(result, schema("msg", "string"), schema("idx", "int"));
verifyDataRows(result, rows("status=200", 1), rows("status=200", 2));
}

@Test
public void testWildcardQueryWithRexOnMixedField() throws IOException {
// Rex command on the conflicting field should work across both indices
JSONObject result =
executeQuery(
"source=test_log_*_4659 | rex field=msg 'status=(?<statusCode>\\\\d+)'"
+ " | fields msg, idx, statusCode | sort idx");
verifySchema(
result, schema("msg", "string"), schema("idx", "int"), schema("statusCode", "string"));
verifyDataRows(result, rows("status=200", 1, "200"), rows("status=200", 2, "200"));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
setup:
- do:
query.settings:
body:
transient:
plugins.calcite.enabled : true
- do:
indices.create:
index: log_text_4659
body:
mappings:
properties:
msg:
type: text
idx:
type: integer
- do:
indices.create:
index: log_keyword_4659
body:
mappings:
properties:
msg:
type: keyword
idx:
type: integer
- do:
bulk:
index: log_text_4659
refresh: true
body:
- '{"index": {"_id": "1"}}'
- '{"msg": "status=200", "idx": 1}'
- do:
bulk:
index: log_keyword_4659
refresh: true
body:
- '{"index": {"_id": "1"}}'
- '{"msg": "status=200", "idx": 2}'

---
teardown:
- do:
query.settings:
body:
transient:
plugins.calcite.enabled : false
- do:
indices.delete:
index: log_text_4659
ignore: 404
- do:
indices.delete:
index: log_keyword_4659
ignore: 404

---
"PPL wildcard query returns all documents across indices with mixed text/keyword field types":
- skip:
features:
- headers
- allowed_warnings
- do:
allowed_warnings:
- 'Loading the fielddata on the _id field is deprecated and will be removed in future versions. If you require sorting or aggregating on this field you should also include the id in the body of your documents, and map this field as a keyword field that has [doc_values] enabled'
headers:
Content-Type: 'application/json'
ppl:
body:
query: 'source=log_*_4659 | fields msg, idx | sort idx'
- match: {"total": 2}
- match: {"datarows": [["status=200", 1], ["status=200", 2]]}

---
"PPL script filter works across indices with mixed text/keyword field types":
- skip:
features:
- headers
- allowed_warnings
- do:
allowed_warnings:
- 'Loading the fielddata on the _id field is deprecated and will be removed in future versions. If you require sorting or aggregating on this field you should also include the id in the body of your documents, and map this field as a keyword field that has [doc_values] enabled'
headers:
Content-Type: 'application/json'
ppl:
body:
query: "source=log_*_4659 | where upper(msg) = 'STATUS=200' | fields msg, idx | sort idx"
- match: {"total": 2}
- match: {"datarows": [["status=200", 1], ["status=200", 2]]}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
public class MergeRuleHelper {
private static final List<MergeRule> RULES =
List.of(
new DeepMergeRule(), new LatestRule() // must come last
new DeepMergeRule(), new TextKeywordConflictRule(), new LatestRule() // must come last
);

public static MergeRule selectRule(OpenSearchDataType source, OpenSearchDataType target) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.opensearch.util.MergeRules;

import java.util.Map;
import org.opensearch.sql.opensearch.data.type.OpenSearchDataType;
import org.opensearch.sql.opensearch.data.type.OpenSearchDataType.MappingType;
import org.opensearch.sql.opensearch.data.type.OpenSearchTextType;

/**
* Merge rule for text/keyword type conflicts across indices. When a field is text in one index and
* keyword in another, or text-with-keyword-subfield in one and text-without in another, we merge to
* text WITHOUT keyword subfields. This forces _source retrieval instead of doc_values, which works
* universally across all shards regardless of the actual field type.
*
* <p>See GitHub issue #4659.
*/
public class TextKeywordConflictRule implements MergeRule {

@Override
public boolean isMatch(OpenSearchDataType source, OpenSearchDataType target) {
if (source == null || target == null) {
return false;
}
MappingType sourceMapping = source.getMappingType();
MappingType targetMapping = target.getMappingType();
if (sourceMapping == null || targetMapping == null) {
return false;
}
// Match when one is text and the other is keyword
if (isTextLike(sourceMapping) && isKeyword(targetMapping)) {

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a field is text but has keyword subfield, and the other is keyword, should they match and use text only?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes — this is intentional. Consider the scenario:

  • Index A: field msg is text with a keyword subfield (msg.keyword)
  • Index B: field msg is keyword

If the text-with-keyword-subfield wins the merge, toKeywordSubField() returns msg.keyword, leading to DOC_VALUE retrieval on the subfield path msg.keyword. But on Index B's shards, msg is just a keyword field — there is no msg.keyword subfield. DOC_VALUE retrieval for msg.keyword would fail on those shards, producing the same silent data loss as the original bug.

By matching this case and merging to plain text (without keyword subfields), we force _source retrieval, which works universally.

The existing test testMatchTextAndKeyword (line 25) covers the pure text-vs-keyword case, and this isTextLike check intentionally covers text-with-keyword-subfield-vs-keyword as well.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Curios that could this case be improved to a DSL with full DOC_VALUE retrieval?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curios that could this case be improved to a DSL with full DOC_VALUE retrieval?

@LantaoJin Text only field doesn't have doc value.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My question is for If a field is text but has keyword subfield, and the other is keyword

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline, the current implementation is sufficient for bugfix. Supporting above enhancement requires additional investigation.

return true;
}
if (isKeyword(sourceMapping) && isTextLike(targetMapping)) {

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same reasoning as above — the reverse direction (keyword source, text-like target) also needs to match to ensure we always merge to plain text without keyword subfields.

return true;
}
// Match when both are text but one has keyword subfields and the other does not
if (isTextLike(sourceMapping) && isTextLike(targetMapping)) {
boolean sourceHasKeywordSub = hasKeywordSubField(source);
boolean targetHasKeywordSub = hasKeywordSubField(target);
return sourceHasKeywordSub != targetHasKeywordSub;
}
return false;
}

@Override
public void mergeInto(
String key, OpenSearchDataType source, Map<String, OpenSearchDataType> target) {
// Always merge to text WITHOUT keyword subfields.
// This forces _source retrieval, which works for both text and keyword fields.
target.put(key, OpenSearchTextType.of());
}

private static boolean isTextLike(MappingType mappingType) {
return mappingType == MappingType.Text || mappingType == MappingType.MatchOnlyText;
}

private static boolean isKeyword(MappingType mappingType) {
return mappingType == MappingType.Keyword;
}

private static boolean hasKeywordSubField(OpenSearchDataType type) {
if (type instanceof OpenSearchTextType textType) {
return textType.getFields().values().stream()
.anyMatch(f -> f.getMappingType() == MappingType.Keyword);
}
return false;
}
}
Loading
Loading