From 5a5b674f2e2a8ea488e7f542c25f3d29f5860dec Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Mon, 13 Apr 2026 13:25:43 +0800 Subject: [PATCH] [BugFix] Fix NOT IN including null/missing rows due to missing exists filter (#5165) When PredicateAnalyzer handles a SEARCH expression with complemented points (NOT IN) and nullAs=UNKNOWN, the generated DSL query now includes an exists filter to exclude null/missing documents. This aligns with SQL three-valued logic where NULL NOT IN (...) evaluates to UNKNOWN, not TRUE. Signed-off-by: Songkan Tang --- .../remote/CalciteNotInNullFilterIT.java | 61 ++++++++++++++++++ .../rest-api-spec/test/issues/5165.yml | 62 +++++++++++++++++++ .../opensearch/request/PredicateAnalyzer.java | 9 ++- .../request/PredicateAnalyzerTest.java | 58 +++++++++++++++++ 4 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteNotInNullFilterIT.java create mode 100644 integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/5165.yml diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteNotInNullFilterIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteNotInNullFilterIT.java new file mode 100644 index 00000000000..267a750e0cc --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteNotInNullFilterIT.java @@ -0,0 +1,61 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_BANK_WITH_NULL_VALUES; +import static org.opensearch.sql.util.MatcherUtils.rows; +import static org.opensearch.sql.util.MatcherUtils.verifyDataRowsInOrder; + +import java.io.IOException; +import org.json.JSONObject; +import org.junit.Test; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +/** Integration test for NOT IN excluding null/missing rows (issue #5165). */ +public class CalciteNotInNullFilterIT extends PPLIntegTestCase { + + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + loadIndex(Index.BANK_WITH_NULL_VALUES); + } + + @Test + public void testNotInExcludesNullRows() throws IOException { + // age values: 32, 36, 28, 33, 36, null, 34 + // NOT IN (32, 28) should return 36, 33, 36, 34 — excluding the null row + JSONObject result = + executeQuery( + String.format( + "source=%s | where age NOT IN (32, 28) | fields age | sort age", + TEST_INDEX_BANK_WITH_NULL_VALUES)); + verifyDataRowsInOrder(result, rows(33), rows(34), rows(36), rows(36)); + } + + @Test + public void testNotInExcludesNullAndMissingRows() throws IOException { + // balance values: 39225, null, 32838, 4180, null, null, 48086 + // NOT IN (39225) should return 32838, 4180, 48086 — excluding null/missing rows + JSONObject result = + executeQuery( + String.format( + "source=%s | where balance NOT IN (39225) | fields balance | sort balance", + TEST_INDEX_BANK_WITH_NULL_VALUES)); + verifyDataRowsInOrder(result, rows(4180), rows(32838), rows(48086)); + } + + @Test + public void testInWithNullRowsIsUnaffected() throws IOException { + // IN should naturally exclude nulls (positive match never matches null) + JSONObject result = + executeQuery( + String.format( + "source=%s | where age IN (32, 28) | fields age | sort age", + TEST_INDEX_BANK_WITH_NULL_VALUES)); + verifyDataRowsInOrder(result, rows(28), rows(32)); + } +} diff --git a/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/5165.yml b/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/5165.yml new file mode 100644 index 00000000000..34cb9f24370 --- /dev/null +++ b/integ-test/src/yamlRestTest/resources/rest-api-spec/test/issues/5165.yml @@ -0,0 +1,62 @@ +setup: + - do: + query.settings: + body: + transient: + plugins.calcite.enabled: true + - do: + indices.create: + index: issue5165 + body: + settings: + number_of_shards: 1 + number_of_replicas: 0 + mappings: + properties: + int_field: + type: integer + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "issue5165", "_id": "1"}}' + - '{"int_field": 42}' + - '{"index": {"_index": "issue5165", "_id": "2"}}' + - '{"int_field": -1}' + - '{"index": {"_index": "issue5165", "_id": "3"}}' + - '{"int_field": 0}' + - '{"index": {"_index": "issue5165", "_id": "4"}}' + - '{"int_field": 2147483647}' + - '{"index": {"_index": "issue5165", "_id": "5"}}' + - '{"int_field": null}' + +--- +teardown: + - do: + indices.delete: + index: issue5165 + ignore_unavailable: true + - do: + query.settings: + body: + transient: + plugins.calcite.enabled: false + +--- +"Issue 5165: NOT IN should exclude null/missing rows": + - skip: + features: + - headers + - allowed_warnings + - do: + allowed_warnings: + - 'Loading the fielddata on the _id field is deprecated and will be removed in future versions. If you require sorting or aggregating on this field you should also include the id in the body of your documents, and map this field as a keyword field that has [doc_values] enabled' + headers: + Content-Type: 'application/json' + ppl: + body: + query: source=issue5165 | where int_field NOT IN (42, -1, 0) | fields int_field + + - match: { total: 1 } + - length: { datarows: 1 } + - match: { datarows: [ [ 2147483647 ] ] } diff --git a/opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java b/opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java index 8e6dbede58e..3f5295412b4 100644 --- a/opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java +++ b/opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java @@ -725,7 +725,14 @@ private QueryExpression binary(RexCall call) { CompoundQueryExpression.or( expression, QueryExpression.create(pair.getKey()).notExists()); // e.g. where a = 1 or a = 2 - case UNKNOWN -> expression; + // For NOT IN (complemented points), SQL three-valued logic dictates + // NULL NOT IN (...) evaluates to UNKNOWN (not TRUE), so null rows + // must be excluded via an exists filter. + case UNKNOWN -> + isSearchWithComplementedPoints(call) + ? CompoundQueryExpression.and( + false, expression, QueryExpression.create(pair.getKey()).exists()) + : expression; }; finalExpression.updateAnalyzedNodes(call); return finalExpression; diff --git a/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java b/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java index 572f748fd03..5db0272793b 100644 --- a/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java +++ b/opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java @@ -1245,4 +1245,62 @@ void search_complementedPointsWithNullAsFalse_generatesExistsAndNotInQuery() """, result.toString()); } + + @Test + void search_complementedPointsWithNullAsUnknown_generatesExistsAndNotInQuery() + throws ExpressionNotAnalyzableException { + // Simulates: a NOT IN (12, 13) + // Calcite represents this as SEARCH($0, Sarg[...; NULL AS UNKNOWN]) with complemented points + // SQL three-valued logic: NULL NOT IN (...) evaluates to UNKNOWN (not TRUE), + // so null rows must be excluded. + Sarg sarg = + Sarg.of( + RexUnknownAs.UNKNOWN, + ImmutableRangeSet.builder() + .add(Range.lessThan(BigDecimal.valueOf(12))) + .add(Range.open(BigDecimal.valueOf(12), BigDecimal.valueOf(13))) + .add(Range.greaterThan(BigDecimal.valueOf(13))) + .build()); + RexNode sargLiteral = + builder.makeSearchArgumentLiteral(sarg, typeFactory.createSqlType(SqlTypeName.DECIMAL)); + RexNode call = builder.makeCall(SqlStdOperatorTable.SEARCH, field1, sargLiteral); + QueryBuilder result = PredicateAnalyzer.analyze(call, schema, fieldTypes); + + assertInstanceOf(BoolQueryBuilder.class, result); + assertEquals( + """ + { + "bool" : { + "must" : [ + { + "bool" : { + "must_not" : [ + { + "terms" : { + "a" : [ + 12.0, + 13.0 + ], + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "exists" : { + "field" : "a", + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }\ + """, + result.toString()); + } }