fix tests

RyanL1997 · RyanL1997 · commit 2ddff58d72b5 · 2025-11-21T00:24:46.000-08:00
Signed-off-by: Jialiang Liang &lt;jiallian@amazon.com&gt;
diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java
@@ -183,8 +183,8 @@ public RelNode analyze(UnresolvedPlan unresolved, CalcitePlanContext context) {
       context.enableFilterAccumulation();
       try {
         unresolved.accept(this, context);
-        context.flushFilterConditions(); // Flush accumulated conditions before returning
-        return context.relBuilder.peek(); // Get the result after flushing
+        context.flushFilterConditions();
+        return context.relBuilder.peek();
       } finally {
         context.disableFilterAccumulation();
       }
@@ -193,6 +193,17 @@ public RelNode analyze(UnresolvedPlan unresolved, CalcitePlanContext context) {
     }
   }
 
+  /**
+   * Flushes accumulated filter conditions before schema-changing operations. This prevents
+   * RexInputRef index mismatches that occur when filters reference field indices from the old
+   * schema.
+   */
+  private void flushFiltersBeforeSchemaChange(CalcitePlanContext context) {
+    if (context.isFilterAccumulationEnabled() && context.hasPendingFilterConditions()) {
+      context.flushFilterConditions();
+    }
+  }
+
   @Override
   public RelNode visitRelation(Relation node, CalcitePlanContext context) {
     DataSourceSchemaIdentifierNameResolver nameResolver =
@@ -404,10 +415,7 @@ private boolean containsSubqueryExpression(Node expr) {
   public RelNode visitProject(Project node, CalcitePlanContext context) {
     visitChildren(node, context);
 
-    // Flush accumulated filter conditions before schema-changing operations
-    if (context.isFilterAccumulationEnabled() && context.hasPendingFilterConditions()) {
-      context.flushFilterConditions();
-    }
+    flushFiltersBeforeSchemaChange(context);
 
     if (isSingleAllFieldsProject(node)) {
       return handleAllFieldsProject(node, context);
@@ -883,6 +891,9 @@ public RelNode visitPatterns(Patterns node, CalcitePlanContext context) {
   @Override
   public RelNode visitEval(Eval node, CalcitePlanContext context) {
     visitChildren(node, context);
+
+    flushFiltersBeforeSchemaChange(context);
+
     node.getExpressionList()
         .forEach(
             expr -> {
@@ -1152,6 +1163,9 @@ private Pair<List<RexNode>, List<AggCall>> resolveAttributesForAggregation(
   /** Visits an aggregation for stats command */
   @Override
   public RelNode visitAggregation(Aggregation node, CalcitePlanContext context) {
+    // Flush accumulated filter conditions before schema-changing aggregation operations
+    flushFiltersBeforeSchemaChange(context);
+
     Argument.ArgumentMap statsArgs = Argument.ArgumentMap.of(node.getArgExprList());
     Boolean bucketNullable =
         (Boolean) statsArgs.getOrDefault(Argument.BUCKET_NULLABLE, Literal.TRUE).getValue();
@@ -2252,10 +2266,26 @@ private RelNode mergeTableAndResolveColumnConflict(
   @Override
   public RelNode visitMultisearch(Multisearch node, CalcitePlanContext context) {
     List<RelNode> subsearchNodes = new ArrayList<>();
+    // Save the current filter accumulation state - we'll process each subsearch independently
+    boolean wasFilterAccumulationEnabled = context.isFilterAccumulationEnabled();
+
     for (UnresolvedPlan subsearch : node.getSubsearches()) {
       UnresolvedPlan prunedSubSearch = subsearch.accept(new EmptySourcePropagateVisitor(), null);
-      prunedSubSearch.accept(this, context);
+
+      // Temporarily disable filter accumulation so each subsearch gets its own independent
+      // lifecycle via analyze(). This prevents filter state from bleeding across branches.
+      if (wasFilterAccumulationEnabled) {
+        context.disableFilterAccumulation();
+      }
+
+      // Use analyze() to let each subsearch determine its own filter accumulation needs
+      analyze(prunedSubSearch, context);
       subsearchNodes.add(context.relBuilder.build());
+
+      // Restore filter accumulation state for the next iteration
+      if (wasFilterAccumulationEnabled) {
+        context.enableFilterAccumulation();
+      }
     }
 
     // Use shared schema merging logic that handles type conflicts via field renaming
@@ -3271,8 +3301,12 @@ private RexNode createOptimizedTransliteration(
    * RelNodes. This is used to detect queries with multiple regex/filter operations that could cause
    * deep Filter RelNode chains and memory exhaustion.
    *
+   * <p>Stops counting at schema-changing operations (like Aggregation, Project with computed
+   * expressions) to avoid enabling filter accumulation across schema boundaries, which would cause
+   * RexInputRef index mismatches.
+   *
    * @param plan the UnresolvedPlan to analyze
-   * @return the count of filtering operations found
+   * @return the count of filtering operations found before the first schema-changing operation
    */
   private int countFilteringOperations(UnresolvedPlan plan) {
     if (plan == null) {
@@ -3282,8 +3316,25 @@ private int countFilteringOperations(UnresolvedPlan plan) {
     int count = 0;
 
     // Count this node if it's a filtering operation
-    if (plan instanceof Regex || plan instanceof Filter) {
+    // BUT: Don't count Filter nodes that contain function calls, as they can cause
+    // type mismatches when accumulated and flushed later
+    if (plan instanceof Regex) {
       count = 1;
+    } else if (plan instanceof Filter) {
+      Filter filterNode = (Filter) plan;
+      if (!containsFunctionCall(filterNode.getCondition())) {
+        count = 1;
+      }
+    }
+
+    // Stop counting at schema-changing operations to prevent accumulation across schema boundaries
+    // Schema-changing operations include: Aggregation, Eval, Project (with computed expressions),
+    // Window, StreamWindow, etc.
+    if (plan instanceof Aggregation
+        || plan instanceof Eval
+        || plan instanceof Window
+        || plan instanceof StreamWindow) {
+      return count; // Don't recurse into children beyond schema changes
     }
 
     // Recursively count filtering operations in children
@@ -3297,4 +3348,29 @@ private int countFilteringOperations(UnresolvedPlan plan) {
 
     return count;
   }
+
+  /**
+   * Checks if an expression contains any function calls. Filter expressions with function calls can
+   * cause type mismatches when accumulated and flushed later, so we exclude them from filter
+   * accumulation.
+   */
+  private boolean containsFunctionCall(UnresolvedExpression expr) {
+    if (expr == null) {
+      return false;
+    }
+
+    if (expr instanceof org.opensearch.sql.ast.expression.Function) {
+      return true;
+    }
+
+    // Check children recursively
+    for (Node child : expr.getChild()) {
+      if (child instanceof UnresolvedExpression
+          && containsFunctionCall((UnresolvedExpression) child)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
 }
diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_filter_push.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_filter_push.yaml
@@ -2,9 +2,7 @@ calcite:
   logical: |
     LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
       LogicalProject(age=[$8])
-        LogicalFilter(condition=[>($3, 10000)])
-          LogicalFilter(condition=[<($8, 40)])
-            LogicalFilter(condition=[>($8, 30)])
-              CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
+        LogicalFilter(condition=[AND(SEARCH($8, Sarg[(30..40)]), >($3, 10000))])
+          CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
   physical: |
     CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[balance, age], FILTER->AND(SEARCH($1, Sarg[(30..40)]), >($0, 10000)), PROJECT->[age], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"bool":{"must":[{"range":{"age":{"from":30.0,"to":40.0,"include_lower":false,"include_upper":false,"boost":1.0}}},{"range":{"balance":{"from":10000,"to":null,"include_lower":false,"include_upper":true,"boost":1.0}}}],"adjust_pure_negative":true,"boost":1.0}},"_source":{"includes":["age"],"excludes":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)])
diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_filter_push_compare_date_string.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_filter_push_compare_date_string.yaml
@@ -1,9 +1,8 @@
 calcite:
   logical: |
     LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
-      LogicalFilter(condition=[<($0, DATE('2018-11-09 00:00:00.000000000':VARCHAR))])
-        LogicalFilter(condition=[>($0, DATE('2016-12-08 00:00:00.123456789':VARCHAR))])
-          LogicalProject(yyyy-MM-dd=[$83])
-            CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_date_formats]])
+      LogicalFilter(condition=[AND(>($0, DATE('2016-12-08 00:00:00.123456789':VARCHAR)), <($0, DATE('2018-11-09 00:00:00.000000000':VARCHAR)))])
+        LogicalProject(yyyy-MM-dd=[$83])
+          CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_date_formats]])
   physical: |
     CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_date_formats]], PushDownContext=[[PROJECT->[yyyy-MM-dd], FILTER->SEARCH($0, Sarg[('2016-12-08':VARCHAR..'2018-11-09':VARCHAR)]:VARCHAR), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"range":{"yyyy-MM-dd":{"from":"2016-12-08","to":"2018-11-09","include_lower":false,"include_upper":false,"boost":1.0}}},"_source":{"includes":["yyyy-MM-dd"],"excludes":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)])
diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_filter_push_compare_timestamp_string.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_filter_push_compare_timestamp_string.yaml
@@ -2,8 +2,7 @@ calcite:
   logical: |
     LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
       LogicalProject(account_number=[$0], firstname=[$1], address=[$2], birthdate=[$3], gender=[$4], city=[$5], lastname=[$6], balance=[$7], employer=[$8], state=[$9], age=[$10], email=[$11], male=[$12])
-        LogicalFilter(condition=[<($3, TIMESTAMP('2018-11-09 00:00:00.000000000':VARCHAR))])
-          LogicalFilter(condition=[>($3, TIMESTAMP('2016-12-08 00:00:00.000000000':VARCHAR))])
-            CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_bank]])
+        LogicalFilter(condition=[AND(>($3, TIMESTAMP('2016-12-08 00:00:00.000000000':VARCHAR)), <($3, TIMESTAMP('2018-11-09 00:00:00.000000000':VARCHAR)))])
+          CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_bank]])
   physical: |
     CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_bank]], PushDownContext=[[PROJECT->[account_number, firstname, address, birthdate, gender, city, lastname, balance, employer, state, age, email, male], FILTER->SEARCH($3, Sarg[('2016-12-08 00:00:00':VARCHAR..'2018-11-09 00:00:00':VARCHAR)]:VARCHAR), LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","query":{"range":{"birthdate":{"from":"2016-12-08T00:00:00.000Z","to":"2018-11-09T00:00:00.000Z","include_lower":false,"include_upper":false,"format":"date_time","boost":1.0}}},"_source":{"includes":["account_number","firstname","address","birthdate","gender","city","lastname","balance","employer","state","age","email","male"],"excludes":[]}}, requestedTotalSize=10000, pageSize=null, startFrom=0)])
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMultisearchTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMultisearchTest.java
@@ -183,29 +183,26 @@ public void testMultisearchWithStats() {
             + "  LogicalAggregate(group=[{0}], count=[COUNT()])\n"
             + "    LogicalProject(type=[$8])\n"
             + "      LogicalUnion(all=[true])\n"
-            + "        LogicalFilter(condition=[=($7, 10)])\n"
-            + "          LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + "        LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
             + " SAL=[$5], COMM=[$6], DEPTNO=[$7], type=['accounting':VARCHAR])\n"
+            + "          LogicalFilter(condition=[=($7, 10)])\n"
             + "            LogicalTableScan(table=[[scott, EMP]])\n"
-            + "        LogicalFilter(condition=[=($7, 20)])\n"
-            + "          LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + "        LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
             + " SAL=[$5], COMM=[$6], DEPTNO=[$7], type=['research':VARCHAR])\n"
+            + "          LogicalFilter(condition=[=($7, 20)])\n"
             + "            LogicalTableScan(table=[[scott, EMP]])\n";
     verifyLogical(root, expectedLogical);
 
-    // SparkSQL reflects Filter above Project due to flush logic
     String expectedSparkSql =
         "SELECT COUNT(*) `count`, `type`\n"
-            + "FROM (SELECT *\n"
             + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`,"
             + " 'accounting' `type`\n"
-            + "FROM `scott`.`EMP`) `t`\n"
+            + "FROM `scott`.`EMP`\n"
             + "WHERE `DEPTNO` = 10\n"
             + "UNION ALL\n"
-            + "SELECT *\n"
-            + "FROM (SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`,"
+            + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`,"
             + " 'research' `type`\n"
-            + "FROM `scott`.`EMP`) `t1`\n"
+            + "FROM `scott`.`EMP`\n"
             + "WHERE `DEPTNO` = 20) `t3`\n"
             + "GROUP BY `type`";
     verifyPPLToSparkSQL(root, expectedSparkSql);