[GLUTEN-12013][VL] Fix bloom-filter bytes corruption on whole-stage AQE fallback

brijrajk · brijrajk · commit 9d096a3dd9b4 · 2026-06-26T09:24:19.000+05:30
`BloomFilterMightContainJointRewriteRule` previously rewrote every
`BloomFilterAggregate` it encountered, including standalone usages such
as `DataFrame.stat.bloomFilter()`.  That API collects the aggregate
output bytes and passes them directly to `BloomFilter.readFrom()`, which
expects Spark-native format; receiving Velox-format bytes caused
`java.io.IOException: Unexpected Bloom filter version number` (surfaced
as a CI failure in `GlutenDataFrameStatSuite - Bloom filter`).

Fix: only rewrite `BloomFilterAggregate` when it appears inside the
`ScalarSubquery` of a `BloomFilterMightContain`.  Standalone aggregates
are left untouched so that collected bytes remain in Spark-native format.

Add a regression test (`GlutenBloomFilterFallbackSuite`) to guard
against reintroducing this regression.

Local test results (Spark 4.0, Velox backend):
- GlutenDataFrameStatSuite            : 25/25 passed (was failing)
- GlutenBloomFilterFallbackSuite      :  4/4  passed
- GlutenBloomFilterAggregateQuerySuite: 14/14 passed
- GlutenInjectRuntimeFilterSuite      : 13/13 passed
diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
@@ -53,6 +53,7 @@ object VeloxRuleApi {
   private def injectSpark(injector: SparkInjector): Unit = {
     // Inject the regular Spark rules directly.
     injector.injectOptimizerRule(CollectRewriteRule.apply)
+    injector.injectOptimizerRule(BloomFilterMightContainJointRewriteRule.apply)
     injector.injectOptimizerRule(HLLRewriteRule.apply)
     injector.injectOptimizerRule(CollapseGetJsonObjectExpressionRule.apply)
     injector.injectOptimizerRule(RewriteCastFromArray.apply)
@@ -81,11 +82,6 @@ object VeloxRuleApi {
     injector.injectPreTransform(c => FallbackMultiCodegens.apply(c.session))
     injector.injectPreTransform(c => MergeTwoPhasesHashBaseAggregate(c.session))
     injector.injectPreTransform(_ => RewriteSubqueryBroadcast())
-    injector.injectPreTransform(
-      c =>
-        BloomFilterMightContainJointRewriteRule.apply(
-          c.session,
-          c.caller.isBloomFilterStatFunction()))
     injector.injectPreTransform(_ => EliminateRedundantGetTimestamp)
 
     // Legacy: The legacy transform rule.
diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/BloomFilterMightContainJointRewriteRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/BloomFilterMightContainJointRewriteRule.scala
@@ -21,63 +21,47 @@ import org.apache.gluten.expression.VeloxBloomFilterMightContain
 import org.apache.gluten.expression.aggregate.VeloxBloomFilterAggregate
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, BloomFilterMightContain, Expression}
-import org.apache.spark.sql.catalyst.expressions.aggregate.{BloomFilterAggregate, TypedImperativeAggregate}
+import org.apache.spark.sql.catalyst.expressions.{BloomFilterMightContain, ScalarSubquery}
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, BloomFilterAggregate}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution.SparkPlan
 
-case class BloomFilterMightContainJointRewriteRule(
-    spark: SparkSession,
-    isBloomFilterStatFunction: Boolean)
-  extends Rule[SparkPlan] {
-  override def apply(plan: SparkPlan): SparkPlan = {
-    if (isBloomFilterStatFunction || !GlutenConfig.get.enableNativeBloomFilter) {
+/**
+ * Optimizer rule that rewrites `BloomFilterAggregate` -> `VeloxBloomFilterAggregate` and
+ * `BloomFilterMightContain` -> `VeloxBloomFilterMightContain` at the logical plan level.
+ *
+ * Running as an optimizer rule ensures the substitution is captured in the `originalPlan` snapshot
+ * that [[org.apache.gluten.extension.columnar.heuristic.ExpandFallbackPolicy]] uses when promoting
+ * an individual stage fallback to a whole-stage AQE fallback. This guarantees that both sides of
+ * the bloom-filter pair always produce and consume the same byte format, regardless of whether
+ * stages fall back to JVM execution after AQE re-planning.
+ *
+ * `BloomFilterAggregate` is only rewritten when it appears inside the [[ScalarSubquery]] of a
+ * [[BloomFilterMightContain]]. Standalone usages (in particular `DataFrame.stat.bloomFilter()`,
+ * which collects bloom filter bytes and passes them to `BloomFilter.readFrom()`) are intentionally
+ * left untouched so that the returned bytes remain in Spark-native format.
+ */
+case class BloomFilterMightContainJointRewriteRule(spark: SparkSession)
+  extends Rule[LogicalPlan] {
+
+  override def apply(plan: LogicalPlan): LogicalPlan = {
+    if (!GlutenConfig.get.enableNativeBloomFilter) {
       return plan
     }
-    val out = plan.transformWithSubqueries {
-      case p =>
-        applyForNode(p)
-    }
-    out
-  }
-
-  private def replaceBloomFilterAggregate[T](
-      expr: Expression,
-      bloomFilterAggReplacer: (
-          Expression,
-          Expression,
-          Expression,
-          Int,
-          Int) => TypedImperativeAggregate[T]): Expression = expr match {
-    case BloomFilterAggregate(
-          child,
-          estimatedNumItemsExpression,
-          numBitsExpression,
-          mutableAggBufferOffset,
-          inputAggBufferOffset) =>
-      bloomFilterAggReplacer(
-        child,
-        estimatedNumItemsExpression,
-        numBitsExpression,
-        mutableAggBufferOffset,
-        inputAggBufferOffset)
-    case other => other
-  }
-
-  private def replaceMightContain[T](
-      expr: Expression,
-      mightContainReplacer: (Expression, Expression) => BinaryExpression): Expression = expr match {
-    case BloomFilterMightContain(bloomFilterExpression, valueExpression) =>
-      mightContainReplacer(bloomFilterExpression, valueExpression)
-    case other => other
-  }
-
-  private def applyForNode(p: SparkPlan) = {
-    p.transformExpressions {
-      case e =>
-        replaceMightContain(
-          replaceBloomFilterAggregate(e, VeloxBloomFilterAggregate.apply),
-          VeloxBloomFilterMightContain.apply)
+    plan.transformAllExpressions {
+      case BloomFilterMightContain(subq: ScalarSubquery, v) =>
+        val rewrittenPlan = subq.plan.transformAllExpressions {
+          case ae @ AggregateExpression(b: BloomFilterAggregate, _, _, _, _) =>
+            ae.copy(aggregateFunction = VeloxBloomFilterAggregate(
+              b.child,
+              b.estimatedNumItemsExpression,
+              b.numBitsExpression,
+              b.mutableAggBufferOffset,
+              b.inputAggBufferOffset))
+        }
+        VeloxBloomFilterMightContain(subq.withNewPlan(rewrittenPlan), v)
+      case BloomFilterMightContain(bf, v) =>
+        VeloxBloomFilterMightContain(bf, v)
     }
   }
 }
diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/caller/CallerInfo.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/caller/CallerInfo.scala
@@ -30,7 +30,6 @@ trait CallerInfo {
   def isAqe(): Boolean
   def isCache(): Boolean
   def isStreaming(): Boolean
-  def isBloomFilterStatFunction(): Boolean
 }
 
 object CallerInfo {
@@ -42,8 +41,7 @@ object CallerInfo {
   private class Impl(
       override val isAqe: Boolean,
       override val isCache: Boolean,
-      override val isStreaming: Boolean,
-      override val isBloomFilterStatFunction: Boolean
+      override val isStreaming: Boolean
   ) extends CallerInfo
 
   /*
@@ -57,8 +55,7 @@ object CallerInfo {
     new Impl(
       isAqe = inAqeCall(stack),
       isCache = inCacheCall(stack),
-      isStreaming = inStreamingCall(stack),
-      isBloomFilterStatFunction = inBloomFilterStatFunctionCall(stack))
+      isStreaming = inStreamingCall(stack))
   }
 
   private def inAqeCall(stack: Seq[StackTraceElement]): Boolean = {
@@ -78,21 +75,13 @@ object CallerInfo {
     stack.exists(_.getClassName.equals(streamName))
   }
 
-  private def inBloomFilterStatFunctionCall(stack: Seq[StackTraceElement]): Boolean = {
-    val res = stack.exists(
-      _.getClassName.equals("org.apache.spark.sql.DataFrameStatFunctions")
-        && stack.exists(_.getMethodName.equals("bloomFilter")))
-    res
-  }
-
   /** For testing only. */
   def withLocalValue[T](
       isAqe: Boolean,
       isCache: Boolean,
-      isStreaming: Boolean = false,
-      isBloomFilterStatFunction: Boolean = false)(body: => T): T = {
+      isStreaming: Boolean = false)(body: => T): T = {
     val prevValue = localStorage.get()
-    val newValue = new Impl(isAqe, isCache, isStreaming, isBloomFilterStatFunction)
+    val newValue = new Impl(isAqe, isCache, isStreaming)
     localStorage.set(Some(newValue))
     try {
       body
diff --git a/gluten-ut/test/src/test/scala/org/apache/gluten/sql/GlutenBloomFilterFallbackSuite.scala b/gluten-ut/test/src/test/scala/org/apache/gluten/sql/GlutenBloomFilterFallbackSuite.scala
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.sql
+
+import org.apache.gluten.backendsapi.BackendsApiManager
+import org.apache.gluten.config.GlutenConfig
+import org.apache.gluten.execution.WholeStageTransformerSuite
+
+import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.expressions.BloomFilterMightContain
+import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
+import org.apache.spark.sql.catalyst.expressions.aggregate.BloomFilterAggregate
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Regression tests for https://github.com/apache/gluten/issues/12013.
+ *
+ * Verifies that `BloomFilterMightContainJointRewriteRule`, registered as a `Rule[LogicalPlan]` via
+ * `injectOptimizerRule`, correctly handles whole-stage AQE fallback scenarios where one or both
+ * bloom-filter stages revert to vanilla Spark execution.
+ */
+class GlutenBloomFilterFallbackSuite extends WholeStageTransformerSuite {
+  protected val resourcePath: String = null
+  protected val fileFormat: String = null
+
+  import testImplicits._
+
+  private val funcIdBloomFilterAgg = FunctionIdentifier("bloom_filter_agg")
+  private val funcIdMightContain = FunctionIdentifier("might_contain")
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    spark.sessionState.functionRegistry.registerFunction(
+      funcIdBloomFilterAgg,
+      new ExpressionInfo(classOf[BloomFilterAggregate].getName, "bloom_filter_agg"),
+      args =>
+        args.size match {
+          case 1 => new BloomFilterAggregate(args(0))
+          case 2 => new BloomFilterAggregate(args(0), args(1))
+          case 3 => new BloomFilterAggregate(args(0), args(1), args(2))
+          case _ => throw new IllegalArgumentException("bloom_filter_agg requires 1-3 arguments")
+        }
+    )
+    spark.sessionState.functionRegistry.registerFunction(
+      funcIdMightContain,
+      new ExpressionInfo(classOf[BloomFilterMightContain].getName, "might_contain"),
+      args => BloomFilterMightContain(args(0), args(1)))
+  }
+
+  override def afterAll(): Unit = {
+    spark.sessionState.functionRegistry.dropFunction(funcIdBloomFilterAgg)
+    spark.sessionState.functionRegistry.dropFunction(funcIdMightContain)
+    super.afterAll()
+  }
+
+  private val veloxBloomFilterMaxNumBits = 4194304L
+
+  // GLUTEN-12013: only filter stage falls back (threshold=2).
+  // bloom_filter_agg subquery runs natively and produces Velox-format bytes; the filter stage
+  // falls back via ExpandFallbackPolicy.  The optimizer-level substitution ensures the fallback
+  // plan still uses VeloxBloomFilterMightContain so the JVM filter reads Velox-format bytes.
+  test("GLUTEN-12013: bloom_filter_agg whole-stage fallback does not corrupt bloom filter bytes") {
+    if (BackendsApiManager.getSettings.requireBloomFilterAggMightContainJointFallback()) {
+      val table = "bloom_filter_test"
+      val numEstimatedItems = 5000000L
+      val sqlString =
+        s"""
+           |SELECT col positive_membership_test
+           |FROM $table
+           |WHERE might_contain(
+           |            (SELECT bloom_filter_agg(col,
+           |              cast($numEstimatedItems as long),
+           |              cast($veloxBloomFilterMaxNumBits as long))
+           |             FROM $table), col)
+           |""".stripMargin
+      withTempView(table) {
+        (Seq(Long.MinValue, 0, Long.MaxValue) ++ (1L to 200000L))
+          .toDF("col")
+          .createOrReplaceTempView(table)
+        // Threshold=2: FilterExec fallback cost=2 triggers whole-stage fallback; agg cost=1
+        // does not, so Stage 0 runs natively.  ANSI off keeps agg cost at 1 on Spark 4.0+.
+        withSQLConf(
+          GlutenConfig.COLUMNAR_FILTER_ENABLED.key -> "false",
+          GlutenConfig.COLUMNAR_WHOLESTAGE_FALLBACK_THRESHOLD.key -> "2",
+          SQLConf.ANSI_ENABLED.key -> "false"
+        ) {
+          val df = spark.sql(sqlString)
+          // Must not throw: java.io.IOException: Unexpected Bloom filter version number.
+          assert(df.collect().length == 200003)
+          // Verify the optimizer rule ran: VeloxBloomFilterMightContain must be present even
+          // though Stage 1 executes inside a FallbackNode.
+          assert(
+            df.queryExecution.optimizedPlan.toString.contains("velox_might_contain"),
+            "Expected velox_might_contain in optimized plan -- optimizer rule may not have run"
+          )
+        }
+      }
+    }
+  }
+
+  // GLUTEN-12013: both stages fall back (threshold=1).
+  // Stage 0's inherent transition cost of 1 meets the threshold so ExpandFallbackPolicy
+  // promotes it to a whole-stage fallback too.  The optimizer rule has already rewritten both
+  // sides to Velox variants before ExpandFallbackPolicy captures its snapshot.  Even in JVM
+  // row-mode, VeloxBloomFilterAggregate produces Velox-format bytes (via JNI) and
+  // VeloxBloomFilterMightContain consumes them -- both sides are consistent.
+  test("GLUTEN-12013: bloom_filter_agg whole-stage fallback when both stages fall back") {
+    if (BackendsApiManager.getSettings.requireBloomFilterAggMightContainJointFallback()) {
+      val table = "bloom_filter_test"
+      val numEstimatedItems = 5000000L
+      val sqlString =
+        s"""
+           |SELECT col positive_membership_test
+           |FROM $table
+           |WHERE might_contain(
+           |            (SELECT bloom_filter_agg(col,
+           |              cast($numEstimatedItems as long),
+           |              cast($veloxBloomFilterMaxNumBits as long))
+           |             FROM $table), col)
+           |""".stripMargin
+      withTempView(table) {
+        (Seq(Long.MinValue, 0, Long.MaxValue) ++ (1L to 200000L))
+          .toDF("col")
+          .createOrReplaceTempView(table)
+        // Threshold=1: both stages fall back; both use Velox variants via JNI.
+        withSQLConf(
+          GlutenConfig.COLUMNAR_FILTER_ENABLED.key -> "false",
+          GlutenConfig.COLUMNAR_WHOLESTAGE_FALLBACK_THRESHOLD.key -> "1",
+          SQLConf.ANSI_ENABLED.key -> "false"
+        ) {
+          val df = spark.sql(sqlString)
+          // Must not throw: java.io.IOException: Unexpected Bloom filter version number.
+          assert(df.collect().length == 200003)
+          // Verify the optimizer rule ran on both sides.
+          assert(
+            df.queryExecution.optimizedPlan.toString.contains("velox_might_contain"),
+            "Expected velox_might_contain in optimized plan -- optimizer rule may not have run"
+          )
+        }
+      }
+    }
+  }
+
+  // GLUTEN-12013: DataFrame.stat.bloomFilter() must not be affected by the optimizer rule.
+  // The rule must only rewrite BloomFilterAggregate inside a BloomFilterMightContain subquery.
+  // A standalone BloomFilterAggregate (as used here) must remain vanilla so that the collected
+  // bytes are in Spark-native format and BloomFilter.readFrom() succeeds.
+  test("GLUTEN-12013: DataFrame.stat.bloomFilter() produces Spark-readable bytes") {
+    if (BackendsApiManager.getSettings.requireBloomFilterAggMightContainJointFallback()) {
+      val table = "bloom_filter_stat_test"
+      withTempView(table) {
+        (1L to 1000L).toDF("col").createOrReplaceTempView(table)
+        // Must not throw: java.io.IOException: Unexpected Bloom filter version number
+        val bf = spark.table(table).stat.bloomFilter("col", 1000L, 0.01)
+        // Bloom filters have no false negatives: every inserted value must be present.
+        assert(bf.mightContainLong(500L), "Expected 500 to be in bloom filter")
+      }
+    }
+  }
+
+  // GLUTEN-12013: native bloom filter disabled -- early-exit path of the optimizer rule.
+  // When spark.gluten.sql.native.bloomFilter=false the rule returns the plan unchanged.
+  // BloomFilterAggregate / BloomFilterMightContain remain as vanilla Spark expressions and
+  // produce/consume consistent Spark-format bytes.
+  test(
+    "GLUTEN-12013: native bloom filter disabled skips rewrite and produces correct results") {
+    if (BackendsApiManager.getSettings.requireBloomFilterAggMightContainJointFallback()) {
+      val table = "bloom_filter_test"
+      val numEstimatedItems = 5000000L
+      val sqlString =
+        s"""
+           |SELECT col positive_membership_test
+           |FROM $table
+           |WHERE might_contain(
+           |            (SELECT bloom_filter_agg(col,
+           |              cast($numEstimatedItems as long),
+           |              cast($veloxBloomFilterMaxNumBits as long))
+           |             FROM $table), col)
+           |""".stripMargin
+      withTempView(table) {
+        (Seq(Long.MinValue, 0, Long.MaxValue) ++ (1L to 200000L))
+          .toDF("col")
+          .createOrReplaceTempView(table)
+        withSQLConf(
+          GlutenConfig.COLUMNAR_NATIVE_BLOOMFILTER_ENABLED.key -> "false",
+          SQLConf.ANSI_ENABLED.key -> "false"
+        ) {
+          val df = spark.sql(sqlString)
+          assert(df.collect().length == 200003)
+          // Verify the rule early-exited: plan must NOT contain Velox variants.
+          assert(
+            !df.queryExecution.optimizedPlan.toString.contains("velox_might_contain"),
+            "Expected vanilla BloomFilterMightContain when native bloom filter is disabled"
+          )
+        }
+      }
+    }
+  }
+}