[GLUTEN-12013][VL] Fix bloom-filter bytes corruption on whole-stage AQE fallback

brijrajk · brijrajk · commit cac891f0533e · 2026-06-19T23:10:37.000+05:30
Register BloomFilterMightContainJointRewriteRule as a Rule[LogicalPlan]
via injectOptimizerRule so that both BloomFilterAggregate -&gt;
VeloxBloomFilterAggregate and BloomFilterMightContain -&gt;
VeloxBloomFilterMightContain substitutions are baked into the
originalPlan snapshot before ExpandFallbackPolicy takes it.

This ensures that when a stage falls back via whole-stage AQE fallback,
the fallback plan already uses the Velox variants on both sides of the
bloom-filter pair, so the byte format is always consistent regardless
of which stages fall back and in what order.

This also fixes the case (threshold=1) where Stage 0 itself falls back:
the previous FallbackPatcher approach would incorrectly rewrite
BloomFilterMightContain -&gt; VeloxBloomFilterMightContain even when Stage 0
produced Spark-format bytes, causing a version-mismatch IOException.
With the optimizer rule, both sides are always rewritten together or
not at all (when enableNativeBloomFilter=false).

Add regression tests covering:
- threshold=2: only the filter stage falls back, agg stage runs natively
- threshold=1: both stages fall back, agg stage produces Spark-format bytes
diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala
@@ -53,6 +53,7 @@ object VeloxRuleApi {
   private def injectSpark(injector: SparkInjector): Unit = {
     // Inject the regular Spark rules directly.
     injector.injectOptimizerRule(CollectRewriteRule.apply)
+    injector.injectOptimizerRule(BloomFilterMightContainJointRewriteRule.apply)
     injector.injectOptimizerRule(HLLRewriteRule.apply)
     injector.injectOptimizerRule(CollapseGetJsonObjectExpressionRule.apply)
     injector.injectOptimizerRule(RewriteCastFromArray.apply)
@@ -81,11 +82,6 @@ object VeloxRuleApi {
     injector.injectPreTransform(c => FallbackMultiCodegens.apply(c.session))
     injector.injectPreTransform(c => MergeTwoPhasesHashBaseAggregate(c.session))
     injector.injectPreTransform(_ => RewriteSubqueryBroadcast())
-    injector.injectPreTransform(
-      c =>
-        BloomFilterMightContainJointRewriteRule.apply(
-          c.session,
-          c.caller.isBloomFilterStatFunction()))
     injector.injectPreTransform(_ => EliminateRedundantGetTimestamp)
 
     // Legacy: The legacy transform rule.
diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/BloomFilterMightContainJointRewriteRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/BloomFilterMightContainJointRewriteRule.scala
@@ -21,63 +21,38 @@ import org.apache.gluten.expression.VeloxBloomFilterMightContain
 import org.apache.gluten.expression.aggregate.VeloxBloomFilterAggregate
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, BloomFilterMightContain, Expression}
-import org.apache.spark.sql.catalyst.expressions.aggregate.{BloomFilterAggregate, TypedImperativeAggregate}
+import org.apache.spark.sql.catalyst.expressions.BloomFilterMightContain
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, BloomFilterAggregate}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution.SparkPlan
 
-case class BloomFilterMightContainJointRewriteRule(
-    spark: SparkSession,
-    isBloomFilterStatFunction: Boolean)
-  extends Rule[SparkPlan] {
-  override def apply(plan: SparkPlan): SparkPlan = {
-    if (isBloomFilterStatFunction || !GlutenConfig.get.enableNativeBloomFilter) {
+/**
+ * Optimizer rule that rewrites `BloomFilterAggregate` -> `VeloxBloomFilterAggregate` and
+ * `BloomFilterMightContain` -> `VeloxBloomFilterMightContain` at the logical plan level.
+ *
+ * Running as an optimizer rule ensures the substitution is captured in the `originalPlan` snapshot
+ * that [[org.apache.gluten.extension.columnar.heuristic.ExpandFallbackPolicy]] uses when promoting
+ * an individual stage fallback to a whole-stage AQE fallback. This guarantees that both sides of
+ * the bloom-filter pair always produce and consume the same byte format, regardless of whether
+ * stages fall back to JVM execution after AQE re-planning.
+ */
+case class BloomFilterMightContainJointRewriteRule(spark: SparkSession)
+  extends Rule[LogicalPlan] {
+
+  override def apply(plan: LogicalPlan): LogicalPlan = {
+    if (!GlutenConfig.get.enableNativeBloomFilter) {
       return plan
     }
-    val out = plan.transformWithSubqueries {
-      case p =>
-        applyForNode(p)
-    }
-    out
-  }
-
-  private def replaceBloomFilterAggregate[T](
-      expr: Expression,
-      bloomFilterAggReplacer: (
-          Expression,
-          Expression,
-          Expression,
-          Int,
-          Int) => TypedImperativeAggregate[T]): Expression = expr match {
-    case BloomFilterAggregate(
-          child,
-          estimatedNumItemsExpression,
-          numBitsExpression,
-          mutableAggBufferOffset,
-          inputAggBufferOffset) =>
-      bloomFilterAggReplacer(
-        child,
-        estimatedNumItemsExpression,
-        numBitsExpression,
-        mutableAggBufferOffset,
-        inputAggBufferOffset)
-    case other => other
-  }
-
-  private def replaceMightContain[T](
-      expr: Expression,
-      mightContainReplacer: (Expression, Expression) => BinaryExpression): Expression = expr match {
-    case BloomFilterMightContain(bloomFilterExpression, valueExpression) =>
-      mightContainReplacer(bloomFilterExpression, valueExpression)
-    case other => other
-  }
-
-  private def applyForNode(p: SparkPlan) = {
-    p.transformExpressions {
-      case e =>
-        replaceMightContain(
-          replaceBloomFilterAggregate(e, VeloxBloomFilterAggregate.apply),
-          VeloxBloomFilterMightContain.apply)
+    plan.transformAllExpressions {
+      case aggExpr @ AggregateExpression(b: BloomFilterAggregate, _, _, _, _) =>
+        aggExpr.copy(aggregateFunction = VeloxBloomFilterAggregate(
+          b.child,
+          b.estimatedNumItemsExpression,
+          b.numBitsExpression,
+          b.mutableAggBufferOffset,
+          b.inputAggBufferOffset))
+      case BloomFilterMightContain(bf, v) =>
+        VeloxBloomFilterMightContain(bf, v)
     }
   }
 }
diff --git a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenBloomFilterAggregateQuerySuite.scala b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenBloomFilterAggregateQuerySuite.scala
@@ -24,6 +24,16 @@ import org.apache.spark.SparkConf
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.internal.SQLConf
 
+import org.scalatest.Tag
+
+/**
+ * ScalaTest tag for the issue-12013 regression test. Run with:
+ * {{{
+ *  --test-tags=org.apache.gluten.tags.Issue12013
+ * }}}
+ */
+object Issue12013 extends Tag("org.apache.gluten.tags.Issue12013")
+
 class GlutenBloomFilterAggregateQuerySuite
   extends BloomFilterAggregateQuerySuite
   with GlutenSQLTestsTrait
@@ -112,6 +122,109 @@ class GlutenBloomFilterAggregateQuerySuite
     }
   }
 
+  // Regression test for https://github.com/apache/gluten/issues/12013
+  // When ExpandFallbackPolicy triggers a whole-stage AQE fallback, the resulting plan comes
+  // from the original vanilla Spark plan which contains BloomFilterMightContain (not the Velox
+  // variant). If Stage 0 (bloom_filter_agg subquery) already ran natively it produced Velox-
+  // format bytes, which BloomFilterImpl.readFrom() cannot deserialize. BloomFilterMightContain-
+  // FallbackPatcher patches the fallback plan to use VeloxBloomFilterMightContain so Stage 1
+  // can read Velox bytes via JNI even after falling back to JVM.
+  testGluten(
+    "Test bloom_filter_agg whole-stage fallback does not corrupt bloom filter bytes",
+    Issue12013) {
+    val table = "bloom_filter_test"
+    val numEstimatedItems = 5000000L
+    val sqlString =
+      s"""
+         |SELECT col positive_membership_test
+         |FROM $table
+         |WHERE might_contain(
+         |            (SELECT bloom_filter_agg(col,
+         |              cast($numEstimatedItems as long),
+         |              cast($veloxBloomFilterMaxNumBits as long))
+         |             FROM $table), col)
+         |""".stripMargin
+
+    withTempView(table) {
+      (Seq(Long.MinValue, 0, Long.MaxValue) ++ (1L to 200000L))
+        .toDF("col")
+        .createOrReplaceTempView(table)
+      if (BackendsApiManager.getSettings.requireBloomFilterAggMightContainJointFallback()) {
+        // Disable columnar filter so FilterExec falls back, and set the whole-stage fallback
+        // threshold so ExpandFallbackPolicy promotes the individual fallback to whole-stage.
+        // This reproduces the scenario where the filter stage falls back to the original
+        // vanilla plan while the bloom_filter_agg subquery has already produced Velox-format
+        // bloom filter bytes.
+        //
+        // Threshold=2: a fallen-back FilterExec introduces two ColumnarToRow/RowToColumnar
+        // transitions (net transition cost=2), which meets the threshold and triggers the
+        // whole-stage AQE fallback.  The bloom_filter_agg subquery stages have an inherent
+        // transition cost of 1, so they do NOT trigger the fallback and run natively.
+        //
+        // ANSI mode must be off: Spark 4.0 enables ANSI by default, which causes
+        // ObjectHashAggregateExec to fail Gluten validation ("does not support ansi mode"),
+        // raising the agg-stage transition cost above 1.  With ANSI off the agg-stage cost
+        // stays at 1 (< threshold 2), so only the filter stage falls back as intended.
+        withSQLConf(
+          GlutenConfig.COLUMNAR_FILTER_ENABLED.key -> "false",
+          GlutenConfig.COLUMNAR_WHOLESTAGE_FALLBACK_THRESHOLD.key -> "2",
+          SQLConf.ANSI_ENABLED.key -> "false"
+        ) {
+          val df = spark.sql(sqlString)
+          // Must not throw java.io.IOException: Unexpected Bloom filter version number (16777217).
+          // All 200003 rows match the bloom filter built from the same data.
+          assert(df.collect().length == 200003L)
+        }
+      }
+    }
+  }
+
+  // Validates that the patcher is also a no-op when Stage 0 (bloom_filter_agg subquery) itself
+  // falls back via ExpandFallbackPolicy (not just when native bloom filter is disabled via
+  // config).  With threshold=1 the subquery stage's inherent transition cost of 1 meets the
+  // threshold, so Stage 0 is wrapped in a whole-stage FallbackNode and produces Spark-format
+  // bytes.  A correct patcher must detect this and leave Stage 1 with the vanilla
+  // BloomFilterMightContain so it can read Spark bytes without a version-mismatch IOException.
+  testGluten(
+    "Test bloom_filter_agg whole-stage fallback when both stages fall back",
+    Issue12013) {
+    val table = "bloom_filter_test"
+    val numEstimatedItems = 5000000L
+    val sqlString =
+      s"""
+         |SELECT col positive_membership_test
+         |FROM $table
+         |WHERE might_contain(
+         |            (SELECT bloom_filter_agg(col,
+         |              cast($numEstimatedItems as long),
+         |              cast($veloxBloomFilterMaxNumBits as long))
+         |             FROM $table), col)
+         |""".stripMargin
+
+    withTempView(table) {
+      (Seq(Long.MinValue, 0, Long.MaxValue) ++ (1L to 200000L))
+        .toDF("col")
+        .createOrReplaceTempView(table)
+      if (BackendsApiManager.getSettings.requireBloomFilterAggMightContainJointFallback()) {
+        // threshold=1: Stage 0's inherent transition cost of 1 meets the threshold, so
+        // ExpandFallbackPolicy promotes Stage 0 to a whole-stage fallback as well.
+        // Stage 0 runs as Spark and produces Spark-format bytes.  Stage 1 also falls back
+        // (COLUMNAR_FILTER_ENABLED=false, cost >= 1).  The patcher must NOT rewrite
+        // BloomFilterMightContain -> VeloxBloomFilterMightContain in this case because
+        // VeloxBloomFilterMightContain would try to read Spark-format bytes as Velox bytes.
+        withSQLConf(
+          GlutenConfig.COLUMNAR_FILTER_ENABLED.key -> "false",
+          GlutenConfig.COLUMNAR_WHOLESTAGE_FALLBACK_THRESHOLD.key -> "1",
+          SQLConf.ANSI_ENABLED.key -> "false"
+        ) {
+          val df = spark.sql(sqlString)
+          // Must not throw java.io.IOException: Unexpected Bloom filter version number.
+          assert(df.collect().length == 200003L)
+        }
+      }
+    }
+  }
+
   testGluten("Test bloom_filter_agg agg fallback") {
     val table = "bloom_filter_test"
     val numEstimatedItems = 5000000L