feat(phase-5b): Dynamic Partition Pruning support for Delta

schenksj · claude · schenksj · commit 6f66c4c3c630 · 2026-04-12T10:08:47.000-04:00
Delta scans with DPP now go through Comet's native path instead of
falling back to vanilla Spark.

Changes:
1. CometScanRule: moved Delta detection BEFORE the DPP fallback check
   so DPP-bearing Delta scans reach nativeDeltaScan instead of bailing
   out. Non-Delta scans still fall back for DPP as before.

2. prunePartitions: filters out DynamicPruningExpression (wrapping
   InSubqueryExec) before building the InterpretedPredicate. These
   expressions aren't resolved at planning time; Spark applies them
   post-scan at runtime. Static partition filters are still evaluated
   for file-level pruning at planning time.

3. New test: "dynamic partition pruning through join" - star-schema
   join (fact partitioned by region + small dim table) that exercises
   DPP. Verifies CometDeltaNativeScanExec appears in the plan and
   results match vanilla row-for-row.

Results: Tests: succeeded 35, failed 0, canceled 0, ignored 0, pending 0

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -246,20 +246,22 @@ case class CometScanRule(session: SparkSession)
 
   private def transformV1Scan(plan: SparkPlan, scanExec: FileSourceScanExec): SparkPlan = {
 
-    if (COMET_DPP_FALLBACK_ENABLED.get() &&
-      scanExec.partitionFilters.exists(isDynamicPruningFilter)) {
-      return withInfo(scanExec, "Dynamic Partition Pruning is not supported")
-    }
-
     scanExec.relation match {
       case r: HadoopFsRelation =>
-        // Delta Lake (V1 path) — detect before the `isFileFormatSupported` check, which
-        // only accepts the exact `ParquetFileFormat` class and otherwise rejects Delta's
-        // `DeltaParquetFileFormat` subclass.
+        // Delta Lake (V1 path): detect BEFORE the DPP fallback check below,
+        // because Delta's native path handles DPP through partition pruning
+        // at execution time (DPP expressions are filtered out of the
+        // planning-time InterpretedPredicate and applied by Spark post-scan).
         if (DeltaReflection.isDeltaFileFormat(r.fileFormat)) {
           return nativeDeltaScan(session, scanExec, r, hadoopConfOrNull = null)
             .getOrElse(scanExec)
         }
+        // DPP fallback for non-Delta scans (DataFusion/Iceberg-compat paths
+        // don't support DPP natively).
+        if (COMET_DPP_FALLBACK_ENABLED.get() &&
+          scanExec.partitionFilters.exists(isDynamicPruningFilter)) {
+          return withInfo(scanExec, "Dynamic Partition Pruning is not supported")
+        }
         if (!CometScanExec.isFileFormatSupported(r.fileFormat)) {
           return withInfo(scanExec, s"Unsupported file format ${r.fileFormat}")
         }
diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometDeltaNativeScan.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometDeltaNativeScan.scala
@@ -320,12 +320,20 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometScanExec] with Loggi
       partitionSchema: StructType): Seq[OperatorOuterClass.DeltaScanTask] = {
     if (scan.partitionFilters.isEmpty || partitionSchema.isEmpty) return tasks
 
+    // Phase 5b: filter out DPP expressions (DynamicPruningExpression wrapping
+    // InSubqueryExec) because they aren't resolved at planning time. Spark
+    // applies them post-scan at runtime. Static partition filters are still
+    // evaluated here for file-level pruning.
+    val staticFilters = scan.partitionFilters.filterNot(
+      _.exists(_.isInstanceOf[org.apache.spark.sql.catalyst.expressions.PlanExpression[_]]))
+    if (staticFilters.isEmpty) return tasks
+
     // Build an `InterpretedPredicate` that expects a row whose schema matches
     // `partitionSchema`. Rewrite attribute references to `BoundReference`s keyed by
     // partition-schema column name so it can evaluate against a row we assemble below.
     val partitionAttrsByName =
-      scan.partitionFilters.flatMap(_.references).groupBy(_.name.toLowerCase(Locale.ROOT))
-    val combined = scan.partitionFilters.reduce(And)
+      staticFilters.flatMap(_.references).groupBy(_.name.toLowerCase(Locale.ROOT))
+    val combined = staticFilters.reduce(And)
     val bound = combined.transform {
       case a: org.apache.spark.sql.catalyst.expressions.AttributeReference =>
         val idx = partitionSchema.fieldIndex(a.name)
diff --git a/spark/src/test/scala/org/apache/comet/CometDeltaNativeSuite.scala b/spark/src/test/scala/org/apache/comet/CometDeltaNativeSuite.scala
@@ -1137,6 +1137,63 @@ class CometDeltaNativeSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
+  test("dynamic partition pruning through join") {
+    assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping")
+    withDeltaTable("dpp") { tablePath =>
+      val ss = spark
+      import ss.implicits._
+
+      val factPath = new java.io.File(tablePath, "fact").getAbsolutePath
+      val dimPath = new java.io.File(tablePath, "dim").getAbsolutePath
+
+      // Fact table: partitioned by region, many rows.
+      (0 until 100)
+        .map(i => (i.toLong, s"item_$i", Seq("us", "eu", "ap")(i % 3)))
+        .toDF("id", "item", "region")
+        .write
+        .partitionBy("region")
+        .format("delta")
+        .save(factPath)
+
+      // Dimension table: small, filters the fact via join.
+      Seq(("us", "United States"))
+        .toDF("region", "region_name")
+        .write
+        .format("delta")
+        .save(dimPath)
+
+      // Star-schema join: DPP should push the dim.region='us' filter into the
+      // fact scan as a dynamic partition filter, pruning eu and ap partitions.
+      val query = s"""
+        SELECT f.id, f.item, d.region_name
+        FROM delta.`$factPath` f
+        JOIN delta.`$dimPath` d ON f.region = d.region
+      """
+      val df = spark.sql(query)
+      val plan = df.queryExecution.executedPlan
+      val scans = collect(plan) { case s: CometDeltaNativeScanExec => s }
+
+      // The fact scan should be a native Delta scan (DPP doesn't prevent it).
+      assert(scans.nonEmpty, s"expected CometDeltaNativeScanExec in DPP plan, got:\n$plan")
+
+      // Correctness: result should only contain 'us' region rows.
+      val rows = df.collect()
+      assert(
+        rows.forall(_.getString(2) == "United States"),
+        s"expected all rows to have region_name='United States'")
+      // ~34 rows (100/3 rounding)
+      assert(rows.length > 30 && rows.length < 40, s"expected ~34 rows, got ${rows.length}")
+
+      // Compare with vanilla for row-for-row correctness.
+      withSQLConf(CometConf.COMET_DELTA_NATIVE_ENABLED.key -> "false") {
+        val vanilla = spark.sql(query).collect()
+        assert(
+          rows.map(_.toSeq).toSet == vanilla.map(_.toSeq).toSet,
+          s"DPP result differs from vanilla")
+      }
+    }
+  }
+
   test("wider primitive type coverage") {
     assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping")
     withDeltaTable("primitives") { tablePath =>