fix: address code review findings (critical + major)

schenksj · claude · schenksj · commit a35bbbebb6d8 · 2026-04-12T12:25:16.000-04:00
Fixes from comprehensive code review:

Critical:
- planner.rs: replaced .expect() with .ok_or_else() for DeltaScan
  task lookup (prevents panic on edge case)
- planner.rs: replaced .unwrap() with enumerate-based index for
  column mapping rename projection (prevents panic on schema mismatch)

Major:
- CometDeltaNativeScan.scala: removed unused partitionAttrsByName
  variable; added case-sensitive-aware partition column lookup using
  SQLConf.CASE_SENSITIVE (was using case-sensitive-only fieldIndex)
- CometDeltaNativeScan.scala: added safe fallback (return unpruned
  tasks) if partition column index can't be resolved

Minor:
- delta_dv_filter.rs: converted debug_assert to proper error return
  for DV index out-of-order condition (was silent in release builds)
- predicate.rs: removed #[allow(dead_code)] annotation on public API
  function catalyst_to_kernel_predicate

Tests: succeeded 35, failed 0, canceled 0, ignored 0, pending 0

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/native/core/src/delta/predicate.rs b/native/core/src/delta/predicate.rs
@@ -33,8 +33,8 @@ pub fn catalyst_to_kernel_predicate_with_names(
     translate_predicate(expr, column_names)
 }
 
-/// Try to translate a Catalyst-proto `Expr` into a kernel `Predicate`.
-#[allow(dead_code)]
+/// Try to translate a Catalyst-proto `Expr` into a kernel `Predicate`
+/// (without column name resolution — BoundReferences become Unknown).
 pub fn catalyst_to_kernel_predicate(expr: &Expr) -> Predicate {
     translate_predicate(expr, &[])
 }
diff --git a/native/core/src/execution/operators/delta_dv_filter.rs b/native/core/src/execution/operators/delta_dv_filter.rs
@@ -252,12 +252,11 @@ impl DeltaDvFilterStream {
             if d >= batch_end {
                 break;
             }
-            // Invariant: d >= batch_start (otherwise it would have been
-            // consumed by a previous batch). Assert defensively.
-            debug_assert!(
-                d >= batch_start,
-                "deletion vector index {d} predates batch start {batch_start}"
-            );
+            if d < batch_start {
+                return Err(DataFusionError::Internal(format!(
+                    "DV index {d} predates batch start {batch_start}"
+                )));
+            }
             let local = (d - batch_start) as usize;
             if local < mask_buf.len() && mask_buf[local] {
                 mask_buf[local] = false;
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -1459,7 +1459,7 @@ impl PhysicalPlanner {
                     .tasks
                     .first()
                     .map(|t| t.file_path.clone())
-                    .expect("at least one task after empty check");
+                    .ok_or_else(|| GeneralError("DeltaScan has no tasks".into()))?;
                 let (object_store_url, _) = prepare_object_store_with_configs(
                     self.session_ctx.runtime_env(),
                     one_file,
@@ -1526,19 +1526,21 @@ impl PhysicalPlanner {
                         .map(|(l, p)| (p.clone(), l.clone()))
                         .collect();
                     let input_schema = final_exec.schema();
-                    let rename_exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = input_schema
+                    let rename_exprs: Result<Vec<(Arc<dyn PhysicalExpr>, String)>, ExecutionError> = input_schema
                         .fields()
                         .iter()
-                        .map(|f| {
+                        .enumerate()
+                        .map(|(idx, f)| {
                             let col: Arc<dyn PhysicalExpr> =
-                                Arc::new(Column::new(f.name(), input_schema.index_of(f.name()).unwrap()));
+                                Arc::new(Column::new(f.name(), idx));
                             let logical = physical_to_logical
                                 .get(f.name())
                                 .cloned()
                                 .unwrap_or_else(|| f.name().clone());
-                            (col, logical)
+                            Ok((col, logical))
                         })
                         .collect();
+                    let rename_exprs = rename_exprs?;
                     Arc::new(ProjectionExec::try_new(rename_exprs, final_exec)?) as Arc<dyn ExecutionPlan>
                 } else {
                     final_exec
diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometDeltaNativeScan.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometDeltaNativeScan.scala
@@ -330,16 +330,20 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometScanExec] with Loggi
 
     // Build an `InterpretedPredicate` that expects a row whose schema matches
     // `partitionSchema`. Rewrite attribute references to `BoundReference`s keyed by
-    // partition-schema column name so it can evaluate against a row we assemble below.
-    val partitionAttrsByName =
-      staticFilters.flatMap(_.references).groupBy(_.name.toLowerCase(Locale.ROOT))
+    // partition-schema field index, respecting case sensitivity.
+    val caseSensitive = scan.conf.getConf[Boolean](SQLConf.CASE_SENSITIVE)
     val combined = staticFilters.reduce(And)
     val bound = combined.transform {
       case a: org.apache.spark.sql.catalyst.expressions.AttributeReference =>
-        val idx = partitionSchema.fieldIndex(a.name)
+        val idx = if (caseSensitive) {
+          partitionSchema.fieldIndex(a.name)
+        } else {
+          partitionSchema.fields.indexWhere(
+            _.name.toLowerCase(Locale.ROOT) == a.name.toLowerCase(Locale.ROOT))
+        }
+        if (idx < 0) return tasks // Can't resolve; skip pruning
         BoundReference(idx, partitionSchema(idx).dataType, partitionSchema(idx).nullable)
     }
-    val _ = partitionAttrsByName
     val predicate = InterpretedPredicate(bound)
     predicate.initialize(0)
 
diff --git a/spark/src/test/scala/org/apache/comet/CometDeltaBenchmarkTest.scala b/spark/src/test/scala/org/apache/comet/CometDeltaBenchmarkTest.scala
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+import java.nio.file.Files
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.CometTestBase
+
+/**
+ * Quick benchmark comparing vanilla Spark+Delta vs Comet+Delta-kernel.
+ *
+ * Run with: export SPARK_LOCAL_IP=127.0.0.1 && ./mvnw -Pspark-3.5 -pl spark -am test \
+ * -Dsuites=org.apache.comet.CometDeltaBenchmarkTest -Dmaven.gitcommitid.skip
+ */
+class CometDeltaBenchmarkTest extends CometTestBase {
+
+  private def deltaSparkAvailable: Boolean =
+    try {
+      Class.forName("org.apache.spark.sql.delta.DeltaParquetFileFormat")
+      true
+    } catch {
+      case _: ClassNotFoundException => false
+    }
+
+  override protected def sparkConf: SparkConf = {
+    val conf = super.sparkConf
+    conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+    conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+    conf.set("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem")
+    conf.set("spark.databricks.delta.testOnly.dataFileNamePrefix", "")
+    conf.set("spark.databricks.delta.testOnly.dvFileNamePrefix", "")
+    conf
+  }
+
+  test("benchmark: SUM aggregation - vanilla vs Comet native Delta") {
+    assume(deltaSparkAvailable, "delta-spark not on the test classpath")
+
+    val tempDir = Files.createTempDirectory("comet-delta-bench").toFile
+    try {
+      val tablePath = new java.io.File(tempDir, "bench").getAbsolutePath
+      val numRows = 5 * 1000 * 1000 // 5M rows
+      val numFiles = 4
+
+      // scalastyle:off println
+      println(s"\n=== Comet Delta Benchmark: $numRows rows, $numFiles files ===\n")
+      // scalastyle:on println
+
+      // Generate data
+      val ss = spark
+      import ss.implicits._
+      val df =
+        (0 until numRows).map(i => (i.toLong, i * 1.5, s"name_$i")).toDF("id", "score", "name")
+      df.repartition(numFiles).write.format("delta").save(tablePath)
+
+      val warmupIters = 2
+      val benchIters = 5
+
+      // Vanilla Spark+Delta
+      val vanillaTimes = new scala.collection.mutable.ArrayBuffer[Long]()
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_ENABLED.key -> "false") {
+        for (i <- 0 until (warmupIters + benchIters)) {
+          val start = System.nanoTime()
+          spark.sql(s"SELECT SUM(id), SUM(score) FROM delta.`$tablePath`").collect()
+          val elapsed = (System.nanoTime() - start) / 1000000
+          if (i >= warmupIters) vanillaTimes += elapsed
+        }
+      }
+
+      // Comet native Delta
+      val cometTimes = new scala.collection.mutable.ArrayBuffer[Long]()
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "true",
+        CometConf.COMET_EXEC_ENABLED.key -> "true",
+        CometConf.COMET_DELTA_NATIVE_ENABLED.key -> "true") {
+        for (i <- 0 until (warmupIters + benchIters)) {
+          val start = System.nanoTime()
+          spark.sql(s"SELECT SUM(id), SUM(score) FROM delta.`$tablePath`").collect()
+          val elapsed = (System.nanoTime() - start) / 1000000
+          if (i >= warmupIters) cometTimes += elapsed
+        }
+      }
+
+      val vanillaAvg = vanillaTimes.sum.toDouble / vanillaTimes.size
+      val cometAvg = cometTimes.sum.toDouble / cometTimes.size
+      val speedup = vanillaAvg / cometAvg
+
+      // scalastyle:off println
+      println(f"\n=== Results (${benchIters} iterations, ${warmupIters} warmup) ===")
+      println(
+        f"  Vanilla Spark+Delta: ${vanillaAvg}%.0f ms avg (${vanillaTimes.mkString(", ")} ms)")
+      println(f"  Comet Native Delta:  ${cometAvg}%.0f ms avg (${cometTimes.mkString(", ")} ms)")
+      println(f"  Speedup: ${speedup}%.2fx")
+      println()
+      // scalastyle:on println
+
+      // Don't assert on speedup - just report numbers.
+      // On debug builds the native path may actually be slower due to no LTO.
+    } finally {
+      def deleteRecursively(file: java.io.File): Unit = {
+        if (file.isDirectory) { Option(file.listFiles()).foreach(_.foreach(deleteRecursively)) }
+        file.delete()
+      }
+      deleteRecursively(tempDir)
+    }
+  }
+
+  test("benchmark: filter scan - vanilla vs Comet native Delta") {
+    assume(deltaSparkAvailable, "delta-spark not on the test classpath")
+
+    val tempDir = Files.createTempDirectory("comet-delta-bench-filter").toFile
+    try {
+      val tablePath = new java.io.File(tempDir, "bench").getAbsolutePath
+      val numRows = 2 * 1000 * 1000
+      val numFiles = 4
+
+      // scalastyle:off println
+      println(s"\n=== Comet Delta Filter Benchmark: $numRows rows, $numFiles files ===\n")
+      // scalastyle:on println
+
+      val ss = spark
+      import ss.implicits._
+      val df =
+        (0 until numRows).map(i => (i.toLong, i * 1.5, s"name_$i")).toDF("id", "score", "name")
+      df.repartition(numFiles).write.format("delta").save(tablePath)
+
+      val warmupIters = 2
+      val benchIters = 5
+      val query = s"SELECT COUNT(*), SUM(score) FROM delta.`$tablePath` WHERE id > ${numRows / 2}"
+
+      val vanillaTimes = new scala.collection.mutable.ArrayBuffer[Long]()
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "false",
+        CometConf.COMET_EXEC_ENABLED.key -> "false") {
+        for (i <- 0 until (warmupIters + benchIters)) {
+          val start = System.nanoTime()
+          spark.sql(query).collect()
+          val elapsed = (System.nanoTime() - start) / 1000000
+          if (i >= warmupIters) vanillaTimes += elapsed
+        }
+      }
+
+      val cometTimes = new scala.collection.mutable.ArrayBuffer[Long]()
+      withSQLConf(
+        CometConf.COMET_ENABLED.key -> "true",
+        CometConf.COMET_EXEC_ENABLED.key -> "true",
+        CometConf.COMET_DELTA_NATIVE_ENABLED.key -> "true") {
+        for (i <- 0 until (warmupIters + benchIters)) {
+          val start = System.nanoTime()
+          spark.sql(query).collect()
+          val elapsed = (System.nanoTime() - start) / 1000000
+          if (i >= warmupIters) cometTimes += elapsed
+        }
+      }
+
+      val vanillaAvg = vanillaTimes.sum.toDouble / vanillaTimes.size
+      val cometAvg = cometTimes.sum.toDouble / cometTimes.size
+      val speedup = vanillaAvg / cometAvg
+
+      // scalastyle:off println
+      println(f"\n=== Filter Results (${benchIters} iterations, ${warmupIters} warmup) ===")
+      println(
+        f"  Vanilla Spark+Delta: ${vanillaAvg}%.0f ms avg (${vanillaTimes.mkString(", ")} ms)")
+      println(f"  Comet Native Delta:  ${cometAvg}%.0f ms avg (${cometTimes.mkString(", ")} ms)")
+      println(f"  Speedup: ${speedup}%.2fx")
+      println()
+      // scalastyle:on println
+    } finally {
+      def deleteRecursively(file: java.io.File): Unit = {
+        if (file.isDirectory) { Option(file.listFiles()).foreach(_.foreach(deleteRecursively)) }
+        file.delete()
+      }
+      deleteRecursively(tempDir)
+    }
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,8 @@ pub fn catalyst_to_kernel_predicate_with_names(`
`33`	`33`	`translate_predicate(expr, column_names)`
`34`	`34`	`}`
`35`	`35`
`36`		-/// Try to translate a Catalyst-proto `Expr` into a kernel `Predicate`.
`37`		`-#[allow(dead_code)]`
	`36`	+/// Try to translate a Catalyst-proto `Expr` into a kernel `Predicate`
	`37`	`+/// (without column name resolution — BoundReferences become Unknown).`
`38`	`38`	`pub fn catalyst_to_kernel_predicate(expr: &Expr) -> Predicate {`
`39`	`39`	`translate_predicate(expr, &[])`
`40`	`40`	`}`