fix: detect and fall back for native_datafusion incompatibilities

andygrove · andygrove · commit 738694a2d507 · 2026-03-12T22:15:48.000-06:00
- Add detection for case-insensitive duplicate field names in
  CometScanRule, falling back to native_iceberg_compat when
  native_datafusion would produce different error messages than Spark.
- Fix schema evolution test to account for auto mode now preferring
  native_datafusion, which always handles type promotion.

The metrics test failures (output_rows=0, filter pushdown=0) are
pre-existing on main and not caused by this change.
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -222,6 +222,19 @@ case class CometScanRule(session: SparkSession)
       withInfo(scanExec, "Native DataFusion scan does not support Parquet field ID matching")
       return None
     }
+    // Case-insensitive mode with duplicate field names produces different errors
+    // in DataFusion vs Spark, so fall back to avoid incompatible error messages
+    if (!session.sessionState.conf.caseSensitiveAnalysis) {
+      val fieldNames =
+        scanExec.requiredSchema.fieldNames.map(_.toLowerCase(java.util.Locale.ROOT))
+      if (fieldNames.length != fieldNames.distinct.length) {
+        withInfo(
+          scanExec,
+          "Native DataFusion scan does not support " +
+            "duplicate field names in case-insensitive mode")
+        return None
+      }
+    }
     if (!isSchemaSupported(scanExec, SCAN_NATIVE_DATAFUSION, r)) {
       return None
     }
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala
@@ -984,7 +984,7 @@ abstract class ParquetReadSuite extends CometTestBase {
             withParquetDataFrame(data, schema = Some(readSchema)) { df =>
               // TODO: validate with Spark 3.x and 'usingDataFusionParquetExec=true'
               if (enableSchemaEvolution || CometConf.COMET_NATIVE_SCAN_IMPL
-                  .get(conf) == CometConf.SCAN_NATIVE_DATAFUSION) {
+                  .get(conf) != CometConf.SCAN_NATIVE_ICEBERG_COMPAT) {
                 checkAnswer(df, data.map(Row.fromTuple))
               } else {
                 assertThrows[SparkException](df.collect())