fix: fall back from native_datafusion when schema evolution is disabled

andygrove · andygrove · commit cdb64fe2c5df · 2026-03-13T07:13:32.000-06:00
DataFusion's native Parquet reader always enables type promotion
regardless of the Comet schema evolution config. When schema evolution
is disabled, fall back to native_iceberg_compat in auto mode to
enforce strict type matching.
diff --git a/docs/source/contributor-guide/parquet_scans.md b/docs/source/contributor-guide/parquet_scans.md
@@ -28,9 +28,12 @@ Comet currently has two distinct implementations of the Parquet scan operator.
 
 The configuration property
 `spark.comet.scan.impl` is used to select an implementation. The default setting is `spark.comet.scan.impl=auto`, which
-currently always uses the `native_iceberg_compat` implementation. Most users should not need to change this setting.
-However, it is possible to force Comet to use a particular implementation for all scan operations by setting
-this configuration property to one of the following implementations. For example: `--conf spark.comet.scan.impl=native_datafusion`.
+selects the best implementation based on query characteristics. In auto mode, Comet prefers `native_datafusion` when
+possible and falls back to `native_iceberg_compat` when it detects incompatibilities (such as row indexes, metadata
+columns, `input_file_name()` usage, or when `spark.comet.schemaEvolution.enabled` is disabled). Most users should
+not need to change this setting. However, it is possible to force Comet to use a particular implementation for all
+scan operations by setting this configuration property to one of the following implementations. For example:
+`--conf spark.comet.scan.impl=native_datafusion`.
 
 The following features are not supported by either scan implementation, and Comet will fall back to Spark in these scenarios:
 
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -240,6 +240,17 @@ case class CometScanRule(session: SparkSession)
         return None
       }
     }
+    // DataFusion's native Parquet reader always enables type promotion
+    // (e.g., int->long, float->double) regardless of the Comet schema
+    // evolution config. When schema evolution is disabled, fall back so
+    // that native_iceberg_compat can enforce strict type matching.
+    if (!COMET_SCHEMA_EVOLUTION_ENABLED.get()) {
+      withInfo(
+        scanExec,
+        "Native DataFusion scan always enables schema evolution " +
+          s"but ${COMET_SCHEMA_EVOLUTION_ENABLED.key} is disabled")
+      return None
+    }
     if (!isSchemaSupported(scanExec, SCAN_NATIVE_DATAFUSION, r)) {
       return None
     }
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala
@@ -984,7 +984,7 @@ abstract class ParquetReadSuite extends CometTestBase {
             withParquetDataFrame(data, schema = Some(readSchema)) { df =>
               // TODO: validate with Spark 3.x and 'usingDataFusionParquetExec=true'
               if (enableSchemaEvolution || CometConf.COMET_NATIVE_SCAN_IMPL
-                  .get(conf) != CometConf.SCAN_NATIVE_ICEBERG_COMPAT) {
+                  .get(conf) == CometConf.SCAN_NATIVE_DATAFUSION) {
                 checkAnswer(df, data.map(Row.fromTuple))
               } else {
                 assertThrows[SparkException](df.collect())

Original file line number	Diff line number	Diff line change
`@@ -240,6 +240,17 @@ case class CometScanRule(session: SparkSession)`
`240`	`240`	`return None`
`241`	`241`	`}`
`242`	`242`	`}`
	`243`	`+ // DataFusion's native Parquet reader always enables type promotion`
	`244`	`+ // (e.g., int->long, float->double) regardless of the Comet schema`
	`245`	`+ // evolution config. When schema evolution is disabled, fall back so`
	`246`	`+ // that native_iceberg_compat can enforce strict type matching.`
	`247`	`+ if (!COMET_SCHEMA_EVOLUTION_ENABLED.get()) {`
	`248`	`+ withInfo(`
	`249`	`+ scanExec,`
	`250`	`+ "Native DataFusion scan always enables schema evolution " +`
	`251`	`+ s"but ${COMET_SCHEMA_EVOLUTION_ENABLED.key} is disabled")`
	`252`	`+ return None`
	`253`	`+ }`
`243`	`254`	`if (!isSchemaSupported(scanExec, SCAN_NATIVE_DATAFUSION, r)) {`
`244`	`255`	`return None`
`245`	`256`	`}`