fix: enable native_datafusion Spark SQL tests for #3320, #3401, #3719

andygrove · andygrove · commit 82acd9ceda5c · 2026-03-17T09:36:51.000-06:00
- Remove IgnoreCometNativeDataFusion tags from 5 tests that now pass: - ParquetFilterSuite: SPARK-31026 and row group level filter pushdown - StreamingSelfUnionSuite: DSv1 self-union tests - FileBasedDataSourceSuite: caseSensitive test - Add SparkError::DuplicateFieldCaseInsensitive to convert DataFusion's "Unable to get field named" schema error to SparkRuntimeException with error class _LEGACY_ERROR_TEMP_2093, matching Spark's behavior - Re-link remaining #3311 tests to specific issues #3719, #3720
diff --git a/dev/diffs/3.5.8.diff b/dev/diffs/3.5.8.diff
diff --git a/docs/source/contributor-guide/parquet_scans.md b/docs/source/contributor-guide/parquet_scans.md
@@ -63,9 +63,10 @@ cause Comet to fall back to Spark.
   The `native_datafusion` scan does not use Spark's `FileScanRDD`, so these functions cannot populate their values.
 - No support for `ignoreMissingFiles` or `ignoreCorruptFiles` being set to `true`
 - No support for duplicate field names in case-insensitive mode. When the required or data schema contains
-  field names that differ only by case (e.g., `B` and `b`), Comet falls back to Spark. Note that duplicates
-  in the physical Parquet file that are not reflected in the table schema cannot be detected at plan time,
-  so DataFusion may produce a different error message than Spark in that case.
+  field names that differ only by case (e.g., `B` and `b`), Comet falls back to Spark. Duplicates
+  in the physical Parquet file that are not reflected in the table schema cannot be detected at plan time;
+  in that case DataFusion will throw a `SparkRuntimeException` with error class `_LEGACY_ERROR_TEMP_2093`,
+  matching Spark's behavior.
 
 The `native_iceberg_compat` scan has the following additional limitation that may produce incorrect results
 without falling back to Spark:
diff --git a/native/core/src/errors.rs b/native/core/src/errors.rs
@@ -436,13 +436,17 @@ fn throw_exception(env: &mut JNIEnv, error: &CometError, backtrace: Option<Strin
             // Handle direct SparkError - serialize to JSON
             CometError::Spark(spark_error) => throw_spark_error_as_json(env, spark_error),
             _ => {
-                // Check for file-not-found errors that may arrive through other wrapping paths
                 let error_msg = error.to_string();
+                // Check for file-not-found errors that may arrive through other wrapping paths
                 if error_msg.contains("not found")
                     && error_msg.contains("No such file or directory")
                 {
                     let spark_error = SparkError::FileNotFound { message: error_msg };
                     throw_spark_error_as_json(env, &spark_error)
+                } else if let Some(spark_error) =
+                    try_convert_duplicate_field_error(&error_msg)
+                {
+                    throw_spark_error_as_json(env, &spark_error)
                 } else {
                     let exception = error.to_exception();
                     match backtrace {
@@ -474,6 +478,42 @@ fn throw_spark_error_as_json(
     )
 }
 
+/// Try to convert a DataFusion "Unable to get field named" error into a SparkError.
+/// DataFusion produces this error when reading Parquet files with duplicate field names
+/// in case-insensitive mode (e.g., file has columns "b" and "B", query requests "b").
+fn try_convert_duplicate_field_error(error_msg: &str) -> Option<SparkError> {
+    // Match: Schema error: Unable to get field named "X". Valid fields: [...]
+    lazy_static! {
+        static ref FIELD_RE: Regex =
+            Regex::new(r#"Unable to get field named "([^"]+)"\. Valid fields: \[(.+)\]"#)
+                .unwrap();
+    }
+    if let Some(caps) = FIELD_RE.captures(error_msg) {
+        let requested_field = caps.get(1)?.as_str();
+        // Parse field names from the Valid fields list: ["b"] or ["b", "B"]
+        let valid_fields_raw = caps.get(2)?.as_str();
+        let mut fields: Vec<String> = valid_fields_raw
+            .split(',')
+            .map(|s| s.trim().trim_matches('"').to_string())
+            .collect();
+        // DataFusion only reports fields it found; add the requested name if not present
+        // to match Spark's behavior of listing all ambiguous fields
+        if !fields.iter().any(|f| f == requested_field) {
+            fields.push(requested_field.to_string());
+        }
+        // Spark uses lowercase required field name
+        let required_field_name = requested_field.to_lowercase();
+        // Format as Spark expects: [b, B]
+        let matched_fields = format!("[{}]", fields.join(", "));
+        Some(SparkError::DuplicateFieldCaseInsensitive {
+            required_field_name,
+            matched_fields,
+        })
+    } else {
+        None
+    }
+}
+
 #[derive(Debug, Error)]
 enum StacktraceError {
     #[error("Unable to initialize message: {0}")]
diff --git a/native/spark-expr/src/error.rs b/native/spark-expr/src/error.rs
@@ -169,6 +169,12 @@ pub enum SparkError {
     #[error("{message}")]
     FileNotFound { message: String },
 
+    #[error("[_LEGACY_ERROR_TEMP_2093] Found duplicate field(s) \"{required_field_name}\": [{matched_fields}] in case-insensitive mode")]
+    DuplicateFieldCaseInsensitive {
+        required_field_name: String,
+        matched_fields: String,
+    },
+
     #[error("ArrowError: {0}.")]
     Arrow(Arc<ArrowError>),
 
@@ -240,6 +246,9 @@ impl SparkError {
             SparkError::DatatypeCannotOrder { .. } => "DatatypeCannotOrder",
             SparkError::ScalarSubqueryTooManyRows => "ScalarSubqueryTooManyRows",
             SparkError::FileNotFound { .. } => "FileNotFound",
+            SparkError::DuplicateFieldCaseInsensitive { .. } => {
+                "DuplicateFieldCaseInsensitive"
+            }
             SparkError::Arrow(_) => "Arrow",
             SparkError::Internal(_) => "Internal",
         }
@@ -430,6 +439,15 @@ impl SparkError {
                     "message": message,
                 })
             }
+            SparkError::DuplicateFieldCaseInsensitive {
+                required_field_name,
+                matched_fields,
+            } => {
+                serde_json::json!({
+                    "requiredFieldName": required_field_name,
+                    "matchedOrcFields": matched_fields,
+                })
+            }
             SparkError::Arrow(e) => {
                 serde_json::json!({
                     "message": e.to_string(),
@@ -499,6 +517,11 @@ impl SparkError {
             // FileNotFound - will be converted to SparkFileNotFoundException by the shim
             SparkError::FileNotFound { .. } => "org/apache/spark/SparkException",
 
+            // DuplicateFieldCaseInsensitive - converted to SparkRuntimeException by the shim
+            SparkError::DuplicateFieldCaseInsensitive { .. } => {
+                "org/apache/spark/SparkRuntimeException"
+            }
+
             // Generic errors
             SparkError::Arrow(_) | SparkError::Internal(_) => "org/apache/spark/SparkException",
         }
@@ -574,6 +597,11 @@ impl SparkError {
             // File not found
             SparkError::FileNotFound { .. } => Some("_LEGACY_ERROR_TEMP_2055"),
 
+            // Duplicate field in case-insensitive mode
+            SparkError::DuplicateFieldCaseInsensitive { .. } => {
+                Some("_LEGACY_ERROR_TEMP_2093")
+            }
+
             // Generic errors (no error class)
             SparkError::Arrow(_) | SparkError::Internal(_) => None,
         }
diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala
@@ -251,6 +251,12 @@ trait ShimSparkErrorConverter {
           QueryExecutionErrors
             .intervalArithmeticOverflowError("Interval arithmetic overflow", "", sqlCtx(context)))
 
+      case "DuplicateFieldCaseInsensitive" =>
+        Some(
+          QueryExecutionErrors.foundDuplicateFieldInCaseInsensitiveModeError(
+            params("requiredFieldName").toString,
+            params("matchedOrcFields").toString))
+
       case "FileNotFound" =>
         val msg = params("message").toString
         // Extract file path from native error message and format like Hadoop's
diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala
@@ -247,6 +247,12 @@ trait ShimSparkErrorConverter {
           QueryExecutionErrors
             .intervalArithmeticOverflowError("Interval arithmetic overflow", "", sqlCtx(context)))
 
+      case "DuplicateFieldCaseInsensitive" =>
+        Some(
+          QueryExecutionErrors.foundDuplicateFieldInCaseInsensitiveModeError(
+            params("requiredFieldName").toString,
+            params("matchedOrcFields").toString))
+
       case "FileNotFound" =>
         val msg = params("message").toString
         // Extract file path from native error message and format like Hadoop's
diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala
@@ -258,6 +258,12 @@ trait ShimSparkErrorConverter {
           QueryExecutionErrors.withoutSuggestionIntervalArithmeticOverflowError(
             context.headOption.orNull))
 
+      case "DuplicateFieldCaseInsensitive" =>
+        Some(
+          QueryExecutionErrors.foundDuplicateFieldInCaseInsensitiveModeError(
+            params("requiredFieldName").toString,
+            params("matchedOrcFields").toString))
+
       case "FileNotFound" =>
         val msg = params("message").toString
         // Extract file path from native error message and format like Hadoop's