fix: reject string/binary read as numeric in native_datafusion scan (#4091)

andygrove · web-flow · commit c05bd16e7188 · 2026-04-27T15:13:57.000-06:00
diff --git a/native/core/src/parquet/schema_adapter.rs b/native/core/src/parquet/schema_adapter.rs
@@ -385,6 +385,40 @@ impl SparkPhysicalExprAdapter {
             let physical_type = cast.input_field().data_type();
             let target_type = cast.target_field().data_type();
 
+            // Reject reading a string/binary Parquet column as anything other
+            // than string, binary, or a binary-encoded decimal. This mirrors
+            // Spark's TypeUtil.checkParquetType for the BINARY case (lines
+            // 208-221): a BINARY (or UTF8-annotated BINARY) physical column is
+            // only readable as StringType, BinaryType, or a binary-encoded
+            // decimal; every other target type (numeric, boolean, date,
+            // timestamp, ...) raises SchemaColumnConvertNotSupportedException.
+            //
+            // Without this guard, Spark's Cast below (in is_adapting_schema
+            // mode) falls through to DataFusion's cast, which silently parses
+            // the bytes (returning nulls for non-numeric strings, parsing
+            // date/timestamp/boolean strings, or in some paths reinterpreting
+            // raw bytes). See issue #4088.
+            if matches!(
+                physical_type,
+                DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary
+            ) && !matches!(
+                target_type,
+                DataType::Utf8
+                    | DataType::LargeUtf8
+                    | DataType::Binary
+                    | DataType::LargeBinary
+                    | DataType::Decimal128(_, _)
+                    | DataType::Decimal256(_, _)
+            ) {
+                return Err(DataFusionError::Plan(format!(
+                    "Parquet column cannot be converted. Column: [{}], \
+                     Expected: {}, Found: {}",
+                    cast.input_field().name(),
+                    target_type,
+                    physical_type,
+                )));
+            }
+
             // For complex nested types (Struct, List, Map), Timestamp timezone
             // mismatches, and Timestamp→Int64 (nanosAsLong), use CometCastColumnExpr
             // with spark_parquet_convert which handles field-name-based selection,
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala
@@ -998,6 +998,30 @@ abstract class ParquetReadSuite extends CometTestBase {
     }
   }
 
+  test("native_datafusion rejects string read as non-string/binary type") {
+    // Regression guard for https://github.com/apache/datafusion-comet/issues/4088.
+    // Spark's vectorized reader rejects reading a Parquet BINARY column as
+    // anything except StringType, BinaryType, or a binary-encoded decimal (see
+    // TypeUtil.checkParquetType, BINARY case). The native_datafusion scan
+    // must do the same in its schema adapter rather than letting DataFusion's
+    // cast silently parse the bytes or reinterpret them.
+    withSQLConf(
+      CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_DATAFUSION,
+      SQLConf.USE_V1_SOURCE_LIST.key -> "parquet") {
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+        Seq("a", "b", "c").toDF("c").write.parquet(path)
+        // Cover representative non-string/binary target types: numeric,
+        // boolean, date, and timestamp. Each would silently produce wrong
+        // results without the schema-adapter guard.
+        Seq("int", "bigint", "double", "boolean", "date", "timestamp").foreach { sqlType =>
+          val df = spark.read.schema(s"c $sqlType").parquet(path)
+          assertThrows[SparkException](df.collect())
+        }
+      }
+    }
+  }
+
   test("type widening: byte → short/int/long, short → int/long, int → long") {
     withSQLConf(CometConf.COMET_SCHEMA_EVOLUTION_ENABLED.key -> "true") {
       withTempPath { dir =>