fix_bool_to_timestamp_support

coderfender · coderfender · commit e64e194cfbe9 · 2026-02-26T15:57:55.000-08:00
diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
@@ -168,8 +168,8 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
   }
 
   test("cast BooleanType to TimestampType") {
-    // Arrow error: Cast error: Casting from Boolean to Timestamp(Microsecond, Some("UTC")) not supported
-    castTest(generateBools(), DataTypes.TimestampType)
+    // Spark does not support ANSI or Try mode for Boolean to Timestamp casts
+    castTest(generateBools(), DataTypes.TimestampType, testAnsi = false, testTry = false)
   }
 
   // CAST from ByteType
@@ -437,14 +437,10 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
   }
 
   test("cast LongType to TimestampType") {
-    // Cast back to long avoids java.sql.Timestamp overflow during collect() for extreme values
     compatibleTimezones.foreach { tz =>
       withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) {
-        withTable("t1") {
-          generateLongs().write.saveAsTable("t1")
-          val df = spark.sql("select a, cast(cast(a as timestamp) as long) from t1")
-          checkSparkAnswerAndOperator(df)
-        }
+        // Use useDFDiff to avoid collect() which fails on extreme timestamp values
+        castTest(generateLongs(), DataTypes.TimestampType, useDataFrameDiff = true)
       }
     }
   }
@@ -507,11 +503,8 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
   test("cast FloatType to TimestampType") {
     compatibleTimezones.foreach { tz =>
       withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) {
-        withTable("t1") {
-          generateFloats().write.saveAsTable("t1")
-          val df = spark.sql("select a, cast(a as timestamp) from t1")
-          assertDataFrameEquals(df)
-        }
+        // Use useDFDiff to avoid collect() which fails on extreme timestamp values
+        castTest(generateFloats(), DataTypes.TimestampType, useDataFrameDiff = true)
       }
     }
   }
@@ -570,11 +563,8 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
   test("cast DoubleType to TimestampType") {
     compatibleTimezones.foreach { tz =>
       withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) {
-        withTable("t1") {
-          generateDoubles().write.saveAsTable("t1")
-          val df = spark.sql("select a, cast(a as timestamp) from t1")
-          assertDataFrameEquals(df)
-        }
+        // Use useDFDiff to avoid collect() which fails on extreme timestamp values
+        castTest(generateDoubles(), DataTypes.TimestampType, useDataFrameDiff = true)
       }
     }
   }
@@ -1479,15 +1469,18 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
       toType: DataType,
       hasIncompatibleType: Boolean = false,
       testAnsi: Boolean = true,
-      testTry: Boolean = true): Unit = {
+      testTry: Boolean = true,
+      useDataFrameDiff: Boolean = false): Unit = {
 
     withTempPath { dir =>
       val data = roundtripParquet(input, dir).coalesce(1)
 
       withSQLConf((SQLConf.ANSI_ENABLED.key, "false")) {
         // cast() should return null for invalid inputs when ansi mode is disabled
         val df = data.select(col("a"), col("a").cast(toType)).orderBy(col("a"))
-        if (hasIncompatibleType) {
+        if (useDataFrameDiff) {
+          assertDataFrameEquals(df, assertCometNative = !hasIncompatibleType)
+        } else if (hasIncompatibleType) {
           checkSparkAnswer(df)
         } else {
           checkSparkAnswerAndOperator(df)
@@ -1499,7 +1492,9 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
 //          not using spark DSL since it `try_cast` is only available from Spark 4x
           val df2 =
             spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a")
-          if (hasIncompatibleType) {
+          if (useDataFrameDiff) {
+            assertDataFrameEquals(df2, assertCometNative = !hasIncompatibleType)
+          } else if (hasIncompatibleType) {
             checkSparkAnswer(df2)
           } else {
             checkSparkAnswerAndOperator(df2)
@@ -1515,57 +1510,63 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
 
           // cast() should throw exception on invalid inputs when ansi mode is enabled
           val df = data.withColumn("converted", col("a").cast(toType))
-          checkSparkAnswerMaybeThrows(df) match {
-            case (None, None) =>
-            // neither system threw an exception
-            case (None, Some(e)) =>
-              throw e
-            case (Some(e), None) =>
-              // Spark failed but Comet succeeded
-              fail(s"Comet should have failed with ${e.getCause.getMessage}")
-            case (Some(sparkException), Some(cometException)) =>
-              // both systems threw an exception so we make sure they are the same
-              val sparkMessage =
-                if (sparkException.getCause != null) sparkException.getCause.getMessage
-                else sparkException.getMessage
-              val cometMessage =
-                if (cometException.getCause != null) cometException.getCause.getMessage
-                else cometException.getMessage
-              // this if branch should only check decimal to decimal cast and errors when output precision, scale causes overflow.
-              if (df.schema("a").dataType.typeName.contains("decimal") && toType.typeName
-                  .contains("decimal") && sparkMessage.contains("cannot be represented as")) {
-                assert(cometMessage.contains("too large to store"))
-              } else {
-                if (CometSparkSessionExtensions.isSpark40Plus) {
-                  // for Spark 4 we expect to sparkException carries the message
-                  assert(sparkMessage.contains("SQLSTATE"))
-                  if (sparkMessage.startsWith("[NUMERIC_VALUE_OUT_OF_RANGE.WITH_SUGGESTION]")) {
-                    assert(
-                      sparkMessage.replace(".WITH_SUGGESTION] ", "]").startsWith(cometMessage))
-                  } else if (cometMessage.startsWith("[CAST_INVALID_INPUT]") || cometMessage
-                      .startsWith("[CAST_OVERFLOW]")) {
-                    assert(
-                      sparkMessage.startsWith(
-                        cometMessage
-                          .replace(
-                            "If necessary set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.",
-                            "")))
+          if (useDataFrameDiff) {
+            assertDataFrameEquals(df, assertCometNative = !hasIncompatibleType)
+          } else {
+            checkSparkAnswerMaybeThrows(df) match {
+              case (None, None) =>
+              // neither system threw an exception
+              case (None, Some(e)) =>
+                throw e
+              case (Some(e), None) =>
+                // Spark failed but Comet succeeded
+                fail(s"Comet should have failed with ${e.getCause.getMessage}")
+              case (Some(sparkException), Some(cometException)) =>
+                // both systems threw an exception so we make sure they are the same
+                val sparkMessage =
+                  if (sparkException.getCause != null) sparkException.getCause.getMessage
+                  else sparkException.getMessage
+                val cometMessage =
+                  if (cometException.getCause != null) cometException.getCause.getMessage
+                  else cometException.getMessage
+                // this if branch should only check decimal to decimal cast and errors when output precision, scale causes overflow.
+                if (df.schema("a").dataType.typeName.contains("decimal") && toType.typeName
+                    .contains("decimal") && sparkMessage.contains("cannot be represented as")) {
+                  assert(cometMessage.contains("too large to store"))
+                } else {
+                  if (CometSparkSessionExtensions.isSpark40Plus) {
+                    // for Spark 4 we expect to sparkException carries the message
+                    assert(sparkMessage.contains("SQLSTATE"))
+                    if (sparkMessage.startsWith("[NUMERIC_VALUE_OUT_OF_RANGE.WITH_SUGGESTION]")) {
+                      assert(
+                        sparkMessage.replace(".WITH_SUGGESTION] ", "]").startsWith(cometMessage))
+                    } else if (cometMessage.startsWith("[CAST_INVALID_INPUT]") || cometMessage
+                        .startsWith("[CAST_OVERFLOW]")) {
+                      assert(
+                        sparkMessage.startsWith(
+                          cometMessage
+                            .replace(
+                              "If necessary set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.",
+                              "")))
+                    } else {
+                      assert(sparkMessage.startsWith(cometMessage))
+                    }
                   } else {
-                    assert(sparkMessage.startsWith(cometMessage))
+                    // for Spark 3.4 we expect to reproduce the error message exactly
+                    assert(cometMessage == sparkMessage)
                   }
-                } else {
-                  // for Spark 3.4 we expect to reproduce the error message exactly
-                  assert(cometMessage == sparkMessage)
                 }
-              }
+            }
           }
 
           // try_cast() should always return null for invalid inputs
           if (testTry) {
             data.createOrReplaceTempView("t")
             val df2 =
               spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a")
-            if (hasIncompatibleType) {
+            if (useDataFrameDiff) {
+              assertDataFrameEquals(df2, assertCometNative = !hasIncompatibleType)
+            } else if (hasIncompatibleType) {
               checkSparkAnswer(df2)
             } else {
               checkSparkAnswerAndOperator(df2)
diff --git a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala
@@ -1278,33 +1278,34 @@ abstract class CometTestBase
   }
 
   /**
-   * Uses except (difference) to find differences without using collect() Checks cometDF and
-   * sparkDF including schemas
+   * Compares Spark and Comet results using exceptAll instead of collect(). This avoids
+   * java.sql.Timestamp overflow issues with extreme timestamp values.
    */
   protected def assertDataFrameEquals(
       df: => DataFrame,
       assertCometNative: Boolean = true): Unit = {
 
-    var sparkDf: DataFrame = null
+    var dfSpark: DataFrame = null
     withSQLConf(CometConf.COMET_ENABLED.key -> "false") {
-      sparkDf = datasetOfRows(spark, df.logicalPlan)
+      dfSpark = datasetOfRows(spark, df.logicalPlan)
     }
-    val cometDf = datasetOfRows(spark, df.logicalPlan)
+    val dfComet = datasetOfRows(spark, df.logicalPlan)
 
     // Compare schemas
     assert(
-      sparkDf.schema == cometDf.schema,
-      s"Schema mismatch:\nCorrect Answer: ${sparkDf.schema}\nSpark Answer: ${cometDf.schema}")
-
-    // Use except (difference) to compare DataFrames without collect() which error on extremely high Timestamp values
-    val sparkMinusComet = sparkDf.except(cometDf)
-    val cometMinusSpark = cometDf.except(sparkDf)
+      dfSpark.schema == dfComet.schema,
+      s"Schema mismatch:\nSpark: ${dfSpark.schema}\nComet: ${dfComet.schema}")
 
+    val sparkMinusComet = dfSpark.exceptAll(dfComet)
+    val cometMinusSpark = dfComet.exceptAll(dfSpark)
     val diffCount1 = sparkMinusComet.count()
     val diffCount2 = cometMinusSpark.count()
 
     if (diffCount1 > 0 || diffCount2 > 0) {
-      fail("DataFrames count doesnt match.\n")
+      fail(
+        "Results do not match. " +
+          s"Rows in Spark but not Comet: $diffCount1. " +
+          s"Rows in Comet but not Spark: $diffCount2.")
     }
 
     if (assertCometNative) {