@@ -21,11 +21,12 @@ package org.apache.comet.serde
2121
2222import java .util .Locale
2323
24- import org .apache .spark .sql .catalyst .expressions .{Attribute , ConvertTimezone , DateAdd , DateDiff , DateFormatClass , DateFromUnixDate , DateSub , DayOfMonth , DayOfWeek , DayOfYear , Days , FromUTCTimestamp , GetDateField , Hour , Hours , LastDay , Literal , MakeDate , Minute , Month , NextDay , Quarter , Second , SecondsToTimestamp , ToUTCTimestamp , TruncDate , TruncTimestamp , UnixDate , UnixTimestamp , WeekDay , WeekOfYear , Year }
24+ import org .apache .spark .sql .catalyst .expressions .{AddMonths , Attribute , ConvertTimezone , DateAdd , DateDiff , DateFormatClass , DateFromUnixDate , DateSub , DayOfMonth , DayOfWeek , DayOfYear , Days , FromUTCTimestamp , GetDateField , Hour , Hours , LastDay , Literal , MakeDate , MakeTimestamp , MicrosToTimestamp , MillisToTimestamp , Minute , Month , MonthsBetween , NextDay , Quarter , Second , SecondsToTimestamp , ToUnixTimestamp , ToUTCTimestamp , TruncDate , TruncTimestamp , UnixDate , UnixMicros , UnixMillis , UnixSeconds , UnixTimestamp , WeekDay , WeekOfYear , Year }
2525import org .apache .spark .sql .internal .SQLConf
2626import org .apache .spark .sql .types .{DateType , DoubleType , FloatType , IntegerType , LongType , StringType , TimestampNTZType , TimestampType }
2727import org .apache .spark .unsafe .types .UTF8String
2828
29+ import org .apache .comet .CometConf
2930import org .apache .comet .CometSparkSessionExtensions .withInfo
3031import org .apache .comet .expressions .{CometCast , CometEvalMode }
3132import org .apache .comet .serde .CometGetDateField .CometGetDateField
@@ -593,17 +594,23 @@ object CometTruncTimestamp extends CometExpressionSerde[TruncTimestamp] {
593594}
594595
595596/**
596- * Converts Spark DateFormatClass expression to DataFusion's to_char function.
597+ * Converts Spark `DateFormatClass` to DataFusion's `to_char` when format and timezone are
598+ * mappable, otherwise routes the expression through the Arrow-direct codegen dispatcher so that
599+ * Spark's own `DateFormatClass.doGenCode` runs inside the Comet pipeline.
597600 *
598- * Spark uses Java SimpleDateFormat patterns while DataFusion uses strftime patterns. This
599- * implementation supports a whitelist of common format strings that can be reliably mapped
600- * between the two systems.
601+ * Routing:
602+ * - format is a literal in `supportedFormats` AND timezone is UTC -> native `to_char`
603+ * - format is a literal in `supportedFormats` AND timezone is non-UTC, with the per-expression
604+ * `allowIncompatible` flag set -> native `to_char` (results may differ from Spark)
605+ * - all other cases -> JVM codegen dispatcher ([[CometScalaUDF.emitJvmCodegenDispatch ]]), gated
606+ * by [[CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED ]]. When that flag is disabled the operator
607+ * falls back to Spark.
601608 */
602609object CometDateFormat extends CometExpressionSerde [DateFormatClass ] {
603610
604611 /**
605612 * Mapping from Spark SimpleDateFormat patterns to strftime patterns. Only formats in this map
606- * are supported.
613+ * are supported by the native path .
607614 */
608615 val supportedFormats : Map [String , String ] = Map (
609616 // Full date formats
@@ -637,66 +644,50 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] {
637644 // ISO formats
638645 " yyyy-MM-dd'T'HH:mm:ss" -> " %Y-%m-%dT%H:%M:%S" )
639646
640- override def getIncompatibleReasons (): Seq [String ] = Seq (
641- " Non-UTC timezones may produce different results than Spark" )
642-
643- override def getUnsupportedReasons (): Seq [String ] = Seq (
644- " Only the following formats are supported:" +
645- supportedFormats.keys.toSeq.sorted
646- .map(k => s " ` $k` " )
647- .mkString(" \n - " , " \n - " , " " ))
647+ // Compatibility is decided inside `convert`: the native path covers a subset, and the codegen
648+ // dispatcher covers everything else when enabled. Plan-time tagging happens via `withInfo` on
649+ // the path that returns None.
650+ override def getSupportLevel (expr : DateFormatClass ): SupportLevel = Compatible ()
648651
649- override def getSupportLevel (expr : DateFormatClass ): SupportLevel = {
650- // Check timezone - only UTC is fully compatible
651- val timezone = expr.timeZoneId.getOrElse(" UTC" )
652- val isUtc = timezone == " UTC" || timezone == " Etc/UTC"
653-
654- expr.right match {
655- case Literal (fmt : UTF8String , _) =>
656- val format = fmt.toString
657- if (supportedFormats.contains(format)) {
658- if (isUtc) {
659- Compatible ()
660- } else {
661- Incompatible (Some (s " Non-UTC timezone ' $timezone' may produce different results " ))
662- }
663- } else {
664- Unsupported (
665- Some (
666- s " Format ' $format' is not supported. Supported formats: " +
667- supportedFormats.keys.mkString(" , " )))
668- }
669- case _ =>
670- Unsupported (Some (" Only literal format strings are supported" ))
671- }
672- }
652+ override def getCompatibleNotes (): Seq [String ] = Seq (
653+ " Format strings in a curated allow-list run natively via DataFusion's `to_char` for UTC " +
654+ " sessions. Other format strings (including non-literal formats), as well as non-UTC " +
655+ " sessions, route through Spark's own `DateFormatClass.doGenCode` via the Arrow-direct " +
656+ " codegen dispatcher when `spark.comet.exec.scalaUDF.codegen.enabled=true`. When the " +
657+ " codegen dispatcher is disabled (default) the operator falls back to Spark in those " +
658+ " cases." )
673659
674660 override def convert (
675661 expr : DateFormatClass ,
676662 inputs : Seq [Attribute ],
677663 binding : Boolean ): Option [ExprOuterClass .Expr ] = {
678- // Get the format string - must be a literal for us to map it
679- val strftimeFormat = expr.right match {
680- case Literal (fmt : UTF8String , _) =>
681- supportedFormats.get(fmt.toString)
664+ val timezone = expr.timeZoneId.getOrElse(" UTC" )
665+ val isUtc = timezone == " UTC" || timezone == " Etc/UTC"
666+
667+ val nativeFormat : Option [String ] = expr.right match {
668+ case Literal (fmt : UTF8String , _) => supportedFormats.get(fmt.toString)
682669 case _ => None
683670 }
684671
685- strftimeFormat match {
686- case Some (format) =>
687- val childExpr = exprToProtoInternal(expr.left, inputs, binding)
688- val formatExpr = exprToProtoInternal(Literal (format), inputs, binding)
689-
690- val optExpr = scalarFunctionExprToProtoWithReturnType(
691- " to_char" ,
692- StringType ,
693- false ,
694- childExpr,
695- formatExpr)
696- optExprWithInfo(optExpr, expr, expr.left, expr.right)
697- case None =>
698- withInfo(expr, expr.left, expr.right)
699- None
672+ val canUseNative = nativeFormat.isDefined && {
673+ isUtc || CometConf .isExprAllowIncompat(getExprConfigName(expr))
674+ }
675+
676+ if (canUseNative) {
677+ val childExpr = exprToProtoInternal(expr.left, inputs, binding)
678+ val formatExpr = exprToProtoInternal(Literal (nativeFormat.get), inputs, binding)
679+ val optExpr = scalarFunctionExprToProtoWithReturnType(
680+ " to_char" ,
681+ StringType ,
682+ false ,
683+ childExpr,
684+ formatExpr)
685+ optExprWithInfo(optExpr, expr, expr.left, expr.right)
686+ } else {
687+ // Hand the full `DateFormatClass` (with `timeZoneId` already stamped by `ResolveTimeZone`)
688+ // to the codegen dispatcher. It closure-serializes the bound tree, so non-UTC timezones
689+ // and non-whitelisted / non-literal format strings produce Spark-identical results.
690+ CometScalaUDF .emitJvmCodegenDispatch(expr, inputs, binding)
700691 }
701692 }
702693}
@@ -780,3 +771,21 @@ object CometDays extends CometExpressionSerde[Days] {
780771 optExprWithInfo(optExpr, expr, expr.child)
781772 }
782773}
774+
775+ object CometAddMonths extends CometCodegenDispatch [AddMonths ]
776+
777+ object CometMonthsBetween extends CometCodegenDispatch [MonthsBetween ]
778+
779+ object CometMakeTimestamp extends CometCodegenDispatch [MakeTimestamp ]
780+
781+ object CometMicrosToTimestamp extends CometCodegenDispatch [MicrosToTimestamp ]
782+
783+ object CometMillisToTimestamp extends CometCodegenDispatch [MillisToTimestamp ]
784+
785+ object CometUnixSeconds extends CometCodegenDispatch [UnixSeconds ]
786+
787+ object CometUnixMillis extends CometCodegenDispatch [UnixMillis ]
788+
789+ object CometUnixMicros extends CometCodegenDispatch [UnixMicros ]
790+
791+ object CometToUnixTimestamp extends CometCodegenDispatch [ToUnixTimestamp ]
0 commit comments