apache
diff --git a/‎common/src/main/java/org/apache/comet/udf/CometUdfBridge.java‎
Lines changed: 0 additions & 3 deletions b/‎common/src/main/java/org/apache/comet/udf/CometUdfBridge.java‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎common/src/main/scala/org/apache/comet/codegen/CometArrayData.scala‎
Lines changed: 21 additions & 38 deletions b/‎common/src/main/scala/org/apache/comet/codegen/CometArrayData.scala‎
Lines changed: 21 additions & 38 deletions
diff --git a/‎common/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala‎
Lines changed: 33 additions & 12 deletions b/‎common/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala‎
Lines changed: 33 additions & 12 deletions
diff --git a/‎common/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenInput.scala‎
Lines changed: 6 additions & 24 deletions b/‎common/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenInput.scala‎
Lines changed: 6 additions & 24 deletions
diff --git a/‎common/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenOutput.scala‎
Lines changed: 15 additions & 24 deletions b/‎common/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenOutput.scala‎
Lines changed: 15 additions & 24 deletions
@@ -199,9 +199,6 @@ private static void evaluateInternal(
       }
 
       result = udf.evaluate(inputs, numRows);
-      assert result instanceof FieldVector
-          : "CometUDF implementations must return FieldVector; got "
-              + (result == null ? "null" : result.getClass().getName());
       if (!(result instanceof FieldVector)) {
         throw new RuntimeException(
             "CometUDF.evaluate() must return a FieldVector, got: " + result.getClass().getName());
 
@@ -27,53 +27,36 @@ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 import org.apache.comet.shims.CometInternalRowShim
 
 /**
- * Shim base for Comet-owned [[ArrayData]] views used by the Arrow-direct codegen kernel.
- *
+ * Shim base for things that implement Spark's [[ArrayData]] in the Arrow-direct codegen kernel.
  * Provides `UnsupportedOperationException` defaults for every abstract method on `ArrayData` and
- * `SpecializedGetters`. Codegen emits a concrete subclass per complex-typed input column,
- * overriding only the small set of getters the element type requires (e.g. `numElements`,
- * `isNullAt`, and `getUTF8String` for an `ArrayType(StringType)` input).
+ * `SpecializedGetters`; codegen-emitted subclasses override only the getters their element type
+ * needs (e.g. `numElements`, `isNullAt`, and `getUTF8String` for an `ArrayType(StringType)`
+ * input).
+ *
+ * Consumer: `InputArray_${path}` nested classes the input emitter generates per `ArrayType` input
+ * column. These back the kernel's `getArray(ord)` switch and the recursive nested classes for
+ * `Array<Array<...>>` / array-typed map keys / array-typed struct fields.
  *
- * Pattern mirrors [[CometInternalRow]]: centralize the boilerplate throws so the codegen- emitted
- * subclasses stay short, and absorb forward-compat breakage if Spark adds abstract methods to
- * `ArrayData` in a future version.
+ * Why this exists separately from [[CometInternalRow]]: in Spark, `ArrayData` and `InternalRow`
+ * are sibling abstract classes. They both extend `SpecializedGetters` (so they share the typed
+ * scalar getters) but neither inherits the other, so a base aimed at one cannot serve the other.
+ * The `get(ordinal, dataType)` dispatch body that '''is''' shared between the two lives in
+ * [[CometSpecializedGettersDispatch]].
+ *
+ * [[CometMapData]] is the third sibling for `MapType` views; it backs `InputMap_*` and routes
+ * `keyArray()` / `valueArray()` through `CometArrayData` instances.
  *
  * Mixes in [[CometInternalRowShim]] for the same reason `CometInternalRow` does: Spark 4.x adds
- * new abstract getters (`getVariant`, `getGeography`, `getGeometry`) on `SpecializedGetters` that
- * both `InternalRow` and `ArrayData` inherit. The shim is per-profile and provides throwing
- * defaults only on the profiles that declare those methods abstract.
+ * abstract `SpecializedGetters` methods (`getVariant`, `getGeography`, `getGeometry`) that both
+ * `InternalRow` and `ArrayData` inherit. The shim is per-profile and provides throwing defaults
+ * only on the profiles where those methods are abstract.
  */
 abstract class CometArrayData extends ArrayData with CometInternalRowShim {
 
   override def getInterval(ordinal: Int): CalendarInterval = unsupported("getInterval")
 
-  /**
-   * Generic `get(ordinal, dataType)` dispatcher. Spark codegen sometimes calls this rather than
-   * the typed getter (`SafeProjection` uses it when deserializing struct-valued ScalaUDF args,
-   * for example); leaving it as a throw leaks NPEs once callers catch the
-   * `UnsupportedOperationException` and propagate null. Dispatches to the typed getter matching
-   * `dataType`; a null entry returns `null` outright.
-   */
-  override def get(ordinal: Int, dataType: DataType): AnyRef = {
-    if (isNullAt(ordinal)) return null
-    dataType match {
-      case BooleanType => java.lang.Boolean.valueOf(getBoolean(ordinal))
-      case ByteType => java.lang.Byte.valueOf(getByte(ordinal))
-      case ShortType => java.lang.Short.valueOf(getShort(ordinal))
-      case IntegerType | DateType => java.lang.Integer.valueOf(getInt(ordinal))
-      case LongType | TimestampType | TimestampNTZType =>
-        java.lang.Long.valueOf(getLong(ordinal))
-      case FloatType => java.lang.Float.valueOf(getFloat(ordinal))
-      case DoubleType => java.lang.Double.valueOf(getDouble(ordinal))
-      case _: StringType => getUTF8String(ordinal)
-      case BinaryType => getBinary(ordinal)
-      case dt: DecimalType => getDecimal(ordinal, dt.precision, dt.scale)
-      case st: StructType => getStruct(ordinal, st.size)
-      case _: ArrayType => getArray(ordinal)
-      case _: MapType => getMap(ordinal)
-      case other => unsupported(s"get for dataType $other")
-    }
-  }
+  override def get(ordinal: Int, dataType: DataType): AnyRef =
+    CometSpecializedGettersDispatch.get(this, ordinal, dataType)
 
   override def isNullAt(ordinal: Int): Boolean = unsupported("isNullAt")
 
 
@@ -21,11 +21,12 @@ package org.apache.comet.codegen
 
 import org.apache.arrow.vector._
 import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
+import org.apache.arrow.vector.types.pojo.Field
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, Literal, Unevaluable}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types._
 
 import org.apache.comet.shims.CometExprTraitShim
 
@@ -83,18 +84,35 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
     case other => throw new IllegalArgumentException(s"unknown Arrow vector class: $other")
   }
 
+  /**
+   * Type surface the kernel covers, on both the input getter side and the output writer side.
+   * Recursive: `ArrayType` / `StructType` / `MapType` are supported when their children are.
+   * Input and output use a single predicate today; if they ever need to diverge, split this back
+   * into per-direction methods.
+   */
+  def isSupportedDataType(dt: DataType): Boolean = dt match {
+    case BooleanType | ByteType | ShortType | IntegerType | LongType => true
+    case FloatType | DoubleType => true
+    case _: DecimalType => true
+    case _: StringType | _: BinaryType => true
+    case DateType | TimestampType | TimestampNTZType => true
+    case ArrayType(inner, _) => isSupportedDataType(inner)
+    case st: StructType => st.fields.forall(f => isSupportedDataType(f.dataType))
+    case mt: MapType => isSupportedDataType(mt.keyType) && isSupportedDataType(mt.valueType)
+    case _ => false
+  }
+
   /**
    * Plan-time predicate: can the codegen dispatcher handle this bound expression end to end? If
    * it returns `None`, the serde is free to emit the codegen proto. If it returns `Some(reason)`,
    * the serde must fall back (usually via `withInfo(...) + None`) so Spark runs the expression
    * rather than crashing in the Janino compile at execute time.
    *
    * Checks:
-   *   - every `BoundReference`'s data type is in
-   *     [[CometBatchKernelCodegenInput.isSupportedInputType]] (i.e. the kernel has a typed getter
-   *     for it)
-   *   - the overall `expr.dataType` is in [[CometBatchKernelCodegenOutput.isSupportedOutputType]]
-   *     (i.e. `allocateOutput` and `emitWrite` know how to materialize it)
+   *   - every `BoundReference`'s data type is in [[isSupportedDataType]] (i.e. the kernel has a
+   *     typed getter for it)
+   *   - the overall `expr.dataType` is in [[isSupportedDataType]] (i.e. `allocateOutput` and
+   *     `emitWrite` know how to materialize it)
    *   - the expression is scalar (no `AggregateFunction`, no generators). These never reach a
    *     scalar serde, but we belt-and-suspenders anyway.
    *
@@ -103,7 +121,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
    * the output vector) touch Arrow.
    */
   def canHandle(boundExpr: Expression): Option[String] = {
-    if (!CometBatchKernelCodegenOutput.isSupportedOutputType(boundExpr.dataType)) {
+    if (!isSupportedDataType(boundExpr.dataType)) {
       return Some(s"codegen dispatch: unsupported output type ${boundExpr.dataType}")
     }
     // Reject expressions that can't be safely compiled or cached:
@@ -155,7 +173,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
       case None =>
     }
     val badRef = boundExpr.collectFirst {
-      case b: BoundReference if !CometBatchKernelCodegenInput.isSupportedInputType(b.dataType) =>
+      case b: BoundReference if !isSupportedDataType(b.dataType) =>
         b
     }
     badRef.map(b =>
@@ -175,6 +193,10 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
       estimatedBytes: Int = -1): FieldVector =
     CometBatchKernelCodegenOutput.allocateOutput(dataType, name, numRows, estimatedBytes)
 
+  /** Variant that takes a pre-computed Arrow `Field`, letting hot-path callers cache it. */
+  def allocateOutput(field: Field, numRows: Int, estimatedBytes: Int): FieldVector =
+    CometBatchKernelCodegenOutput.allocateOutput(field, numRows, estimatedBytes)
+
   def compile(boundExpr: Expression, inputSchema: Seq[ArrowColumnSpec]): CompiledKernel = {
     val src = generateSource(boundExpr, inputSchema)
     val (clazz, _) =
@@ -188,8 +210,6 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
             t)
           throw t
       }
-    // One log per unique (expr, schema) compile; the caller caches the result so subsequent
-    // batches with the same shape reuse this compile.
     logInfo(
       s"CometBatchKernelCodegen: compiled ${boundExpr.getClass.getSimpleName} " +
         s"-> ${boundExpr.dataType}  inputs=" +
@@ -529,8 +549,9 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
       ScalarColumnSpec(vectorClass, nullable)
 
     /**
-     * Backward-compatible extractor for the common scalar case. Callers that want array / struct
-     * / future map specs should pattern match on the subclass directly.
+     * Trait-level extractor that destructures only the scalar case. Pattern-match callers use
+     * `case ArrowColumnSpec(cls, nullable)` to filter on scalar specs and pull out their vector
+     * class and nullability in one step; complex specs return `None` and skip the case.
      */
     def unapply(spec: ArrowColumnSpec): Option[(Class[_ <: ValueVector], Boolean)] = spec match {
       case ScalarColumnSpec(c, n) => Some((c, n))
 
@@ -94,23 +94,6 @@ private[codegen] object CometBatchKernelCodegenInput {
     classOf[TimeStampMicroTZVector])
   private val cometPlainVectorName: String = classOf[CometPlainVector].getName
 
-  /**
-   * Input types the kernel has a typed getter for. Recursive: `ArrayType(inner)` supported when
-   * `inner` is supported; `StructType` when every field is; `MapType` when key and value types
-   * are both supported.
-   */
-  def isSupportedInputType(dt: DataType): Boolean = dt match {
-    case BooleanType | ByteType | ShortType | IntegerType | LongType => true
-    case FloatType | DoubleType => true
-    case _: DecimalType => true
-    case _: StringType | _: BinaryType => true
-    case DateType | TimestampType | TimestampNTZType => true
-    case ArrayType(inner, _) => isSupportedInputType(inner)
-    case st: StructType => st.fields.forall(f => isSupportedInputType(f.dataType))
-    case mt: MapType => isSupportedInputType(mt.keyType) && isSupportedInputType(mt.valueType)
-    case _ => false
-  }
-
   /**
    * Emit the kernel's typed vector-field declarations for every level of every input column's
    * spec tree. Top-level complex columns additionally get an instance-field declaration for the
@@ -215,10 +198,10 @@ private[codegen] object CometBatchKernelCodegenInput {
         val fastPath = emitDecimalFastBodyUnsafe(valueAddr, "this.rowIdx", "        ")
         val slowPath = emitDecimalSlowBody(slowField, "this.rowIdx", "        ")
         val body = known match {
-          case Some(dt) if dt.precision <= 18 => fastPath
+          case Some(dt) if dt.precision <= Decimal.MAX_LONG_DIGITS => fastPath
           case Some(_) => slowPath
           case None =>
-            s"""        if (precision <= 18) {
+            s"""        if (precision <= ${Decimal.MAX_LONG_DIGITS}) {
                |$fastPath
                |        } else {
                |$slowPath
@@ -608,7 +591,7 @@ private[codegen] object CometBatchKernelCodegenInput {
         collectNestedClasses(s"${path}_f$fi", f.child, out)
       }
     case mp: MapColumnSpec =>
-      out += emitMapClass(path, mp)
+      out += emitMapClass(path)
       // Emit InputArray_${path}_k and InputArray_${path}_v - the ArrayData views returned by
       // `MapData.keyArray()` / `valueArray()`. They follow the standard array-element
       // convention: each reads from `${classPath}_e` which maps to the key / value vector
@@ -754,7 +737,7 @@ private[codegen] object CometBatchKernelCodegenInput {
            |      }""".stripMargin
       case dt: DecimalType =>
         val body =
-          if (dt.precision <= 18) {
+          if (dt.precision <= Decimal.MAX_LONG_DIGITS) {
             emitDecimalFastBodyUnsafe(s"${childField}_valueAddr", "startIndex + i", "        ")
           } else {
             emitDecimalSlowBody(childField, "startIndex + i", "        ")
@@ -947,7 +930,7 @@ private[codegen] object CometBatchKernelCodegenInput {
         val dt = f.sparkType.asInstanceOf[DecimalType]
         val field = s"${path}_f$fi"
         val body =
-          if (dt.precision <= 18) {
+          if (dt.precision <= Decimal.MAX_LONG_DIGITS) {
             emitDecimalFastBodyUnsafe(s"${field}_valueAddr", "this.rowIdx", "          ")
           } else {
             emitDecimalSlowBody(field, "this.rowIdx", "          ")
@@ -1024,8 +1007,7 @@ private[codegen] object CometBatchKernelCodegenInput {
    * `keyArray()` / `valueArray()` through pre-allocated `InputArray_${path}_k` /
    * `InputArray_${path}_v` instances (emitted by [[collectNestedClasses]]).
    */
-  private def emitMapClass(path: String, spec: MapColumnSpec): String = {
-    val _ = spec // key/value arrays declared via path convention below
+  private def emitMapClass(path: String): String = {
     val baseClassName = classOf[CometMapData].getName
     val keyPath = s"${path}_k"
     val valPath = s"${path}_v"
 
@@ -21,6 +21,7 @@ package org.apache.comet.codegen
 
 import org.apache.arrow.vector._
 import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
+import org.apache.arrow.vector.types.pojo.Field
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.comet.util.Utils
 import org.apache.spark.sql.types._
@@ -37,23 +38,6 @@ import org.apache.comet.CometArrowAllocator
  */
 private[codegen] object CometBatchKernelCodegenOutput {
 
-  /**
-   * Output types [[allocateOutput]] and [[emitOutputWriter]] can materialize. Recursive: complex
-   * types are supported when their children are.
-   */
-  def isSupportedOutputType(dt: DataType): Boolean = dt match {
-    case BooleanType | ByteType | ShortType | IntegerType | LongType => true
-    case FloatType | DoubleType => true
-    case _: DecimalType => true
-    case _: StringType | _: BinaryType => true
-    case DateType | TimestampType | TimestampNTZType => true
-    case ArrayType(inner, _) => isSupportedOutputType(inner)
-    case st: StructType => st.fields.forall(f => isSupportedOutputType(f.dataType))
-    case mt: MapType =>
-      isSupportedOutputType(mt.keyType) && isSupportedOutputType(mt.valueType)
-    case _ => false
-  }
-
   /**
    * Allocate an Arrow output vector matching `dataType`. Delegates field and vector construction
    * to [[Utils.toArrowField]] + `Field.createVector`, which is the pattern the rest of Comet uses
@@ -73,15 +57,19 @@ private[codegen] object CometBatchKernelCodegenOutput {
       dataType: DataType,
       name: String,
       numRows: Int,
-      estimatedBytes: Int = -1): FieldVector = {
-    val field = Utils.toArrowField(name, dataType, nullable = true, "UTC")
+      estimatedBytes: Int = -1): FieldVector =
+    allocateOutput(
+      Utils.toArrowField(name, dataType, nullable = true, "UTC"),
+      numRows,
+      estimatedBytes)
+
+  /** Variant that takes a pre-computed Arrow `Field`, letting hot-path callers cache it. */
+  def allocateOutput(field: Field, numRows: Int, estimatedBytes: Int): FieldVector = {
     val vec = field.createVector(CometArrowAllocator).asInstanceOf[FieldVector]
     try {
       vec.setInitialCapacity(numRows)
       vec match {
-        case v: VarCharVector if estimatedBytes > 0 =>
-          v.allocateNew(estimatedBytes.toLong, numRows)
-        case v: VarBinaryVector if estimatedBytes > 0 =>
+        case v: BaseVariableWidthVector if estimatedBytes > 0 =>
           v.allocateNew(estimatedBytes.toLong, numRows)
         case _ =>
           vec.allocateNew()
@@ -172,8 +160,11 @@ private[codegen] object CometBatchKernelCodegenOutput {
       // `DecimalVector.setSafe(int, long)` and skip the `java.math.BigDecimal` allocation
       // `setSafe(int, BigDecimal)` requires. For p > 18 the BigDecimal path is unavoidable.
       val write =
-        if (dt.precision <= 18) s"$targetVec.setSafe($idx, $source.toUnscaledLong());"
-        else s"$targetVec.setSafe($idx, $source.toJavaBigDecimal());"
+        if (dt.precision <= Decimal.MAX_LONG_DIGITS) {
+          s"$targetVec.setSafe($idx, $source.toUnscaledLong());"
+        } else {
+          s"$targetVec.setSafe($idx, $source.toJavaBigDecimal());"
+        }
       OutputEmit("", write)
     case _: StringType =>
       // Optimization: Utf8OutputOnHeapShortcut.
Original file line number	Diff line number	Diff line change
`@@ -199,9 +199,6 @@ private static void evaluateInternal(`
`199`	`199`	`}`
`200`	`200`
`201`	`201`	`result = udf.evaluate(inputs, numRows);`
`202`		`- assert result instanceof FieldVector`
`203`		`- : "CometUDF implementations must return FieldVector; got "`
`204`		`- + (result == null ? "null" : result.getClass().getName());`
`205`	`202`	`if (!(result instanceof FieldVector)) {`
`206`	`203`	`throw new RuntimeException(`
`207`	`204`	`"CometUDF.evaluate() must return a FieldVector, got: " + result.getClass().getName());`