add fallback for too many args and a test, clean up printing code

mbutrovich · mbutrovich · commit 650f619ba134 · 2026-05-15T16:41:37.000-04:00
diff --git a/common/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala b/common/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala
@@ -98,6 +98,18 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
     case _ => false
   }
 
+  /**
+   * Count the number of leaf fields (including nested) in a [[DataType]]. Mirrors WSCG's
+   * `WholeStageCodegenExec.numOfNestedFields` so the [[canHandle]] threshold check uses the same
+   * unit as `spark.sql.codegen.maxFields`.
+   */
+  private def numOfNestedFields(dataType: DataType): Int = dataType match {
+    case st: StructType => st.fields.map(f => numOfNestedFields(f.dataType)).sum
+    case m: MapType => numOfNestedFields(m.keyType) + numOfNestedFields(m.valueType)
+    case a: ArrayType => numOfNestedFields(a.elementType)
+    case _ => 1
+  }
+
   /**
    * Plan-time predicate: can the codegen dispatcher handle this bound expression end to end?
    * `None` greenlights the serde to emit the codegen proto; `Some(reason)` forces a Spark
@@ -112,6 +124,19 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
     if (!isSupportedDataType(boundExpr.dataType)) {
       return Some(s"codegen dispatch: unsupported output type ${boundExpr.dataType}")
     }
+    // Mirror WSCG's `spark.sql.codegen.maxFields` gate. Count nested fields in the output type
+    // and in every `BoundReference`'s input type. Wide schemas blow the generated class's typed
+    // input field count, the typed-getter switch, and the constant pool. Refuse here so the
+    // operator falls back to Spark cleanly rather than tripping a Janino compile failure
+    // mid-execution (which Comet has no way to recover from).
+    val maxFields = SQLConf.get.wholeStageMaxNumFields
+    val totalFields = numOfNestedFields(boundExpr.dataType) +
+      boundExpr.collect { case b: BoundReference => numOfNestedFields(b.dataType) }.sum
+    if (totalFields > maxFields) {
+      return Some(
+        s"codegen dispatch: too many nested fields ($totalFields > " +
+          s"spark.sql.codegen.maxFields=$maxFields)")
+    }
     // Reject expressions that can't be safely compiled or cached:
     //   - AggregateFunction / Generator: non-scalar bridge shape.
     //   - CodegenFallback: opts out of `doGenCode`, which our compile path assumes works.
@@ -192,7 +217,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
         case t: Throwable =>
           logError(
             s"CometBatchKernelCodegen: compile failed for ${boundExpr.getClass.getSimpleName}. " +
-              s"Generated source follows:\n${src.body}",
+              s"Generated source follows:\n${CodeFormatter.format(src.code)}",
             t)
           throw t
       }
diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenDispatchSmokeSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenDispatchSmokeSuite.scala
@@ -163,6 +163,33 @@ class CometCodegenDispatchSmokeSuite extends CometTestBase with AdaptiveSparkPla
       s"expected no dispatcher activity under disabled config, got $after")
   }
 
+  test("schema exceeding spark.sql.codegen.maxFields falls back to Spark") {
+    // `CometBatchKernelCodegen.canHandle` mirrors WSCG's `spark.sql.codegen.maxFields` gate by
+    // counting nested input fields plus the output field and refusing once the total exceeds the
+    // configured cap. Comet has no mid-execution fallback, so the gate must fire at plan time
+    // (in the serde) rather than letting an oversized kernel reach Janino. With 5 input
+    // BoundReferences and a 1-field output we have 6 fields total; setting `maxFields=3` ensures
+    // the gate fires here regardless of test ordering or future schema additions.
+    spark.udf.register(
+      "sumFiveInts",
+      (a: Int, b: Int, c: Int, d: Int, e: Int) => a + b + c + d + e)
+    withTable("t") {
+      sql("CREATE TABLE t (a INT, b INT, c INT, d INT, e INT) USING parquet")
+      sql("INSERT INTO t VALUES (1, 2, 3, 4, 5), (10, 20, 30, 40, 50)")
+      CometScalaUDFCodegen.resetStats()
+      withSQLConf("spark.sql.codegen.maxFields" -> "3") {
+        // Result correctness still has to match Spark; only the dispatcher path is refused.
+        // ScalaUDF has no Comet-native path, so this runs on the JVM Spark path under fallback,
+        // hence `checkSparkAnswer` rather than `checkSparkAnswerAndOperator`.
+        checkSparkAnswer(sql("SELECT sumFiveInts(a, b, c, d, e) FROM t"))
+      }
+      val after = CometScalaUDFCodegen.stats()
+      assert(
+        after.compileCount == 0 && after.cacheHitCount == 0,
+        s"expected dispatcher fallback under maxFields=3, got $after")
+    }
+  }
+
   test("per-batch nullability produces distinct compiles for null-present vs null-absent") {
     // Same ScalaUDF + same Arrow vector class + different observed nullability should hit
     // different cache keys, because `ArrowColumnSpec.nullable` flips when the batch has no