coderfender
diff --git a/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_build_linux.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_build_macos.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/contributor-guide/expression-audits/json_funcs.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/contributor-guide/expression-audits/json_funcs.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/user-guide/latest/compatibility/index.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/user-guide/latest/compatibility/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/user-guide/latest/compatibility/json.md‎
Lines changed: 56 additions & 0 deletions b/‎docs/source/user-guide/latest/compatibility/json.md‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎docs/source/user-guide/latest/expressions.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/user-guide/latest/expressions.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/user-guide/latest/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/user-guide/latest/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala‎
Lines changed: 13 additions & 15 deletions b/‎spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala‎
Lines changed: 13 additions & 3 deletions b/‎spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/serde/json.scala‎
Lines changed: 23 additions & 11 deletions b/‎spark/src/main/scala/org/apache/comet/serde/json.scala‎
Lines changed: 23 additions & 11 deletions
@@ -377,6 +377,7 @@ jobs:
               org.apache.comet.CometMapExpressionSuite
               org.apache.comet.CometCsvExpressionSuite
               org.apache.comet.CometJsonExpressionSuite
+              org.apache.comet.CometJsonJvmSuite
               org.apache.comet.SparkErrorConverterSuite
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
 
@@ -193,6 +193,7 @@ jobs:
               org.apache.comet.CometMapExpressionSuite
               org.apache.comet.CometCsvExpressionSuite
               org.apache.comet.CometJsonExpressionSuite
+              org.apache.comet.CometJsonJvmSuite
               org.apache.comet.SparkErrorConverterSuite
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
 
@@ -33,6 +33,12 @@
 - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1.
 - Known incompatibility: Spark accepts single-quoted JSON and unescaped control characters; Comet's native parser (built on `serde_json`) rejects both, so those inputs require `spark.comet.expression.GetJsonObject.allowIncompatible=true` and may still produce different results. Non-default Spark 4.0 string collations are not propagated (https://github.com/apache/datafusion-comet/issues/2190).
 
+## json_array_length
+
+- `LengthOfJsonArray`: `UnaryExpression with ExpectsInputTypes with CodegenFallback`; `inputTypes = Seq(StringType) -> IntegerType`. Returns NULL for NULL input, invalid JSON, or non-array JSON; otherwise the number of top-level array elements.
+- Runs through the codegen dispatcher by default for byte-exact Spark compatibility.
+- Known incompatibility: the native path (built on `serde_json`) requires strict JSON, so single-quoted JSON, unescaped control characters, and trailing content require `spark.comet.expression.LengthOfJsonArray.allowIncompatible=true` and may still produce different results.
+
 ## to_json
 
 - Partial native support; options and map/array inputs fall back.
 
@@ -28,4 +28,5 @@ This guide documents areas where Comet's behavior is known to differ from Spark.
 - **Regular expressions**: differences between the Rust regexp crate and Java's regex engine.
 - **Operators**: operator-level compatibility notes, including window functions and round-robin partitioning.
 - **Expressions**: per-expression compatibility notes, including cast.
+- **JSON**: choosing between the native and Spark-compatible engines for JSON expressions.
 - **Spark versions**: version-specific known issues and limitations.
@@ -0,0 +1,56 @@
+<!---
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# JSON Compatibility
+
+Comet can evaluate JSON expressions (`get_json_object`, `from_json`, `to_json`,
+`json_array_length`) two ways:
+
+- **Codegen dispatcher (default):** Spark's own `doGenCode` for the expression
+  runs inside the Comet pipeline (via Comet's Arrow-direct codegen dispatcher),
+  giving byte-exact compatibility with Spark at the cost of a JNI roundtrip per
+  batch. This rides the codegen dispatcher
+  (`spark.comet.exec.scalaUDF.codegen.enabled`, enabled by default); if the
+  dispatcher is disabled, the operator falls back to Spark.
+- **Native (rust) path:** the native DataFusion implementation. Faster, but has
+  known compatibility gaps with Spark on certain inputs, so it is **opt-in per
+  expression** via the expression's `allowIncompatible` config. Any expression or
+  input case with no native implementation falls back to the codegen dispatcher.
+
+## Expression coverage
+
+| SQL                 | Native (rust) path                                                                             | Opt-in config                                                |
+| ------------------- | ---------------------------------------------------------------------------------------------- | ------------------------------------------------------------ |
+| `get_json_object`   | Supported, with gaps on single-quoted JSON and unescaped control characters                    | `spark.comet.expression.GetJsonObject.allowIncompatible`     |
+| `from_json`         | Supported with restrictions (PERMISSIVE mode only, simple schema types only)                   | `spark.comet.expression.JsonToStructs.allowIncompatible`     |
+| `to_json`           | Supported for struct inputs only, no options                                                   | `spark.comet.expression.StructsToJson.allowIncompatible`     |
+| `json_array_length` | Supported, with gaps on single-quoted JSON, unescaped control characters, and trailing content | `spark.comet.expression.LengthOfJsonArray.allowIncompatible` |
+
+When the native path is enabled but an expression or input case has no native
+implementation (for example `to_json` with map or array inputs, or `from_json`
+with an unsupported schema), Comet falls back to the codegen dispatcher for that
+case.
+
+## When to use the native path
+
+- You want the faster native path and your inputs avoid the known compatibility
+  gaps above.
+- Enable it per expression, for example
+  `spark.comet.expression.GetJsonObject.allowIncompatible=true`. Cases the native path
+  does not cover still fall back to the codegen dispatcher.
@@ -338,7 +338,7 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the
 | --- | --- | --- |
 | `from_json` | ✅ | Falls back by default; opt-in via allowIncompatible ([audit](../../contributor-guide/expression-audits/json_funcs.md#from_json)) |
 | `get_json_object` | ✅ | Some inputs need allowIncompatible ([audit](../../contributor-guide/expression-audits/json_funcs.md#get_json_object)) |
-| `json_array_length` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) |
+| `json_array_length` | ✅ | Single-quoted/trailing JSON needs allowIncompatible ([audit](../../contributor-guide/expression-audits/json_funcs.md#json_array_length)) |
 | `json_object_keys` | 🔜 | [#3161](https://github.com/apache/datafusion-comet/issues/3161) |
 | `json_tuple` | 🔜 | [#3160](https://github.com/apache/datafusion-comet/issues/3160) |
 | `schema_of_json` | 🔜 | [#3163](https://github.com/apache/datafusion-comet/issues/3163) |
 
@@ -62,6 +62,7 @@ to read more.
    compatibility/regex
    compatibility/operators
    compatibility/expressions/index
+   compatibility/json
    compatibility/spark-versions
 
 .. toctree::
 
@@ -23,7 +23,7 @@ import org.apache.arrow.vector._
 import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
 import org.apache.arrow.vector.types.pojo.Field
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, HigherOrderFunction, LambdaFunction, Literal, NamedLambdaVariable, Unevaluable}
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, Literal, Unevaluable}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
@@ -107,9 +107,8 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
    * back cleanly rather than crashing the Janino compile at execute time.
    *
    * Checks every `BoundReference`'s data type and the root `expr.dataType` against
-   * [[isSupportedDataType]], rejects aggregates / generators / `CodegenFallback` (other than
-   * HOFs, which are admitted), and gates total nested-field count on
-   * `spark.sql.codegen.maxFields`.
+   * [[isSupportedDataType]], rejects aggregates / generators / `Unevaluable`, and gates total
+   * nested-field count on `spark.sql.codegen.maxFields`.
    */
   def canHandle(boundExpr: Expression): Option[String] = {
     if (!isSupportedDataType(boundExpr.dataType)) {
@@ -127,12 +126,15 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
         s"codegen dispatch: too many nested fields ($totalFields > " +
           s"spark.sql.codegen.maxFields=$maxFields)")
     }
-    // HOFs are `CodegenFallback` but admitted: `CodegenFallback.doGenCode` emits one
-    // `((Expression) references[N]).eval(row)` call site per HOF. The kernel dispatches to the
-    // HOF's interpreted `eval`, which mutates `NamedLambdaVariable.value` per element and reads
-    // the input array through the kernel's typed Arrow getters. Per-task `boundExpr` isolation
-    // in `CometScalaUDFCodegen.kernelCache` prevents concurrent partitions from racing on the
-    // lambda variable's `AtomicReference`. See `CometCodegenHOFSuite`.
+    // `CodegenFallback` expressions are admitted. `CodegenFallback.doGenCode` emits one
+    // `((Expression) references[N]).eval(row)` call site per expression. The kernel dispatches
+    // to the expression's interpreted `eval` against `row` aliased to `this`, so the eval reads
+    // through the kernel's typed Arrow getters. This covers `HigherOrderFunction` (which mutates
+    // `NamedLambdaVariable.value` per element; see `CometCodegenHOFSuite`) as well as other
+    // CodegenFallback expressions like `JsonToStructs` / `StructsToJson` whose `eval(row)`
+    // simply calls `row.get(0, dataType)`. Per-task `boundExpr` isolation in
+    // `CometScalaUDFCodegen.kernelCache` prevents concurrent partitions from racing on shared
+    // state inside the expression.
     //
     // Nondeterministic / stateful expressions are accepted: each cache entry holds one kernel
     // instance with a single `init(partitionIndex)` call, so `Rand` / `MonotonicallyIncreasingID`
@@ -150,18 +152,14 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
     boundExpr.find {
       case _: org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction => true
       case _: org.apache.spark.sql.catalyst.expressions.Generator => true
-      case _: HigherOrderFunction => false
-      case _: LambdaFunction => false
-      case _: NamedLambdaVariable => false
-      case _: CodegenFallback => true
       case u: Unevaluable if isCodegenInertUnevaluable(u) => false
       case _: Unevaluable => true
       case _ => false
     } match {
       case Some(bad) =>
         return Some(
           s"codegen dispatch: expression ${bad.getClass.getSimpleName} not supported " +
-            "(aggregate, generator, codegen-fallback, or unevaluable)")
+            "(aggregate, generator, or unevaluable)")
       case None =>
     }
     val badRef = boundExpr.collectFirst {
 
@@ -20,7 +20,7 @@
 package org.apache.comet.serde
 
 import org.apache.spark.SparkEnv
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSeq, BindReferences, Expression, Literal, ScalaUDF}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSeq, BindReferences, Expression, Literal, RuntimeReplaceable, ScalaUDF}
 import org.apache.spark.sql.types.BinaryType
 
 import org.apache.comet.CometConf
@@ -78,10 +78,20 @@ object CometScalaUDF extends CometExpressionSerde[ScalaUDF] {
       return None
     }
 
+    // `RuntimeReplaceable` expressions (e.g. Spark 4's `StructsToJson`) have a `doGenCode` that
+    // always throws "Cannot generate code for expression". Catalyst's `ReplaceExpressions` rule
+    // normally rewrites them to their `replacement` form before codegen runs. Comet's serde
+    // sometimes works with the pre-rewrite form (via shim reconstruction) for matching purposes,
+    // so unwrap to the replacement here before binding so the kernel compiles.
+    val target = expr match {
+      case rr: RuntimeReplaceable => rr.replacement
+      case other => other
+    }
+
     // Bind against only the AttributeReferences the tree actually reads, so ordinals align with
     // the data args we ship.
-    val attrs = expr.collect { case a: AttributeReference => a }.distinct
-    val boundExpr = BindReferences.bindReference(expr, AttributeSeq(attrs))
+    val attrs = target.collect { case a: AttributeReference => a }.distinct
+    val boundExpr = BindReferences.bindReference(target, AttributeSeq(attrs))
 
     // Gate at plan time. Surface the reason via withFallbackReason rather than crashing Janino
     // at execute.
 
@@ -19,18 +19,30 @@
 
 package org.apache.comet.serde
 
-import org.apache.spark.sql.catalyst.expressions.LengthOfJsonArray
+import org.apache.spark.sql.catalyst.expressions.{Attribute, LengthOfJsonArray}
 
-object CometLengthOfJsonArray
-    extends CometScalarFunction[LengthOfJsonArray]("json_array_length") {
+import org.apache.comet.CometConf
+import org.apache.comet.serde.ExprOuterClass.Expr
+import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithFallbackReason, scalarFunctionExprToProto}
 
-  private val IncompatibleReason: String =
-    "Spark's lenient JSON parser allows single quotes, unescaped controls, " +
-      "and trailing content, " +
-      "while Comet's serde_json requires strict JSON."
-
-  override def getIncompatibleReasons(): Seq[String] = Seq(IncompatibleReason)
+/**
+ * `json_array_length` runs Spark's own implementation through the codegen dispatcher by default,
+ * for byte-exact results. The native (rust) path is faster but incompatible with Spark for
+ * single-quoted JSON, unescaped control characters, and trailing content, so it is opt-in via
+ * `spark.comet.expression.LengthOfJsonArray.allowIncompatible`; otherwise it rides the codegen
+ * dispatcher via [[CometCodegenDispatch]].
+ */
+object CometLengthOfJsonArray extends CometCodegenDispatch[LengthOfJsonArray] {
 
-  override def getSupportLevel(expr: LengthOfJsonArray): SupportLevel = Incompatible(
-    Some(IncompatibleReason))
+  override def convert(
+      expr: LengthOfJsonArray,
+      inputs: Seq[Attribute],
+      binding: Boolean): Option[Expr] =
+    if (CometConf.isExprAllowIncompat(getExprConfigName(expr))) {
+      val childExpr = expr.children.map(exprToProtoInternal(_, inputs, binding))
+      val optExpr = scalarFunctionExprToProto("json_array_length", childExpr: _*)
+      optExprWithFallbackReason(optExpr, expr, expr.children: _*)
+    } else {
+      super.convert(expr, inputs, binding)
+    }
 }