[SPARK-57736][SQL] Fix NPE in CreateNamedStruct.dataType when a struct field name is null

MaxGekk · MaxGekk · commit cf05617c67a5 · 2026-06-29T23:38:05.000+02:00
### What changes were proposed in this pull request? `CreateNamedStruct.dataType` builds each field with `StructField(name.toString, ...)`: ```scala override lazy val dataType: StructType = { val fields = names.zip(valExprs).map { case (name, expr) => ... StructField(name.toString, expr.dataType, expr.nullable, metadata) // NPE if name == null } StructType(fields) } ``` When a field name is `null`, `name.toString` throws a `NullPointerException`. This is reached eagerly while building a `RowEncoder` serializer (`SerializerBuildHelper.createSerializerForObject` -> `CreateNamedStruct(...).dataType`), so it crashes before any analysis runs. This PR makes the field name null-safe and preserves the null name: ```scala StructField(if (name == null) null else name.toString, expr.dataType, expr.nullable, metadata) ``` ### Why are the changes needed? A null field name is invalid input -- `CreateNamedStruct.checkInputDataTypes` already rejects it (`names.contains(null)` -> `UNEXPECTED_NULL`) -- but `dataType` dereferences `name.toString` before type checking, and the encoder calls `dataType` directly. Keeping it null-safe converts the hard `NullPointerException` into correct behavior, consistent with SPARK-57725 which made `AttributeSeq` tolerate null-named attributes. Minimal reproduction: ```scala import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, Literal} import org.apache.spark.sql.types.{IntegerType, StringType} CreateNamedStruct(Seq(Literal.create(null, StringType), Literal(1))).dataType // NPE before this fix ``` Note: this fixes the specific `CreateNamedStruct.dataType` NPE. The full `createDataFrame(schemaWithNullFieldName)` scenario hits additional, independent null-name sites further along (e.g. a `StructField.name.equalsIgnoreCase` schema comparison during resolution), which are separate pre-existing issues and out of scope here. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added a regression test in `ComplexTypeSuite` asserting `dataType` no longer throws and preserves the null field name. ``` build/sbt 'catalyst/testOnly *ComplexTypeSuite' ``` ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Cursor Closes #56845 from MaxGekk/SPARK-57736-createnamedstruct-npe. Authored-by: Maxim Gekk <max.gekk@gmail.com> Signed-off-by: Max Gekk <max.gekk@gmail.com> (cherry picked from commit 0525313) Signed-off-by: Max Gekk <max.gekk@gmail.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -470,7 +470,11 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression with
           case gsf: GetStructField => gsf.metadata
           case _ => Metadata.empty
         }
-        StructField(name.toString, expr.dataType, expr.nullable, metadata)
+        // A null field name is invalid input (checkInputDataTypes flags it as UNEXPECTED_NULL),
+        // but dataType is evaluated eagerly by the encoder before type checking; keep it null-safe
+        // and preserve the null name rather than throwing a NullPointerException (SPARK-57736).
+        StructField(if (name == null) null else name.toString, expr.dataType, expr.nullable,
+          metadata)
     }
     StructType(fields)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -492,6 +492,29 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     )
   }
 
+  test("SPARK-57736: CreateNamedStruct.dataType is null-safe when a field name is null") {
+    // Accessing `dataType` must not throw an NPE even though a null field name is invalid input.
+    val struct = CreateNamedStruct(Seq(Literal.create(null, StringType), Literal(1)))
+    val dt = struct.dataType
+    assert(dt.length == 1)
+    assert(dt.head.name == null)
+    // The null field name is still reported as invalid by input type checking.
+    assert(struct.checkInputDataTypes().isFailure)
+    val result = struct.checkInputDataTypes().asInstanceOf[DataTypeMismatch]
+    assert(result.errorSubClass == "UNEXPECTED_NULL")
+
+    // A null field name mixed with valid named fields is null-safe and still flagged.
+    val mixed = CreateNamedStruct(Seq(
+      Literal("a"), Literal(1),
+      Literal.create(null, StringType), Literal(2)))
+    val mixedDt = mixed.dataType
+    assert(mixedDt.length == 2)
+    assert(mixedDt.head.name == "a")
+    assert(mixedDt(1).name == null)
+    assert(mixed.checkInputDataTypes().asInstanceOf[DataTypeMismatch].errorSubClass ==
+      "UNEXPECTED_NULL")
+  }
+
   test("test dsl for complex type") {
     def quickResolve(u: UnresolvedExtractValue): Expression = {
       ExtractValue(u.child, u.extraction, _ == _)

Original file line number	Diff line number	Diff line change
`@@ -470,7 +470,11 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression with`
`470`	`470`	`case gsf: GetStructField => gsf.metadata`
`471`	`471`	`case _ => Metadata.empty`
`472`	`472`	`}`
`473`		`- StructField(name.toString, expr.dataType, expr.nullable, metadata)`
	`473`	`+ // A null field name is invalid input (checkInputDataTypes flags it as UNEXPECTED_NULL),`
	`474`	`+ // but dataType is evaluated eagerly by the encoder before type checking; keep it null-safe`
	`475`	`+ // and preserve the null name rather than throwing a NullPointerException (SPARK-57736).`
	`476`	`+ StructField(if (name == null) null else name.toString, expr.dataType, expr.nullable,`
	`477`	`+ metadata)`
`474`	`478`	`}`
`475`	`479`	`StructType(fields)`
`476`	`480`	`}`