[SPARK-48091][SQL] Preserve aliases inside lambda when ExtractGenerator restructures plan

shrirangmhalgi · cloud-fan · commit c259da2ed506 · 2026-05-22T18:38:13.000+08:00
### What changes were proposed in this pull request? Fix `ExtractGenerator` to preserve aliases inside lambda functions when restructuring the plan. Previously, `ExtractGenerator` called `trimNonTopLevelAliases` on all expressions in the project list before extracting the generator. This stripped aliases inside lambda functions (e.g., struct(x.as("data"))) before `CreateStruct` could resolve them into struct field names. The fix uses `trimNonTopLevelAliases` only for pattern matching (to detect generators via `AliasedGenerator`), but preserves the original untrimmed expression for non-generator project items. ### Why are the changes needed? When using explode together with transform in the same `select statement`, aliases used inside the transformed column's `struct()` are ignored. Field names become auto-generated (x_1, x_2) instead of the user-specified alias. This only happens with the DataFrame/Dataset API, not with SQL. ### Does this PR introduce _any_ user-facing change? Yes. Struct field aliases inside transform lambdas are now correctly preserved when explode (or any generator) is in the same `select`. ### How was this patch tested? Added a test in `GeneratorFunctionSuite` verifying that struct field aliases are preserved when explode and transform are used together, including single and multiple aliases. ### Was this patch authored or co-authored using generative AI tooling? Yes. Closes #55892 from shrirangmhalgi/SPARK-48091-explode-transform-alias. Authored-by: Shrirang Mhalgi <shrirangmhalgi@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit ccdb31a) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import scala.annotation.tailrec
 
-import org.apache.spark.sql.catalyst.analysis.MultiAlias
+import org.apache.spark.sql.catalyst.analysis.{MultiAlias, UnresolvedFunction}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project}
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
@@ -112,6 +112,10 @@ trait AliasHelper {
   }
 
   protected def trimAliases(e: Expression): Expression = e match {
+    // SPARK-48091: Do not descend into unresolved function calls. Aliases inside them
+    // (e.g., UnresolvedFunction("struct", Seq(Alias(x, "data")))) carry semantic information
+    // that ResolveFunctions -> CreateStruct.apply consumes to produce field names.
+    case u: UnresolvedFunction => u
     // The children of `CreateNamedStruct` may use `Alias` to carry metadata and we should not
     // trim them.
     case c: CreateNamedStruct => c.mapChildren {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.trees.LeafLike
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
-import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
 
 class GeneratorFunctionSuite extends SharedSparkSession {
   import testImplicits._
@@ -765,6 +765,37 @@ class GeneratorFunctionSuite extends SharedSparkSession {
       Seq(Row(0, 10, 0, 10), Row(1, 20, 1, 20))
     )
   }
+
+  test("SPARK-48091: explode with transform should preserve struct field aliases") {
+    val df = spark.createDataFrame(Seq((1, Array(1, 2, 3), Array(4, 5, 6))))
+      .toDF("id", "my_array", "my_array2")
+
+    // Without explode - aliases should work (baseline)
+    val good = df.select(
+      transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct")
+    )
+    assert(good.schema("my_struct").dataType.asInstanceOf[ArrayType]
+      .elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data"))
+
+    // With explode in same select - aliases should still be preserved
+    val result = df.select(
+      explode(col("my_array")).as("exploded"),
+      transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct")
+    )
+    assert(result.schema("my_struct").dataType.asInstanceOf[ArrayType]
+      .elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data"))
+
+    // Multiple aliases inside struct
+    val result2 = df.select(
+      explode(col("my_array")).as("exploded"),
+      transform(col("my_array2"),
+        x => struct(x.as("value"), col("id").as("key"))
+      ).as("my_struct")
+    )
+    val fields2 = result2.schema("my_struct").dataType.asInstanceOf[ArrayType]
+      .elementType.asInstanceOf[StructType].fieldNames.toSeq
+    assert(fields2 === Seq("value", "key"))
+  }
 }
 
 case class EmptyGenerator() extends Generator with LeafLike[Expression] {