Skip to content

Commit c259da2

Browse files
shrirangmhalgicloud-fan
authored andcommitted
[SPARK-48091][SQL] Preserve aliases inside lambda when ExtractGenerator restructures plan
### What changes were proposed in this pull request? Fix `ExtractGenerator` to preserve aliases inside lambda functions when restructuring the plan. Previously, `ExtractGenerator` called `trimNonTopLevelAliases` on all expressions in the project list before extracting the generator. This stripped aliases inside lambda functions (e.g., struct(x.as("data"))) before `CreateStruct` could resolve them into struct field names. The fix uses `trimNonTopLevelAliases` only for pattern matching (to detect generators via `AliasedGenerator`), but preserves the original untrimmed expression for non-generator project items. ### Why are the changes needed? When using explode together with transform in the same `select statement`, aliases used inside the transformed column's `struct()` are ignored. Field names become auto-generated (x_1, x_2) instead of the user-specified alias. This only happens with the DataFrame/Dataset API, not with SQL. ### Does this PR introduce _any_ user-facing change? Yes. Struct field aliases inside transform lambdas are now correctly preserved when explode (or any generator) is in the same `select`. ### How was this patch tested? Added a test in `GeneratorFunctionSuite` verifying that struct field aliases are preserved when explode and transform are used together, including single and multiple aliases. ### Was this patch authored or co-authored using generative AI tooling? Yes. Closes #55892 from shrirangmhalgi/SPARK-48091-explode-transform-alias. Authored-by: Shrirang Mhalgi <shrirangmhalgi@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit ccdb31a) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 5dacfe3 commit c259da2

2 files changed

Lines changed: 37 additions & 2 deletions

File tree

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
1919

2020
import scala.annotation.tailrec
2121

22-
import org.apache.spark.sql.catalyst.analysis.MultiAlias
22+
import org.apache.spark.sql.catalyst.analysis.{MultiAlias, UnresolvedFunction}
2323
import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
2424
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project}
2525
import org.apache.spark.sql.catalyst.trees.CurrentOrigin
@@ -112,6 +112,10 @@ trait AliasHelper {
112112
}
113113

114114
protected def trimAliases(e: Expression): Expression = e match {
115+
// SPARK-48091: Do not descend into unresolved function calls. Aliases inside them
116+
// (e.g., UnresolvedFunction("struct", Seq(Alias(x, "data")))) carry semantic information
117+
// that ResolveFunctions -> CreateStruct.apply consumes to produce field names.
118+
case u: UnresolvedFunction => u
115119
// The children of `CreateNamedStruct` may use `Alias` to carry metadata and we should not
116120
// trim them.
117121
case c: CreateNamedStruct => c.mapChildren {

sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.trees.LeafLike
2525
import org.apache.spark.sql.functions._
2626
import org.apache.spark.sql.internal.SQLConf
2727
import org.apache.spark.sql.test.SharedSparkSession
28-
import org.apache.spark.sql.types.{IntegerType, StructType}
28+
import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
2929

3030
class GeneratorFunctionSuite extends SharedSparkSession {
3131
import testImplicits._
@@ -765,6 +765,37 @@ class GeneratorFunctionSuite extends SharedSparkSession {
765765
Seq(Row(0, 10, 0, 10), Row(1, 20, 1, 20))
766766
)
767767
}
768+
769+
test("SPARK-48091: explode with transform should preserve struct field aliases") {
770+
val df = spark.createDataFrame(Seq((1, Array(1, 2, 3), Array(4, 5, 6))))
771+
.toDF("id", "my_array", "my_array2")
772+
773+
// Without explode - aliases should work (baseline)
774+
val good = df.select(
775+
transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct")
776+
)
777+
assert(good.schema("my_struct").dataType.asInstanceOf[ArrayType]
778+
.elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data"))
779+
780+
// With explode in same select - aliases should still be preserved
781+
val result = df.select(
782+
explode(col("my_array")).as("exploded"),
783+
transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct")
784+
)
785+
assert(result.schema("my_struct").dataType.asInstanceOf[ArrayType]
786+
.elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data"))
787+
788+
// Multiple aliases inside struct
789+
val result2 = df.select(
790+
explode(col("my_array")).as("exploded"),
791+
transform(col("my_array2"),
792+
x => struct(x.as("value"), col("id").as("key"))
793+
).as("my_struct")
794+
)
795+
val fields2 = result2.schema("my_struct").dataType.asInstanceOf[ArrayType]
796+
.elementType.asInstanceOf[StructType].fieldNames.toSeq
797+
assert(fields2 === Seq("value", "key"))
798+
}
768799
}
769800

770801
case class EmptyGenerator() extends Generator with LeafLike[Expression] {

0 commit comments

Comments
 (0)