[SPARK-57858][SQL] Emit BIN BY scaled DISTRIBUTE columns as produced attributes

vranes · vranes · commit e417f4b5e9f6 · 2026-07-01T17:23:11.000Z
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/BinByResolution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/BinByResolution.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.analysis
 import scala.collection.mutable
 import scala.util.control.NonFatal
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, EmptyRow, Expression, ExprId}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, EmptyRow, Expression, ExprId}
+import org.apache.spark.sql.catalyst.plans.logical.BinByOutputAliases
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.internal.SQLConf
@@ -154,4 +155,23 @@ object BinByResolution {
       timeZoneId = if (isLTZ) Some(sessionZone) else None
     )
   }
+
+  /**
+   * Builds the three appended output attributes (`bin_start`, `bin_end`, `bin_distribute_ratio`),
+   * applying any user renames from `aliases`. `rangeType` is the RANGE column type carried by
+   * `bin_start` / `bin_end`.
+   */
+  def appendedAttributesWithAliases(
+      rangeType: DataType,
+      aliases: BinByOutputAliases): Seq[Attribute] = Seq(
+    AttributeReference(aliases.effectiveBinStart, rangeType, nullable = true)(),
+    AttributeReference(aliases.effectiveBinEnd, rangeType, nullable = true)(),
+    AttributeReference(aliases.effectiveBinRatio, DoubleType, nullable = true)())
+
+  /**
+   * Mints a produced output attribute for each DISTRIBUTE input column: same name, type, and
+   * nullability, but a fresh `ExprId` so the rescaled value is a distinct attribute from the input.
+   */
+  def scaledDistributeAttributes(distributeColumns: Seq[Attribute]): Seq[Attribute] =
+    distributeColumns.map(a => AttributeReference(a.name, a.dataType, a.nullable)())
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala
@@ -202,8 +202,9 @@ object DeduplicateRelations extends Rule[LogicalPlan] {
         existingRelations,
         b,
         _.producedAttributes.map(_.exprId.id).toSeq,
-        newBinBy => newBinBy.copy(appendedAttributes =
-          newBinBy.appendedAttributes.map(_.newInstance())))
+        newBinBy => newBinBy.copy(
+          scaledDistributeColumns = newBinBy.scaledDistributeColumns.map(_.newInstance()),
+          appendedAttributes = newBinBy.appendedAttributes.map(_.newInstance())))
 
     case e: Expand =>
       deduplicateAndRenew[Expand](
@@ -470,6 +471,7 @@ object DeduplicateRelations extends Rule[LogicalPlan] {
       case oldVersion: BinBy
           if oldVersion.producedAttributes.intersect(conflictingAttributes).nonEmpty =>
         val newVersion = oldVersion.copy(
+          scaledDistributeColumns = oldVersion.scaledDistributeColumns.map(_.newInstance()),
           appendedAttributes = oldVersion.appendedAttributes.map(_.newInstance()))
         newVersion.copyTagsFrom(oldVersion)
         Seq((oldVersion, newVersion))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveBinBy.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveBinBy.scala
@@ -65,14 +65,16 @@ object ResolveBinBy extends Rule[LogicalPlan] {
       originExpr = b.originExpr)
 
     val appendedAttributes =
-      BinBy.appendedAttributesWithAliases(parameters.rangeType, b.outputAliases)
+      BinByResolution.appendedAttributesWithAliases(parameters.rangeType, b.outputAliases)
+    val scaledDistributeColumns = BinByResolution.scaledDistributeAttributes(distributeAttributes)
 
     BinBy(
       binWidthMicros = parameters.binWidthMicros,
       rangeStart = rangeStart,
       rangeEnd = rangeEnd,
       originMicros = parameters.originMicros,
       distributeColumns = distributeAttributes,
+      scaledDistributeColumns = scaledDistributeColumns,
       appendedAttributes = appendedAttributes,
       child = child,
       timeZoneId = parameters.timeZoneId)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -1798,8 +1798,14 @@ case class UnresolvedBinBy(
  * @param rangeEnd            Resolved attribute holding each row's window-end timestamp.
  * @param originMicros        Alignment anchor in microseconds since the epoch: the folded value of
  *                            `ALIGN TO`, or the type-specific default when the clause is omitted.
- * @param distributeColumns   Resolved columns to proportionally redistribute.
- * @param appendedAttributes  The three output attributes appended after `child.output`.
+ * @param distributeColumns   Resolved input columns to proportionally redistribute. Read by the
+ *                            operator to compute the rescaled values; not part of `output`.
+ * @param scaledDistributeColumns
+ *                            Produced output attributes (fresh `ExprId`s, same names and types as
+ *                            `distributeColumns`) holding the rescaled values. They take the place
+ *                            of `distributeColumns` in `output` so the rescaled value carries a
+ *                            distinct identity from the input and cannot be mistaken for it.
+ * @param appendedAttributes  The three output attributes appended after the child columns.
  * @param child               Input relation.
  * @param timeZoneId          Captured session local time zone for LTZ inputs; `None` for NTZ.
  *                            Required when `rangeStart.dataType` is `TimestampType`; must be
@@ -1811,6 +1817,7 @@ case class BinBy(
     rangeEnd: Attribute,
     originMicros: Long,
     distributeColumns: Seq[Attribute],
+    scaledDistributeColumns: Seq[Attribute],
     appendedAttributes: Seq[Attribute],
     child: LogicalPlan,
     timeZoneId: Option[String])
@@ -1822,25 +1829,28 @@ case class BinBy(
         s"${rangeStart.dataType}, timeZoneId=$timeZoneId")
   }
 
-  override def output: Seq[Attribute] = child.output ++ appendedAttributes
+  assert(distributeColumns.length == scaledDistributeColumns.length,
+    "BinBy requires one scaled attribute per DISTRIBUTE column, got " +
+      s"${distributeColumns.length} distribute columns and " +
+      s"${scaledDistributeColumns.length} scaled attributes")
 
-  override def producedAttributes: AttributeSet = AttributeSet(appendedAttributes)
+  // Each DISTRIBUTE input column is swapped in `output` for its scaled produced counterpart
+  // (fresh identity); the input itself stays on the node for the executor but is not forwarded.
+  private lazy val distributeReplacements: AttributeMap[Attribute] =
+    AttributeMap(distributeColumns.zip(scaledDistributeColumns))
+
+  override def output: Seq[Attribute] =
+    child.output.map(a => distributeReplacements.getOrElse(a, a)) ++ appendedAttributes
+
+  override def producedAttributes: AttributeSet =
+    AttributeSet(scaledDistributeColumns ++ appendedAttributes)
 
   final override val nodePatterns: Seq[TreePattern] = Seq(BIN_BY)
 
   override protected def withNewChildInternal(newChild: LogicalPlan): BinBy =
     copy(child = newChild)
 }
 
-object BinBy {
-  def appendedAttributesWithAliases(
-      rangeType: DataType,
-      aliases: BinByOutputAliases): Seq[Attribute] = Seq(
-    AttributeReference(aliases.effectiveBinStart, rangeType, nullable = true)(),
-    AttributeReference(aliases.effectiveBinEnd, rangeType, nullable = true)(),
-    AttributeReference(aliases.effectiveBinRatio, DoubleType, nullable = true)())
-}
-
 /**
  * A logical plan node for creating a logical limit, which is split into two separate logical nodes:
  * a [[LocalLimit]], which is a partition local limit, followed by a [[GlobalLimit]].
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveBinBySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveBinBySuite.scala
@@ -202,6 +202,73 @@ class ResolveBinBySuite extends AnalysisTest {
     assert(bi.distributeColumns.map(_.exprId) == Seq(value.exprId))
   }
 
+  test("rescaled DISTRIBUTE columns are produced attributes that shadow the input") {
+    val bi = ResolveBinBy.apply(unresolved()).asInstanceOf[BinBy]
+
+    // The input column is still read (held in distributeColumns) but is not forwarded by identity.
+    assert(bi.distributeColumns.map(_.exprId) == Seq(value.exprId))
+    assert(!bi.output.exists(_.exprId == value.exprId))
+
+    // The output `value` keeps its name, type, and position, but has a fresh exprId, and it is a
+    // produced attribute. This is what prevents the rescaled value from being confused with the
+    // input by any rule that reasons on exprId (predicate pushdown, constraints, CSE).
+    val outValue = bi.output(ltzChild.output.indexWhere(_.exprId == value.exprId))
+    assert(outValue.name == "value")
+    assert(outValue.dataType == DoubleType)
+    assert(outValue.exprId != value.exprId)
+    assert(bi.scaledDistributeColumns.map(_.exprId) == Seq(outValue.exprId))
+    assert(bi.producedAttributes.contains(outValue))
+
+    // Forwarded (non-distribute) columns keep their identity.
+    assert(bi.output.exists(_.exprId == label.exprId))
+    assert(bi.output.exists(_.exprId == tsStart.exprId))
+  }
+
+  test("each of multiple DISTRIBUTE columns is replaced in place with a distinct fresh id") {
+    val value2 = $"value2".double
+    val child = LocalRelation(tsStart, tsEnd, value, value2, label)
+    val bi = ResolveBinBy.apply(
+      unresolved(child = child, distribute = Seq(value, value2))).asInstanceOf[BinBy]
+
+    assert(bi.distributeColumns.map(_.exprId) == Seq(value.exprId, value2.exprId))
+    assert(bi.scaledDistributeColumns.length == 2)
+
+    // Each input column is replaced at its own position by a fresh-id, same-name attribute.
+    Seq(value, value2).foreach { in =>
+      val pos = child.output.indexWhere(_.exprId == in.exprId)
+      val out = bi.output(pos)
+      assert(out.name == in.name)
+      assert(out.exprId != in.exprId)
+      assert(!bi.output.exists(_.exprId == in.exprId))
+    }
+
+    // The two scaled columns are distinct; non-distribute columns keep their identity.
+    assert(bi.scaledDistributeColumns.map(_.exprId).distinct.length == 2)
+    assert(bi.output.exists(_.exprId == label.exprId))
+    assert(bi.output.exists(_.exprId == tsStart.exprId))
+  }
+
+  test("rescaled DISTRIBUTE column drops the input qualifier and metadata (computed value)") {
+    // The rescaled column is a computed value, not a rename, so it must not inherit the input's
+    // qualifier or metadata (else stale value-derived metadata such as ML min/max could ride along).
+    val md = new MetadataBuilder().putString("comment", "a measure").build()
+    val qualifiedValue = AttributeReference("value", DoubleType, nullable = true, md)()
+    val child = SubqueryAlias("m", LocalRelation(tsStart, tsEnd, qualifiedValue))
+    val bi = ResolveBinBy.apply(
+      unresolved(child = child, distribute = Seq(UnresolvedAttribute(Seq("m", "value")))))
+      .asInstanceOf[BinBy]
+
+    // The resolved input carries the qualifier and metadata...
+    assert(bi.distributeColumns.head.qualifier == Seq("m"))
+    assert(bi.distributeColumns.head.metadata == md)
+
+    // ...but the produced output column drops both and has a fresh id.
+    val outValue = bi.output.find(_.name == "value").get
+    assert(outValue.exprId != qualifiedValue.exprId)
+    assert(outValue.qualifier.isEmpty)
+    assert(outValue.metadata == Metadata.empty)
+  }
+
   test("multipart identifiers disambiguate same-name columns across a JOIN") {
     val t1Start = AttributeReference("ts_start", TimestampType, nullable = true)()
     val t1End = AttributeReference("ts_end", TimestampType, nullable = true)()
@@ -330,9 +397,12 @@ class ResolveBinBySuite extends AnalysisTest {
 
     val binBys = analyzed.collect { case b: BinBy => b }
     assert(binBys.size == 2, s"expected two BinBy nodes, got ${binBys.size}")
-    val appendedExprIds = binBys.flatMap(_.appendedAttributes.map(_.exprId))
-    assert(appendedExprIds.distinct.size == appendedExprIds.size,
-      "appended BinBy attributes must have distinct exprIds across the two join sides")
+    // All produced attributes (the scaled DISTRIBUTE columns plus the three appended ones) must be
+    // renewed on one side, so both dedup phases have to cover them.
+    val producedExprIds = binBys.flatMap(b =>
+      (b.scaledDistributeColumns ++ b.appendedAttributes).map(_.exprId))
+    assert(producedExprIds.distinct.size == producedExprIds.size,
+      "produced BinBy attributes must have distinct exprIds across the two join sides")
   }
 
   // `super.test` escapes the suite-wide flag-on wrapper; pin the flag off explicitly.