apache
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteMergeIntoTable.scala‎
Lines changed: 6 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteMergeIntoTable.scala‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteUpdateTable.scala‎
Lines changed: 2 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteUpdateTable.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala‎
Lines changed: 4 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala‎
Lines changed: 28 additions & 2 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala‎
Lines changed: 2 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala‎
Lines changed: 9 additions & 9 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/OptimizeMetadataOnlyDeleteFromTable.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/OptimizeMetadataOnlyDeleteFromTable.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala‎
Lines changed: 52 additions & 41 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala‎
Lines changed: 52 additions & 41 deletions
@@ -295,7 +295,12 @@ object RewriteMergeIntoTable extends RewriteRowLevelCommand with PredicateHelper
     // build a plan to write the row delta to the table
     val writeRelation = relation.copy(table = operationTable)
     val projections = buildWriteDeltaProjections(mergeRowsPlan, rowAttrs, rowIdAttrs, metadataAttrs)
-    WriteDelta(writeRelation, cond, mergeRowsPlan, relation, projections)
+    val groupFilterCond = if (notMatchedBySourceActions.isEmpty && groupFilterEnabled) {
+      Some(toGroupFilterCondition(relation, source, cond))
+    } else {
+      None
+    }
+    WriteDelta(writeRelation, cond, mergeRowsPlan, relation, projections, groupFilterCond)
   }
 
   private def chooseWriteDeltaJoinType(
 
@@ -174,7 +174,8 @@ object RewriteUpdateTable extends RewriteRowLevelCommand {
     // build a plan to write the row delta to the table
     val writeRelation = relation.copy(table = operationTable)
     val projections = buildWriteDeltaProjections(rowDeltaPlan, rowAttrs, rowIdAttrs, metadataAttrs)
-    WriteDelta(writeRelation, cond, rowDeltaPlan, relation, projections)
+    val groupFilterCond = if (groupFilterEnabled) Some(cond) else None
+    WriteDelta(writeRelation, cond, rowDeltaPlan, relation, projections, groupFilterCond)
   }
 
   // this method assumes the assignments have been already aligned before
 
@@ -60,7 +60,10 @@ object ReplaceNullWithFalseInPredicate extends Rule[LogicalPlan] {
       val newCond = replaceNullWithFalse(cond)
       val newGroupFilterCond = groupFilterCond.map(replaceNullWithFalse)
       rd.copy(condition = newCond, groupFilterCondition = newGroupFilterCond)
-    case wd @ WriteDelta(_, cond, _, _, _, _) => wd.copy(condition = replaceNullWithFalse(cond))
+    case wd @ WriteDelta(_, cond, _, _, _, groupFilterCond, _) =>
+      val newCond = replaceNullWithFalse(cond)
+      val newGroupFilterCond = groupFilterCond.map(replaceNullWithFalse)
+      wd.copy(condition = newCond, groupFilterCondition = newGroupFilterCond)
     case d @ DeleteFromTable(_, cond) => d.copy(condition = replaceNullWithFalse(cond))
     case u @ UpdateTable(_, _, Some(cond)) => u.copy(condition = Some(replaceNullWithFalse(cond)))
     case m: MergeIntoTable =>
 
@@ -432,7 +432,7 @@ object ExtractSingleColumnNullAwareAntiJoin extends JoinSelectionHelper with Pre
  *  - the read relation that can be either [[DataSourceV2Relation]] or [[DataSourceV2ScanRelation]]
  *  depending on whether the planning has already happened;
  */
-object GroupBasedRowLevelOperation {
+object GroupBasedRowLevelOperation extends RowLevelOperationExtractor {
   type ReturnType = (ReplaceData, Expression, Option[Expression], LogicalPlan)
 
   def unapply(plan: LogicalPlan): Option[ReturnType] = plan match {
@@ -445,8 +445,34 @@ object GroupBasedRowLevelOperation {
     case _ =>
       None
   }
+}
+
+/**
+ * An extractor for row-level commands such as DELETE, UPDATE, MERGE that were rewritten using plans
+ * that operate on individual rows (row deltas).
+ *
+ * This class extracts the following entities:
+ *  - the delta-based rewrite plan;
+ *  - the condition that defines matching rows;
+ *  - the group filter condition;
+ *  - the read relation that can be either [[DataSourceV2Relation]] or [[DataSourceV2ScanRelation]]
+ *  depending on whether the planning has already happened;
+ */
+object DeltaBasedRowLevelOperation extends RowLevelOperationExtractor {
+  type ReturnType = (WriteDelta, Expression, Option[Expression], LogicalPlan)
+
+  def unapply(plan: LogicalPlan): Option[ReturnType] = plan match {
+    case wd @ WriteDelta(ExtractV2Table(table), cond, query, _, _, groupFilterCond, _) =>
+      val readRelation = findReadRelation(table, query, allowMultipleReads = false)
+      readRelation.map((wd, cond, groupFilterCond, _))
+
+    case _ =>
+      None
+  }
+}
 
-  private def findReadRelation(
+trait RowLevelOperationExtractor {
+  protected def findReadRelation(
       table: Table,
       plan: LogicalPlan,
       allowMultipleReads: Boolean): Option[LogicalPlan] = {
 
@@ -425,6 +425,7 @@ case class ReplaceData(
  * @param query a query with a delta of records that should written
  * @param originalTable a plan for the original table for which the row-level command was triggered
  * @param projections projections for row ID, row, metadata attributes
+ * @param groupFilterCondition a condition that can be used to filter groups at runtime
  * @param write a logical write, if already constructed
  */
 case class WriteDelta(
@@ -433,6 +434,7 @@ case class WriteDelta(
     query: LogicalPlan,
     originalTable: NamedRelation,
     projections: WriteDeltaProjections,
+    groupFilterCondition: Option[Expression] = None,
     write: Option[DeltaWrite] = None) extends RowLevelWrite {
 
   override val isByName: Boolean = false
 
@@ -746,15 +746,15 @@ object SQLConf {
 
   val RUNTIME_ROW_LEVEL_OPERATION_GROUP_FILTER_ENABLED =
     buildConf("spark.sql.optimizer.runtime.rowLevelOperationGroupFilter.enabled")
-      .doc("Enables runtime group filtering for group-based row-level operations. " +
-        "Data sources that replace groups of data (e.g. files, partitions) may prune entire " +
-        "groups using provided data source filters when planning a row-level operation scan. " +
-        "However, such filtering is limited as not all expressions can be converted into data " +
-        "source filters and some expressions can only be evaluated by Spark (e.g. subqueries). " +
-        "Since rewriting groups is expensive, Spark can execute a query at runtime to find what " +
-        "records match the condition of the row-level operation. The information about matching " +
-        "records will be passed back to the row-level operation scan, allowing data sources to " +
-        "discard groups that don't have to be rewritten.")
+      .doc("Enables runtime filtering for group-based and delta-based row-level operations. " +
+        "Data sources may prune entire file groups at runtime when planning a row-level " +
+        "operation scan. Planning-time filter pushdown is limited as not all expressions can " +
+        "be converted into data source filters and some expressions can only be evaluated by " +
+        "Spark (e.g. subqueries). Since rewriting groups or scanning unnecessary files is " +
+        "expensive, Spark can execute a lightweight query at runtime to find what records match " +
+        "the condition of the row-level operation. The information about matching records will " +
+        "be passed back to the row-level operation scan, allowing data sources to skip files " +
+        "that don't have to be processed.")
       .version("3.4.0")
       .booleanConf
       .createWithDefault(true)
 
@@ -525,7 +525,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat
         r.name) :: Nil
 
     case wd @ WriteDelta(_: DataSourceV2Relation, _, query, r: DataSourceV2Relation, projections,
-        Some(write)) =>
+        _, Some(write)) =>
       WriteDeltaExec(
         planLater(query),
         refreshCache(r), // use the original relation to refresh the cache
 
@@ -114,7 +114,7 @@ object OptimizeMetadataOnlyDeleteFromTable extends Rule[LogicalPlan] with Predic
         val command = rd.operation.command
         Some(rd, command, cond, originalTable)
 
-      case wd @ WriteDelta(_, cond, _, originalTable, _, _) =>
+      case wd @ WriteDelta(_, cond, _, originalTable, _, _, _) =>
         val command = wd.operation.command
         Some(wd, command, cond, originalTable)
 
 
@@ -113,7 +113,7 @@ object V2Writes extends Rule[LogicalPlan] with PredicateHelper {
       val newQuery = DistributionAndOrderingUtils.prepareQuery(write, query, r.funCatalog)
       rd.copy(write = Some(write), query = newQuery)
 
-    case wd @ WriteDelta(r: DataSourceV2Relation, _, query, _, projections, None) =>
+    case wd @ WriteDelta(r: DataSourceV2Relation, _, query, _, projections, _, None) =>
       val writeOptions = mergeOptions(Map.empty, r.options.asCaseSensitiveMap.asScala.toMap)
       val deltaWriteBuilder = newDeltaWriteBuilder(r.table, writeOptions, projections)
       val deltaWrite = deltaWriteBuilder.build()
 
@@ -21,11 +21,10 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, DynamicPruningExpression, Expression, InSubquery, ListQuery, PredicateHelper, V2ExpressionUtils}
 import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral
 import org.apache.spark.sql.catalyst.optimizer.RewritePredicateSubquery
-import org.apache.spark.sql.catalyst.planning.GroupBasedRowLevelOperation
-import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, LogicalPlan}
+import org.apache.spark.sql.catalyst.planning.{DeltaBasedRowLevelOperation, GroupBasedRowLevelOperation}
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, LogicalPlan, RowLevelWrite}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.connector.read.SupportsRuntimeV2Filtering
-import org.apache.spark.sql.connector.write.RowLevelOperation.Command
 import org.apache.spark.sql.connector.write.RowLevelOperation.Command.{DELETE, MERGE, UPDATE}
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Implicits, DataSourceV2Relation, DataSourceV2ScanRelation, ExtractV2Scan}
 import org.apache.spark.util.ArrayImplicits._
@@ -34,66 +33,78 @@ import org.apache.spark.util.ArrayImplicits._
  * A rule that assigns a subquery to filter groups in row-level operations at runtime.
  *
  * Data skipping during job planning for row-level operations is limited to expressions that can be
- * converted to data source filters. Since not all expressions can be pushed down that way and
- * rewriting groups is expensive, Spark allows data sources to filter group at runtime.
- * If the primary scan in a group-based row-level operation supports runtime filtering, this rule
- * will inject a subquery to find all rows that match the condition so that data sources know
- * exactly which groups must be rewritten.
+ * converted to data source filters. Since not all expressions can be pushed down that way, Spark
+ * allows data sources to filter groups at runtime. If the primary scan in a row-level operation
+ * supports runtime filtering, this rule will inject a subquery to find all rows that match the
+ * condition so that data sources know exactly which groups have changes.
  *
- * Note this rule only applies to group-based row-level operations.
+ * Note that this rule is also beneficial for operations that deal with deltas of rows. Even if
+ * the data source is capable of handling specific changes, it is useful to first discard entire
+ * groups that are not modified. The cost of the runtime query is small as it only projects columns
+ * required to evaluate the row level operation condition. The main scan, on the other hand, must
+ * project all columns, meaning the cost of reading unaffected groups can dominate the runtime.
  */
 class RowLevelOperationRuntimeGroupFiltering(optimizeSubqueries: Rule[LogicalPlan])
   extends Rule[LogicalPlan] with PredicateHelper {
 
   import DataSourceV2Implicits._
 
   override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
-    // apply special dynamic filtering only for group-based row-level operations
     case GroupBasedRowLevelOperation(replaceData, _, Some(cond),
-        ExtractV2Scan(scan: SupportsRuntimeV2Filtering))
-        if conf.runtimeRowLevelOperationGroupFilterEnabled && cond != TrueLiteral
-          && scan.filterAttributes().nonEmpty =>
-
-      // use reference equality on scan to find required scan relations
-      val newQuery = replaceData.query transformUp {
-        case r: DataSourceV2ScanRelation if r.scan eq scan =>
-          // use the original table instance that was loaded for this row-level operation
-          // in order to leverage a regular batch scan in the group filter query
-          val originalTable = r.relation.table.asRowLevelOperationTable.table
-          val relation = r.relation.copy(table = originalTable)
-          val tableAttrs = replaceData.table.output
-          val command = replaceData.operation.command
-          val matchingRowsPlan = buildMatchingRowsPlan(relation, cond, tableAttrs, command)
-
-          val filterAttrs = scan.filterAttributes.toImmutableArraySeq
-          val buildKeys = V2ExpressionUtils.resolveRefs[Attribute](filterAttrs, matchingRowsPlan)
-          val pruningKeys = V2ExpressionUtils.resolveRefs[Attribute](filterAttrs, r)
-          val dynamicPruningCond = buildDynamicPruningCond(matchingRowsPlan, buildKeys, pruningKeys)
-
-          Filter(dynamicPruningCond, r)
-      }
-
-      // optimize subqueries to rewrite them as joins and trigger job planning
-      replaceData.copy(query = optimizeSubqueries(newQuery))
+        ExtractV2Scan(scan: SupportsRuntimeV2Filtering)) if canInjectGroupFilters(cond, scan) =>
+      injectGroupFilters(replaceData, cond, scan)
+
+    case DeltaBasedRowLevelOperation(writeDelta, _, Some(cond),
+        ExtractV2Scan(scan: SupportsRuntimeV2Filtering)) if canInjectGroupFilters(cond, scan) =>
+      injectGroupFilters(writeDelta, cond, scan)
+  }
+
+  private def canInjectGroupFilters(
+      cond: Expression,
+      scan: SupportsRuntimeV2Filtering): Boolean = {
+    conf.runtimeRowLevelOperationGroupFilterEnabled &&
+      cond != TrueLiteral &&
+      scan.filterAttributes.nonEmpty
+  }
+
+  private def injectGroupFilters(
+      write: RowLevelWrite,
+      cond: Expression,
+      scan: SupportsRuntimeV2Filtering): LogicalPlan = {
+    // use reference equality on scan to find required scan relations
+    val newQuery = write.query transformUp {
+      case r: DataSourceV2ScanRelation if r.scan eq scan =>
+        // use the original table instance that was loaded for this row-level operation
+        // in order to leverage a regular batch scan in the group filter query
+        val originalTable = r.relation.table.asRowLevelOperationTable.table
+        val relation = r.relation.copy(table = originalTable)
+        val matchingRowsPlan = buildMatchingRowsPlan(write, relation, cond)
+        val filterAttrs = scan.filterAttributes.toImmutableArraySeq
+        val buildKeys = V2ExpressionUtils.resolveRefs[Attribute](filterAttrs, matchingRowsPlan)
+        val pruningKeys = V2ExpressionUtils.resolveRefs[Attribute](filterAttrs, r)
+        Filter(buildDynamicPruningCond(matchingRowsPlan, buildKeys, pruningKeys), r)
+    }
+    // optimize subqueries to rewrite them as joins and trigger job planning
+    write.withNewQuery(optimizeSubqueries(newQuery))
   }
 
   private def buildMatchingRowsPlan(
+      write: RowLevelWrite,
       relation: DataSourceV2Relation,
-      cond: Expression,
-      tableAttrs: Seq[Attribute],
-      command: Command): LogicalPlan = {
+      cond: Expression): LogicalPlan = {
 
-    val matchingRowsPlan = command match {
+    val matchingRowsPlan = write.operation.command match {
       case DELETE =>
         Filter(cond, relation)
 
       case UPDATE =>
-        // UPDATEs with subqueries are rewritten using UNION with two identical scan relations
+        // UPDATEs with subqueries can be rewritten using UNION with two identical scan relations
         // the analyzer assigns fresh expr IDs for one of them so that attributes don't collide
         // this rule assigns runtime filters to both scan relations (will be shared at runtime)
         // and must transform the runtime filter condition to use correct expr IDs for each relation
+        // note this only applies to group-based row-level operations (i.e. ReplaceData)
         // see RewriteUpdateTable for more details
-        val attrMap = buildTableToScanAttrMap(tableAttrs, relation.output)
+        val attrMap = buildTableToScanAttrMap(write.table.output, relation.output)
         val transformedCond = cond transform {
           case attr: AttributeReference if attrMap.contains(attr) => attrMap(attr)
         }
Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,8 @@ object RewriteUpdateTable extends RewriteRowLevelCommand {`
`174`	`174`	`// build a plan to write the row delta to the table`
`175`	`175`	`val writeRelation = relation.copy(table = operationTable)`
`176`	`176`	`val projections = buildWriteDeltaProjections(rowDeltaPlan, rowAttrs, rowIdAttrs, metadataAttrs)`
`177`		`- WriteDelta(writeRelation, cond, rowDeltaPlan, relation, projections)`
	`177`	`+ val groupFilterCond = if (groupFilterEnabled) Some(cond) else None`
	`178`	`+ WriteDelta(writeRelation, cond, rowDeltaPlan, relation, projections, groupFilterCond)`
`178`	`179`	`}`
`179`	`180`
`180`	`181`	`// this method assumes the assignments have been already aligned before`