Introduce AutoCdcFlow and AutoCdcMergeFlow

AnishMahto · AnishMahto · commit d018f891d409 · 2026-05-21T01:26:59.000Z
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -209,9 +209,15 @@
     ],
     "sqlState" : "42703"
   },
-  "AUTOCDC_RESERVED_COLUMN_NAME_CONFLICT" : {
+  "AUTOCDC_KEY_NOT_IN_SELECTED_SCHEMA" : {
     "message" : [
-      "Using <caseSensitivity> column name comparison, the column `<columnName>` in the <schemaName> schema conflicts with the reserved AutoCDC column name `<reservedColumnName>`. Rename or remove the column."
+      "Using <caseSensitivity> column name comparison, the AutoCDC key column `<keyColumnName>` is not present in the flow's selected source schema. AutoCDC requires every key column to be present in the source change-data feed and retained by any configured column selection."
+    ],
+    "sqlState" : "22023"
+  },
+  "AUTOCDC_RESERVED_COLUMN_NAME_PREFIX_CONFLICT" : {
+    "message" : [
+      "The column `<columnName>` in the <schemaName> schema collides with the reserved AutoCDC column name prefix `<reservedColumnNamePrefix>` (using <caseSensitivity> column name comparison). Rename or remove the column."
     ],
     "sqlState" : "42710"
   },
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala
@@ -120,7 +120,7 @@ object ColumnSelection {
 }
 
 /** User-facing case-sensitivity labels surfaced in AutoCDC error messages. */
-private[autocdc] object CaseSensitivityLabels {
+private[pipelines] object CaseSensitivityLabels {
   val CaseSensitive: String = "case-sensitive"
   val CaseInsensitive: String = "case-insensitive"
 
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.pipelines.autocdc
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{functions => F, AnalysisException}
+import org.apache.spark.sql.{functions => F}
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.util.QuotingUtils
 import org.apache.spark.sql.classic.DataFrame
@@ -85,9 +85,6 @@ case class Scd1BatchProcessor(
    * column.
    */
   def extendMicrobatchRowsWithCdcMetadata(microbatchDf: DataFrame): DataFrame = {
-    // Proactively validate the reserved CDC metadata column does not exist in the microbatch.
-    validateCdcMetadataColumnNotPresent(microbatchDf)
-
     val rowDeleteSequence: Column = changeArgs.deleteCondition match {
       case Some(deleteCondition) =>
         F.when(deleteCondition, changeArgs.sequencing).otherwise(F.lit(null))
@@ -109,39 +106,26 @@ case class Scd1BatchProcessor(
       )
     )
   }
-
-  private def validateCdcMetadataColumnNotPresent(microbatchDf: DataFrame): Unit = {
-    val microbatchSqlConf = microbatchDf.sparkSession.sessionState.conf
-    val resolver = microbatchSqlConf.resolver
-
-    microbatchDf.schema.fieldNames
-      .find(resolver(_, Scd1BatchProcessor.cdcMetadataColName))
-      .foreach { conflictingColumnName =>
-        throw new AnalysisException(
-          errorClass = "AUTOCDC_RESERVED_COLUMN_NAME_CONFLICT",
-          messageParameters = Map(
-            "caseSensitivity" -> CaseSensitivityLabels.of(microbatchSqlConf.caseSensitiveAnalysis),
-            "columnName" -> conflictingColumnName,
-            "schemaName" -> "microbatch",
-            "reservedColumnName" -> Scd1BatchProcessor.cdcMetadataColName
-          )
-        )
-      }
-  }
 }
 
 object Scd1BatchProcessor {
-  // Columns prefixed with `__spark_autocdc_` are reserved for internal SDP AutoCDC processing.
-  private[autocdc] val winningRowColName: String = "__spark_autocdc_winning_row"
-  private[autocdc] val cdcMetadataColName: String = "__spark_autocdc_metadata"
+  /**
+   * Reserved column-name prefix for internal SDP AutoCDC processing. Source change-data-feed
+   * dataframes must not contain any columns starting with this prefix; the invariant is
+   * enforced at [[org.apache.spark.sql.pipelines.graph.AutoCdcMergeFlow]] construction.
+   */
+  private[pipelines] val reservedColumnNamePrefix: String = "__spark_autocdc_"
+
+  private[autocdc] val winningRowColName: String = s"${reservedColumnNamePrefix}winning_row"
+  private[pipelines] val cdcMetadataColName: String = s"${reservedColumnNamePrefix}metadata"
 
   private[autocdc] val cdcDeleteSequenceFieldName: String = "deleteSequence"
   private[autocdc] val cdcUpsertSequenceFieldName: String = "upsertSequence"
 
   /**
    * Schema of the CDC metadata struct column for SCD1.
    */
-  private def cdcMetadataColSchema(sequencingType: DataType): StructType =
+  private[pipelines] def cdcMetadataColSchema(sequencingType: DataType): StructType =
     StructType(
       Seq(
         // The sequencing of the event if it represents a delete, null otherwise.
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala
@@ -20,11 +20,19 @@ package org.apache.spark.sql.pipelines.graph
 import scala.util.Try
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier}
 import org.apache.spark.sql.classic.DataFrame
 import org.apache.spark.sql.pipelines.AnalysisWarning
+import org.apache.spark.sql.pipelines.autocdc.{
+  CaseSensitivityLabels,
+  ChangeArgs,
+  ColumnSelection,
+  Scd1BatchProcessor,
+  ScdType
+}
 import org.apache.spark.sql.pipelines.util.InputReadOptions
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StructField, StructType}
 
 /**
  * Contains the catalog and database context information for query execution.
@@ -121,15 +129,56 @@ case class FlowFunctionResult(
 }
 
 /** A [[Flow]] whose output schema and dependencies aren't known. */
-case class UnresolvedFlow(
+sealed trait UnresolvedFlow extends Flow {
+  /** Returns a copy of this flow with the given SQL confs overriding the existing ones. */
+  def withSqlConf(newSqlConf: Map[String, String]): UnresolvedFlow
+}
+
+/**
+ * An [[UnresolvedFlow]] whose execution-type has not yet been determined.
+ * 
+ * In some cases, we know the execution-type for an [[UnresolvedFlow]] even before flow analysis
+ * and resolution. For example an AutoCDCFlow is a special unresolved-but-typed flow; we know a
+ * flow will be an AutoCDC flow immediately on construction, because it has its own special
+ * registration API. Such flows are considered "typed flows", but there isn't any semantic reason
+ * yet to explicitly introduce a `TypedFlow` trait/class. 
+ */
+case class UntypedFlow(
     identifier: TableIdentifier,
     destinationIdentifier: TableIdentifier,
     func: FlowFunction,
     queryContext: QueryContext,
     sqlConf: Map[String, String],
     override val once: Boolean,
     override val origin: QueryOrigin
-) extends Flow
+) extends UnresolvedFlow {
+  override def withSqlConf(newSqlConf: Map[String, String]): UntypedFlow =
+    copy(sqlConf = newSqlConf)
+}
+
+/**
+ * An unresolved but typed that applies a CDC event stream to a target table via MERGE.
+ *
+ * [[AutoCdcFlow]] is a typed flow because it is only supported for streaming, and not as a once
+ * flow. Therefore by definition it is a streaming-type flow.
+ *
+ * In the future once-support for [[AutoCdcFlow]] may be added.
+ */
+case class AutoCdcFlow(
+    identifier: TableIdentifier,
+    destinationIdentifier: TableIdentifier,
+    func: FlowFunction,
+    queryContext: QueryContext,
+    sqlConf: Map[String, String] = Map.empty,
+    comment: Option[String] = None,
+    override val origin: QueryOrigin,
+    changeArgs: ChangeArgs
+) extends UnresolvedFlow {
+  override val once: Boolean = false
+
+  override def withSqlConf(newSqlConf: Map[String, String]): AutoCdcFlow =
+    copy(sqlConf = newSqlConf)
+}
 
 /**
  * A [[Flow]] whose flow function has been invoked, meaning either:
@@ -194,3 +243,108 @@ class AppendOnceFlow(
 
   override val once = true
 }
+
+/**
+ * A resolved flow that applies a CDC event stream to a target table via MERGE, in accordance to
+ * the configured [[flow.changeArgs]].
+ */
+class AutoCdcMergeFlow(
+    val flow: AutoCdcFlow,
+    val funcResult: FlowFunctionResult
+) extends ResolvedFlow {
+  requireReservedPrefixAbsentInSourceColumns()
+
+  def changeArgs: ChangeArgs = flow.changeArgs
+
+  /**
+   * Returns the augmented output schema of this flow, which can differ from the schema of the
+   * source change-data-feed dataframe.
+   *
+   * The source dataframe's schema describes the incoming CDC events; the augmented schema here
+   * applies the user-specified [[ColumnSelection]] and appends the SCD-specific metadata
+   * columns that the AutoCDC MERGE engine projects onto the target table. Downstream
+   * dependencies in the pipeline see this augmented schema.
+   */
+  override val schema: StructType = {
+    val userSelectedSchema = ColumnSelection.applyToSchema(
+      schemaName = "changeDataFeed",
+      schema = df.schema,
+      columnSelection = changeArgs.columnSelection,
+      caseSensitive = spark.sessionState.conf.caseSensitiveAnalysis
+    )
+
+    // AutoCDC flows require all key columns to be present in the target table, to adhere to SCD
+    // semantics.
+    requireKeysPresentInSelectedSchema(userSelectedSchema)
+
+    changeArgs.storedAsScdType match {
+      case ScdType.Type1 =>
+        // SCD1 produces a target table with all the user-selected output columns and a projected
+        // CDC operational metadata column at the end.
+        StructType(
+          userSelectedSchema.fields :+ StructField(
+            Scd1BatchProcessor.cdcMetadataColName,
+            Scd1BatchProcessor.cdcMetadataColSchema(
+              sequencingType = df.select(changeArgs.sequencing).schema.head.dataType
+            ),
+            nullable = false
+          )
+        )
+      case ScdType.Type2 =>
+        throw new UnsupportedOperationException(
+          "AutoCDC flows do not currently support SCD Type 2 transformations."
+        )
+    }
+  }
+
+  /**
+   * Validate that the resolved source dataframe for the AutoCDC flow does not contain any column
+   * names that use the reserved Spark AutoCDC prefix.
+   */
+  private def requireReservedPrefixAbsentInSourceColumns(): Unit = {
+    val resolver = spark.sessionState.conf.resolver
+    val reservedPrefix = Scd1BatchProcessor.reservedColumnNamePrefix
+
+    def nameContainsReservedPrefix(name: String): Boolean = {
+      name.length >= reservedPrefix.length && resolver(
+        name.substring(0, reservedPrefix.length),
+        reservedPrefix
+      )
+    }
+
+    df.schema.fieldNames.find(nameContainsReservedPrefix).foreach { conflictingColumnName =>
+      throw new AnalysisException(
+        errorClass = "AUTOCDC_RESERVED_COLUMN_NAME_PREFIX_CONFLICT",
+        messageParameters = Map(
+          "caseSensitivity" -> CaseSensitivityLabels.of(
+            spark.sessionState.conf.caseSensitiveAnalysis
+          ),
+          "columnName" -> conflictingColumnName,
+          "schemaName" -> "changeDataFeed",
+          "reservedColumnNamePrefix" -> reservedPrefix
+        )
+      )
+    }
+  }
+
+  /**
+   * Validate all keys specified in changeArgs are actually present in the user-selected schema.
+   */
+  private def requireKeysPresentInSelectedSchema(selectedSchema: StructType): Unit = {
+    val resolver = spark.sessionState.conf.resolver
+
+    changeArgs.keys
+      .find(key => !selectedSchema.fieldNames.exists(name => resolver(name, key.name)))
+      .foreach { missingKey =>
+        throw new AnalysisException(
+          errorClass = "AUTOCDC_KEY_NOT_IN_SELECTED_SCHEMA",
+          messageParameters = Map(
+            "caseSensitivity" -> CaseSensitivityLabels.of(
+              spark.sessionState.conf.caseSensitiveAnalysis
+            ),
+            "keyColumnName" -> missingKey.name
+          )
+        )
+      }
+  }
+}
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphRegistrationContext.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphRegistrationContext.scala
@@ -59,7 +59,7 @@ class GraphRegistrationContext(
   }
 
   def registerFlow(flowDef: UnresolvedFlow): Unit = {
-    flows += flowDef.copy(sqlConf = defaultSqlConf ++ flowDef.sqlConf)
+    flows += flowDef.withSqlConf(defaultSqlConf ++ flowDef.sqlConf)
   }
 
   private def isEmpty: Boolean = {
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/SqlGraphRegistrationContext.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/SqlGraphRegistrationContext.scala
@@ -237,7 +237,7 @@ class SqlGraphRegistrationContext(
 
       // Register flow that backs this streaming table.
       graphRegistrationContext.registerFlow(
-        UnresolvedFlow(
+        UntypedFlow(
           identifier = stIdentifier,
           destinationIdentifier = stIdentifier,
           func = FlowAnalysis.createFlowFunctionFromLogicalPlan(cst.query),
@@ -288,7 +288,7 @@ class SqlGraphRegistrationContext(
 
       // Register flow that backs this materialized view.
       graphRegistrationContext.registerFlow(
-        UnresolvedFlow(
+        UntypedFlow(
           identifier = mvIdentifier,
           destinationIdentifier = mvIdentifier,
           func = FlowAnalysis.createFlowFunctionFromLogicalPlan(cmv.query),
@@ -331,7 +331,7 @@ class SqlGraphRegistrationContext(
 
       // Register flow that backs this persisted view.
       graphRegistrationContext.registerFlow(
-        UnresolvedFlow(
+        UntypedFlow(
           identifier = viewIdentifier,
           destinationIdentifier = viewIdentifier,
           func = FlowAnalysis.createFlowFunctionFromLogicalPlan(cv.query),
@@ -375,7 +375,7 @@ class SqlGraphRegistrationContext(
 
       // Register flow definition that backs this temporary view.
       graphRegistrationContext.registerFlow(
-        UnresolvedFlow(
+        UntypedFlow(
           identifier = viewIdentifier,
           destinationIdentifier = viewIdentifier,
           func = FlowAnalysis.createFlowFunctionFromLogicalPlan(cvc.plan),
@@ -451,7 +451,7 @@ class SqlGraphRegistrationContext(
         .identifier
 
       graphRegistrationContext.registerFlow(
-        UnresolvedFlow(
+        UntypedFlow(
           identifier = flowIdentifier,
           destinationIdentifier = qualifiedDestinationIdentifier,
           func = FlowAnalysis.createFlowFunctionFromLogicalPlan(flowQueryLogicalPlan),
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorSuite.scala
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/TestGraphRegistrationContext.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/TestGraphRegistrationContext.scala

Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ object ColumnSelection {`
`120`	`120`	`}`
`121`	`121`
`122`	`122`	`/** User-facing case-sensitivity labels surfaced in AutoCDC error messages. */`
`123`		`-private[autocdc] object CaseSensitivityLabels {`
	`123`	`+private[pipelines] object CaseSensitivityLabels {`
`124`	`124`	`val CaseSensitive: String = "case-sensitive"`
`125`	`125`	`val CaseInsensitive: String = "case-insensitive"`
`126`	`126`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ class GraphRegistrationContext(`
`59`	`59`	`}`
`60`	`60`
`61`	`61`	`def registerFlow(flowDef: UnresolvedFlow): Unit = {`
`62`		`- flows += flowDef.copy(sqlConf = defaultSqlConf ++ flowDef.sqlConf)`
	`62`	`+ flows += flowDef.withSqlConf(defaultSqlConf ++ flowDef.sqlConf)`
`63`	`63`	`}`
`64`	`64`
`65`	`65`	`private def isEmpty: Boolean = {`