Skip to content

Commit d018f89

Browse files
committed
Introduce AutoCdcFlow and AutoCdcMergeFlow
1 parent 02473ba commit d018f89

9 files changed

Lines changed: 737 additions & 119 deletions

File tree

common/utils/src/main/resources/error/error-conditions.json

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,15 @@
209209
],
210210
"sqlState" : "42703"
211211
},
212-
"AUTOCDC_RESERVED_COLUMN_NAME_CONFLICT" : {
212+
"AUTOCDC_KEY_NOT_IN_SELECTED_SCHEMA" : {
213213
"message" : [
214-
"Using <caseSensitivity> column name comparison, the column `<columnName>` in the <schemaName> schema conflicts with the reserved AutoCDC column name `<reservedColumnName>`. Rename or remove the column."
214+
"Using <caseSensitivity> column name comparison, the AutoCDC key column `<keyColumnName>` is not present in the flow's selected source schema. AutoCDC requires every key column to be present in the source change-data feed and retained by any configured column selection."
215+
],
216+
"sqlState" : "22023"
217+
},
218+
"AUTOCDC_RESERVED_COLUMN_NAME_PREFIX_CONFLICT" : {
219+
"message" : [
220+
"The column `<columnName>` in the <schemaName> schema collides with the reserved AutoCDC column name prefix `<reservedColumnNamePrefix>` (using <caseSensitivity> column name comparison). Rename or remove the column."
215221
],
216222
"sqlState" : "42710"
217223
},

sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ object ColumnSelection {
120120
}
121121

122122
/** User-facing case-sensitivity labels surfaced in AutoCDC error messages. */
123-
private[autocdc] object CaseSensitivityLabels {
123+
private[pipelines] object CaseSensitivityLabels {
124124
val CaseSensitive: String = "case-sensitive"
125125
val CaseInsensitive: String = "case-insensitive"
126126

sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala

Lines changed: 11 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
package org.apache.spark.sql.pipelines.autocdc
1919

2020
import org.apache.spark.SparkException
21-
import org.apache.spark.sql.{functions => F, AnalysisException}
21+
import org.apache.spark.sql.{functions => F}
2222
import org.apache.spark.sql.Column
2323
import org.apache.spark.sql.catalyst.util.QuotingUtils
2424
import org.apache.spark.sql.classic.DataFrame
@@ -85,9 +85,6 @@ case class Scd1BatchProcessor(
8585
* column.
8686
*/
8787
def extendMicrobatchRowsWithCdcMetadata(microbatchDf: DataFrame): DataFrame = {
88-
// Proactively validate the reserved CDC metadata column does not exist in the microbatch.
89-
validateCdcMetadataColumnNotPresent(microbatchDf)
90-
9188
val rowDeleteSequence: Column = changeArgs.deleteCondition match {
9289
case Some(deleteCondition) =>
9390
F.when(deleteCondition, changeArgs.sequencing).otherwise(F.lit(null))
@@ -109,39 +106,26 @@ case class Scd1BatchProcessor(
109106
)
110107
)
111108
}
112-
113-
private def validateCdcMetadataColumnNotPresent(microbatchDf: DataFrame): Unit = {
114-
val microbatchSqlConf = microbatchDf.sparkSession.sessionState.conf
115-
val resolver = microbatchSqlConf.resolver
116-
117-
microbatchDf.schema.fieldNames
118-
.find(resolver(_, Scd1BatchProcessor.cdcMetadataColName))
119-
.foreach { conflictingColumnName =>
120-
throw new AnalysisException(
121-
errorClass = "AUTOCDC_RESERVED_COLUMN_NAME_CONFLICT",
122-
messageParameters = Map(
123-
"caseSensitivity" -> CaseSensitivityLabels.of(microbatchSqlConf.caseSensitiveAnalysis),
124-
"columnName" -> conflictingColumnName,
125-
"schemaName" -> "microbatch",
126-
"reservedColumnName" -> Scd1BatchProcessor.cdcMetadataColName
127-
)
128-
)
129-
}
130-
}
131109
}
132110

133111
object Scd1BatchProcessor {
134-
// Columns prefixed with `__spark_autocdc_` are reserved for internal SDP AutoCDC processing.
135-
private[autocdc] val winningRowColName: String = "__spark_autocdc_winning_row"
136-
private[autocdc] val cdcMetadataColName: String = "__spark_autocdc_metadata"
112+
/**
113+
* Reserved column-name prefix for internal SDP AutoCDC processing. Source change-data-feed
114+
* dataframes must not contain any columns starting with this prefix; the invariant is
115+
* enforced at [[org.apache.spark.sql.pipelines.graph.AutoCdcMergeFlow]] construction.
116+
*/
117+
private[pipelines] val reservedColumnNamePrefix: String = "__spark_autocdc_"
118+
119+
private[autocdc] val winningRowColName: String = s"${reservedColumnNamePrefix}winning_row"
120+
private[pipelines] val cdcMetadataColName: String = s"${reservedColumnNamePrefix}metadata"
137121

138122
private[autocdc] val cdcDeleteSequenceFieldName: String = "deleteSequence"
139123
private[autocdc] val cdcUpsertSequenceFieldName: String = "upsertSequence"
140124

141125
/**
142126
* Schema of the CDC metadata struct column for SCD1.
143127
*/
144-
private def cdcMetadataColSchema(sequencingType: DataType): StructType =
128+
private[pipelines] def cdcMetadataColSchema(sequencingType: DataType): StructType =
145129
StructType(
146130
Seq(
147131
// The sequencing of the event if it represents a delete, null otherwise.

sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala

Lines changed: 157 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,19 @@ package org.apache.spark.sql.pipelines.graph
2020
import scala.util.Try
2121

2222
import org.apache.spark.internal.Logging
23+
import org.apache.spark.sql.AnalysisException
2324
import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier}
2425
import org.apache.spark.sql.classic.DataFrame
2526
import org.apache.spark.sql.pipelines.AnalysisWarning
27+
import org.apache.spark.sql.pipelines.autocdc.{
28+
CaseSensitivityLabels,
29+
ChangeArgs,
30+
ColumnSelection,
31+
Scd1BatchProcessor,
32+
ScdType
33+
}
2634
import org.apache.spark.sql.pipelines.util.InputReadOptions
27-
import org.apache.spark.sql.types.StructType
35+
import org.apache.spark.sql.types.{StructField, StructType}
2836

2937
/**
3038
* Contains the catalog and database context information for query execution.
@@ -121,15 +129,56 @@ case class FlowFunctionResult(
121129
}
122130

123131
/** A [[Flow]] whose output schema and dependencies aren't known. */
124-
case class UnresolvedFlow(
132+
sealed trait UnresolvedFlow extends Flow {
133+
/** Returns a copy of this flow with the given SQL confs overriding the existing ones. */
134+
def withSqlConf(newSqlConf: Map[String, String]): UnresolvedFlow
135+
}
136+
137+
/**
138+
* An [[UnresolvedFlow]] whose execution-type has not yet been determined.
139+
*
140+
* In some cases, we know the execution-type for an [[UnresolvedFlow]] even before flow analysis
141+
* and resolution. For example an AutoCDCFlow is a special unresolved-but-typed flow; we know a
142+
* flow will be an AutoCDC flow immediately on construction, because it has its own special
143+
* registration API. Such flows are considered "typed flows", but there isn't any semantic reason
144+
* yet to explicitly introduce a `TypedFlow` trait/class.
145+
*/
146+
case class UntypedFlow(
125147
identifier: TableIdentifier,
126148
destinationIdentifier: TableIdentifier,
127149
func: FlowFunction,
128150
queryContext: QueryContext,
129151
sqlConf: Map[String, String],
130152
override val once: Boolean,
131153
override val origin: QueryOrigin
132-
) extends Flow
154+
) extends UnresolvedFlow {
155+
override def withSqlConf(newSqlConf: Map[String, String]): UntypedFlow =
156+
copy(sqlConf = newSqlConf)
157+
}
158+
159+
/**
160+
* An unresolved but typed that applies a CDC event stream to a target table via MERGE.
161+
*
162+
* [[AutoCdcFlow]] is a typed flow because it is only supported for streaming, and not as a once
163+
* flow. Therefore by definition it is a streaming-type flow.
164+
*
165+
* In the future once-support for [[AutoCdcFlow]] may be added.
166+
*/
167+
case class AutoCdcFlow(
168+
identifier: TableIdentifier,
169+
destinationIdentifier: TableIdentifier,
170+
func: FlowFunction,
171+
queryContext: QueryContext,
172+
sqlConf: Map[String, String] = Map.empty,
173+
comment: Option[String] = None,
174+
override val origin: QueryOrigin,
175+
changeArgs: ChangeArgs
176+
) extends UnresolvedFlow {
177+
override val once: Boolean = false
178+
179+
override def withSqlConf(newSqlConf: Map[String, String]): AutoCdcFlow =
180+
copy(sqlConf = newSqlConf)
181+
}
133182

134183
/**
135184
* A [[Flow]] whose flow function has been invoked, meaning either:
@@ -194,3 +243,108 @@ class AppendOnceFlow(
194243

195244
override val once = true
196245
}
246+
247+
/**
248+
* A resolved flow that applies a CDC event stream to a target table via MERGE, in accordance to
249+
* the configured [[flow.changeArgs]].
250+
*/
251+
class AutoCdcMergeFlow(
252+
val flow: AutoCdcFlow,
253+
val funcResult: FlowFunctionResult
254+
) extends ResolvedFlow {
255+
requireReservedPrefixAbsentInSourceColumns()
256+
257+
def changeArgs: ChangeArgs = flow.changeArgs
258+
259+
/**
260+
* Returns the augmented output schema of this flow, which can differ from the schema of the
261+
* source change-data-feed dataframe.
262+
*
263+
* The source dataframe's schema describes the incoming CDC events; the augmented schema here
264+
* applies the user-specified [[ColumnSelection]] and appends the SCD-specific metadata
265+
* columns that the AutoCDC MERGE engine projects onto the target table. Downstream
266+
* dependencies in the pipeline see this augmented schema.
267+
*/
268+
override val schema: StructType = {
269+
val userSelectedSchema = ColumnSelection.applyToSchema(
270+
schemaName = "changeDataFeed",
271+
schema = df.schema,
272+
columnSelection = changeArgs.columnSelection,
273+
caseSensitive = spark.sessionState.conf.caseSensitiveAnalysis
274+
)
275+
276+
// AutoCDC flows require all key columns to be present in the target table, to adhere to SCD
277+
// semantics.
278+
requireKeysPresentInSelectedSchema(userSelectedSchema)
279+
280+
changeArgs.storedAsScdType match {
281+
case ScdType.Type1 =>
282+
// SCD1 produces a target table with all the user-selected output columns and a projected
283+
// CDC operational metadata column at the end.
284+
StructType(
285+
userSelectedSchema.fields :+ StructField(
286+
Scd1BatchProcessor.cdcMetadataColName,
287+
Scd1BatchProcessor.cdcMetadataColSchema(
288+
sequencingType = df.select(changeArgs.sequencing).schema.head.dataType
289+
),
290+
nullable = false
291+
)
292+
)
293+
case ScdType.Type2 =>
294+
throw new UnsupportedOperationException(
295+
"AutoCDC flows do not currently support SCD Type 2 transformations."
296+
)
297+
}
298+
}
299+
300+
/**
301+
* Validate that the resolved source dataframe for the AutoCDC flow does not contain any column
302+
* names that use the reserved Spark AutoCDC prefix.
303+
*/
304+
private def requireReservedPrefixAbsentInSourceColumns(): Unit = {
305+
val resolver = spark.sessionState.conf.resolver
306+
val reservedPrefix = Scd1BatchProcessor.reservedColumnNamePrefix
307+
308+
def nameContainsReservedPrefix(name: String): Boolean = {
309+
name.length >= reservedPrefix.length && resolver(
310+
name.substring(0, reservedPrefix.length),
311+
reservedPrefix
312+
)
313+
}
314+
315+
df.schema.fieldNames.find(nameContainsReservedPrefix).foreach { conflictingColumnName =>
316+
throw new AnalysisException(
317+
errorClass = "AUTOCDC_RESERVED_COLUMN_NAME_PREFIX_CONFLICT",
318+
messageParameters = Map(
319+
"caseSensitivity" -> CaseSensitivityLabels.of(
320+
spark.sessionState.conf.caseSensitiveAnalysis
321+
),
322+
"columnName" -> conflictingColumnName,
323+
"schemaName" -> "changeDataFeed",
324+
"reservedColumnNamePrefix" -> reservedPrefix
325+
)
326+
)
327+
}
328+
}
329+
330+
/**
331+
* Validate all keys specified in changeArgs are actually present in the user-selected schema.
332+
*/
333+
private def requireKeysPresentInSelectedSchema(selectedSchema: StructType): Unit = {
334+
val resolver = spark.sessionState.conf.resolver
335+
336+
changeArgs.keys
337+
.find(key => !selectedSchema.fieldNames.exists(name => resolver(name, key.name)))
338+
.foreach { missingKey =>
339+
throw new AnalysisException(
340+
errorClass = "AUTOCDC_KEY_NOT_IN_SELECTED_SCHEMA",
341+
messageParameters = Map(
342+
"caseSensitivity" -> CaseSensitivityLabels.of(
343+
spark.sessionState.conf.caseSensitiveAnalysis
344+
),
345+
"keyColumnName" -> missingKey.name
346+
)
347+
)
348+
}
349+
}
350+
}

sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphRegistrationContext.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class GraphRegistrationContext(
5959
}
6060

6161
def registerFlow(flowDef: UnresolvedFlow): Unit = {
62-
flows += flowDef.copy(sqlConf = defaultSqlConf ++ flowDef.sqlConf)
62+
flows += flowDef.withSqlConf(defaultSqlConf ++ flowDef.sqlConf)
6363
}
6464

6565
private def isEmpty: Boolean = {

sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/SqlGraphRegistrationContext.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ class SqlGraphRegistrationContext(
237237

238238
// Register flow that backs this streaming table.
239239
graphRegistrationContext.registerFlow(
240-
UnresolvedFlow(
240+
UntypedFlow(
241241
identifier = stIdentifier,
242242
destinationIdentifier = stIdentifier,
243243
func = FlowAnalysis.createFlowFunctionFromLogicalPlan(cst.query),
@@ -288,7 +288,7 @@ class SqlGraphRegistrationContext(
288288

289289
// Register flow that backs this materialized view.
290290
graphRegistrationContext.registerFlow(
291-
UnresolvedFlow(
291+
UntypedFlow(
292292
identifier = mvIdentifier,
293293
destinationIdentifier = mvIdentifier,
294294
func = FlowAnalysis.createFlowFunctionFromLogicalPlan(cmv.query),
@@ -331,7 +331,7 @@ class SqlGraphRegistrationContext(
331331

332332
// Register flow that backs this persisted view.
333333
graphRegistrationContext.registerFlow(
334-
UnresolvedFlow(
334+
UntypedFlow(
335335
identifier = viewIdentifier,
336336
destinationIdentifier = viewIdentifier,
337337
func = FlowAnalysis.createFlowFunctionFromLogicalPlan(cv.query),
@@ -375,7 +375,7 @@ class SqlGraphRegistrationContext(
375375

376376
// Register flow definition that backs this temporary view.
377377
graphRegistrationContext.registerFlow(
378-
UnresolvedFlow(
378+
UntypedFlow(
379379
identifier = viewIdentifier,
380380
destinationIdentifier = viewIdentifier,
381381
func = FlowAnalysis.createFlowFunctionFromLogicalPlan(cvc.plan),
@@ -451,7 +451,7 @@ class SqlGraphRegistrationContext(
451451
.identifier
452452

453453
graphRegistrationContext.registerFlow(
454-
UnresolvedFlow(
454+
UntypedFlow(
455455
identifier = flowIdentifier,
456456
destinationIdentifier = qualifiedDestinationIdentifier,
457457
func = FlowAnalysis.createFlowFunctionFromLogicalPlan(flowQueryLogicalPlan),

0 commit comments

Comments
 (0)