From 0d781bfbccc29978f6670609d2e835c8654b6d48 Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Wed, 10 Jun 2026 00:19:05 +0000 Subject: [PATCH 01/17] [CONNECT][SDP][SPARK-56249] Create AUTO CDC syntax for SCD Type 1 in SQL Add SQL syntax for AUTO CDC (Change Data Capture) with SCD Type 1 semantics. Two forms are supported: 1. CREATE FLOW AS AUTO CDC INTO FROM KEYS (...) SEQUENCE BY 2. CREATE [OR REFRESH] STREAMING TABLE FLOW AUTO CDC FROM KEYS (...) SEQUENCE BY Optional clauses: APPLY AS DELETE WHEN, COLUMNS, COLUMNS * EXCEPT. Co-authored-by: Isaac --- .../resources/error/error-conditions.json | 6 + .../spark/sql/catalyst/parser/SqlBaseLexer.g4 | 4 + .../sql/catalyst/parser/SqlBaseParser.g4 | 42 +- .../sql/catalyst/parser/AstBuilder.scala | 44 ++ .../plans/logical/AutoCdcIntoCommand.scala | 56 +++ .../catalyst/plans/logical/v2Commands.scala | 43 ++ .../spark/sql/execution/SparkSqlParser.scala | 94 ++++ .../command/v2/AutoCdcParserSuite.scala | 404 ++++++++++++++++++ 8 files changed, 692 insertions(+), 1 deletion(-) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index b6206653a1612..d500572cf4767 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -5542,6 +5542,12 @@ }, "sqlState" : "0A000" }, + "MISSING_CLAUSES_FOR_OPERATION" : { + "message" : [ + "Missing required clause(s) for operation ." + ], + "sqlState" : "42601" + }, "MISSING_DATABASE_FOR_V1_SESSION_CATALOG" : { "message" : [ "Database name is not specified in the v1 session catalog. Please ensure to provide a valid database name when interacting with the v1 catalog." diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index 0e940ee5b4b01..86e50881958ae 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -140,8 +140,10 @@ AND: 'AND'; ANTI: 'ANTI'; ANY: 'ANY'; ANY_VALUE: 'ANY_VALUE'; +APPLY: 'APPLY'; APPROX: 'APPROX'; ARCHIVE: 'ARCHIVE'; +AUTO: 'AUTO'; ARRAY: 'ARRAY' {incComplexTypeLevelCounter();}; AS: 'AS'; ASC: 'ASC'; @@ -173,6 +175,7 @@ CASE: 'CASE'; CAST: 'CAST'; CATALOG: 'CATALOG'; CATALOGS: 'CATALOGS'; +CDC: 'CDC'; CHANGE: 'CHANGE'; CHANGES: 'CHANGES'; CHAR: 'CHAR'; @@ -454,6 +457,7 @@ ROWS: 'ROWS'; SECOND: 'SECOND'; SECONDS: 'SECONDS'; SCHEMA: 'SCHEMA'; +SEQUENCE: 'SEQUENCE'; SCHEMAS: 'SCHEMAS'; SECURITY: 'SECURITY'; SELECT: 'SELECT'; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 2466cf62272af..bf33d18c6a424 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -417,7 +417,11 @@ statement | createPipelineDatasetHeader (LEFT_PAREN tableElementList? RIGHT_PAREN)? tableProvider? createTableClauses (AS query)? #createPipelineDataset + | createPipelineDatasetHeader (LEFT_PAREN tableElementList? RIGHT_PAREN)? tableProvider? + createTableClauses + FLOW autoCdcBody #createStreamingTableAutoCdc | createPipelineFlowHeader insertInto query #createPipelineInsertIntoFlow + | createPipelineFlowHeader autoCdcCommand #createFlowAutoCdc ; materializedView @@ -429,7 +433,7 @@ streamingTable ; createPipelineDatasetHeader - : CREATE + : CREATE (OR REFRESH)? (materializedView | streamingTable) (IF errorCapturingNot EXISTS)? identifierReference @@ -750,6 +754,38 @@ dmlStatementNoWith notMatchedBySourceClause* #mergeIntoTable ; +autoCdcCommand + : AUTO CDC INTO target=multipartIdentifier + autoCdcParameters + ; + +autoCdcBody + : AUTO CDC autoCdcParameters + ; + +autoCdcParameters + : FROM source=relation + KEYS LEFT_PAREN keys=multipartIdentifierList RIGHT_PAREN + (autoCdcDeleteClause + | autoCdcSequenceByClause + | autoCdcColumnsClause + )* + ; + +autoCdcDeleteClause + : APPLY AS DELETE WHEN deleteCondition=booleanExpression + ; + +autoCdcSequenceByClause + : SEQUENCE BY sequence=expression + ; + +autoCdcColumnsClause + : COLUMNS ( + columns=multipartIdentifierList | + ASTERISK EXCEPT LEFT_PAREN exceptCols=multipartIdentifierList RIGHT_PAREN) + ; + identifierReference : IDENTIFIER_KW LEFT_PAREN expression RIGHT_PAREN | multipartIdentifier @@ -2346,8 +2382,10 @@ nonReserved | AND | ANY | ANY_VALUE + | APPLY | APPROX | ARCHIVE + | AUTO | ARRAY | AS | ASC @@ -2380,6 +2418,7 @@ nonReserved | CAST | CATALOG | CATALOGS + | CDC | CHANGE | CHANGES | CHAR @@ -2653,6 +2692,7 @@ nonReserved | SECOND | SECONDS | SECURITY + | SEQUENCE | SELECT | SEPARATED | SERDE diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index fe4841042a392..bc5bbe9866927 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1362,6 +1362,50 @@ class AstBuilder extends DataTypeAstBuilder withSchemaEvolution) } + protected def buildAutoCdcIntoCommand(ctx: AutoCdcCommandContext): AutoCdcIntoCommand = + withOrigin(ctx) { + val target = visitMultipartIdentifier(ctx.target).asTableIdentifier + val (src, keys, delete, seq, specCols, exceptCols) = + parseAutoCdcParams(ctx.autoCdcParameters()) + AutoCdcIntoCommand(target, src, keys, delete, seq, specCols, exceptCols) + } + + protected def parseAutoCdcParams(params: AutoCdcParametersContext): ( + LogicalPlan, + Seq[UnresolvedAttribute], + Option[Expression], + Expression, + Seq[UnresolvedAttribute], + Seq[UnresolvedAttribute]) = + withOrigin(params) { + checkDuplicateClauses(params.autoCdcDeleteClause(), "APPLY AS DELETE", params) + checkDuplicateClauses(params.autoCdcSequenceByClause(), "SEQUENCE BY", params) + checkDuplicateClauses(params.autoCdcColumnsClause(), "COLUMNS", params) + + if (params.autoCdcSequenceByClause().isEmpty) { + throw QueryParsingErrors.missingClausesForOperation( + params, "SEQUENCE BY", "AUTO CDC INTO") + } + + val sourceTable = plan(params.source.relationPrimary) + val keys = visitMultipartIdentifierList(params.keys) + val deleteCondition = params.autoCdcDeleteClause().asScala.headOption + .map(c => expression(c.deleteCondition)) + val sequencing = expression(params.autoCdcSequenceByClause(0).sequence) + + val columnsClause = params.autoCdcColumnsClause().asScala.headOption + val specifiedCols = columnsClause match { + case Some(c) if c.columns != null => visitMultipartIdentifierList(c.columns) + case _ => Seq.empty + } + val exceptCols = columnsClause match { + case Some(c) if c.exceptCols != null => visitMultipartIdentifierList(c.exceptCols) + case _ => Seq.empty + } + + (sourceTable, keys, deleteCondition, sequencing, specifiedCols, exceptCols) + } + /** * Returns the parameters for [[UnresolvedExecuteImmediate]] logical plan. * Expected format: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala new file mode 100644 index 0000000000000..69aa28e153e97 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical + +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} + +/** + * Logical plan node for an AUTO CDC INTO command, used by Spark Declarative Pipelines. + * + * This represents a CDC (Change Data Capture) operation that applies an ordered change event + * stream from [[sourceTable]] into [[targetTable]] using SCD Type 1 (upsert) semantics. + * + * This node is a parse-time placeholder. It is only executable within a Declarative Pipelines + * pipeline context; attempting to execute it directly will fail at analysis time. + * + * @param targetTable The target table to apply changes into. + * @param sourceTable The source relation providing the change events. + * @param keys Column(s) that uniquely identify a row in the target table. + * @param deleteCondition An optional expression that marks a source row as a DELETE operation. + * When absent, all source rows are treated as upserts. + * @param sequenceByExpr Expression that orders CDC events to correctly resolve out-of-order + * arrivals. Must evaluate to a sortable type. Required. + * @param specifiedCols An explicit list of source columns to include in the target table. + * Mutually exclusive with [[exceptCols]]. + * @param exceptCols Source columns to exclude from the target table (i.e., all columns + * except these). Mutually exclusive with [[specifiedCols]]. + */ +case class AutoCdcIntoCommand( + targetTable: TableIdentifier, + sourceTable: LogicalPlan, + keys: Seq[UnresolvedAttribute], + deleteCondition: Option[Expression], + sequenceByExpr: Expression, + specifiedCols: Seq[UnresolvedAttribute], + exceptCols: Seq[UnresolvedAttribute] +) extends LeafCommand { + // Output is not meaningful; this node is a pipeline-context placeholder. + override def output: Seq[Attribute] = Nil +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 40cf5009b97dc..cb6bf0434a1cb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -824,6 +824,49 @@ case class CreateStreamingTable( copy(name = newChild) } +/** + * Command parsed from `CREATE [OR REFRESH] STREAMING TABLE FLOW AUTO CDC ...` SQL syntax. + * This command serves as a parse-time placeholder for a pipeline CDC definition and cannot be + * executed directly. It is interpreted by the pipeline submodule during a pipeline execution. + * + * The target of the CDC operation is the streaming table itself (given by [[name]]). + * + * @param name The streaming table name, which also serves as the CDC target. + * @param columns User-specified columns for the streaming table. + * @param partitioning Column-based partitioning for the streaming table. + * @param tableSpec Additional table specs. + * @param ifNotExists Whether the table should only be created if it doesn't already exist. + * @param orRefresh Whether the statement is `CREATE OR REFRESH` (vs plain `CREATE`). + * @param sourceTable The source relation providing the change events. + * @param keys Column(s) that uniquely identify a row in the target table. + * @param deleteCondition An optional expression that marks a source row as a DELETE operation. + * @param sequenceByExpr Expression that orders CDC events to resolve out-of-order arrivals. + * @param specifiedCols An explicit list of source columns to include. Mutually exclusive with + * [[exceptCols]]. + * @param exceptCols Source columns to exclude. Mutually exclusive with [[specifiedCols]]. + */ +case class CreateStreamingTableAutoCdc( + name: LogicalPlan, + columns: Seq[ColumnDefinition], + partitioning: Seq[Transform], + tableSpec: TableSpecBase, + ifNotExists: Boolean, + orRefresh: Boolean, + sourceTable: LogicalPlan, + keys: Seq[UnresolvedAttribute], + deleteCondition: Option[Expression], + sequenceByExpr: Expression, + specifiedCols: Seq[UnresolvedAttribute], + exceptCols: Seq[UnresolvedAttribute] +) extends BinaryCommand with CreatePipelineDataset { + override def left: LogicalPlan = name + override def right: LogicalPlan = sourceTable + + override protected def withNewChildrenInternal( + newLeft: LogicalPlan, newRight: LogicalPlan): LogicalPlan = + copy(name = newLeft, sourceTable = newRight) +} + /** * Replace a table with a v2 catalog. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 8ff2161a2965d..71864a02f66ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -1571,6 +1571,100 @@ class SparkSqlAstBuilder extends AstBuilder { ) } + override def visitCreateFlowAutoCdc( + ctx: CreateFlowAutoCdcContext): LogicalPlan = withOrigin(ctx) { + val flowHeaderCtx = ctx.createPipelineFlowHeader() + val ident = withIdentClause(flowHeaderCtx.flowName, UnresolvedIdentifier(_)) + val commentOpt = Option(flowHeaderCtx.commentSpec()).map(visitCommentSpec) + val applyChanges = buildAutoCdcIntoCommand(ctx.autoCdcCommand()) + CreateFlowCommand( + name = ident, + flowOperation = applyChanges, + comment = commentOpt + ) + } + + override def visitCreateStreamingTableAutoCdc( + ctx: CreateStreamingTableAutoCdcContext): LogicalPlan = withOrigin(ctx) { + val headerCtx = ctx.createPipelineDatasetHeader() + + if (headerCtx.materializedView() != null) { + throw operationNotAllowed( + "AUTO CDC is only supported for STREAMING TABLE, not MATERIALIZED VIEW.", ctx) + } + + val orRefresh = headerCtx.REFRESH() != null + val ifNotExists = headerCtx.EXISTS() != null + val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) + val (colDefs, colConstraints) = Option(ctx.tableElementList()).map(visitTableElementList) + .getOrElse((Nil, Nil)) + + if (colConstraints.nonEmpty) { + throw operationNotAllowed( + "Pipeline datasets do not currently support column constraints.", ctx) + } + + val (partTransforms, partCols, bucketSpec, + properties, options, location, comment, collation, serdeInfoOpt, + clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) + + val partitioning = + partitionExpressions(partTransforms, partCols, ctx) ++ + clusterBySpec.map(_.asTransform) + + if (bucketSpec.isDefined) { + throw operationNotAllowed( + "Bucketing is not supported for CREATE STREAMING TABLE statements.", ctx) + } + if (options.options.nonEmpty) { + throw operationNotAllowed( + "Options are not supported for CREATE STREAMING TABLE statements.", ctx) + } + serdeInfoOpt.foreach { _ => + throw operationNotAllowed( + "Hive SerDe format options are not supported for CREATE STREAMING TABLE statements.", ctx) + } + if (location.nonEmpty) { + throw operationNotAllowed( + "Specifying location is not supported for CREATE STREAMING TABLE statements.", ctx) + } + + val spec = TableSpec( + properties = properties, + provider = provider, + options = Map.empty, + location = location, + comment = comment, + collation = collation, + serde = None, + external = false, + constraints = Seq.empty + ) + + val tableIdent = withIdentClause( + headerCtx.identifierReference, + UnresolvedIdentifier(_) + ) + + val (src, keys, delete, seq, specCols, exceptCols) = + parseAutoCdcParams(ctx.autoCdcBody().autoCdcParameters()) + + CreateStreamingTableAutoCdc( + name = tableIdent, + columns = colDefs, + partitioning = partitioning, + tableSpec = spec, + ifNotExists = ifNotExists, + orRefresh = orRefresh, + sourceTable = src, + keys = keys, + deleteCondition = delete, + sequenceByExpr = seq, + specifiedCols = specCols, + exceptCols = exceptCols + ) + } + override def visitCreatePipelineDataset( ctx: CreatePipelineDatasetContext): LogicalPlan = withOrigin(ctx) { val createPipelineDatasetHeaderCtx = ctx.createPipelineDatasetHeader() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala new file mode 100644 index 0000000000000..3b2a91744ad22 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -0,0 +1,404 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v2 + +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute, UnresolvedIdentifier, UnresolvedRelation} +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.catalyst.plans.logical.{ + AutoCdcIntoCommand, + CreateFlowCommand, + CreateStreamingTableAutoCdc +} +import org.apache.spark.sql.execution.SparkSqlParser + +/** + * Parser tests for AUTO CDC syntax. + * + * Covers two supported forms: + * 1. CREATE FLOW [COMMENT ...] AS AUTO CDC INTO ... + * 2. CREATE [OR REFRESH] STREAMING TABLE FLOW AUTO CDC ... + * + * Snapshot CDC, SCD Type 2, IGNORE NULL UPDATES, and APPLY AS TRUNCATE WHEN are not + * supported and should fail to parse. The standalone AUTO CDC INTO form (without CREATE FLOW + * or CREATE STREAMING TABLE) is also not supported. + */ +class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { + protected lazy val parser = new SparkSqlParser() + + // --------------------------------------------------------------------------- + // CREATE FLOW ... AS AUTO CDC INTO + // --------------------------------------------------------------------------- + + test("CREATE FLOW AS AUTO CDC INTO - minimal form") { + val plan = parser.parsePlan( + """CREATE FLOW myflow AS AUTO CDC INTO target + |FROM source + |KEYS (key1, key2) + |SEQUENCE BY timestamp""".stripMargin) + + val cmd = plan.asInstanceOf[CreateFlowCommand] + assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("myflow")) + assert(cmd.comment.isEmpty) + + val cdc = cmd.flowOperation.asInstanceOf[AutoCdcIntoCommand] + assert(cdc.targetTable.table == "target") + assert(cdc.sourceTable.isInstanceOf[UnresolvedRelation]) + assert(cdc.keys.map(_.name) == Seq("key1", "key2")) + assert(cdc.deleteCondition.isEmpty) + assert(cdc.sequenceByExpr == UnresolvedAttribute("timestamp")) + assert(cdc.specifiedCols.isEmpty) + assert(cdc.exceptCols.isEmpty) + } + + test("CREATE FLOW AS AUTO CDC INTO - with COMMENT") { + val plan = parser.parsePlan( + """CREATE FLOW myflow COMMENT 'my comment' AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + + val cmd = plan.asInstanceOf[CreateFlowCommand] + assert(cmd.comment == Some("my comment")) + } + + test("CREATE FLOW AS AUTO CDC INTO - multipart flow name") { + val plan = parser.parsePlan( + """CREATE FLOW mycat.myschema.myflow AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + + val cmd = plan.asInstanceOf[CreateFlowCommand] + assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("mycat", "myschema", "myflow")) + } + + test("CREATE FLOW AS AUTO CDC INTO - two-part target table name") { + val plan = parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO myschema.mytable + |FROM source + |KEYS (k) + |SEQUENCE BY ts""".stripMargin) + + val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] + assert(cdc.targetTable.database == Some("myschema")) + assert(cdc.targetTable.table == "mytable") + } + + test("CREATE FLOW AS AUTO CDC INTO - APPLY AS DELETE WHEN") { + val plan = parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |APPLY AS DELETE WHEN op = 'DELETE' + |SEQUENCE BY ts""".stripMargin) + + val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] + assert(cdc.deleteCondition.isDefined) + assert(cdc.deleteCondition.get.sql.contains("op")) + } + + test("CREATE FLOW AS AUTO CDC INTO - COLUMNS include list") { + val plan = parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |COLUMNS id, name, value""".stripMargin) + + val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] + assert(cdc.specifiedCols.map(_.name) == Seq("id", "name", "value")) + assert(cdc.exceptCols.isEmpty) + } + + test("CREATE FLOW AS AUTO CDC INTO - COLUMNS * EXCEPT list") { + val plan = parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |COLUMNS * EXCEPT (op, ts)""".stripMargin) + + val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] + assert(cdc.specifiedCols.isEmpty) + assert(cdc.exceptCols.map(_.name) == Seq("op", "ts")) + } + + test("CREATE FLOW AS AUTO CDC INTO - all clauses combined") { + val plan = parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (key1, key2) + |APPLY AS DELETE WHEN key3 = 3 + |SEQUENCE BY timestamp + |COLUMNS key1, key2, key3, timestamp""".stripMargin) + + val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] + assert(cdc.keys.map(_.name) == Seq("key1", "key2")) + assert(cdc.deleteCondition.isDefined) + assert(cdc.sequenceByExpr == UnresolvedAttribute("timestamp")) + assert(cdc.specifiedCols.map(_.name) == Seq("key1", "key2", "key3", "timestamp")) + } + + // --------------------------------------------------------------------------- + // CREATE [OR REFRESH] STREAMING TABLE ... FLOW AUTO CDC + // --------------------------------------------------------------------------- + + test("CREATE STREAMING TABLE FLOW AUTO CDC - minimal form") { + val plan = parser.parsePlan( + """CREATE STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (key1, key2) + |SEQUENCE BY timestamp""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("target")) + assert(!cmd.orRefresh) + assert(!cmd.ifNotExists) + assert(cmd.sourceTable.isInstanceOf[UnresolvedRelation]) + assert(cmd.keys.map(_.name) == Seq("key1", "key2")) + assert(cmd.deleteCondition.isEmpty) + assert(cmd.sequenceByExpr == UnresolvedAttribute("timestamp")) + assert(cmd.specifiedCols.isEmpty) + assert(cmd.exceptCols.isEmpty) + } + + test("CREATE OR REFRESH STREAMING TABLE FLOW AUTO CDC") { + val plan = parser.parsePlan( + """CREATE OR REFRESH STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + assert(cmd.orRefresh) + } + + test("CREATE STREAMING TABLE IF NOT EXISTS FLOW AUTO CDC") { + val plan = parser.parsePlan( + """CREATE STREAMING TABLE IF NOT EXISTS target + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + assert(cmd.ifNotExists) + assert(!cmd.orRefresh) + } + + test("CREATE STREAMING TABLE FLOW AUTO CDC - multipart table name") { + val plan = parser.parsePlan( + """CREATE STREAMING TABLE myschema.mytable + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("myschema", "mytable")) + } + + test("CREATE STREAMING TABLE FLOW AUTO CDC - APPLY AS DELETE WHEN") { + val plan = parser.parsePlan( + """CREATE STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |APPLY AS DELETE WHEN op = 'DELETE' + |SEQUENCE BY ts""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + assert(cmd.deleteCondition.isDefined) + assert(cmd.deleteCondition.get.sql.contains("op")) + } + + test("CREATE STREAMING TABLE FLOW AUTO CDC - COLUMNS include list") { + val plan = parser.parsePlan( + """CREATE STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |COLUMNS id, name, value""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + assert(cmd.specifiedCols.map(_.name) == Seq("id", "name", "value")) + assert(cmd.exceptCols.isEmpty) + } + + test("CREATE STREAMING TABLE FLOW AUTO CDC - COLUMNS * EXCEPT list") { + val plan = parser.parsePlan( + """CREATE STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |COLUMNS * EXCEPT (op, ts)""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + assert(cmd.specifiedCols.isEmpty) + assert(cmd.exceptCols.map(_.name) == Seq("op", "ts")) + } + + test("CREATE STREAMING TABLE FLOW AUTO CDC - all clauses combined") { + val plan = parser.parsePlan( + """CREATE OR REFRESH STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (key1, key2) + |APPLY AS DELETE WHEN key3 = 3 + |SEQUENCE BY timestamp + |COLUMNS * EXCEPT (key4)""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + assert(cmd.orRefresh) + assert(cmd.keys.map(_.name) == Seq("key1", "key2")) + assert(cmd.deleteCondition.isDefined) + assert(cmd.sequenceByExpr == UnresolvedAttribute("timestamp")) + assert(cmd.exceptCols.map(_.name) == Seq("key4")) + } + + // --------------------------------------------------------------------------- + // Error cases: missing required clause + // --------------------------------------------------------------------------- + + test("CREATE FLOW AS AUTO CDC INTO - SEQUENCE BY is required") { + val e = intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id)""".stripMargin) + } + assert(e.getCondition == "MISSING_CLAUSES_FOR_OPERATION") + assert(e.getMessageParameters.get("clauses") == "SEQUENCE BY") + assert(e.getMessageParameters.get("operation") == "AUTO CDC INTO") + } + + test("CREATE STREAMING TABLE FLOW AUTO CDC - SEQUENCE BY is required") { + intercept[ParseException] { + parser.parsePlan( + """CREATE STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (id)""".stripMargin) + } + } + + // --------------------------------------------------------------------------- + // Error cases: duplicate clauses + // --------------------------------------------------------------------------- + + test("duplicate SEQUENCE BY clause") { + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts1 + |SEQUENCE BY ts2""".stripMargin) + } + } + + test("duplicate APPLY AS DELETE clause") { + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |APPLY AS DELETE WHEN a = 1 + |APPLY AS DELETE WHEN b = 2 + |SEQUENCE BY ts""".stripMargin) + } + } + + test("duplicate COLUMNS clause") { + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |COLUMNS a, b + |COLUMNS c, d""".stripMargin) + } + } + + // --------------------------------------------------------------------------- + // Error cases: standalone form not supported + // --------------------------------------------------------------------------- + + test("standalone AUTO CDC INTO is not supported") { + intercept[ParseException] { + parser.parsePlan( + """AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + } + } + + // --------------------------------------------------------------------------- + // Error cases: deprecated / unsupported syntax + // --------------------------------------------------------------------------- + + test("APPLY AS TRUNCATE WHEN is not supported") { + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |APPLY AS TRUNCATE WHEN op = 'TRUNCATE' + |SEQUENCE BY ts""".stripMargin) + } + } + + test("IGNORE NULL UPDATES is not supported") { + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |IGNORE NULL UPDATES + |SEQUENCE BY ts""".stripMargin) + } + } + + test("STORED AS SCD TYPE 2 is not supported") { + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |STORED AS SCD TYPE 2""".stripMargin) + } + } + + test("TRACK HISTORY ON is not supported") { + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |TRACK HISTORY ON value1, value2""".stripMargin) + } + } +} From 5939533cc6146d8c61205b909ba5eedd0581f9d3 Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Wed, 10 Jun 2026 03:48:56 +0000 Subject: [PATCH 02/17] [CONNECT][SDP][SPARK-56249] Fix lines exceeding 100 char limit in AutoCdcParserSuite Co-authored-by: Isaac --- .../sql/execution/command/v2/AutoCdcParserSuite.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala index 3b2a91744ad22..98906b996d04d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -17,7 +17,9 @@ package org.apache.spark.sql.execution.command.v2 -import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute, UnresolvedIdentifier, UnresolvedRelation} +import org.apache.spark.sql.catalyst.analysis.{ + AnalysisTest, UnresolvedAttribute, + UnresolvedIdentifier, UnresolvedRelation} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{ AutoCdcIntoCommand, @@ -84,7 +86,8 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |SEQUENCE BY ts""".stripMargin) val cmd = plan.asInstanceOf[CreateFlowCommand] - assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("mycat", "myschema", "myflow")) + assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == + Seq("mycat", "myschema", "myflow")) } test("CREATE FLOW AS AUTO CDC INTO - two-part target table name") { From 22ec1cebd2fca44a0f17450c08374d8def90369c Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Wed, 10 Jun 2026 04:07:05 +0000 Subject: [PATCH 03/17] [CONNECT][SDP][SPARK-56249] Add AUTO CDC keywords to ansiNonReserved and ANSI compliance doc Add APPLY, AUTO, CDC, and SEQUENCE as non-reserved keywords in both ANSI and default modes, and document them in sql-ref-ansi-compliance.md. Co-authored-by: Isaac --- docs/sql-ref-ansi-compliance.md | 4 ++++ .../org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index d28f0af5dd0af..4704bef4c6afb 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -423,6 +423,7 @@ Below is a list of all the keywords in Spark SQL. |ANTI|non-reserved|strict-non-reserved|non-reserved| |ANY|reserved|non-reserved|reserved| |ANY_VALUE|non-reserved|non-reserved|non-reserved| +|APPLY|non-reserved|non-reserved|non-reserved| |APPROX|non-reserved|non-reserved|non-reserved| |ARCHIVE|non-reserved|non-reserved|non-reserved| |ARRAY|non-reserved|non-reserved|reserved| @@ -430,6 +431,7 @@ Below is a list of all the keywords in Spark SQL. |ASC|non-reserved|non-reserved|non-reserved| |ASENSITIVE|non-reserved|non-reserved|non-reserved| |AT|non-reserved|non-reserved|reserved| +|AUTO|non-reserved|non-reserved|non-reserved| |ATOMIC|non-reserved|non-reserved|non-reserved| |AUTHORIZATION|reserved|non-reserved|reserved| |BEGIN|non-reserved|non-reserved|non-reserved| @@ -456,6 +458,7 @@ Below is a list of all the keywords in Spark SQL. |CAST|reserved|non-reserved|reserved| |CATALOG|non-reserved|non-reserved|non-reserved| |CATALOGS|non-reserved|non-reserved|non-reserved| +|CDC|non-reserved|non-reserved|non-reserved| |CHANGE|non-reserved|non-reserved|non-reserved| |CHANGES|non-reserved|non-reserved|non-reserved| |CHAR|non-reserved|non-reserved|reserved| @@ -744,6 +747,7 @@ Below is a list of all the keywords in Spark SQL. |SELECT|reserved|non-reserved|reserved| |SEMI|non-reserved|strict-non-reserved|non-reserved| |SEPARATED|non-reserved|non-reserved|non-reserved| +|SEQUENCE|non-reserved|non-reserved|non-reserved| |SERDE|non-reserved|non-reserved|non-reserved| |SERDEPROPERTIES|non-reserved|non-reserved|non-reserved| |SESSION_USER|reserved|non-reserved|reserved| diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index bf33d18c6a424..61ebc630329eb 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -1995,12 +1995,14 @@ ansiNonReserved | ANALYZE | ANTI | ANY_VALUE + | APPLY | APPROX | ARCHIVE | ARRAY | ASC | ASENSITIVE | AT + | AUTO | ATOMIC | BEGIN | BERNOULLI @@ -2023,6 +2025,7 @@ ansiNonReserved | CASCADE | CATALOG | CATALOGS + | CDC | CHANGE | CHANGES | CHAR @@ -2254,6 +2257,7 @@ ansiNonReserved | SECURITY | SEMI | SEPARATED + | SEQUENCE | SERDE | SERDEPROPERTIES | SET From f98202fe33f9e8475abd5ffe63c1f8122eae02ec Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Wed, 10 Jun 2026 06:20:51 +0000 Subject: [PATCH 04/17] [CONNECT][SDP][SPARK-56249] Update keyword golden files and hardcoded keyword lists Regenerate SQL golden files and add APPLY, AUTO, CDC, SEQUENCE to the hardcoded keyword lists in ThriftServer and SparkConnect tests. Co-authored-by: Isaac --- .../client/jdbc/SparkConnectDatabaseMetaDataSuite.scala | 2 +- .../resources/sql-tests/results/keywords-enforced.sql.out | 4 ++++ .../src/test/resources/sql-tests/results/keywords.sql.out | 4 ++++ .../test/resources/sql-tests/results/nonansi/keywords.sql.out | 4 ++++ .../hive/thriftserver/ThriftServerWithSparkContextSuite.scala | 2 +- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/sql/connect/client/jdbc/src/test/scala/org/apache/spark/sql/connect/client/jdbc/SparkConnectDatabaseMetaDataSuite.scala b/sql/connect/client/jdbc/src/test/scala/org/apache/spark/sql/connect/client/jdbc/SparkConnectDatabaseMetaDataSuite.scala index 49c0dccc780c5..61424bd9a1955 100644 --- a/sql/connect/client/jdbc/src/test/scala/org/apache/spark/sql/connect/client/jdbc/SparkConnectDatabaseMetaDataSuite.scala +++ b/sql/connect/client/jdbc/src/test/scala/org/apache/spark/sql/connect/client/jdbc/SparkConnectDatabaseMetaDataSuite.scala @@ -210,7 +210,7 @@ class SparkConnectDatabaseMetaDataSuite extends ConnectFunSuite with RemoteSpark val metadata = conn.getMetaData // scalastyle:off line.size.limit // CURRENT_PATH and SYSTEM are excluded: getSQLKeywords drops SQL:2003 reserved words (see companion). - assert(metadata.getSQLKeywords === "ADD,AFTER,AGGREGATE,ALIGN,ALWAYS,ANALYZE,ANTI,ANY_VALUE,APPROX,ARCHIVE,ASC,BERNOULLI,BIN,BINDING,BIN_DISTRIBUTE_RATIO,BIN_END,BIN_START,BUCKET,BUCKETS,BYTE,CACHE,CASCADE,CATALOG,CATALOGS,CHANGE,CHANGES,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATION,COLLATIONS,COLLECTION,COLUMNS,COMMENT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONTAINS,CONTINUE,COST,CURRENT_DATABASE,CURRENT_SCHEMA,DATA,DATABASE,DATABASES,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAYOFYEAR,DAYS,DBPROPERTIES,DEFAULT_PATH,DEFINED,DEFINER,DELAY,DELIMITED,DESC,DFS,DIRECTORIES,DIRECTORY,DISTANCE,DISTRIBUTE,DIV,DO,ELSEIF,ENFORCED,ESCAPED,EVOLUTION,EXACT,EXCHANGE,EXCLUDE,EXCLUSIVE,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,FIELDS,FILEFORMAT,FIRST,FLOW,FOLLOWING,FORMAT,FORMATTED,FOUND,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,HANDLER,HOURS,IDENTIFIED,IDENTIFIER,IF,IGNORE,ILIKE,IMMEDIATE,INCLUDE,INCLUSIVE,INCREMENT,INDEX,INDEXES,INPATH,INPUT,INPUTFORMAT,INVOKER,ITEMS,ITERATE,JSON,KEY,KEYS,LAST,LAZY,LEAVE,LEVEL,LIMIT,LINES,LIST,LOAD,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MEASURE,METRICS,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTES,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NEAREST,NORELY,NULLS,OFFSET,OPTION,OPTIONS,OUTPUTFORMAT,OVERWRITE,PARTITIONED,PARTITIONS,PATH,PERCENT,PIVOT,PLACING,PRECEDING,PRINCIPALS,PROCEDURES,PROPERTIES,PURGE,QUALIFY,QUARTER,QUERY,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,REDUCE,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,ROLE,ROLES,SCHEMA,SCHEMAS,SECONDS,SECURITY,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SETS,SHORT,SHOW,SIMILARITY,SINGLE,SKEWED,SORT,SORTED,SOURCE,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SYNC,SYSTEM_PATH,SYSTEM_TIME,SYSTEM_VERSION,TABLES,TARGET,TBLPROPERTIES,TERMINATED,TIMEDIFF,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TOUCH,TRANSACTION,TRANSACTIONS,TRANSFORM,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNIFORM,UNLOCK,UNPIVOT,UNSET,UNTIL,USE,VAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHILE,WIDTH,X,YEARS,ZONE") + assert(metadata.getSQLKeywords === "ADD,AFTER,AGGREGATE,ALIGN,ALWAYS,ANALYZE,ANTI,ANY_VALUE,APPLY,APPROX,ARCHIVE,ASC,AUTO,BERNOULLI,BIN,BINDING,BIN_DISTRIBUTE_RATIO,BIN_END,BIN_START,BUCKET,BUCKETS,BYTE,CACHE,CASCADE,CATALOG,CATALOGS,CDC,CHANGE,CHANGES,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATION,COLLATIONS,COLLECTION,COLUMNS,COMMENT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONTAINS,CONTINUE,COST,CURRENT_DATABASE,CURRENT_SCHEMA,DATA,DATABASE,DATABASES,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAYOFYEAR,DAYS,DBPROPERTIES,DEFAULT_PATH,DEFINED,DEFINER,DELAY,DELIMITED,DESC,DFS,DIRECTORIES,DIRECTORY,DISTANCE,DISTRIBUTE,DIV,DO,ELSEIF,ENFORCED,ESCAPED,EVOLUTION,EXACT,EXCHANGE,EXCLUDE,EXCLUSIVE,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,FIELDS,FILEFORMAT,FIRST,FLOW,FOLLOWING,FORMAT,FORMATTED,FOUND,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,HANDLER,HOURS,IDENTIFIED,IDENTIFIER,IF,IGNORE,ILIKE,IMMEDIATE,INCLUDE,INCLUSIVE,INCREMENT,INDEX,INDEXES,INPATH,INPUT,INPUTFORMAT,INVOKER,ITEMS,ITERATE,JSON,KEY,KEYS,LAST,LAZY,LEAVE,LEVEL,LIMIT,LINES,LIST,LOAD,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MEASURE,METRICS,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTES,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NEAREST,NORELY,NULLS,OFFSET,OPTION,OPTIONS,OUTPUTFORMAT,OVERWRITE,PARTITIONED,PARTITIONS,PATH,PERCENT,PIVOT,PLACING,PRECEDING,PRINCIPALS,PROCEDURES,PROPERTIES,PURGE,QUALIFY,QUARTER,QUERY,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,REDUCE,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,ROLE,ROLES,SCHEMA,SCHEMAS,SECONDS,SECURITY,SEMI,SEPARATED,SEQUENCE,SERDE,SERDEPROPERTIES,SETS,SHORT,SHOW,SIMILARITY,SINGLE,SKEWED,SORT,SORTED,SOURCE,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SYNC,SYSTEM_PATH,SYSTEM_TIME,SYSTEM_VERSION,TABLES,TARGET,TBLPROPERTIES,TERMINATED,TIMEDIFF,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TOUCH,TRANSACTION,TRANSACTIONS,TRANSFORM,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNIFORM,UNLOCK,UNPIVOT,UNSET,UNTIL,USE,VAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHILE,WIDTH,X,YEARS,ZONE") // scalastyle:on line.size.limit } } diff --git a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out index cbee1375aba83..0c57178048ebe 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords-enforced.sql.out @@ -16,6 +16,7 @@ AND true ANTI false ANY true ANY_VALUE false +APPLY false APPROX false ARCHIVE false ARRAY false @@ -25,6 +26,7 @@ ASENSITIVE false AT false ATOMIC false AUTHORIZATION true +AUTO false BEGIN false BERNOULLI false BETWEEN false @@ -49,6 +51,7 @@ CASE true CAST true CATALOG false CATALOGS false +CDC false CHANGE false CHANGES false CHAR false @@ -335,6 +338,7 @@ SECURITY false SELECT true SEMI false SEPARATED false +SEQUENCE false SERDE false SERDEPROPERTIES false SESSION_USER true diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out index dfdbc5fccb56b..e0528a75c6ef5 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out @@ -16,6 +16,7 @@ AND false ANTI false ANY false ANY_VALUE false +APPLY false APPROX false ARCHIVE false ARRAY false @@ -25,6 +26,7 @@ ASENSITIVE false AT false ATOMIC false AUTHORIZATION false +AUTO false BEGIN false BERNOULLI false BETWEEN false @@ -49,6 +51,7 @@ CASE false CAST false CATALOG false CATALOGS false +CDC false CHANGE false CHANGES false CHAR false @@ -335,6 +338,7 @@ SECURITY false SELECT false SEMI false SEPARATED false +SEQUENCE false SERDE false SERDEPROPERTIES false SESSION_USER false diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out index dfdbc5fccb56b..e0528a75c6ef5 100644 --- a/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/nonansi/keywords.sql.out @@ -16,6 +16,7 @@ AND false ANTI false ANY false ANY_VALUE false +APPLY false APPROX false ARCHIVE false ARRAY false @@ -25,6 +26,7 @@ ASENSITIVE false AT false ATOMIC false AUTHORIZATION false +AUTO false BEGIN false BERNOULLI false BETWEEN false @@ -49,6 +51,7 @@ CASE false CAST false CATALOG false CATALOGS false +CDC false CHANGE false CHANGES false CHAR false @@ -335,6 +338,7 @@ SECURITY false SELECT false SEMI false SEPARATED false +SEQUENCE false SERDE false SERDEPROPERTIES false SESSION_USER false diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index 276a266f5d16c..4e8f117dc8a5c 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { val sessionHandle = client.openSession(user, "") val infoValue = client.getInfo(sessionHandle, GetInfoType.CLI_ODBC_KEYWORDS) // scalastyle:off line.size.limit - assert(infoValue.getStringValue == "ADD,AFTER,AGGREGATE,ALIGN,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,APPROX,ARCHIVE,ARRAY,AS,ASC,ASENSITIVE,AT,ATOMIC,AUTHORIZATION,BEGIN,BERNOULLI,BETWEEN,BIGINT,BIN,BINARY,BINDING,BIN_DISTRIBUTE_RATIO,BIN_END,BIN_START,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALL,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHANGES,CHAR,CHARACTER,CHECK,CLEAR,CLOSE,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLATIONS,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONDITION,CONSTRAINT,CONTAINS,CONTINUE,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATABASE,CURRENT_DATE,CURRENT_PATH,CURRENT_SCHEMA,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,CURSOR,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFAULT_PATH,DEFINED,DEFINER,DELAY,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTANCE,DISTINCT,DISTRIBUTE,DIV,DO,DOUBLE,DROP,ELSE,ELSEIF,END,ENFORCED,ESCAPE,ESCAPED,EVOLUTION,EXACT,EXCEPT,EXCHANGE,EXCLUDE,EXCLUSIVE,EXECUTE,EXISTS,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FLOW,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FOUND,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,GLOBAL,GRANT,GROUP,GROUPING,HANDLER,HAVING,HOUR,HOURS,IDENTIFIED,IDENTIFIER,IDENTITY,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INCLUSIVE,INCREMENT,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSENSITIVE,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,ITERATE,JOIN,JSON,KEY,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEAVE,LEFT,LEVEL,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCALTIME,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MAX,MEASURE,MERGE,METRICS,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NEAREST,NEXT,NO,NONE,NORELY,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPEN,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PATH,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROCEDURE,PROCEDURES,PROPERTIES,PURGE,QUALIFY,QUARTER,QUERY,RANGE,READ,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,RECURSIVE,REDUCE,REFERENCES,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SIMILARITY,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,SQLEXCEPTION,SQLSTATE,START,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM,SYSTEM_PATH,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNIFORM,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UNTIL,UPDATE,USE,USER,USING,VALUE,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHEN,WHERE,WHILE,WIDTH,WINDOW,WITH,WITHIN,WITHOUT,X,YEAR,YEARS,ZONE") + assert(infoValue.getStringValue == "ADD,AFTER,AGGREGATE,ALIGN,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,APPLY,APPROX,ARCHIVE,ARRAY,AS,ASC,ASENSITIVE,AT,ATOMIC,AUTHORIZATION,AUTO,BEGIN,BERNOULLI,BETWEEN,BIGINT,BIN,BINARY,BINDING,BIN_DISTRIBUTE_RATIO,BIN_END,BIN_START,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CALL,CALLED,CASCADE,CASE,CAST,CATALOG,CATALOGS,CDC,CHANGE,CHANGES,CHAR,CHARACTER,CHECK,CLEAR,CLOSE,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLATION,COLLATIONS,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPENSATION,COMPUTE,CONCATENATE,CONDITION,CONSTRAINT,CONTAINS,CONTINUE,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATABASE,CURRENT_DATE,CURRENT_PATH,CURRENT_SCHEMA,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,CURSOR,DATA,DATABASE,DATABASES,DATE,DATEADD,DATEDIFF,DATE_ADD,DATE_DIFF,DAY,DAYOFYEAR,DAYS,DBPROPERTIES,DEC,DECIMAL,DECLARE,DEFAULT,DEFAULT_PATH,DEFINED,DEFINER,DELAY,DELETE,DELIMITED,DESC,DESCRIBE,DETERMINISTIC,DFS,DIRECTORIES,DIRECTORY,DISTANCE,DISTINCT,DISTRIBUTE,DIV,DO,DOUBLE,DROP,ELSE,ELSEIF,END,ENFORCED,ESCAPE,ESCAPED,EVOLUTION,EXACT,EXCEPT,EXCHANGE,EXCLUDE,EXCLUSIVE,EXECUTE,EXISTS,EXIT,EXPLAIN,EXPORT,EXTEND,EXTENDED,EXTERNAL,EXTRACT,FALSE,FETCH,FIELDS,FILEFORMAT,FILTER,FIRST,FLOAT,FLOW,FOLLOWING,FOR,FOREIGN,FORMAT,FORMATTED,FOUND,FROM,FULL,FUNCTION,FUNCTIONS,GENERATED,GEOGRAPHY,GEOMETRY,GLOBAL,GRANT,GROUP,GROUPING,HANDLER,HAVING,HOUR,HOURS,IDENTIFIED,IDENTIFIER,IDENTITY,IF,IGNORE,ILIKE,IMMEDIATE,IMPORT,IN,INCLUDE,INCLUSIVE,INCREMENT,INDEX,INDEXES,INNER,INPATH,INPUT,INPUTFORMAT,INSENSITIVE,INSERT,INT,INTEGER,INTERSECT,INTERVAL,INTO,INVOKER,IS,ITEMS,ITERATE,JOIN,JSON,KEY,KEYS,LANGUAGE,LAST,LATERAL,LAZY,LEADING,LEAVE,LEFT,LEVEL,LIKE,LIMIT,LINES,LIST,LOAD,LOCAL,LOCALTIME,LOCATION,LOCK,LOCKS,LOGICAL,LONG,LOOP,MACRO,MAP,MATCHED,MATERIALIZED,MAX,MEASURE,MERGE,METRICS,MICROSECOND,MICROSECONDS,MILLISECOND,MILLISECONDS,MINUS,MINUTE,MINUTES,MODIFIES,MONTH,MONTHS,MSCK,NAME,NAMESPACE,NAMESPACES,NANOSECOND,NANOSECONDS,NATURAL,NEAREST,NEXT,NO,NONE,NORELY,NOT,NULL,NULLS,NUMERIC,OF,OFFSET,ON,ONLY,OPEN,OPTION,OPTIONS,OR,ORDER,OUT,OUTER,OUTPUTFORMAT,OVER,OVERLAPS,OVERLAY,OVERWRITE,PARTITION,PARTITIONED,PARTITIONS,PATH,PERCENT,PIVOT,PLACING,POSITION,PRECEDING,PRIMARY,PRINCIPALS,PROCEDURE,PROCEDURES,PROPERTIES,PURGE,QUALIFY,QUARTER,QUERY,RANGE,READ,READS,REAL,RECORDREADER,RECORDWRITER,RECOVER,RECURSION,RECURSIVE,REDUCE,REFERENCES,REFRESH,RELY,RENAME,REPAIR,REPEAT,REPEATABLE,REPLACE,RESET,RESPECT,RESTRICT,RETURN,RETURNS,REVOKE,RIGHT,ROLE,ROLES,ROLLBACK,ROLLUP,ROW,ROWS,SCHEMA,SCHEMAS,SECOND,SECONDS,SECURITY,SELECT,SEMI,SEPARATED,SEQUENCE,SERDE,SERDEPROPERTIES,SESSION_USER,SET,SETS,SHORT,SHOW,SIMILARITY,SINGLE,SKEWED,SMALLINT,SOME,SORT,SORTED,SOURCE,SPECIFIC,SQL,SQLEXCEPTION,SQLSTATE,START,STATISTICS,STORED,STRATIFY,STREAM,STREAMING,STRING,STRUCT,SUBSTR,SUBSTRING,SYNC,SYSTEM,SYSTEM_PATH,SYSTEM_TIME,SYSTEM_VERSION,TABLE,TABLES,TABLESAMPLE,TARGET,TBLPROPERTIES,TERMINATED,THEN,TIME,TIMEDIFF,TIMESTAMP,TIMESTAMPADD,TIMESTAMPDIFF,TIMESTAMP_LTZ,TIMESTAMP_NTZ,TINYINT,TO,TOUCH,TRAILING,TRANSACTION,TRANSACTIONS,TRANSFORM,TRIM,TRUE,TRUNCATE,TRY_CAST,TYPE,UNARCHIVE,UNBOUNDED,UNCACHE,UNIFORM,UNION,UNIQUE,UNKNOWN,UNLOCK,UNPIVOT,UNSET,UNTIL,UPDATE,USE,USER,USING,VALUE,VALUES,VAR,VARCHAR,VARIABLE,VARIANT,VERSION,VIEW,VIEWS,VOID,WATERMARK,WEEK,WEEKS,WHEN,WHERE,WHILE,WIDTH,WINDOW,WITH,WITHIN,WITHOUT,X,YEAR,YEARS,ZONE") // scalastyle:on line.size.limit } } From efeaa9b7d9c0f6d5fae7b79ae05748ee0e0d955d Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Wed, 10 Jun 2026 14:54:39 +0000 Subject: [PATCH 05/17] trigger CI From 9b639926763413466fd10c1630399e6b3ac6bb0c Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Wed, 10 Jun 2026 20:53:06 +0000 Subject: [PATCH 06/17] [CONNECT][SDP][SPARK-56249] Remove unintended CREATE OR REFRESH syntax from AUTO CDC The createPipelineDatasetHeader grammar rule inadvertently allowed CREATE OR REFRESH for streaming tables and materialized views. This was not intended for the AUTO CDC syntax. Remove the optional (OR REFRESH)? from the grammar and the corresponding orRefresh field from CreateStreamingTableAutoCdc. Co-authored-by: Isaac --- .../sql/catalyst/parser/SqlBaseParser.g4 | 2 +- .../catalyst/plans/logical/v2Commands.scala | 4 +--- .../spark/sql/execution/SparkSqlParser.scala | 2 -- .../command/v2/AutoCdcParserSuite.scala | 21 +++---------------- 4 files changed, 5 insertions(+), 24 deletions(-) diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 61ebc630329eb..e709b071f500e 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -433,7 +433,7 @@ streamingTable ; createPipelineDatasetHeader - : CREATE (OR REFRESH)? + : CREATE (materializedView | streamingTable) (IF errorCapturingNot EXISTS)? identifierReference diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index cb6bf0434a1cb..f01eb6e7a0d58 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -825,7 +825,7 @@ case class CreateStreamingTable( } /** - * Command parsed from `CREATE [OR REFRESH] STREAMING TABLE FLOW AUTO CDC ...` SQL syntax. + * Command parsed from `CREATE STREAMING TABLE FLOW AUTO CDC ...` SQL syntax. * This command serves as a parse-time placeholder for a pipeline CDC definition and cannot be * executed directly. It is interpreted by the pipeline submodule during a pipeline execution. * @@ -836,7 +836,6 @@ case class CreateStreamingTable( * @param partitioning Column-based partitioning for the streaming table. * @param tableSpec Additional table specs. * @param ifNotExists Whether the table should only be created if it doesn't already exist. - * @param orRefresh Whether the statement is `CREATE OR REFRESH` (vs plain `CREATE`). * @param sourceTable The source relation providing the change events. * @param keys Column(s) that uniquely identify a row in the target table. * @param deleteCondition An optional expression that marks a source row as a DELETE operation. @@ -851,7 +850,6 @@ case class CreateStreamingTableAutoCdc( partitioning: Seq[Transform], tableSpec: TableSpecBase, ifNotExists: Boolean, - orRefresh: Boolean, sourceTable: LogicalPlan, keys: Seq[UnresolvedAttribute], deleteCondition: Option[Expression], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 71864a02f66ba..a2527fa0138ce 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -1593,7 +1593,6 @@ class SparkSqlAstBuilder extends AstBuilder { "AUTO CDC is only supported for STREAMING TABLE, not MATERIALIZED VIEW.", ctx) } - val orRefresh = headerCtx.REFRESH() != null val ifNotExists = headerCtx.EXISTS() != null val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) val (colDefs, colConstraints) = Option(ctx.tableElementList()).map(visitTableElementList) @@ -1655,7 +1654,6 @@ class SparkSqlAstBuilder extends AstBuilder { partitioning = partitioning, tableSpec = spec, ifNotExists = ifNotExists, - orRefresh = orRefresh, sourceTable = src, keys = keys, deleteCondition = delete, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala index 98906b996d04d..a6f0199ebc71e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.execution.SparkSqlParser * * Covers two supported forms: * 1. CREATE FLOW [COMMENT ...] AS AUTO CDC INTO ... - * 2. CREATE [OR REFRESH] STREAMING TABLE FLOW AUTO CDC ... + * 2. CREATE STREAMING TABLE FLOW AUTO CDC ... * * Snapshot CDC, SCD Type 2, IGNORE NULL UPDATES, and APPLY AS TRUNCATE WHEN are not * supported and should fail to parse. The standalone AUTO CDC INTO form (without CREATE FLOW @@ -158,7 +158,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { } // --------------------------------------------------------------------------- - // CREATE [OR REFRESH] STREAMING TABLE ... FLOW AUTO CDC + // CREATE STREAMING TABLE ... FLOW AUTO CDC // --------------------------------------------------------------------------- test("CREATE STREAMING TABLE FLOW AUTO CDC - minimal form") { @@ -171,7 +171,6 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("target")) - assert(!cmd.orRefresh) assert(!cmd.ifNotExists) assert(cmd.sourceTable.isInstanceOf[UnresolvedRelation]) assert(cmd.keys.map(_.name) == Seq("key1", "key2")) @@ -181,18 +180,6 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cmd.exceptCols.isEmpty) } - test("CREATE OR REFRESH STREAMING TABLE FLOW AUTO CDC") { - val plan = parser.parsePlan( - """CREATE OR REFRESH STREAMING TABLE target - |FLOW AUTO CDC - |FROM source - |KEYS (id) - |SEQUENCE BY ts""".stripMargin) - - val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] - assert(cmd.orRefresh) - } - test("CREATE STREAMING TABLE IF NOT EXISTS FLOW AUTO CDC") { val plan = parser.parsePlan( """CREATE STREAMING TABLE IF NOT EXISTS target @@ -203,7 +190,6 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] assert(cmd.ifNotExists) - assert(!cmd.orRefresh) } test("CREATE STREAMING TABLE FLOW AUTO CDC - multipart table name") { @@ -262,7 +248,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { test("CREATE STREAMING TABLE FLOW AUTO CDC - all clauses combined") { val plan = parser.parsePlan( - """CREATE OR REFRESH STREAMING TABLE target + """CREATE STREAMING TABLE target |FLOW AUTO CDC |FROM source |KEYS (key1, key2) @@ -271,7 +257,6 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |COLUMNS * EXCEPT (key4)""".stripMargin) val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] - assert(cmd.orRefresh) assert(cmd.keys.map(_.name) == Seq("key1", "key2")) assert(cmd.deleteCondition.isDefined) assert(cmd.sequenceByExpr == UnresolvedAttribute("timestamp")) From 61fefeaf794ed76db679e7308d01f258977bd133 Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Fri, 12 Jun 2026 01:50:32 +0000 Subject: [PATCH 07/17] [CONNECT][SDP][SPARK-56249] Address review feedback for AUTO CDC SQL parsing - Restrict grammar source to relationPrimary to reject JOINs/PIVOTs - Mark source relation as streaming (isStreaming = true) for consistency with Connect/Python path - Use AUTOCDC_MISSING_SEQUENCE_BY and AUTOCDC_BOTH_COLUMN_LIST_AND_EXCEPT_COLUMN_LIST error conditions for consistency across front-ends - Fix keyword ordering: AUTO after AUTHORIZATION, SEQUENCE after SEPARATED in lexer, parser rules, and ANSI compliance doc - Extract parsePipelineDatasetPrelude helper to deduplicate visitCreateStreamingTableAutoCdc and visitCreatePipelineDataset - Soften execution doc wording for CreateStreamingTableAutoCdc and AutoCdcIntoCommand - Add tests for MV rejection, column constraints, bucketing, options, serde, location, and assert error conditions on all error cases Co-authored-by: Isaac --- docs/sql-ref-ansi-compliance.md | 2 +- .../spark/sql/catalyst/parser/SqlBaseLexer.g4 | 4 +- .../sql/catalyst/parser/SqlBaseParser.g4 | 8 +- .../sql/catalyst/parser/AstBuilder.scala | 17 +- .../plans/logical/AutoCdcIntoCommand.scala | 5 +- .../catalyst/plans/logical/v2Commands.scala | 3 +- .../spark/sql/execution/SparkSqlParser.scala | 159 +++++++++--------- .../command/v2/AutoCdcParserSuite.scala | 126 ++++++++++++-- 8 files changed, 215 insertions(+), 109 deletions(-) diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 4704bef4c6afb..c3eec6cc5ce3e 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -431,9 +431,9 @@ Below is a list of all the keywords in Spark SQL. |ASC|non-reserved|non-reserved|non-reserved| |ASENSITIVE|non-reserved|non-reserved|non-reserved| |AT|non-reserved|non-reserved|reserved| -|AUTO|non-reserved|non-reserved|non-reserved| |ATOMIC|non-reserved|non-reserved|non-reserved| |AUTHORIZATION|reserved|non-reserved|reserved| +|AUTO|non-reserved|non-reserved|non-reserved| |BEGIN|non-reserved|non-reserved|non-reserved| |BERNOULLI|non-reserved|non-reserved|non-reserved| |BETWEEN|non-reserved|non-reserved|reserved| diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index 86e50881958ae..5b97f32dbfcfc 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -143,7 +143,6 @@ ANY_VALUE: 'ANY_VALUE'; APPLY: 'APPLY'; APPROX: 'APPROX'; ARCHIVE: 'ARCHIVE'; -AUTO: 'AUTO'; ARRAY: 'ARRAY' {incComplexTypeLevelCounter();}; AS: 'AS'; ASC: 'ASC'; @@ -151,6 +150,7 @@ ASENSITIVE: 'ASENSITIVE'; AT: 'AT'; ATOMIC: 'ATOMIC'; AUTHORIZATION: 'AUTHORIZATION'; +AUTO: 'AUTO'; BEGIN: 'BEGIN'; BERNOULLI: 'BERNOULLI'; BETWEEN: 'BETWEEN'; @@ -457,12 +457,12 @@ ROWS: 'ROWS'; SECOND: 'SECOND'; SECONDS: 'SECONDS'; SCHEMA: 'SCHEMA'; -SEQUENCE: 'SEQUENCE'; SCHEMAS: 'SCHEMAS'; SECURITY: 'SECURITY'; SELECT: 'SELECT'; SEMI: 'SEMI'; SEPARATED: 'SEPARATED'; +SEQUENCE: 'SEQUENCE'; SERDE: 'SERDE'; SERDEPROPERTIES: 'SERDEPROPERTIES'; SESSION_USER: 'SESSION_USER'; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index e709b071f500e..9dd4e2e5ddf59 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -764,7 +764,7 @@ autoCdcBody ; autoCdcParameters - : FROM source=relation + : FROM source=relationPrimary KEYS LEFT_PAREN keys=multipartIdentifierList RIGHT_PAREN (autoCdcDeleteClause | autoCdcSequenceByClause @@ -2002,8 +2002,8 @@ ansiNonReserved | ASC | ASENSITIVE | AT - | AUTO | ATOMIC + | AUTO | BEGIN | BERNOULLI | BETWEEN @@ -2389,7 +2389,6 @@ nonReserved | APPLY | APPROX | ARCHIVE - | AUTO | ARRAY | AS | ASC @@ -2397,6 +2396,7 @@ nonReserved | AT | ATOMIC | AUTHORIZATION + | AUTO | BEGIN | BERNOULLI | BETWEEN @@ -2696,9 +2696,9 @@ nonReserved | SECOND | SECONDS | SECURITY - | SEQUENCE | SELECT | SEPARATED + | SEQUENCE | SERDE | SERDEPROPERTIES | SESSION_USER diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index bc5bbe9866927..7faebed615703 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1380,14 +1380,25 @@ class AstBuilder extends DataTypeAstBuilder withOrigin(params) { checkDuplicateClauses(params.autoCdcDeleteClause(), "APPLY AS DELETE", params) checkDuplicateClauses(params.autoCdcSequenceByClause(), "SEQUENCE BY", params) + val allColumnsClauses = params.autoCdcColumnsClause().asScala + if (allColumnsClauses.exists(_.columns != null) && + allColumnsClauses.exists(_.exceptCols != null)) { + throw new ParseException( + errorClass = "AUTOCDC_BOTH_COLUMN_LIST_AND_EXCEPT_COLUMN_LIST", + ctx = params) + } checkDuplicateClauses(params.autoCdcColumnsClause(), "COLUMNS", params) if (params.autoCdcSequenceByClause().isEmpty) { - throw QueryParsingErrors.missingClausesForOperation( - params, "SEQUENCE BY", "AUTO CDC INTO") + throw new ParseException( + errorClass = "AUTOCDC_MISSING_SEQUENCE_BY", + ctx = params) } - val sourceTable = plan(params.source.relationPrimary) + val sourceTable = plan(params.source) match { + case r: UnresolvedRelation => r.copy(isStreaming = true) + case other => other + } val keys = visitMultipartIdentifierList(params.keys) val deleteCondition = params.autoCdcDeleteClause().asScala.headOption .map(c => expression(c.deleteCondition)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala index 69aa28e153e97..f13635e6d2931 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala @@ -27,8 +27,9 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} * This represents a CDC (Change Data Capture) operation that applies an ordered change event * stream from [[sourceTable]] into [[targetTable]] using SCD Type 1 (upsert) semantics. * - * This node is a parse-time placeholder. It is only executable within a Declarative Pipelines - * pipeline context; attempting to execute it directly will fail at analysis time. + * This node serves as a parse-time placeholder for a pipeline CDC definition and cannot be + * executed directly. It will be interpreted by the pipeline submodule once execution support + * is added (SPARK-57402). * * @param targetTable The target table to apply changes into. * @param sourceTable The source relation providing the change events. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index f01eb6e7a0d58..3497586bcf7c5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -827,7 +827,8 @@ case class CreateStreamingTable( /** * Command parsed from `CREATE STREAMING TABLE FLOW AUTO CDC ...` SQL syntax. * This command serves as a parse-time placeholder for a pipeline CDC definition and cannot be - * executed directly. It is interpreted by the pipeline submodule during a pipeline execution. + * executed directly. It will be interpreted by the pipeline submodule once execution support + * is added (SPARK-57402). * * The target of the CDC operation is the streaming table itself (given by [[name]]). * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a2527fa0138ce..20ee24a6d11df 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -41,6 +41,7 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} import org.apache.spark.sql.catalyst.util.DateTimeConstants import org.apache.spark.sql.connector.catalog.CatalogManager +import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryParsingErrors} import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources._ @@ -1584,48 +1585,69 @@ class SparkSqlAstBuilder extends AstBuilder { ) } - override def visitCreateStreamingTableAutoCdc( - ctx: CreateStreamingTableAutoCdcContext): LogicalPlan = withOrigin(ctx) { - val headerCtx = ctx.createPipelineDatasetHeader() - - if (headerCtx.materializedView() != null) { - throw operationNotAllowed( - "AUTO CDC is only supported for STREAMING TABLE, not MATERIALIZED VIEW.", ctx) - } - + /** + * Shared helper for pipeline dataset creation statements (CREATE STREAMING TABLE, + * CREATE MATERIALIZED VIEW, CREATE STREAMING TABLE ... FLOW AUTO CDC). + * + * Validates and extracts column definitions, partitioning, and table spec from the common + * grammar elements shared by these statements. + * + * @return (colDefs, partitioning, spec, ifNotExists, tableIdent) + */ + private def parsePipelineDatasetPrelude( + syntaxTypeErrorStr: String, + headerCtx: CreatePipelineDatasetHeaderContext, + tableProviderCtx: TableProviderContext, + tableElementListCtx: TableElementListContext, + createTableClausesCtx: CreateTableClausesContext, + ctx: ParserRuleContext): ( + Seq[ColumnDefinition], + Seq[Transform], + TableSpec, + Boolean, + LogicalPlan) = { val ifNotExists = headerCtx.EXISTS() != null - val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) - val (colDefs, colConstraints) = Option(ctx.tableElementList()).map(visitTableElementList) + val provider = Option(tableProviderCtx).map(_.multipartIdentifier.getText) + val (colDefs, colConstraints) = Option(tableElementListCtx).map(visitTableElementList) .getOrElse((Nil, Nil)) if (colConstraints.nonEmpty) { - throw operationNotAllowed( - "Pipeline datasets do not currently support column constraints.", ctx) + throw operationNotAllowed("Pipeline datasets do not currently support column constraints. " + + "Please remove any CHECK, UNIQUE, PK, and FK constraints specified on the pipeline " + + "dataset.", ctx) } val (partTransforms, partCols, bucketSpec, properties, options, location, comment, collation, serdeInfoOpt, - clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) + clusterBySpec) = visitCreateTableClauses(createTableClausesCtx) val partitioning = partitionExpressions(partTransforms, partCols, ctx) ++ clusterBySpec.map(_.asTransform) + // Because the createTableClauses grammar is reused for pipeline datasets but pipeline + // datasets don't support bucketing, options, storage location, or Hive SerDe, validate they + // are not set. if (bucketSpec.isDefined) { - throw operationNotAllowed( - "Bucketing is not supported for CREATE STREAMING TABLE statements.", ctx) + throw operationNotAllowed(s"Bucketing is not supported for CREATE $syntaxTypeErrorStr " + + "statements. Please remove any bucket spec specified in the statement.", ctx) } if (options.options.nonEmpty) { - throw operationNotAllowed( - "Options are not supported for CREATE STREAMING TABLE statements.", ctx) - } - serdeInfoOpt.foreach { _ => - throw operationNotAllowed( - "Hive SerDe format options are not supported for CREATE STREAMING TABLE statements.", ctx) + throw operationNotAllowed(s"Options are not supported for CREATE $syntaxTypeErrorStr " + + "statements. Please remove any OPTIONS lists specified in the statement.", ctx) } + serdeInfoOpt.map(serdeInfo => if (serdeInfo.storedAs.nonEmpty) { + throw operationNotAllowed(s"The STORED AS syntax is not supported for CREATE " + + s"$syntaxTypeErrorStr statements. Consider using the Data Source based USING clause " + + "instead.", ctx) + } else { + throw operationNotAllowed(s"Hive SerDe format options are not supported for CREATE " + + s"$syntaxTypeErrorStr statements.", ctx) + }) if (location.nonEmpty) { - throw operationNotAllowed( - "Specifying location is not supported for CREATE STREAMING TABLE statements.", ctx) + throw operationNotAllowed(s"Specifying location is not supported for CREATE " + + s"$syntaxTypeErrorStr statements. The storage location for a pipeline dataset is " + + "managed by the pipeline itself.", ctx) } val spec = TableSpec( @@ -1645,6 +1667,27 @@ class SparkSqlAstBuilder extends AstBuilder { UnresolvedIdentifier(_) ) + (colDefs, partitioning, spec, ifNotExists, tableIdent) + } + + override def visitCreateStreamingTableAutoCdc( + ctx: CreateStreamingTableAutoCdcContext): LogicalPlan = withOrigin(ctx) { + val headerCtx = ctx.createPipelineDatasetHeader() + + if (headerCtx.materializedView() != null) { + throw operationNotAllowed( + "AUTO CDC is only supported for STREAMING TABLE, not MATERIALIZED VIEW.", ctx) + } + + val (colDefs, partitioning, spec, ifNotExists, tableIdent) = + parsePipelineDatasetPrelude( + "STREAMING TABLE", + headerCtx, + ctx.tableProvider, + ctx.tableElementList(), + ctx.createTableClauses(), + ctx) + val (src, keys, delete, seq, specCols, exceptCols) = parseAutoCdcParams(ctx.autoCdcBody().autoCdcParameters()) @@ -1676,66 +1719,14 @@ class SparkSqlAstBuilder extends AstBuilder { throw invalidStatement(ctx.getText, ctx) } - val ifNotExists = createPipelineDatasetHeaderCtx.EXISTS() != null - val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) - val (colDefs, colConstraints) = Option(ctx.tableElementList()).map(visitTableElementList) - .getOrElse((Nil, Nil)) - - if (colConstraints.nonEmpty) { - throw operationNotAllowed("Pipeline datasets do not currently support column constraints. " + - "Please remove any CHECK, UNIQUE, PK, and FK constraints specified on the pipeline " + - "dataset.", ctx) - } - - val (partTransforms, partCols, bucketSpec, - properties, options, location, comment, collation, serdeInfoOpt, - clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) - - val partitioning = - partitionExpressions(partTransforms, partCols, ctx) ++ - clusterBySpec.map(_.asTransform) - - // Because the createTableClauses grammar is reused for createPipelineDataset but pipeline - // datasets don't support bucketing, options, storage location, or Hive SerDe, validate they - // are not set. - if (bucketSpec.isDefined) { - throw operationNotAllowed(s"Bucketing is not supported for CREATE $syntaxTypeErrorStr " + - "statements. Please remove any bucket spec specified in the statement.", ctx) - } - if (options.options.nonEmpty) { - throw operationNotAllowed(s"Options are not supported for CREATE $syntaxTypeErrorStr " + - "statements. Please remove any OPTIONS lists specified in the statement.", ctx) - } - serdeInfoOpt.map(serdeInfo => if (serdeInfo.storedAs.nonEmpty) { - throw operationNotAllowed(s"The STORED AS syntax is not supported for CREATE " + - s"$syntaxTypeErrorStr statements. Consider using the Data Source based USING clause " - + "instead.", ctx) - } else { - throw operationNotAllowed(s"Hive SerDe format options are not supported for CREATE " + - s"$syntaxTypeErrorStr statements.", ctx) - }) - if (location.nonEmpty) { - throw operationNotAllowed(s"Specifying location is not supported for CREATE " + - s"$syntaxTypeErrorStr statements. The storage location for a pipeline dataset is " + - "managed by the pipeline itself.", ctx) - } - - val spec = TableSpec( - properties = properties, - provider = provider, - options = Map.empty, - location = location, - comment = comment, - collation = collation, - serde = None, - external = false, - constraints = Seq.empty - ) - - val datasetIdentifier = withIdentClause( - createPipelineDatasetHeaderCtx.identifierReference, - UnresolvedIdentifier(_) - ) + val (colDefs, partitioning, spec, ifNotExists, datasetIdentifier) = + parsePipelineDatasetPrelude( + syntaxTypeErrorStr, + createPipelineDatasetHeaderCtx, + ctx.tableProvider, + ctx.tableElementList(), + ctx.createTableClauses(), + ctx) if (createPipelineDatasetHeaderCtx.materializedView() != null) { val query: ParserRuleContext = Option(ctx.query).getOrElse( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala index a6f0199ebc71e..f29215d33feb4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -60,6 +60,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val cdc = cmd.flowOperation.asInstanceOf[AutoCdcIntoCommand] assert(cdc.targetTable.table == "target") assert(cdc.sourceTable.isInstanceOf[UnresolvedRelation]) + assert(cdc.sourceTable.asInstanceOf[UnresolvedRelation].isStreaming) assert(cdc.keys.map(_.name) == Seq("key1", "key2")) assert(cdc.deleteCondition.isEmpty) assert(cdc.sequenceByExpr == UnresolvedAttribute("timestamp")) @@ -173,6 +174,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("target")) assert(!cmd.ifNotExists) assert(cmd.sourceTable.isInstanceOf[UnresolvedRelation]) + assert(cmd.sourceTable.asInstanceOf[UnresolvedRelation].isStreaming) assert(cmd.keys.map(_.name) == Seq("key1", "key2")) assert(cmd.deleteCondition.isEmpty) assert(cmd.sequenceByExpr == UnresolvedAttribute("timestamp")) @@ -274,19 +276,18 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |FROM source |KEYS (id)""".stripMargin) } - assert(e.getCondition == "MISSING_CLAUSES_FOR_OPERATION") - assert(e.getMessageParameters.get("clauses") == "SEQUENCE BY") - assert(e.getMessageParameters.get("operation") == "AUTO CDC INTO") + assert(e.getCondition == "AUTOCDC_MISSING_SEQUENCE_BY") } test("CREATE STREAMING TABLE FLOW AUTO CDC - SEQUENCE BY is required") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """CREATE STREAMING TABLE target |FLOW AUTO CDC |FROM source |KEYS (id)""".stripMargin) } + assert(e.getCondition == "AUTOCDC_MISSING_SEQUENCE_BY") } // --------------------------------------------------------------------------- @@ -294,7 +295,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { // --------------------------------------------------------------------------- test("duplicate SEQUENCE BY clause") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source @@ -302,10 +303,11 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |SEQUENCE BY ts1 |SEQUENCE BY ts2""".stripMargin) } + assert(e.getCondition == "DUPLICATE_CLAUSES") } test("duplicate APPLY AS DELETE clause") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source @@ -314,10 +316,11 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |APPLY AS DELETE WHEN b = 2 |SEQUENCE BY ts""".stripMargin) } + assert(e.getCondition == "DUPLICATE_CLAUSES") } test("duplicate COLUMNS clause") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source @@ -326,6 +329,20 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |COLUMNS a, b |COLUMNS c, d""".stripMargin) } + assert(e.getCondition == "DUPLICATE_CLAUSES") + } + + test("both COLUMNS include list and COLUMNS * EXCEPT is an error") { + val e = intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |COLUMNS a, b + |COLUMNS * EXCEPT (c)""".stripMargin) + } + assert(e.getCondition == "AUTOCDC_BOTH_COLUMN_LIST_AND_EXCEPT_COLUMN_LIST") } // --------------------------------------------------------------------------- @@ -333,13 +350,94 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { // --------------------------------------------------------------------------- test("standalone AUTO CDC INTO is not supported") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """AUTO CDC INTO target |FROM source |KEYS (id) |SEQUENCE BY ts""".stripMargin) } + assert(e.getCondition == "PARSE_SYNTAX_ERROR") + } + + // --------------------------------------------------------------------------- + // Error cases: unsupported dataset types and table features + // --------------------------------------------------------------------------- + + test("AUTO CDC is not supported for MATERIALIZED VIEW") { + val e = intercept[ParseException] { + parser.parsePlan( + """CREATE MATERIALIZED VIEW target + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + } + assert(e.getMessage.contains("AUTO CDC is only supported for STREAMING TABLE")) + } + + test("column constraints are not supported for AUTO CDC streaming table") { + val e = intercept[ParseException] { + parser.parsePlan( + """CREATE STREAMING TABLE target (id INT PRIMARY KEY, name STRING) + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + } + assert(e.getMessage.contains("column constraints")) + } + + test("bucketing is not supported for AUTO CDC streaming table") { + val e = intercept[ParseException] { + parser.parsePlan( + """CREATE STREAMING TABLE target + |CLUSTERED BY (id) INTO 4 BUCKETS + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + } + assert(e.getMessage.contains("Bucketing is not supported")) + } + + test("options are not supported for AUTO CDC streaming table") { + val e = intercept[ParseException] { + parser.parsePlan( + """CREATE STREAMING TABLE target + |OPTIONS (key = 'value') + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + } + assert(e.getMessage.contains("Options are not supported")) + } + + test("serde is not supported for AUTO CDC streaming table") { + val e = intercept[ParseException] { + parser.parsePlan( + """CREATE STREAMING TABLE target + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + } + assert(e.getMessage.contains("SerDe")) + } + + test("location is not supported for AUTO CDC streaming table") { + val e = intercept[ParseException] { + parser.parsePlan( + """CREATE STREAMING TABLE target + |LOCATION '/tmp/data' + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + } + assert(e.getMessage.contains("location is not supported")) } // --------------------------------------------------------------------------- @@ -347,7 +445,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { // --------------------------------------------------------------------------- test("APPLY AS TRUNCATE WHEN is not supported") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source @@ -355,10 +453,11 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |APPLY AS TRUNCATE WHEN op = 'TRUNCATE' |SEQUENCE BY ts""".stripMargin) } + assert(e.getCondition == "PARSE_SYNTAX_ERROR") } test("IGNORE NULL UPDATES is not supported") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source @@ -366,10 +465,11 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |IGNORE NULL UPDATES |SEQUENCE BY ts""".stripMargin) } + assert(e.getCondition == "PARSE_SYNTAX_ERROR") } test("STORED AS SCD TYPE 2 is not supported") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source @@ -377,10 +477,11 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |SEQUENCE BY ts |STORED AS SCD TYPE 2""".stripMargin) } + assert(e.getCondition == "PARSE_SYNTAX_ERROR") } test("TRACK HISTORY ON is not supported") { - intercept[ParseException] { + val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source @@ -388,5 +489,6 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |SEQUENCE BY ts |TRACK HISTORY ON value1, value2""".stripMargin) } + assert(e.getCondition == "PARSE_SYNTAX_ERROR") } } From e7ad493687e16b32f6bb7efe3567230cb1d93bd5 Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Fri, 12 Jun 2026 23:29:55 +0000 Subject: [PATCH 08/17] [CONNECT][SDP][SPARK-56249] Enforce fixed clause order for AUTO CDC syntax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change the AUTO CDC grammar from a repeating alternation to a fixed order: APPLY AS DELETE (optional) → SEQUENCE BY (required) → COLUMNS (optional). Remove the now-unnecessary duplicate clause and mutual exclusion checks from the parser. Co-authored-by: Isaac --- .../sql/catalyst/parser/SqlBaseParser.g4 | 7 ++- .../sql/catalyst/parser/AstBuilder.scala | 23 ++-------- .../command/v2/AutoCdcParserSuite.scala | 45 ++++--------------- 3 files changed, 15 insertions(+), 60 deletions(-) diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 9dd4e2e5ddf59..6b120395b3010 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -766,10 +766,9 @@ autoCdcBody autoCdcParameters : FROM source=relationPrimary KEYS LEFT_PAREN keys=multipartIdentifierList RIGHT_PAREN - (autoCdcDeleteClause - | autoCdcSequenceByClause - | autoCdcColumnsClause - )* + autoCdcDeleteClause? + autoCdcSequenceByClause + autoCdcColumnsClause? ; autoCdcDeleteClause diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 7faebed615703..34c1df0d4f26f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1378,33 +1378,16 @@ class AstBuilder extends DataTypeAstBuilder Seq[UnresolvedAttribute], Seq[UnresolvedAttribute]) = withOrigin(params) { - checkDuplicateClauses(params.autoCdcDeleteClause(), "APPLY AS DELETE", params) - checkDuplicateClauses(params.autoCdcSequenceByClause(), "SEQUENCE BY", params) - val allColumnsClauses = params.autoCdcColumnsClause().asScala - if (allColumnsClauses.exists(_.columns != null) && - allColumnsClauses.exists(_.exceptCols != null)) { - throw new ParseException( - errorClass = "AUTOCDC_BOTH_COLUMN_LIST_AND_EXCEPT_COLUMN_LIST", - ctx = params) - } - checkDuplicateClauses(params.autoCdcColumnsClause(), "COLUMNS", params) - - if (params.autoCdcSequenceByClause().isEmpty) { - throw new ParseException( - errorClass = "AUTOCDC_MISSING_SEQUENCE_BY", - ctx = params) - } - val sourceTable = plan(params.source) match { case r: UnresolvedRelation => r.copy(isStreaming = true) case other => other } val keys = visitMultipartIdentifierList(params.keys) - val deleteCondition = params.autoCdcDeleteClause().asScala.headOption + val deleteCondition = Option(params.autoCdcDeleteClause()) .map(c => expression(c.deleteCondition)) - val sequencing = expression(params.autoCdcSequenceByClause(0).sequence) + val sequencing = expression(params.autoCdcSequenceByClause().sequence) - val columnsClause = params.autoCdcColumnsClause().asScala.headOption + val columnsClause = Option(params.autoCdcColumnsClause()) val specifiedCols = columnsClause match { case Some(c) if c.columns != null => visitMultipartIdentifierList(c.columns) case _ => Seq.empty diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala index f29215d33feb4..d0e48a08bbb26 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -276,7 +276,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |FROM source |KEYS (id)""".stripMargin) } - assert(e.getCondition == "AUTOCDC_MISSING_SEQUENCE_BY") + assert(e.getCondition == "PARSE_SYNTAX_ERROR") } test("CREATE STREAMING TABLE FLOW AUTO CDC - SEQUENCE BY is required") { @@ -287,62 +287,35 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |FROM source |KEYS (id)""".stripMargin) } - assert(e.getCondition == "AUTOCDC_MISSING_SEQUENCE_BY") + assert(e.getCondition == "PARSE_SYNTAX_ERROR") } // --------------------------------------------------------------------------- - // Error cases: duplicate clauses + // Error cases: wrong clause order // --------------------------------------------------------------------------- - test("duplicate SEQUENCE BY clause") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id) - |SEQUENCE BY ts1 - |SEQUENCE BY ts2""".stripMargin) - } - assert(e.getCondition == "DUPLICATE_CLAUSES") - } - - test("duplicate APPLY AS DELETE clause") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id) - |APPLY AS DELETE WHEN a = 1 - |APPLY AS DELETE WHEN b = 2 - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getCondition == "DUPLICATE_CLAUSES") - } - - test("duplicate COLUMNS clause") { + test("SEQUENCE BY before APPLY AS DELETE is not allowed") { val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source |KEYS (id) |SEQUENCE BY ts - |COLUMNS a, b - |COLUMNS c, d""".stripMargin) + |APPLY AS DELETE WHEN a = 1""".stripMargin) } - assert(e.getCondition == "DUPLICATE_CLAUSES") + assert(e.getCondition == "PARSE_SYNTAX_ERROR") } - test("both COLUMNS include list and COLUMNS * EXCEPT is an error") { + test("COLUMNS before SEQUENCE BY is not allowed") { val e = intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target |FROM source |KEYS (id) - |SEQUENCE BY ts |COLUMNS a, b - |COLUMNS * EXCEPT (c)""".stripMargin) + |SEQUENCE BY ts""".stripMargin) } - assert(e.getCondition == "AUTOCDC_BOTH_COLUMN_LIST_AND_EXCEPT_COLUMN_LIST") + assert(e.getCondition == "PARSE_SYNTAX_ERROR") } // --------------------------------------------------------------------------- From ebda4cb53a1c6d7632febfe7bac3a9d4bec4bf4a Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Sat, 13 Jun 2026 00:06:38 +0000 Subject: [PATCH 09/17] [CONNECT][SDP][SPARK-56249] Merge createStreamingTableAutoCdc into createPipelineDataset grammar rule Unify the two grammar alternatives for pipeline dataset creation into a single createPipelineDataset rule with an optional (AS query | FLOW autoCdcBody) tail. Remove the separate visitCreateStreamingTableAutoCdc visitor and handle all three streaming table forms (CTAS, bare, AutoCDC) in visitCreatePipelineDataset. Also change AutoCdcIntoCommand from LeafCommand to UnaryCommand since it has sourceTable as a child plan. Co-authored-by: Isaac --- .../sql/catalyst/parser/SqlBaseParser.g4 | 5 +- .../plans/logical/AutoCdcIntoCommand.scala | 10 +- .../spark/sql/execution/SparkSqlParser.scala | 96 ++++++++----------- 3 files changed, 48 insertions(+), 63 deletions(-) diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 6b120395b3010..aca7bf1ad3f9c 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -416,10 +416,7 @@ statement | unsupportedHiveNativeCommands .*? #failNativeCommand | createPipelineDatasetHeader (LEFT_PAREN tableElementList? RIGHT_PAREN)? tableProvider? createTableClauses - (AS query)? #createPipelineDataset - | createPipelineDatasetHeader (LEFT_PAREN tableElementList? RIGHT_PAREN)? tableProvider? - createTableClauses - FLOW autoCdcBody #createStreamingTableAutoCdc + (AS query | FLOW autoCdcBody)? #createPipelineDataset | createPipelineFlowHeader insertInto query #createPipelineInsertIntoFlow | createPipelineFlowHeader autoCdcCommand #createFlowAutoCdc ; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala index f13635e6d2931..dc4dbfc353a16 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.expressions.Expression /** * Logical plan node for an AUTO CDC INTO command, used by Spark Declarative Pipelines. @@ -51,7 +51,9 @@ case class AutoCdcIntoCommand( sequenceByExpr: Expression, specifiedCols: Seq[UnresolvedAttribute], exceptCols: Seq[UnresolvedAttribute] -) extends LeafCommand { - // Output is not meaningful; this node is a pipeline-context placeholder. - override def output: Seq[Attribute] = Nil +) extends UnaryCommand { + override def child: LogicalPlan = sourceTable + + override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = + copy(sourceTable = newChild) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 20ee24a6d11df..bf845918fdd6e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -1670,42 +1670,6 @@ class SparkSqlAstBuilder extends AstBuilder { (colDefs, partitioning, spec, ifNotExists, tableIdent) } - override def visitCreateStreamingTableAutoCdc( - ctx: CreateStreamingTableAutoCdcContext): LogicalPlan = withOrigin(ctx) { - val headerCtx = ctx.createPipelineDatasetHeader() - - if (headerCtx.materializedView() != null) { - throw operationNotAllowed( - "AUTO CDC is only supported for STREAMING TABLE, not MATERIALIZED VIEW.", ctx) - } - - val (colDefs, partitioning, spec, ifNotExists, tableIdent) = - parsePipelineDatasetPrelude( - "STREAMING TABLE", - headerCtx, - ctx.tableProvider, - ctx.tableElementList(), - ctx.createTableClauses(), - ctx) - - val (src, keys, delete, seq, specCols, exceptCols) = - parseAutoCdcParams(ctx.autoCdcBody().autoCdcParameters()) - - CreateStreamingTableAutoCdc( - name = tableIdent, - columns = colDefs, - partitioning = partitioning, - tableSpec = spec, - ifNotExists = ifNotExists, - sourceTable = src, - keys = keys, - deleteCondition = delete, - sequenceByExpr = seq, - specifiedCols = specCols, - exceptCols = exceptCols - ) - } - override def visitCreatePipelineDataset( ctx: CreatePipelineDatasetContext): LogicalPlan = withOrigin(ctx) { val createPipelineDatasetHeaderCtx = ctx.createPipelineDatasetHeader() @@ -1729,6 +1693,10 @@ class SparkSqlAstBuilder extends AstBuilder { ctx) if (createPipelineDatasetHeaderCtx.materializedView() != null) { + if (ctx.autoCdcBody() != null) { + throw operationNotAllowed( + "AUTO CDC is only supported for STREAMING TABLE, not MATERIALIZED VIEW.", ctx) + } val query: ParserRuleContext = Option(ctx.query).getOrElse( throw operationNotAllowed( s"Unable to find query for CREATE $syntaxTypeErrorStr statement.", ctx) @@ -1743,25 +1711,43 @@ class SparkSqlAstBuilder extends AstBuilder { ifNotExists = ifNotExists ) } else if (createPipelineDatasetHeaderCtx.streamingTable() != null) { - Option(ctx.query) match { - case Some(query) => - CreateStreamingTableAsSelect( - name = datasetIdentifier, - columns = colDefs, - partitioning = partitioning, - tableSpec = spec, - query = plan(query), - originalText = source(query), - ifNotExists = ifNotExists - ) - case None => - CreateStreamingTable( - name = datasetIdentifier, - columns = colDefs, - partitioning = partitioning, - tableSpec = spec, - ifNotExists = ifNotExists - ) + if (ctx.autoCdcBody() != null) { + val (src, keys, delete, seq, specCols, exceptCols) = + parseAutoCdcParams(ctx.autoCdcBody().autoCdcParameters()) + CreateStreamingTableAutoCdc( + name = datasetIdentifier, + columns = colDefs, + partitioning = partitioning, + tableSpec = spec, + ifNotExists = ifNotExists, + sourceTable = src, + keys = keys, + deleteCondition = delete, + sequenceByExpr = seq, + specifiedCols = specCols, + exceptCols = exceptCols + ) + } else { + Option(ctx.query) match { + case Some(query) => + CreateStreamingTableAsSelect( + name = datasetIdentifier, + columns = colDefs, + partitioning = partitioning, + tableSpec = spec, + query = plan(query), + originalText = source(query), + ifNotExists = ifNotExists + ) + case None => + CreateStreamingTable( + name = datasetIdentifier, + columns = colDefs, + partitioning = partitioning, + tableSpec = spec, + ifNotExists = ifNotExists + ) + } } } else { // Should never be possible based on grammar definition. From 9590ebbf370fc222379f38ef5e39544d3a790138 Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Sat, 13 Jun 2026 00:45:50 +0000 Subject: [PATCH 10/17] [CONNECT][SDP][SPARK-56249] Use checkError in AutoCdcParserSuite error tests Replace plain assert checks with checkError for all error test cases, validating condition, sqlState, parameters, and queryContext. Co-authored-by: Isaac --- .../command/v2/AutoCdcParserSuite.scala | 336 +++++++++++------- 1 file changed, 201 insertions(+), 135 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala index d0e48a08bbb26..b08b8175c791d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -270,24 +270,32 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { // --------------------------------------------------------------------------- test("CREATE FLOW AS AUTO CDC INTO - SEQUENCE BY is required") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id)""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id)""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "end of input", "hint" -> "") + ) } test("CREATE STREAMING TABLE FLOW AUTO CDC - SEQUENCE BY is required") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE STREAMING TABLE target - |FLOW AUTO CDC - |FROM source - |KEYS (id)""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (id)""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "end of input", "hint" -> "") + ) } // --------------------------------------------------------------------------- @@ -295,27 +303,35 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { // --------------------------------------------------------------------------- test("SEQUENCE BY before APPLY AS DELETE is not allowed") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id) - |SEQUENCE BY ts - |APPLY AS DELETE WHEN a = 1""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |APPLY AS DELETE WHEN a = 1""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'APPLY'", "hint" -> "") + ) } test("COLUMNS before SEQUENCE BY is not allowed") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id) - |COLUMNS a, b - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |COLUMNS a, b + |SEQUENCE BY ts""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'COLUMNS'", "hint" -> "") + ) } // --------------------------------------------------------------------------- @@ -323,14 +339,18 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { // --------------------------------------------------------------------------- test("standalone AUTO CDC INTO is not supported") { - val e = intercept[ParseException] { - parser.parsePlan( - """AUTO CDC INTO target - |FROM source - |KEYS (id) - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'AUTO'", "hint" -> "") + ) } // --------------------------------------------------------------------------- @@ -338,79 +358,109 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { // --------------------------------------------------------------------------- test("AUTO CDC is not supported for MATERIALIZED VIEW") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE MATERIALIZED VIEW target - |FLOW AUTO CDC - |FROM source - |KEYS (id) - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getMessage.contains("AUTO CDC is only supported for STREAMING TABLE")) + val sql = + """CREATE MATERIALIZED VIEW target + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin + checkError( + intercept[ParseException] { parser.parsePlan(sql) }, + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map( + "message" -> "AUTO CDC is only supported for STREAMING TABLE, not MATERIALIZED VIEW."), + queryContext = Array(ExpectedContext(sql, 0, sql.length - 1)) + ) } test("column constraints are not supported for AUTO CDC streaming table") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE STREAMING TABLE target (id INT PRIMARY KEY, name STRING) - |FLOW AUTO CDC - |FROM source - |KEYS (id) - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getMessage.contains("column constraints")) + val sql = + """CREATE STREAMING TABLE target (id INT PRIMARY KEY, name STRING) + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin + checkError( + intercept[ParseException] { parser.parsePlan(sql) }, + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map("message" -> + ("Pipeline datasets do not currently support column constraints. " + + "Please remove any CHECK, UNIQUE, PK, and FK constraints " + + "specified on the pipeline dataset.")), + queryContext = Array(ExpectedContext(sql, 0, sql.length - 1)) + ) } test("bucketing is not supported for AUTO CDC streaming table") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE STREAMING TABLE target - |CLUSTERED BY (id) INTO 4 BUCKETS - |FLOW AUTO CDC - |FROM source - |KEYS (id) - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getMessage.contains("Bucketing is not supported")) + val sql = + """CREATE STREAMING TABLE target + |CLUSTERED BY (id) INTO 4 BUCKETS + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin + checkError( + intercept[ParseException] { parser.parsePlan(sql) }, + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map("message" -> + ("Bucketing is not supported for CREATE STREAMING TABLE statements. " + + "Please remove any bucket spec specified in the statement.")), + queryContext = Array(ExpectedContext(sql, 0, sql.length - 1)) + ) } test("options are not supported for AUTO CDC streaming table") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE STREAMING TABLE target - |OPTIONS (key = 'value') - |FLOW AUTO CDC - |FROM source - |KEYS (id) - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getMessage.contains("Options are not supported")) + val sql = + """CREATE STREAMING TABLE target + |OPTIONS (key = 'value') + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin + checkError( + intercept[ParseException] { parser.parsePlan(sql) }, + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map("message" -> + ("Options are not supported for CREATE STREAMING TABLE statements. " + + "Please remove any OPTIONS lists specified in the statement.")), + queryContext = Array(ExpectedContext(sql, 0, sql.length - 1)) + ) } test("serde is not supported for AUTO CDC streaming table") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE STREAMING TABLE target - |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' - |FLOW AUTO CDC - |FROM source - |KEYS (id) - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getMessage.contains("SerDe")) + val sql = + """CREATE STREAMING TABLE target + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin + checkError( + intercept[ParseException] { parser.parsePlan(sql) }, + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map("message" -> + ("Hive SerDe format options are not supported for " + + "CREATE STREAMING TABLE statements.")), + queryContext = Array(ExpectedContext(sql, 0, sql.length - 1)) + ) } test("location is not supported for AUTO CDC streaming table") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE STREAMING TABLE target - |LOCATION '/tmp/data' - |FLOW AUTO CDC - |FROM source - |KEYS (id) - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getMessage.contains("location is not supported")) + val sql = + """CREATE STREAMING TABLE target + |LOCATION '/tmp/data' + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin + checkError( + intercept[ParseException] { parser.parsePlan(sql) }, + condition = "_LEGACY_ERROR_TEMP_0035", + parameters = Map("message" -> + ("Specifying location is not supported for CREATE STREAMING TABLE statements. " + + "The storage location for a pipeline dataset is managed by the pipeline itself.")), + queryContext = Array(ExpectedContext(sql, 0, sql.length - 1)) + ) } // --------------------------------------------------------------------------- @@ -418,50 +468,66 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { // --------------------------------------------------------------------------- test("APPLY AS TRUNCATE WHEN is not supported") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id) - |APPLY AS TRUNCATE WHEN op = 'TRUNCATE' - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |APPLY AS TRUNCATE WHEN op = 'TRUNCATE' + |SEQUENCE BY ts""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'TRUNCATE'", "hint" -> "") + ) } test("IGNORE NULL UPDATES is not supported") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id) - |IGNORE NULL UPDATES - |SEQUENCE BY ts""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |IGNORE NULL UPDATES + |SEQUENCE BY ts""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'IGNORE'", "hint" -> "") + ) } test("STORED AS SCD TYPE 2 is not supported") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id) - |SEQUENCE BY ts - |STORED AS SCD TYPE 2""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |STORED AS SCD TYPE 2""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'STORED'", "hint" -> "") + ) } test("TRACK HISTORY ON is not supported") { - val e = intercept[ParseException] { - parser.parsePlan( - """CREATE FLOW f AS AUTO CDC INTO target - |FROM source - |KEYS (id) - |SEQUENCE BY ts - |TRACK HISTORY ON value1, value2""".stripMargin) - } - assert(e.getCondition == "PARSE_SYNTAX_ERROR") + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts + |TRACK HISTORY ON value1, value2""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'TRACK'", "hint" -> "") + ) } } From 1f8df74b4cbe1e576fae49a7c06e8f513d4eee9b Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Fri, 26 Jun 2026 05:48:01 +0000 Subject: [PATCH 11/17] [SPARK-57402][SQL] Restrict AUTO CDC source to STREAM(...) and KEYS/COLUMNS to simple identifiers Tighten the AUTO CDC grammar and AST building: - The AUTO CDC source now only accepts STREAM(multipartIdentifier); the bare multipart-identifier form is removed. The source is always returned as an UnresolvedRelation marked as a streaming read (isStreaming = true). - KEYS and COLUMNS now only accept simple identifiers (identifierSeq) instead of multipart identifiers, and are built as single-part UnresolvedAttributes. - Inline the pipeline-dataset prelude into visitCreatePipelineDataset and use the AutoCdcParams case class for parameter passing. Adds parser tests covering the STREAM requirement and rejection of multipart identifiers in KEYS and COLUMNS for both AUTO CDC variants. Co-authored-by: Isaac --- .../sql/catalyst/parser/SqlBaseParser.g4 | 12 +- .../sql/catalyst/parser/AstBuilder.scala | 61 ++++-- .../plans/logical/AutoCdcIntoCommand.scala | 12 +- .../catalyst/plans/logical/v2Commands.scala | 15 +- .../spark/sql/execution/SparkSqlParser.scala | 87 +++----- .../command/v2/AutoCdcParserSuite.scala | 186 ++++++++++++++---- 6 files changed, 236 insertions(+), 137 deletions(-) diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index aca7bf1ad3f9c..382907f718791 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -761,13 +761,17 @@ autoCdcBody ; autoCdcParameters - : FROM source=relationPrimary - KEYS LEFT_PAREN keys=multipartIdentifierList RIGHT_PAREN + : FROM source=autoCdcSource + KEYS LEFT_PAREN keys=identifierSeq RIGHT_PAREN autoCdcDeleteClause? autoCdcSequenceByClause autoCdcColumnsClause? ; +autoCdcSource + : STREAM LEFT_PAREN multipartIdentifier RIGHT_PAREN + ; + autoCdcDeleteClause : APPLY AS DELETE WHEN deleteCondition=booleanExpression ; @@ -778,8 +782,8 @@ autoCdcSequenceByClause autoCdcColumnsClause : COLUMNS ( - columns=multipartIdentifierList | - ASTERISK EXCEPT LEFT_PAREN exceptCols=multipartIdentifierList RIGHT_PAREN) + columns=identifierSeq | + ASTERISK EXCEPT LEFT_PAREN exceptCols=identifierSeq RIGHT_PAREN) ; identifierReference diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 34c1df0d4f26f..13b7673dc2af2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1365,39 +1365,49 @@ class AstBuilder extends DataTypeAstBuilder protected def buildAutoCdcIntoCommand(ctx: AutoCdcCommandContext): AutoCdcIntoCommand = withOrigin(ctx) { val target = visitMultipartIdentifier(ctx.target).asTableIdentifier - val (src, keys, delete, seq, specCols, exceptCols) = - parseAutoCdcParams(ctx.autoCdcParameters()) - AutoCdcIntoCommand(target, src, keys, delete, seq, specCols, exceptCols) - } - - protected def parseAutoCdcParams(params: AutoCdcParametersContext): ( - LogicalPlan, - Seq[UnresolvedAttribute], - Option[Expression], - Expression, - Seq[UnresolvedAttribute], - Seq[UnresolvedAttribute]) = + val params = parseAutoCdcParams(ctx.autoCdcParameters()) + AutoCdcIntoCommand( + target, + params.source, + params.keys, + params.deleteCondition, + params.sequencing, + params.specifiedCols, + params.exceptCols) + } + + protected def parseAutoCdcParams(params: AutoCdcParametersContext): AutoCdcParams = withOrigin(params) { - val sourceTable = plan(params.source) match { - case r: UnresolvedRelation => r.copy(isStreaming = true) - case other => other - } - val keys = visitMultipartIdentifierList(params.keys) + val source = resolveAutoCdcSource(params.source) + val keys = visitIdentifierSeq(params.keys).map(UnresolvedAttribute.quoted) val deleteCondition = Option(params.autoCdcDeleteClause()) .map(c => expression(c.deleteCondition)) val sequencing = expression(params.autoCdcSequenceByClause().sequence) val columnsClause = Option(params.autoCdcColumnsClause()) val specifiedCols = columnsClause match { - case Some(c) if c.columns != null => visitMultipartIdentifierList(c.columns) + case Some(c) if c.columns != null => + visitIdentifierSeq(c.columns).map(UnresolvedAttribute.quoted) case _ => Seq.empty } val exceptCols = columnsClause match { - case Some(c) if c.exceptCols != null => visitMultipartIdentifierList(c.exceptCols) + case Some(c) if c.exceptCols != null => + visitIdentifierSeq(c.exceptCols).map(UnresolvedAttribute.quoted) case _ => Seq.empty } - (sourceTable, keys, deleteCondition, sequencing, specifiedCols, exceptCols) + AutoCdcParams(source, keys, deleteCondition, sequencing, specifiedCols, exceptCols) + } + + /** + * Resolve the AUTO CDC source, which is a STREAM(multipartIdentifier). It is returned as an + * [[UnresolvedRelation]] marked as a streaming read via the `isStreaming` flag. + */ + protected def resolveAutoCdcSource(ctx: AutoCdcSourceContext): UnresolvedRelation = + withOrigin(ctx) { + val ident = visitMultipartIdentifier(ctx.multipartIdentifier) + createUnresolvedRelation( + ctx, ident, optionsClause = None, writePrivileges = Set.empty, isStreaming = true) } /** @@ -7748,3 +7758,14 @@ class AstBuilder extends DataTypeAstBuilder } } } + +/** + * Parameters parsed from an AUTO CDC clause. + */ +case class AutoCdcParams( + source: LogicalPlan, + keys: Seq[UnresolvedAttribute], + deleteCondition: Option[Expression], + sequencing: Expression, + specifiedCols: Seq[UnresolvedAttribute], + exceptCols: Seq[UnresolvedAttribute]) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala index dc4dbfc353a16..938a1b91201f0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala @@ -32,7 +32,8 @@ import org.apache.spark.sql.catalyst.expressions.Expression * is added (SPARK-57402). * * @param targetTable The target table to apply changes into. - * @param sourceTable The source relation providing the change events. + * @param source The source relation providing the change events. Always a STREAM(...) + * source (marked as a streaming read). * @param keys Column(s) that uniquely identify a row in the target table. * @param deleteCondition An optional expression that marks a source row as a DELETE operation. * When absent, all source rows are treated as upserts. @@ -45,15 +46,10 @@ import org.apache.spark.sql.catalyst.expressions.Expression */ case class AutoCdcIntoCommand( targetTable: TableIdentifier, - sourceTable: LogicalPlan, + source: LogicalPlan, keys: Seq[UnresolvedAttribute], deleteCondition: Option[Expression], sequenceByExpr: Expression, specifiedCols: Seq[UnresolvedAttribute], exceptCols: Seq[UnresolvedAttribute] -) extends UnaryCommand { - override def child: LogicalPlan = sourceTable - - override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = - copy(sourceTable = newChild) -} +) extends LeafCommand diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 3497586bcf7c5..3f0573b1171b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -837,7 +837,8 @@ case class CreateStreamingTable( * @param partitioning Column-based partitioning for the streaming table. * @param tableSpec Additional table specs. * @param ifNotExists Whether the table should only be created if it doesn't already exist. - * @param sourceTable The source relation providing the change events. + * @param source The source relation providing the change events. Always a STREAM(...) + * source (marked as a streaming read). * @param keys Column(s) that uniquely identify a row in the target table. * @param deleteCondition An optional expression that marks a source row as a DELETE operation. * @param sequenceByExpr Expression that orders CDC events to resolve out-of-order arrivals. @@ -851,19 +852,17 @@ case class CreateStreamingTableAutoCdc( partitioning: Seq[Transform], tableSpec: TableSpecBase, ifNotExists: Boolean, - sourceTable: LogicalPlan, + source: LogicalPlan, keys: Seq[UnresolvedAttribute], deleteCondition: Option[Expression], sequenceByExpr: Expression, specifiedCols: Seq[UnresolvedAttribute], exceptCols: Seq[UnresolvedAttribute] -) extends BinaryCommand with CreatePipelineDataset { - override def left: LogicalPlan = name - override def right: LogicalPlan = sourceTable +) extends UnaryCommand with CreatePipelineDataset { + override def child: LogicalPlan = name - override protected def withNewChildrenInternal( - newLeft: LogicalPlan, newRight: LogicalPlan): LogicalPlan = - copy(name = newLeft, sourceTable = newRight) + override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = + copy(name = newChild) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index bf845918fdd6e..3e0d59ef174c9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -41,7 +41,6 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} import org.apache.spark.sql.catalyst.util.DateTimeConstants import org.apache.spark.sql.connector.catalog.CatalogManager -import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryParsingErrors} import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources._ @@ -1585,30 +1584,22 @@ class SparkSqlAstBuilder extends AstBuilder { ) } - /** - * Shared helper for pipeline dataset creation statements (CREATE STREAMING TABLE, - * CREATE MATERIALIZED VIEW, CREATE STREAMING TABLE ... FLOW AUTO CDC). - * - * Validates and extracts column definitions, partitioning, and table spec from the common - * grammar elements shared by these statements. - * - * @return (colDefs, partitioning, spec, ifNotExists, tableIdent) - */ - private def parsePipelineDatasetPrelude( - syntaxTypeErrorStr: String, - headerCtx: CreatePipelineDatasetHeaderContext, - tableProviderCtx: TableProviderContext, - tableElementListCtx: TableElementListContext, - createTableClausesCtx: CreateTableClausesContext, - ctx: ParserRuleContext): ( - Seq[ColumnDefinition], - Seq[Transform], - TableSpec, - Boolean, - LogicalPlan) = { - val ifNotExists = headerCtx.EXISTS() != null - val provider = Option(tableProviderCtx).map(_.multipartIdentifier.getText) - val (colDefs, colConstraints) = Option(tableElementListCtx).map(visitTableElementList) + override def visitCreatePipelineDataset( + ctx: CreatePipelineDatasetContext): LogicalPlan = withOrigin(ctx) { + val createPipelineDatasetHeaderCtx = ctx.createPipelineDatasetHeader() + + val syntaxTypeErrorStr = if (createPipelineDatasetHeaderCtx.materializedView() != null) { + "MATERIALIZED VIEW" + } else if (createPipelineDatasetHeaderCtx.streamingTable() != null) { + "STREAMING TABLE" + } else { + // Should never be possible based on grammar definition. + throw invalidStatement(ctx.getText, ctx) + } + + val ifNotExists = createPipelineDatasetHeaderCtx.EXISTS() != null + val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) + val (colDefs, colConstraints) = Option(ctx.tableElementList()).map(visitTableElementList) .getOrElse((Nil, Nil)) if (colConstraints.nonEmpty) { @@ -1619,7 +1610,7 @@ class SparkSqlAstBuilder extends AstBuilder { val (partTransforms, partCols, bucketSpec, properties, options, location, comment, collation, serdeInfoOpt, - clusterBySpec) = visitCreateTableClauses(createTableClausesCtx) + clusterBySpec) = visitCreateTableClauses(ctx.createTableClauses()) val partitioning = partitionExpressions(partTransforms, partCols, ctx) ++ @@ -1662,36 +1653,11 @@ class SparkSqlAstBuilder extends AstBuilder { constraints = Seq.empty ) - val tableIdent = withIdentClause( - headerCtx.identifierReference, + val datasetIdentifier = withIdentClause( + createPipelineDatasetHeaderCtx.identifierReference, UnresolvedIdentifier(_) ) - (colDefs, partitioning, spec, ifNotExists, tableIdent) - } - - override def visitCreatePipelineDataset( - ctx: CreatePipelineDatasetContext): LogicalPlan = withOrigin(ctx) { - val createPipelineDatasetHeaderCtx = ctx.createPipelineDatasetHeader() - - val syntaxTypeErrorStr = if (createPipelineDatasetHeaderCtx.materializedView() != null) { - "MATERIALIZED VIEW" - } else if (createPipelineDatasetHeaderCtx.streamingTable() != null) { - "STREAMING TABLE" - } else { - // Should never be possible based on grammar definition. - throw invalidStatement(ctx.getText, ctx) - } - - val (colDefs, partitioning, spec, ifNotExists, datasetIdentifier) = - parsePipelineDatasetPrelude( - syntaxTypeErrorStr, - createPipelineDatasetHeaderCtx, - ctx.tableProvider, - ctx.tableElementList(), - ctx.createTableClauses(), - ctx) - if (createPipelineDatasetHeaderCtx.materializedView() != null) { if (ctx.autoCdcBody() != null) { throw operationNotAllowed( @@ -1712,20 +1678,19 @@ class SparkSqlAstBuilder extends AstBuilder { ) } else if (createPipelineDatasetHeaderCtx.streamingTable() != null) { if (ctx.autoCdcBody() != null) { - val (src, keys, delete, seq, specCols, exceptCols) = - parseAutoCdcParams(ctx.autoCdcBody().autoCdcParameters()) + val params = parseAutoCdcParams(ctx.autoCdcBody().autoCdcParameters()) CreateStreamingTableAutoCdc( name = datasetIdentifier, columns = colDefs, partitioning = partitioning, tableSpec = spec, ifNotExists = ifNotExists, - sourceTable = src, - keys = keys, - deleteCondition = delete, - sequenceByExpr = seq, - specifiedCols = specCols, - exceptCols = exceptCols + source = params.source, + keys = params.keys, + deleteCondition = params.deleteCondition, + sequenceByExpr = params.sequencing, + specifiedCols = params.specifiedCols, + exceptCols = params.exceptCols ) } else { Option(ctx.query) match { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala index b08b8175c791d..47c7ac97cc0ec 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -18,8 +18,7 @@ package org.apache.spark.sql.execution.command.v2 import org.apache.spark.sql.catalyst.analysis.{ - AnalysisTest, UnresolvedAttribute, - UnresolvedIdentifier, UnresolvedRelation} + AnalysisTest, UnresolvedAttribute, UnresolvedIdentifier, UnresolvedRelation} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{ AutoCdcIntoCommand, @@ -49,7 +48,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { test("CREATE FLOW AS AUTO CDC INTO - minimal form") { val plan = parser.parsePlan( """CREATE FLOW myflow AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (key1, key2) |SEQUENCE BY timestamp""".stripMargin) @@ -59,8 +58,9 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val cdc = cmd.flowOperation.asInstanceOf[AutoCdcIntoCommand] assert(cdc.targetTable.table == "target") - assert(cdc.sourceTable.isInstanceOf[UnresolvedRelation]) - assert(cdc.sourceTable.asInstanceOf[UnresolvedRelation].isStreaming) + val source = cdc.source.asInstanceOf[UnresolvedRelation] + assert(source.multipartIdentifier == Seq("source")) + assert(source.isStreaming) assert(cdc.keys.map(_.name) == Seq("key1", "key2")) assert(cdc.deleteCondition.isEmpty) assert(cdc.sequenceByExpr == UnresolvedAttribute("timestamp")) @@ -68,10 +68,23 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cdc.exceptCols.isEmpty) } + test("CREATE FLOW AS AUTO CDC INTO - multipart source name") { + val plan = parser.parsePlan( + """CREATE FLOW myflow AS AUTO CDC INTO target + |FROM STREAM(mycat.myschema.source) + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + + val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] + val source = cdc.source.asInstanceOf[UnresolvedRelation] + assert(source.multipartIdentifier == Seq("mycat", "myschema", "source")) + assert(source.isStreaming) + } + test("CREATE FLOW AS AUTO CDC INTO - with COMMENT") { val plan = parser.parsePlan( """CREATE FLOW myflow COMMENT 'my comment' AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin) @@ -82,7 +95,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { test("CREATE FLOW AS AUTO CDC INTO - multipart flow name") { val plan = parser.parsePlan( """CREATE FLOW mycat.myschema.myflow AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin) @@ -94,7 +107,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { test("CREATE FLOW AS AUTO CDC INTO - two-part target table name") { val plan = parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO myschema.mytable - |FROM source + |FROM STREAM(source) |KEYS (k) |SEQUENCE BY ts""".stripMargin) @@ -106,7 +119,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { test("CREATE FLOW AS AUTO CDC INTO - APPLY AS DELETE WHEN") { val plan = parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |APPLY AS DELETE WHEN op = 'DELETE' |SEQUENCE BY ts""".stripMargin) @@ -119,7 +132,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { test("CREATE FLOW AS AUTO CDC INTO - COLUMNS include list") { val plan = parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts |COLUMNS id, name, value""".stripMargin) @@ -132,7 +145,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { test("CREATE FLOW AS AUTO CDC INTO - COLUMNS * EXCEPT list") { val plan = parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts |COLUMNS * EXCEPT (op, ts)""".stripMargin) @@ -145,7 +158,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { test("CREATE FLOW AS AUTO CDC INTO - all clauses combined") { val plan = parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (key1, key2) |APPLY AS DELETE WHEN key3 = 3 |SEQUENCE BY timestamp @@ -166,15 +179,16 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val plan = parser.parsePlan( """CREATE STREAMING TABLE target |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (key1, key2) |SEQUENCE BY timestamp""".stripMargin) val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] assert(cmd.name.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("target")) assert(!cmd.ifNotExists) - assert(cmd.sourceTable.isInstanceOf[UnresolvedRelation]) - assert(cmd.sourceTable.asInstanceOf[UnresolvedRelation].isStreaming) + val source = cmd.source.asInstanceOf[UnresolvedRelation] + assert(source.multipartIdentifier == Seq("source")) + assert(source.isStreaming) assert(cmd.keys.map(_.name) == Seq("key1", "key2")) assert(cmd.deleteCondition.isEmpty) assert(cmd.sequenceByExpr == UnresolvedAttribute("timestamp")) @@ -182,11 +196,25 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cmd.exceptCols.isEmpty) } + test("CREATE STREAMING TABLE FLOW AUTO CDC - multipart source name") { + val plan = parser.parsePlan( + """CREATE STREAMING TABLE target + |FLOW AUTO CDC + |FROM STREAM(mycat.myschema.source) + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + + val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] + val source = cmd.source.asInstanceOf[UnresolvedRelation] + assert(source.multipartIdentifier == Seq("mycat", "myschema", "source")) + assert(source.isStreaming) + } + test("CREATE STREAMING TABLE IF NOT EXISTS FLOW AUTO CDC") { val plan = parser.parsePlan( """CREATE STREAMING TABLE IF NOT EXISTS target |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin) @@ -198,7 +226,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val plan = parser.parsePlan( """CREATE STREAMING TABLE myschema.mytable |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin) @@ -210,7 +238,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val plan = parser.parsePlan( """CREATE STREAMING TABLE target |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |APPLY AS DELETE WHEN op = 'DELETE' |SEQUENCE BY ts""".stripMargin) @@ -224,7 +252,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val plan = parser.parsePlan( """CREATE STREAMING TABLE target |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts |COLUMNS id, name, value""".stripMargin) @@ -238,7 +266,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val plan = parser.parsePlan( """CREATE STREAMING TABLE target |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts |COLUMNS * EXCEPT (op, ts)""".stripMargin) @@ -252,7 +280,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val plan = parser.parsePlan( """CREATE STREAMING TABLE target |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (key1, key2) |APPLY AS DELETE WHEN key3 = 3 |SEQUENCE BY timestamp @@ -274,7 +302,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id)""".stripMargin) }, condition = "PARSE_SYNTAX_ERROR", @@ -289,7 +317,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { parser.parsePlan( """CREATE STREAMING TABLE target |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id)""".stripMargin) }, condition = "PARSE_SYNTAX_ERROR", @@ -307,7 +335,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts |APPLY AS DELETE WHEN a = 1""".stripMargin) @@ -323,7 +351,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |COLUMNS a, b |SEQUENCE BY ts""".stripMargin) @@ -343,7 +371,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { intercept[ParseException] { parser.parsePlan( """AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin) }, @@ -353,6 +381,92 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { ) } + // --------------------------------------------------------------------------- + // Error cases: source must be a STREAM(...) + // --------------------------------------------------------------------------- + + test("CREATE FLOW AS AUTO CDC INTO - source without STREAM is not allowed") { + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'source'", "hint" -> "") + ) + } + + test("CREATE STREAMING TABLE FLOW AUTO CDC - source without STREAM is not allowed") { + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE STREAMING TABLE target + |FLOW AUTO CDC + |FROM source + |KEYS (id) + |SEQUENCE BY ts""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'source'", "hint" -> "") + ) + } + + // --------------------------------------------------------------------------- + // Error cases: KEYS and COLUMNS only accept simple identifiers + // --------------------------------------------------------------------------- + + test("KEYS does not accept multipart identifiers") { + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM STREAM(source) + |KEYS (a.id) + |SEQUENCE BY ts""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'.'", "hint" -> "") + ) + } + + test("COLUMNS include list does not accept multipart identifiers") { + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM STREAM(source) + |KEYS (id) + |SEQUENCE BY ts + |COLUMNS a.name""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'.'", "hint" -> "") + ) + } + + test("COLUMNS * EXCEPT list does not accept multipart identifiers") { + checkError( + intercept[ParseException] { + parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO target + |FROM STREAM(source) + |KEYS (id) + |SEQUENCE BY ts + |COLUMNS * EXCEPT (a.op)""".stripMargin) + }, + condition = "PARSE_SYNTAX_ERROR", + sqlState = "42601", + parameters = Map("error" -> "'.'", "hint" -> "") + ) + } + // --------------------------------------------------------------------------- // Error cases: unsupported dataset types and table features // --------------------------------------------------------------------------- @@ -361,7 +475,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val sql = """CREATE MATERIALIZED VIEW target |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin checkError( @@ -377,7 +491,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { val sql = """CREATE STREAMING TABLE target (id INT PRIMARY KEY, name STRING) |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin checkError( @@ -396,7 +510,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { """CREATE STREAMING TABLE target |CLUSTERED BY (id) INTO 4 BUCKETS |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin checkError( @@ -414,7 +528,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { """CREATE STREAMING TABLE target |OPTIONS (key = 'value') |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin checkError( @@ -432,7 +546,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { """CREATE STREAMING TABLE target |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin checkError( @@ -450,7 +564,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { """CREATE STREAMING TABLE target |LOCATION '/tmp/data' |FLOW AUTO CDC - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts""".stripMargin checkError( @@ -472,7 +586,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |APPLY AS TRUNCATE WHEN op = 'TRUNCATE' |SEQUENCE BY ts""".stripMargin) @@ -488,7 +602,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |IGNORE NULL UPDATES |SEQUENCE BY ts""".stripMargin) @@ -504,7 +618,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts |STORED AS SCD TYPE 2""".stripMargin) @@ -520,7 +634,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { intercept[ParseException] { parser.parsePlan( """CREATE FLOW f AS AUTO CDC INTO target - |FROM source + |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts |TRACK HISTORY ON value1, value2""".stripMargin) From 9f09360f3d843ad87fda3c866bc00a291f41b375 Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Fri, 26 Jun 2026 15:28:40 +0000 Subject: [PATCH 12/17] [SPARK-57402][SQL] Remove MISSING_CLAUSES_FOR_OPERATION error condition Move the MISSING_CLAUSES_FOR_OPERATION error-condition definition out of this branch; it belongs to a separate change and will be submitted in its own PR. The QueryParsingErrors.missingClausesForOperation helper and its SparkSqlParser callers are already on master (from the METRIC VIEW work) and are unaffected. Co-authored-by: Isaac --- common/utils/src/main/resources/error/error-conditions.json | 6 ------ 1 file changed, 6 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index d500572cf4767..b6206653a1612 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -5542,12 +5542,6 @@ }, "sqlState" : "0A000" }, - "MISSING_CLAUSES_FOR_OPERATION" : { - "message" : [ - "Missing required clause(s) for operation ." - ], - "sqlState" : "42601" - }, "MISSING_DATABASE_FOR_V1_SESSION_CATALOG" : { "message" : [ "Database name is not specified in the v1 session catalog. Please ensure to provide a valid database name when interacting with the v1 catalog." From faa35c7179640c4971d0389d8aebb2edad524e9a Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Fri, 26 Jun 2026 16:20:10 +0000 Subject: [PATCH 13/17] [SPARK-57402][SQL] Rename AUTO CDC column params and make them optional Rename the AUTO CDC column fields specifiedCols/exceptCols to includeColumns/excludeColumns and change them from Seq to Option[Seq[UnresolvedAttribute]], so an absent COLUMNS clause (None) is distinguishable from an empty list. The rename and Option type are propagated through AutoCdcParams, AutoCdcIntoCommand, CreateStreamingTableAutoCdc, SparkSqlParser, and the parser tests. Co-authored-by: Isaac --- .../sql/catalyst/parser/AstBuilder.scala | 20 ++++++------- .../plans/logical/AutoCdcIntoCommand.scala | 14 ++++++---- .../catalyst/plans/logical/v2Commands.scala | 11 ++++---- .../spark/sql/execution/SparkSqlParser.scala | 4 +-- .../command/v2/AutoCdcParserSuite.scala | 28 +++++++++---------- 5 files changed, 39 insertions(+), 38 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 13b7673dc2af2..c06a0693665d2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1372,8 +1372,8 @@ class AstBuilder extends DataTypeAstBuilder params.keys, params.deleteCondition, params.sequencing, - params.specifiedCols, - params.exceptCols) + params.includeColumns, + params.excludeColumns) } protected def parseAutoCdcParams(params: AutoCdcParametersContext): AutoCdcParams = @@ -1385,18 +1385,16 @@ class AstBuilder extends DataTypeAstBuilder val sequencing = expression(params.autoCdcSequenceByClause().sequence) val columnsClause = Option(params.autoCdcColumnsClause()) - val specifiedCols = columnsClause match { - case Some(c) if c.columns != null => + val includeColumns = columnsClause.collect { + case c if c.columns != null => visitIdentifierSeq(c.columns).map(UnresolvedAttribute.quoted) - case _ => Seq.empty } - val exceptCols = columnsClause match { - case Some(c) if c.exceptCols != null => + val excludeColumns = columnsClause.collect { + case c if c.exceptCols != null => visitIdentifierSeq(c.exceptCols).map(UnresolvedAttribute.quoted) - case _ => Seq.empty } - AutoCdcParams(source, keys, deleteCondition, sequencing, specifiedCols, exceptCols) + AutoCdcParams(source, keys, deleteCondition, sequencing, includeColumns, excludeColumns) } /** @@ -7767,5 +7765,5 @@ case class AutoCdcParams( keys: Seq[UnresolvedAttribute], deleteCondition: Option[Expression], sequencing: Expression, - specifiedCols: Seq[UnresolvedAttribute], - exceptCols: Seq[UnresolvedAttribute]) + includeColumns: Option[Seq[UnresolvedAttribute]], + excludeColumns: Option[Seq[UnresolvedAttribute]]) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala index 938a1b91201f0..1cfcd0f1026d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala @@ -39,10 +39,12 @@ import org.apache.spark.sql.catalyst.expressions.Expression * When absent, all source rows are treated as upserts. * @param sequenceByExpr Expression that orders CDC events to correctly resolve out-of-order * arrivals. Must evaluate to a sortable type. Required. - * @param specifiedCols An explicit list of source columns to include in the target table. - * Mutually exclusive with [[exceptCols]]. - * @param exceptCols Source columns to exclude from the target table (i.e., all columns - * except these). Mutually exclusive with [[specifiedCols]]. + * @param includeColumns An explicit list of source columns to include in the target table. + * [[None]] when no COLUMNS clause was specified. Mutually exclusive with + * [[excludeColumns]]. + * @param excludeColumns Source columns to exclude from the target table (i.e., all columns + * except these). [[None]] when no COLUMNS clause was specified. Mutually + * exclusive with [[includeColumns]]. */ case class AutoCdcIntoCommand( targetTable: TableIdentifier, @@ -50,6 +52,6 @@ case class AutoCdcIntoCommand( keys: Seq[UnresolvedAttribute], deleteCondition: Option[Expression], sequenceByExpr: Expression, - specifiedCols: Seq[UnresolvedAttribute], - exceptCols: Seq[UnresolvedAttribute] + includeColumns: Option[Seq[UnresolvedAttribute]], + excludeColumns: Option[Seq[UnresolvedAttribute]] ) extends LeafCommand diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 3f0573b1171b3..c24e1e6ebe7fa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -842,9 +842,10 @@ case class CreateStreamingTable( * @param keys Column(s) that uniquely identify a row in the target table. * @param deleteCondition An optional expression that marks a source row as a DELETE operation. * @param sequenceByExpr Expression that orders CDC events to resolve out-of-order arrivals. - * @param specifiedCols An explicit list of source columns to include. Mutually exclusive with - * [[exceptCols]]. - * @param exceptCols Source columns to exclude. Mutually exclusive with [[specifiedCols]]. + * @param includeColumns An explicit list of source columns to include. [[None]] when no COLUMNS + * clause was specified. Mutually exclusive with [[excludeColumns]]. + * @param excludeColumns Source columns to exclude. [[None]] when no COLUMNS clause was specified. + * Mutually exclusive with [[includeColumns]]. */ case class CreateStreamingTableAutoCdc( name: LogicalPlan, @@ -856,8 +857,8 @@ case class CreateStreamingTableAutoCdc( keys: Seq[UnresolvedAttribute], deleteCondition: Option[Expression], sequenceByExpr: Expression, - specifiedCols: Seq[UnresolvedAttribute], - exceptCols: Seq[UnresolvedAttribute] + includeColumns: Option[Seq[UnresolvedAttribute]], + excludeColumns: Option[Seq[UnresolvedAttribute]] ) extends UnaryCommand with CreatePipelineDataset { override def child: LogicalPlan = name diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 3e0d59ef174c9..d62503f681b34 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -1689,8 +1689,8 @@ class SparkSqlAstBuilder extends AstBuilder { keys = params.keys, deleteCondition = params.deleteCondition, sequenceByExpr = params.sequencing, - specifiedCols = params.specifiedCols, - exceptCols = params.exceptCols + includeColumns = params.includeColumns, + excludeColumns = params.excludeColumns ) } else { Option(ctx.query) match { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala index 47c7ac97cc0ec..eee19dfefc2ee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -64,8 +64,8 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cdc.keys.map(_.name) == Seq("key1", "key2")) assert(cdc.deleteCondition.isEmpty) assert(cdc.sequenceByExpr == UnresolvedAttribute("timestamp")) - assert(cdc.specifiedCols.isEmpty) - assert(cdc.exceptCols.isEmpty) + assert(cdc.includeColumns.isEmpty) + assert(cdc.excludeColumns.isEmpty) } test("CREATE FLOW AS AUTO CDC INTO - multipart source name") { @@ -138,8 +138,8 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |COLUMNS id, name, value""".stripMargin) val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] - assert(cdc.specifiedCols.map(_.name) == Seq("id", "name", "value")) - assert(cdc.exceptCols.isEmpty) + assert(cdc.includeColumns.get.map(_.name) == Seq("id", "name", "value")) + assert(cdc.excludeColumns.isEmpty) } test("CREATE FLOW AS AUTO CDC INTO - COLUMNS * EXCEPT list") { @@ -151,8 +151,8 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |COLUMNS * EXCEPT (op, ts)""".stripMargin) val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] - assert(cdc.specifiedCols.isEmpty) - assert(cdc.exceptCols.map(_.name) == Seq("op", "ts")) + assert(cdc.includeColumns.isEmpty) + assert(cdc.excludeColumns.get.map(_.name) == Seq("op", "ts")) } test("CREATE FLOW AS AUTO CDC INTO - all clauses combined") { @@ -168,7 +168,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cdc.keys.map(_.name) == Seq("key1", "key2")) assert(cdc.deleteCondition.isDefined) assert(cdc.sequenceByExpr == UnresolvedAttribute("timestamp")) - assert(cdc.specifiedCols.map(_.name) == Seq("key1", "key2", "key3", "timestamp")) + assert(cdc.includeColumns.get.map(_.name) == Seq("key1", "key2", "key3", "timestamp")) } // --------------------------------------------------------------------------- @@ -192,8 +192,8 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cmd.keys.map(_.name) == Seq("key1", "key2")) assert(cmd.deleteCondition.isEmpty) assert(cmd.sequenceByExpr == UnresolvedAttribute("timestamp")) - assert(cmd.specifiedCols.isEmpty) - assert(cmd.exceptCols.isEmpty) + assert(cmd.includeColumns.isEmpty) + assert(cmd.excludeColumns.isEmpty) } test("CREATE STREAMING TABLE FLOW AUTO CDC - multipart source name") { @@ -258,8 +258,8 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |COLUMNS id, name, value""".stripMargin) val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] - assert(cmd.specifiedCols.map(_.name) == Seq("id", "name", "value")) - assert(cmd.exceptCols.isEmpty) + assert(cmd.includeColumns.get.map(_.name) == Seq("id", "name", "value")) + assert(cmd.excludeColumns.isEmpty) } test("CREATE STREAMING TABLE FLOW AUTO CDC - COLUMNS * EXCEPT list") { @@ -272,8 +272,8 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |COLUMNS * EXCEPT (op, ts)""".stripMargin) val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] - assert(cmd.specifiedCols.isEmpty) - assert(cmd.exceptCols.map(_.name) == Seq("op", "ts")) + assert(cmd.includeColumns.isEmpty) + assert(cmd.excludeColumns.get.map(_.name) == Seq("op", "ts")) } test("CREATE STREAMING TABLE FLOW AUTO CDC - all clauses combined") { @@ -290,7 +290,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cmd.keys.map(_.name) == Seq("key1", "key2")) assert(cmd.deleteCondition.isDefined) assert(cmd.sequenceByExpr == UnresolvedAttribute("timestamp")) - assert(cmd.exceptCols.map(_.name) == Seq("key4")) + assert(cmd.excludeColumns.get.map(_.name) == Seq("key4")) } // --------------------------------------------------------------------------- From 0b0dba5144660930adc0464d1697ea32d20b2bfa Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Fri, 26 Jun 2026 16:58:13 +0000 Subject: [PATCH 14/17] [SPARK-57402][SQL] Expose AUTO CDC source relation as a plan child Make the AUTO CDC source relation a child of the command nodes so the analyzer resolves it through the normal plan resolution path, mirroring MERGE INTO and the sibling CreatePipelineDatasetAsSelect: - AutoCdcIntoCommand becomes a UnaryCommand whose child is source (targetTable is a TableIdentifier, not a plan, so source is the only plan child). - CreateStreamingTableAutoCdc becomes a BinaryCommand with left = name and right = source, matching CreatePipelineDatasetAsSelect's left = name, right = query. Co-authored-by: Isaac --- .../plans/logical/AutoCdcIntoCommand.scala | 14 ++++++++++---- .../catalyst/plans/logical/v2Commands.scala | 19 +++++++++++++------ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala index 1cfcd0f1026d1..54c33825c0fd9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala @@ -25,15 +25,16 @@ import org.apache.spark.sql.catalyst.expressions.Expression * Logical plan node for an AUTO CDC INTO command, used by Spark Declarative Pipelines. * * This represents a CDC (Change Data Capture) operation that applies an ordered change event - * stream from [[sourceTable]] into [[targetTable]] using SCD Type 1 (upsert) semantics. + * stream from [[source]] into [[targetTable]] using SCD Type 1 (upsert) semantics. * * This node serves as a parse-time placeholder for a pipeline CDC definition and cannot be * executed directly. It will be interpreted by the pipeline submodule once execution support - * is added (SPARK-57402). + * is added (SPARK-57402). The [[source]] relation is exposed as the node's child so the analyzer + * resolves it through the normal plan resolution path. * * @param targetTable The target table to apply changes into. * @param source The source relation providing the change events. Always a STREAM(...) - * source (marked as a streaming read). + * source (marked as a streaming read). Exposed as the node's child. * @param keys Column(s) that uniquely identify a row in the target table. * @param deleteCondition An optional expression that marks a source row as a DELETE operation. * When absent, all source rows are treated as upserts. @@ -54,4 +55,9 @@ case class AutoCdcIntoCommand( sequenceByExpr: Expression, includeColumns: Option[Seq[UnresolvedAttribute]], excludeColumns: Option[Seq[UnresolvedAttribute]] -) extends LeafCommand +) extends UnaryCommand { + override def child: LogicalPlan = source + + override protected def withNewChildInternal(newChild: LogicalPlan): AutoCdcIntoCommand = + copy(source = newChild) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index c24e1e6ebe7fa..b11890131f359 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -832,13 +832,18 @@ case class CreateStreamingTable( * * The target of the CDC operation is the streaming table itself (given by [[name]]). * - * @param name The streaming table name, which also serves as the CDC target. + * [[name]] and [[source]] are exposed as the node's children (left and right respectively) so the + * analyzer resolves them through the normal plan resolution path, mirroring how + * [[CreatePipelineDatasetAsSelect]] exposes its name and query. + * + * @param name The streaming table name, which also serves as the CDC target. Exposed as + * the node's left child. * @param columns User-specified columns for the streaming table. * @param partitioning Column-based partitioning for the streaming table. * @param tableSpec Additional table specs. * @param ifNotExists Whether the table should only be created if it doesn't already exist. * @param source The source relation providing the change events. Always a STREAM(...) - * source (marked as a streaming read). + * source (marked as a streaming read). Exposed as the node's right child. * @param keys Column(s) that uniquely identify a row in the target table. * @param deleteCondition An optional expression that marks a source row as a DELETE operation. * @param sequenceByExpr Expression that orders CDC events to resolve out-of-order arrivals. @@ -859,11 +864,13 @@ case class CreateStreamingTableAutoCdc( sequenceByExpr: Expression, includeColumns: Option[Seq[UnresolvedAttribute]], excludeColumns: Option[Seq[UnresolvedAttribute]] -) extends UnaryCommand with CreatePipelineDataset { - override def child: LogicalPlan = name +) extends BinaryCommand with CreatePipelineDataset { + override def left: LogicalPlan = name + override def right: LogicalPlan = source - override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = - copy(name = newChild) + override protected def withNewChildrenInternal( + newLeft: LogicalPlan, newRight: LogicalPlan): CreateStreamingTableAutoCdc = + copy(name = newLeft, source = newRight) } /** From 0ddd1f1a9a58d5c5401140d3c8907fc92dd6515b Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Fri, 26 Jun 2026 17:05:07 +0000 Subject: [PATCH 15/17] [SPARK-57402][SQL] Rename applyChanges local to autoCdcInto in visitCreateFlowAutoCdc Rename the local variable holding the AutoCdcIntoCommand from applyChanges to autoCdcInto to match the command type. No behavior change. Co-authored-by: Isaac --- .../scala/org/apache/spark/sql/execution/SparkSqlParser.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index d62503f681b34..2eb2c7e914413 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -1576,10 +1576,10 @@ class SparkSqlAstBuilder extends AstBuilder { val flowHeaderCtx = ctx.createPipelineFlowHeader() val ident = withIdentClause(flowHeaderCtx.flowName, UnresolvedIdentifier(_)) val commentOpt = Option(flowHeaderCtx.commentSpec()).map(visitCommentSpec) - val applyChanges = buildAutoCdcIntoCommand(ctx.autoCdcCommand()) + val autoCdcInto = buildAutoCdcIntoCommand(ctx.autoCdcCommand()) CreateFlowCommand( name = ident, - flowOperation = applyChanges, + flowOperation = autoCdcInto, comment = commentOpt ) } From eb2f9d4740fa096eb0c5adef404f903920b206d1 Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Fri, 26 Jun 2026 17:27:08 +0000 Subject: [PATCH 16/17] [SPARK-57402][SQL] Use named parameters when constructing AUTO CDC case classes Convert the positional AutoCdcIntoCommand and AutoCdcParams constructions in AstBuilder.parseAutoCdcParams/buildAutoCdcIntoCommand to named parameters. No behavior change. Co-authored-by: Isaac --- .../sql/catalyst/parser/AstBuilder.scala | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index c06a0693665d2..56da6ff1e49ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1367,13 +1367,13 @@ class AstBuilder extends DataTypeAstBuilder val target = visitMultipartIdentifier(ctx.target).asTableIdentifier val params = parseAutoCdcParams(ctx.autoCdcParameters()) AutoCdcIntoCommand( - target, - params.source, - params.keys, - params.deleteCondition, - params.sequencing, - params.includeColumns, - params.excludeColumns) + targetTable = target, + source = params.source, + keys = params.keys, + deleteCondition = params.deleteCondition, + sequenceByExpr = params.sequencing, + includeColumns = params.includeColumns, + excludeColumns = params.excludeColumns) } protected def parseAutoCdcParams(params: AutoCdcParametersContext): AutoCdcParams = @@ -1394,7 +1394,13 @@ class AstBuilder extends DataTypeAstBuilder visitIdentifierSeq(c.exceptCols).map(UnresolvedAttribute.quoted) } - AutoCdcParams(source, keys, deleteCondition, sequencing, includeColumns, excludeColumns) + AutoCdcParams( + source = source, + keys = keys, + deleteCondition = deleteCondition, + sequencing = sequencing, + includeColumns = includeColumns, + excludeColumns = excludeColumns) } /** From 986a8086f72922da9c79616fd9cec450bf562ee2 Mon Sep 17 00:00:00 2001 From: andreas-neumann_data Date: Tue, 30 Jun 2026 06:46:55 +0000 Subject: [PATCH 17/17] [SPARK-57402][SQL] Parenthesize AUTO CDC COLUMNS list and support multi-part targets Address review feedback on AUTO CDC SQL parsing: - Wrap the COLUMNS include list in parentheses (COLUMNS (a, b, c)) so it matches the COLUMNS * EXCEPT (...) branch and the established AUTO CDC / APPLY CHANGES syntax, which parenthesizes both forms. - Model the AUTO CDC INTO target as an UnresolvedIdentifier (exposed as a plan child) instead of a 1-/2-part TableIdentifier, so catalog-qualified three-part target names parse and the form is consistent with the CREATE STREAMING TABLE ... FLOW AUTO CDC form. Co-authored-by: Isaac --- .../sql/catalyst/parser/SqlBaseParser.g4 | 2 +- .../sql/catalyst/parser/AstBuilder.scala | 2 +- .../plans/logical/AutoCdcIntoCommand.scala | 23 +++++++++------- .../command/v2/AutoCdcParserSuite.scala | 26 ++++++++++++++----- 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 382907f718791..193e2143b2ab7 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -782,7 +782,7 @@ autoCdcSequenceByClause autoCdcColumnsClause : COLUMNS ( - columns=identifierSeq | + LEFT_PAREN columns=identifierSeq RIGHT_PAREN | ASTERISK EXCEPT LEFT_PAREN exceptCols=identifierSeq RIGHT_PAREN) ; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 56da6ff1e49ca..1901a44fe479b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1364,7 +1364,7 @@ class AstBuilder extends DataTypeAstBuilder protected def buildAutoCdcIntoCommand(ctx: AutoCdcCommandContext): AutoCdcIntoCommand = withOrigin(ctx) { - val target = visitMultipartIdentifier(ctx.target).asTableIdentifier + val target = UnresolvedIdentifier(visitMultipartIdentifier(ctx.target)) val params = parseAutoCdcParams(ctx.autoCdcParameters()) AutoCdcIntoCommand( targetTable = target, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala index 54c33825c0fd9..29016f7023679 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AutoCdcIntoCommand.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.catalyst.plans.logical -import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.Expression @@ -29,12 +28,14 @@ import org.apache.spark.sql.catalyst.expressions.Expression * * This node serves as a parse-time placeholder for a pipeline CDC definition and cannot be * executed directly. It will be interpreted by the pipeline submodule once execution support - * is added (SPARK-57402). The [[source]] relation is exposed as the node's child so the analyzer - * resolves it through the normal plan resolution path. + * is added (SPARK-57402). The [[targetTable]] and [[source]] relations are exposed as the node's + * children (left and right respectively) so the analyzer resolves them through the normal plan + * resolution path. * - * @param targetTable The target table to apply changes into. + * @param targetTable The target table to apply changes into, as an `UnresolvedIdentifier`. + * Exposed as the node's left child. * @param source The source relation providing the change events. Always a STREAM(...) - * source (marked as a streaming read). Exposed as the node's child. + * source (marked as a streaming read). Exposed as the node's right child. * @param keys Column(s) that uniquely identify a row in the target table. * @param deleteCondition An optional expression that marks a source row as a DELETE operation. * When absent, all source rows are treated as upserts. @@ -48,16 +49,18 @@ import org.apache.spark.sql.catalyst.expressions.Expression * exclusive with [[includeColumns]]. */ case class AutoCdcIntoCommand( - targetTable: TableIdentifier, + targetTable: LogicalPlan, source: LogicalPlan, keys: Seq[UnresolvedAttribute], deleteCondition: Option[Expression], sequenceByExpr: Expression, includeColumns: Option[Seq[UnresolvedAttribute]], excludeColumns: Option[Seq[UnresolvedAttribute]] -) extends UnaryCommand { - override def child: LogicalPlan = source +) extends BinaryCommand { + override def left: LogicalPlan = targetTable + override def right: LogicalPlan = source - override protected def withNewChildInternal(newChild: LogicalPlan): AutoCdcIntoCommand = - copy(source = newChild) + override protected def withNewChildrenInternal( + newLeft: LogicalPlan, newRight: LogicalPlan): AutoCdcIntoCommand = + copy(targetTable = newLeft, source = newRight) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala index eee19dfefc2ee..5a20ab7a4871a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AutoCdcParserSuite.scala @@ -57,7 +57,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { assert(cmd.comment.isEmpty) val cdc = cmd.flowOperation.asInstanceOf[AutoCdcIntoCommand] - assert(cdc.targetTable.table == "target") + assert(cdc.targetTable.asInstanceOf[UnresolvedIdentifier].nameParts == Seq("target")) val source = cdc.source.asInstanceOf[UnresolvedRelation] assert(source.multipartIdentifier == Seq("source")) assert(source.isStreaming) @@ -112,8 +112,20 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |SEQUENCE BY ts""".stripMargin) val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] - assert(cdc.targetTable.database == Some("myschema")) - assert(cdc.targetTable.table == "mytable") + assert(cdc.targetTable.asInstanceOf[UnresolvedIdentifier].nameParts == + Seq("myschema", "mytable")) + } + + test("CREATE FLOW AS AUTO CDC INTO - three-part target table name") { + val plan = parser.parsePlan( + """CREATE FLOW f AS AUTO CDC INTO mycat.myschema.mytable + |FROM STREAM(source) + |KEYS (k) + |SEQUENCE BY ts""".stripMargin) + + val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] + assert(cdc.targetTable.asInstanceOf[UnresolvedIdentifier].nameParts == + Seq("mycat", "myschema", "mytable")) } test("CREATE FLOW AS AUTO CDC INTO - APPLY AS DELETE WHEN") { @@ -135,7 +147,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts - |COLUMNS id, name, value""".stripMargin) + |COLUMNS (id, name, value)""".stripMargin) val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] assert(cdc.includeColumns.get.map(_.name) == Seq("id", "name", "value")) @@ -162,7 +174,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |KEYS (key1, key2) |APPLY AS DELETE WHEN key3 = 3 |SEQUENCE BY timestamp - |COLUMNS key1, key2, key3, timestamp""".stripMargin) + |COLUMNS (key1, key2, key3, timestamp)""".stripMargin) val cdc = plan.asInstanceOf[CreateFlowCommand].flowOperation.asInstanceOf[AutoCdcIntoCommand] assert(cdc.keys.map(_.name) == Seq("key1", "key2")) @@ -255,7 +267,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts - |COLUMNS id, name, value""".stripMargin) + |COLUMNS (id, name, value)""".stripMargin) val cmd = plan.asInstanceOf[CreateStreamingTableAutoCdc] assert(cmd.includeColumns.get.map(_.name) == Seq("id", "name", "value")) @@ -443,7 +455,7 @@ class AutoCdcParserSuite extends CommandSuiteBase with AnalysisTest { |FROM STREAM(source) |KEYS (id) |SEQUENCE BY ts - |COLUMNS a.name""".stripMargin) + |COLUMNS (a.name)""".stripMargin) }, condition = "PARSE_SYNTAX_ERROR", sqlState = "42601",