From 06a2406771b0c1f025b571dd2b653304c7ad224f Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 30 Jun 2026 07:54:10 +0000 Subject: [PATCH 1/3] [SPARK-57777][SQL][CONNECT] Distinguish explicit collation when rendering string literals to SQL ### What changes were proposed in this pull request? Make `Literal.sql` render a `collate` clause for string literals with an explicit collation (including explicit `UTF8_BINARY`), while keeping the default `StringType` clause-less. Also remove the `PlanGenerationTestSuite` normalization that stamped `UTF8_BINARY` onto empty-collation proto string types, and regenerate the affected Spark Connect golden files so they reflect the real wire format. ### Why are the changes needed? A default `StringType` (undetermined, eligible to inherit a default collation) and an explicitly-`UTF8_BINARY` `StringType` are semantically distinct; rendering the latter without a `collate` clause is lossy on re-parse. The test shim also made the Connect golden files misrepresent the protocol. ### Does this PR introduce any user-facing change? Yes. `Literal.sql` now appends ` collate UTF8_BINARY` for explicit-`UTF8_BINARY` string literals; default literals and other collations are unchanged. ### How was this patch tested? LiteralExpressionSuite, PlanGenerationTestSuite, ProtoToParsedPlanTestSuite, collation SQLQueryTestSuite inputs, and v1/v2 ShowCreateTableSuite all pass; goldens regenerated. ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude Code (Opus 4.8) --- .../sql/catalyst/expressions/literals.scala | 13 ++++++++----- .../spark/sql/PlanGenerationTestSuite.scala | 12 ------------ .../explain-results/function_typedLit.explain | 2 +- .../query-tests/queries/csv_from_dataset.json | 1 - .../queries/csv_from_dataset.proto.bin | Bin 169 -> 156 bytes .../queries/function_lit_array.json | 2 -- .../queries/function_lit_array.proto.bin | Bin 5346 -> 5320 bytes .../queries/function_typedLit.json | 16 ---------------- .../queries/function_typedLit.proto.bin | Bin 11681 -> 11471 bytes .../queries/json_from_dataset.json | 1 - .../queries/json_from_dataset.proto.bin | Bin 180 -> 167 bytes 11 files changed, 9 insertions(+), 38 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index b1ed1a494be81..edcaf4cf1fa2b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -592,13 +592,16 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { override def sql: String = (value, dataType) match { case (_, NullType | _: ArrayType | _: MapType | _: StructType) if value == null => "NULL" case _ if value == null => s"CAST(NULL AS ${dataType.sql})" - case (v: UTF8String, StringType) => - // Escapes all backslashes and single quotes. - "'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + "'" case (v: UTF8String, st: StringType) => + // Only render a `collate` clause for an explicit collation (including an explicit + // `UTF8_BINARY`). The default `StringType` (the case object) has no explicit collation, so + // it must render without a clause and stay distinguishable from an explicitly-collated + // string on re-parse (e.g. so that default-collation resolution does not treat an + // explicitly-collated literal as eligible for inheriting a default collation). + val collateClause = + if (DataTypeUtils.isDefaultStringCharOrVarcharType(st)) "" else s" collate ${st.collationName}" // Escapes all backslashes and single quotes. - "'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + - "'" + st.typeName.substring(6) + "'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + "'" + collateClause case (v: Byte, ByteType) => s"${v}Y" case (v: Short, ShortType) => s"${v}S" case (v: Long, LongType) => s"${v}L" diff --git a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index a74b25459bad2..36b2cab4d586a 100644 --- a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -198,7 +198,6 @@ class PlanGenerationTestSuite extends ConnectFunSuite with Logging { /** * Normalize proto messages for stable comparison: * - Trim JVM origin fields (lines, stack traces, anonymous function names) - * - Populate default StringType collation when missing (UTF8_BINARY) */ private def normalizeProtoForComparison[T <: protobuf.Message](message: T): T = { def trim(builder: proto.JvmOrigin.Builder): Unit = { @@ -221,17 +220,6 @@ class PlanGenerationTestSuite extends ConnectFunSuite with Logging { val builder = message.toBuilder builder match { - // For comparison only, we add UTF8_BINARY when StringType collation is missing - // to ensure deterministic plan equality across environments. - case dt: proto.DataType.Builder if dt.getKindCase == proto.DataType.KindCase.STRING => - val sb = dt.getStringBuilder - if (sb.getCollation.isEmpty) { - val defaultCollationName = - CollationFactory - .fetchCollation(CollationFactory.UTF8_BINARY_COLLATION_ID) - .collationName - sb.setCollation(defaultCollationName) - } case exp: proto.Relation.Builder if exp.hasCommon && exp.getCommon.hasOrigin && exp.getCommon.getOrigin.hasJvmOrigin => trim(exp.getCommonBuilder.getOriginBuilder.getJvmOriginBuilder) diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain index 5a827ca88ee7e..3c878be34143a 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain @@ -1,2 +1,2 @@ -Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, INTERVAL '0-0' YEAR TO MONTH AS INTERVAL '0-0' YEAR TO MONTH#0, 23:59:59.999999999 AS TIME '23:59:59.999999999'#0, 2 months 20 days 0.0001 seconds AS INTERVAL '2 months 20 days 0.0001 seconds'#0, [18545,1677155519808000,12345000,1677184560000000,19411,200000000,0,86399999999999,2 months 20 days 0.0001 seconds] AS NAMED_STRUCT('_1', DATE '2020-10-10', '_2', TIMESTAMP '2023-02-23 04:31:59.808', '_3', TIMESTAMP '1969-12-31 16:00:12.345', '_4', TIMESTAMP_NTZ '2023-02-23 20:36:00', '_5', DATE '2023-02-23', '_6', INTERVAL '0 00:03:20' DAY TO SECOND, '_7', INTERVAL '0-0' YEAR TO MONTH, '_8', TIME '23:59:59.999999999', '_9', INTERVAL '2 months 20 days 0.0001 seconds')#0, 1 AS 1#0, [1,2,3] AS ARRAY(1, 2, 3)#0, [null,null] AS ARRAY(CAST(NULL AS INT), CAST(NULL AS INT))#0, [null,null,[1,a],[2,null]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'), NAMED_STRUCT('_1', 2, '_2', CAST(NULL AS STRING COLLATE UTF8_BINARY)))#0, [null,null,[1,a]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'))#0, [1,2,3] AS ARRAY(1, 2, 3)#0, map(keys: [a,b], values: [1,2]) AS MAP('a', 1, 'b', 2)#0, map(keys: [a,b], values: [null,null]) AS MAP('a', CAST(NULL AS INT), 'b', CAST(NULL AS INT))#0, [a,2,1.0] AS NAMED_STRUCT('_1', 'a', '_2', 2, '_3', 1.0D)#0, null AS NULL#0, [1] AS ARRAY(1)#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, [[1,2,3],[4,5,6],[7,8,9]] AS ARRAY(ARRAY(1, 2, 3), ARRAY(4, 5, 6), ARRAY(7, 8, 9))#0, [[1,2,[3,4]],[5,6,[]]] AS ARRAY(NAMED_STRUCT('_1', 1, '_2', '2', '_3', ARRAY('3', '4')), NAMED_STRUCT('_1', 5, '_2', '6', '_3', ARRAY()))#0, [[1,2],[3,4],[5,6]] AS ARRAY(NAMED_STRUCT('a', 1, 'b', '2'), NAMED_STRUCT('a', 3, 'b', '4'), NAMED_STRUCT('a', 5, 'b', '6'))#0, [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4],keys: [a,b], values: [5,6]] AS ARRAY(MAP('a', 1, 'b', 2), MAP('a', 3, 'b', 4), MAP('a', 5, 'b', 6))#0, [keys: [a,b], values: [[1,2],[3,4]],keys: [a,b], values: [[5,6],[7,8]],keys: [a,b], values: [[],[]]] AS ARRAY(MAP('a', ARRAY('1', '2'), 'b', ARRAY('3', '4')), MAP('a', ARRAY('5', '6'), 'b', ARRAY('7', '8')), MAP('a', ARRAY(), 'b', ARRAY()))#0, map(keys: [1,2], values: [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4]]) AS MAP(1, MAP('a', 1, 'b', 2), 2, MAP('a', 3, 'b', 4))#0, [[1,2,3],keys: [a,b], values: [1,2],[a,keys: [1,2], values: [a,b]]] AS NAMED_STRUCT('_1', ARRAY(1, 2, 3), '_2', MAP('a', 1, 'b', 2), '_3', NAMED_STRUCT('_1', 'a', '_2', MAP(1, 'a', 2, 'b')))#0] +Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, INTERVAL '0-0' YEAR TO MONTH AS INTERVAL '0-0' YEAR TO MONTH#0, 23:59:59.999999999 AS TIME '23:59:59.999999999'#0, 2 months 20 days 0.0001 seconds AS INTERVAL '2 months 20 days 0.0001 seconds'#0, [18545,1677155519808000,12345000,1677184560000000,19411,200000000,0,86399999999999,2 months 20 days 0.0001 seconds] AS NAMED_STRUCT('_1', DATE '2020-10-10', '_2', TIMESTAMP '2023-02-23 04:31:59.808', '_3', TIMESTAMP '1969-12-31 16:00:12.345', '_4', TIMESTAMP_NTZ '2023-02-23 20:36:00', '_5', DATE '2023-02-23', '_6', INTERVAL '0 00:03:20' DAY TO SECOND, '_7', INTERVAL '0-0' YEAR TO MONTH, '_8', TIME '23:59:59.999999999', '_9', INTERVAL '2 months 20 days 0.0001 seconds')#0, 1 AS 1#0, [1,2,3] AS ARRAY(1, 2, 3)#0, [null,null] AS ARRAY(CAST(NULL AS INT), CAST(NULL AS INT))#0, [null,null,[1,a],[2,null]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'), NAMED_STRUCT('_1', 2, '_2', CAST(NULL AS STRING)))#0, [null,null,[1,a]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'))#0, [1,2,3] AS ARRAY(1, 2, 3)#0, map(keys: [a,b], values: [1,2]) AS MAP('a', 1, 'b', 2)#0, map(keys: [a,b], values: [null,null]) AS MAP('a', CAST(NULL AS INT), 'b', CAST(NULL AS INT))#0, [a,2,1.0] AS NAMED_STRUCT('_1', 'a', '_2', 2, '_3', 1.0D)#0, null AS NULL#0, [1] AS ARRAY(1)#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, [[1,2,3],[4,5,6],[7,8,9]] AS ARRAY(ARRAY(1, 2, 3), ARRAY(4, 5, 6), ARRAY(7, 8, 9))#0, [[1,2,[3,4]],[5,6,[]]] AS ARRAY(NAMED_STRUCT('_1', 1, '_2', '2', '_3', ARRAY('3', '4')), NAMED_STRUCT('_1', 5, '_2', '6', '_3', ARRAY()))#0, [[1,2],[3,4],[5,6]] AS ARRAY(NAMED_STRUCT('a', 1, 'b', '2'), NAMED_STRUCT('a', 3, 'b', '4'), NAMED_STRUCT('a', 5, 'b', '6'))#0, [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4],keys: [a,b], values: [5,6]] AS ARRAY(MAP('a', 1, 'b', 2), MAP('a', 3, 'b', 4), MAP('a', 5, 'b', 6))#0, [keys: [a,b], values: [[1,2],[3,4]],keys: [a,b], values: [[5,6],[7,8]],keys: [a,b], values: [[],[]]] AS ARRAY(MAP('a', ARRAY('1', '2'), 'b', ARRAY('3', '4')), MAP('a', ARRAY('5', '6'), 'b', ARRAY('7', '8')), MAP('a', ARRAY(), 'b', ARRAY()))#0, map(keys: [1,2], values: [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4]]) AS MAP(1, MAP('a', 1, 'b', 2), 2, MAP('a', 3, 'b', 4))#0, [[1,2,3],keys: [a,b], values: [1,2],[a,keys: [1,2], values: [a,b]]] AS NAMED_STRUCT('_1', ARRAY(1, 2, 3), '_2', MAP('a', 1, 'b', 2), '_3', NAMED_STRUCT('_1', 'a', '_2', MAP(1, 'a', 2, 'b')))#0] +- LocalRelation , [id#0L, a#0, b#0] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json b/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json index e4b31258f984a..d34fcb6f758e6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json @@ -18,7 +18,6 @@ "name": "c1", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true diff --git a/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin index c39243a10a8e4b06c04e2e3505669e780d4e9cd9..5f8bd50685ca82d7a3599e3772fcc110b647f763 100644 GIT binary patch delta 33 pcmZ3flDUgh?yqgZ9Az(smZM8{GVBpm=oskzSc$;V;Y z7Pw9E^jxl_U=~daGJnBF0LK};q>20Nn9R*LSEXmAmR~m?6MguGDM_@yowmBg?3a55#4pgtQt6d;FB1sx_V zd1(ITEE%nc$ANr%RHB1wrzOhfg?`8AScC@GtPzPE4ONixfINYBz^E*ilVe@63c5)_ z@tR*b`Rcnq`YKja?>07JW1WKT!UQDLcygKnQhK#FHT8~N)iL9SHECn~AEQt`z3waS QlpUPv6)|7{!t$N{246X+;{X5v diff --git a/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json b/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json index f29245374e6e2..d6f992d09a5c2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json @@ -18,7 +18,6 @@ "name": "c1", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true diff --git a/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin index 1ce2e676ce30a8df106a72961379a5ed40ef3705..0fce9d9ff8c7e54e16ff63423b6cf2ef7f232ea6 100644 GIT binary patch delta 33 pcmdnOxSWxli%Ed-6XX1e?3sMhs~9D?xVV^-4TYGZ7$$aW0sw-}2WS8Q delta 46 zcmZ3^xP_6Oi%Ed-6XTkR?3vOUs~A Date: Tue, 30 Jun 2026 09:24:40 +0000 Subject: [PATCH 2/3] Add LiteralExpressionSuite assertions for explicit-collation string literal SQL Address review nit (uros-b): pin the Literal.sql behavior directly instead of only via regenerated Connect goldens. --- .../expressions/LiteralExpressionSuite.scala | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index b69b29f789147..58fff9bad849c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -842,4 +842,17 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { assert(lit.dataType === GeometryType(0)) assert(lit.value.isInstanceOf[BinaryView]) } + + test("SPARK-57777: render explicit collation in string literal SQL") { + // The default `StringType` (case object) has no explicit collation, so it renders + // without a `collate` clause. + assert(Literal(UTF8String.fromString("x"), StringType).sql === "'x'") + // A non-default (non-singleton) `UTF8_BINARY` `StringType` is an explicit collation, so it + // renders the clause and stays distinguishable from the default on re-parse. + assert(Literal(UTF8String.fromString("x"), StringType("UTF8_BINARY")).sql === + "'x' collate UTF8_BINARY") + // Other explicit collations are rendered as before. + assert(Literal(UTF8String.fromString("x"), StringType("UTF8_LCASE")).sql === + "'x' collate UTF8_LCASE") + } } From 05ac834498dd641d96966ec6137bb836474beae8 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 30 Jun 2026 12:36:26 +0000 Subject: [PATCH 3/3] [SPARK-57777][SQL][CONNECT][FOLLOWUP] Fix Scalastyle line length in literals.scala Co-authored-by: Isaac --- .../apache/spark/sql/catalyst/expressions/literals.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index edcaf4cf1fa2b..cea16999c4379 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -599,7 +599,11 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { // string on re-parse (e.g. so that default-collation resolution does not treat an // explicitly-collated literal as eligible for inheriting a default collation). val collateClause = - if (DataTypeUtils.isDefaultStringCharOrVarcharType(st)) "" else s" collate ${st.collationName}" + if (DataTypeUtils.isDefaultStringCharOrVarcharType(st)) { + "" + } else { + s" collate ${st.collationName}" + } // Escapes all backslashes and single quotes. "'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + "'" + collateClause case (v: Byte, ByteType) => s"${v}Y"