diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index b1ed1a494be81..cea16999c4379 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -592,13 +592,20 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { override def sql: String = (value, dataType) match { case (_, NullType | _: ArrayType | _: MapType | _: StructType) if value == null => "NULL" case _ if value == null => s"CAST(NULL AS ${dataType.sql})" - case (v: UTF8String, StringType) => - // Escapes all backslashes and single quotes. - "'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + "'" case (v: UTF8String, st: StringType) => + // Only render a `collate` clause for an explicit collation (including an explicit + // `UTF8_BINARY`). The default `StringType` (the case object) has no explicit collation, so + // it must render without a clause and stay distinguishable from an explicitly-collated + // string on re-parse (e.g. so that default-collation resolution does not treat an + // explicitly-collated literal as eligible for inheriting a default collation). + val collateClause = + if (DataTypeUtils.isDefaultStringCharOrVarcharType(st)) { + "" + } else { + s" collate ${st.collationName}" + } // Escapes all backslashes and single quotes. - "'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + - "'" + st.typeName.substring(6) + "'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + "'" + collateClause case (v: Byte, ByteType) => s"${v}Y" case (v: Short, ShortType) => s"${v}S" case (v: Long, LongType) => s"${v}L" diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index b69b29f789147..58fff9bad849c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -842,4 +842,17 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { assert(lit.dataType === GeometryType(0)) assert(lit.value.isInstanceOf[BinaryView]) } + + test("SPARK-57777: render explicit collation in string literal SQL") { + // The default `StringType` (case object) has no explicit collation, so it renders + // without a `collate` clause. + assert(Literal(UTF8String.fromString("x"), StringType).sql === "'x'") + // A non-default (non-singleton) `UTF8_BINARY` `StringType` is an explicit collation, so it + // renders the clause and stays distinguishable from the default on re-parse. + assert(Literal(UTF8String.fromString("x"), StringType("UTF8_BINARY")).sql === + "'x' collate UTF8_BINARY") + // Other explicit collations are rendered as before. + assert(Literal(UTF8String.fromString("x"), StringType("UTF8_LCASE")).sql === + "'x' collate UTF8_LCASE") + } } diff --git a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index a74b25459bad2..36b2cab4d586a 100644 --- a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -198,7 +198,6 @@ class PlanGenerationTestSuite extends ConnectFunSuite with Logging { /** * Normalize proto messages for stable comparison: * - Trim JVM origin fields (lines, stack traces, anonymous function names) - * - Populate default StringType collation when missing (UTF8_BINARY) */ private def normalizeProtoForComparison[T <: protobuf.Message](message: T): T = { def trim(builder: proto.JvmOrigin.Builder): Unit = { @@ -221,17 +220,6 @@ class PlanGenerationTestSuite extends ConnectFunSuite with Logging { val builder = message.toBuilder builder match { - // For comparison only, we add UTF8_BINARY when StringType collation is missing - // to ensure deterministic plan equality across environments. - case dt: proto.DataType.Builder if dt.getKindCase == proto.DataType.KindCase.STRING => - val sb = dt.getStringBuilder - if (sb.getCollation.isEmpty) { - val defaultCollationName = - CollationFactory - .fetchCollation(CollationFactory.UTF8_BINARY_COLLATION_ID) - .collationName - sb.setCollation(defaultCollationName) - } case exp: proto.Relation.Builder if exp.hasCommon && exp.getCommon.hasOrigin && exp.getCommon.getOrigin.hasJvmOrigin => trim(exp.getCommonBuilder.getOriginBuilder.getJvmOriginBuilder) diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain index 5a827ca88ee7e..3c878be34143a 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain @@ -1,2 +1,2 @@ -Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, INTERVAL '0-0' YEAR TO MONTH AS INTERVAL '0-0' YEAR TO MONTH#0, 23:59:59.999999999 AS TIME '23:59:59.999999999'#0, 2 months 20 days 0.0001 seconds AS INTERVAL '2 months 20 days 0.0001 seconds'#0, [18545,1677155519808000,12345000,1677184560000000,19411,200000000,0,86399999999999,2 months 20 days 0.0001 seconds] AS NAMED_STRUCT('_1', DATE '2020-10-10', '_2', TIMESTAMP '2023-02-23 04:31:59.808', '_3', TIMESTAMP '1969-12-31 16:00:12.345', '_4', TIMESTAMP_NTZ '2023-02-23 20:36:00', '_5', DATE '2023-02-23', '_6', INTERVAL '0 00:03:20' DAY TO SECOND, '_7', INTERVAL '0-0' YEAR TO MONTH, '_8', TIME '23:59:59.999999999', '_9', INTERVAL '2 months 20 days 0.0001 seconds')#0, 1 AS 1#0, [1,2,3] AS ARRAY(1, 2, 3)#0, [null,null] AS ARRAY(CAST(NULL AS INT), CAST(NULL AS INT))#0, [null,null,[1,a],[2,null]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'), NAMED_STRUCT('_1', 2, '_2', CAST(NULL AS STRING COLLATE UTF8_BINARY)))#0, [null,null,[1,a]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'))#0, [1,2,3] AS ARRAY(1, 2, 3)#0, map(keys: [a,b], values: [1,2]) AS MAP('a', 1, 'b', 2)#0, map(keys: [a,b], values: [null,null]) AS MAP('a', CAST(NULL AS INT), 'b', CAST(NULL AS INT))#0, [a,2,1.0] AS NAMED_STRUCT('_1', 'a', '_2', 2, '_3', 1.0D)#0, null AS NULL#0, [1] AS ARRAY(1)#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, [[1,2,3],[4,5,6],[7,8,9]] AS ARRAY(ARRAY(1, 2, 3), ARRAY(4, 5, 6), ARRAY(7, 8, 9))#0, [[1,2,[3,4]],[5,6,[]]] AS ARRAY(NAMED_STRUCT('_1', 1, '_2', '2', '_3', ARRAY('3', '4')), NAMED_STRUCT('_1', 5, '_2', '6', '_3', ARRAY()))#0, [[1,2],[3,4],[5,6]] AS ARRAY(NAMED_STRUCT('a', 1, 'b', '2'), NAMED_STRUCT('a', 3, 'b', '4'), NAMED_STRUCT('a', 5, 'b', '6'))#0, [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4],keys: [a,b], values: [5,6]] AS ARRAY(MAP('a', 1, 'b', 2), MAP('a', 3, 'b', 4), MAP('a', 5, 'b', 6))#0, [keys: [a,b], values: [[1,2],[3,4]],keys: [a,b], values: [[5,6],[7,8]],keys: [a,b], values: [[],[]]] AS ARRAY(MAP('a', ARRAY('1', '2'), 'b', ARRAY('3', '4')), MAP('a', ARRAY('5', '6'), 'b', ARRAY('7', '8')), MAP('a', ARRAY(), 'b', ARRAY()))#0, map(keys: [1,2], values: [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4]]) AS MAP(1, MAP('a', 1, 'b', 2), 2, MAP('a', 3, 'b', 4))#0, [[1,2,3],keys: [a,b], values: [1,2],[a,keys: [1,2], values: [a,b]]] AS NAMED_STRUCT('_1', ARRAY(1, 2, 3), '_2', MAP('a', 1, 'b', 2), '_3', NAMED_STRUCT('_1', 'a', '_2', MAP(1, 'a', 2, 'b')))#0] +Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, INTERVAL '0-0' YEAR TO MONTH AS INTERVAL '0-0' YEAR TO MONTH#0, 23:59:59.999999999 AS TIME '23:59:59.999999999'#0, 2 months 20 days 0.0001 seconds AS INTERVAL '2 months 20 days 0.0001 seconds'#0, [18545,1677155519808000,12345000,1677184560000000,19411,200000000,0,86399999999999,2 months 20 days 0.0001 seconds] AS NAMED_STRUCT('_1', DATE '2020-10-10', '_2', TIMESTAMP '2023-02-23 04:31:59.808', '_3', TIMESTAMP '1969-12-31 16:00:12.345', '_4', TIMESTAMP_NTZ '2023-02-23 20:36:00', '_5', DATE '2023-02-23', '_6', INTERVAL '0 00:03:20' DAY TO SECOND, '_7', INTERVAL '0-0' YEAR TO MONTH, '_8', TIME '23:59:59.999999999', '_9', INTERVAL '2 months 20 days 0.0001 seconds')#0, 1 AS 1#0, [1,2,3] AS ARRAY(1, 2, 3)#0, [null,null] AS ARRAY(CAST(NULL AS INT), CAST(NULL AS INT))#0, [null,null,[1,a],[2,null]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'), NAMED_STRUCT('_1', 2, '_2', CAST(NULL AS STRING)))#0, [null,null,[1,a]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'))#0, [1,2,3] AS ARRAY(1, 2, 3)#0, map(keys: [a,b], values: [1,2]) AS MAP('a', 1, 'b', 2)#0, map(keys: [a,b], values: [null,null]) AS MAP('a', CAST(NULL AS INT), 'b', CAST(NULL AS INT))#0, [a,2,1.0] AS NAMED_STRUCT('_1', 'a', '_2', 2, '_3', 1.0D)#0, null AS NULL#0, [1] AS ARRAY(1)#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, [[1,2,3],[4,5,6],[7,8,9]] AS ARRAY(ARRAY(1, 2, 3), ARRAY(4, 5, 6), ARRAY(7, 8, 9))#0, [[1,2,[3,4]],[5,6,[]]] AS ARRAY(NAMED_STRUCT('_1', 1, '_2', '2', '_3', ARRAY('3', '4')), NAMED_STRUCT('_1', 5, '_2', '6', '_3', ARRAY()))#0, [[1,2],[3,4],[5,6]] AS ARRAY(NAMED_STRUCT('a', 1, 'b', '2'), NAMED_STRUCT('a', 3, 'b', '4'), NAMED_STRUCT('a', 5, 'b', '6'))#0, [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4],keys: [a,b], values: [5,6]] AS ARRAY(MAP('a', 1, 'b', 2), MAP('a', 3, 'b', 4), MAP('a', 5, 'b', 6))#0, [keys: [a,b], values: [[1,2],[3,4]],keys: [a,b], values: [[5,6],[7,8]],keys: [a,b], values: [[],[]]] AS ARRAY(MAP('a', ARRAY('1', '2'), 'b', ARRAY('3', '4')), MAP('a', ARRAY('5', '6'), 'b', ARRAY('7', '8')), MAP('a', ARRAY(), 'b', ARRAY()))#0, map(keys: [1,2], values: [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4]]) AS MAP(1, MAP('a', 1, 'b', 2), 2, MAP('a', 3, 'b', 4))#0, [[1,2,3],keys: [a,b], values: [1,2],[a,keys: [1,2], values: [a,b]]] AS NAMED_STRUCT('_1', ARRAY(1, 2, 3), '_2', MAP('a', 1, 'b', 2), '_3', NAMED_STRUCT('_1', 'a', '_2', MAP(1, 'a', 2, 'b')))#0] +- LocalRelation , [id#0L, a#0, b#0] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json b/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json index e4b31258f984a..d34fcb6f758e6 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json @@ -18,7 +18,6 @@ "name": "c1", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true diff --git a/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin index c39243a10a8e4..5f8bd50685ca8 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lit_array.json b/sql/connect/common/src/test/resources/query-tests/queries/function_lit_array.json index 9b77a8bd9b421..9e5227af86851 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_lit_array.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_lit_array.json @@ -521,7 +521,6 @@ "array": { "elementType": { "string": { - "collation": "UTF8_BINARY" } }, "containsNull": true @@ -579,7 +578,6 @@ "array": { "elementType": { "string": { - "collation": "UTF8_BINARY" } }, "containsNull": true diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin index 20b6d81c3cee2..db81bd4e7eba4 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_typedLit.json b/sql/connect/common/src/test/resources/query-tests/queries/function_typedLit.json index 41ca771596ef3..eba5b99fe4958 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/function_typedLit.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_typedLit.json @@ -82,7 +82,6 @@ }, "dataType": { "string": { - "collation": "UTF8_BINARY" } } }, @@ -958,7 +957,6 @@ "name": "_2", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true @@ -1023,7 +1021,6 @@ "name": "_2", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true @@ -1106,7 +1103,6 @@ "map": { "keyType": { "string": { - "collation": "UTF8_BINARY" } }, "valueType": { @@ -1157,7 +1153,6 @@ "map": { "keyType": { "string": { - "collation": "UTF8_BINARY" } }, "valueType": { @@ -1202,7 +1197,6 @@ "name": "_1", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true @@ -1545,7 +1539,6 @@ "name": "_2", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true @@ -1555,7 +1548,6 @@ "array": { "elementType": { "string": { - "collation": "UTF8_BINARY" } }, "containsNull": true @@ -1629,7 +1621,6 @@ "name": "b", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true @@ -1707,7 +1698,6 @@ "map": { "keyType": { "string": { - "collation": "UTF8_BINARY" } }, "valueType": { @@ -1813,14 +1803,12 @@ "map": { "keyType": { "string": { - "collation": "UTF8_BINARY" } }, "valueType": { "array": { "elementType": { "string": { - "collation": "UTF8_BINARY" } }, "containsNull": true @@ -1896,7 +1884,6 @@ "map": { "keyType": { "string": { - "collation": "UTF8_BINARY" } }, "valueType": { @@ -1992,7 +1979,6 @@ "map": { "keyType": { "string": { - "collation": "UTF8_BINARY" } }, "valueType": { @@ -2010,7 +1996,6 @@ "name": "_1", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true @@ -2024,7 +2009,6 @@ }, "valueType": { "string": { - "collation": "UTF8_BINARY" } }, "valueContainsNull": true diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin index 5068b513a9272..4388912bdb1fd 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin differ diff --git a/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json b/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json index f29245374e6e2..d6f992d09a5c2 100644 --- a/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json +++ b/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json @@ -18,7 +18,6 @@ "name": "c1", "dataType": { "string": { - "collation": "UTF8_BINARY" } }, "nullable": true diff --git a/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin index 1ce2e676ce30a..0fce9d9ff8c7e 100644 Binary files a/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin and b/sql/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin differ