Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -592,13 +592,16 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression {
override def sql: String = (value, dataType) match {
case (_, NullType | _: ArrayType | _: MapType | _: StructType) if value == null => "NULL"
case _ if value == null => s"CAST(NULL AS ${dataType.sql})"
case (v: UTF8String, StringType) =>
// Escapes all backslashes and single quotes.
"'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + "'"
case (v: UTF8String, st: StringType) =>

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: there's no direct unit assertion for the central behavior change: an explicit-UTF8_BINARY literal now rendering collate UTF8_BINARY (and the default singleton staying clause-free) is covered only indirectly via regenerated Connect goldens. A ~3-line LiteralExpressionSuite test pinning StringType singleton -> 'x', StringType("UTF8_BINARY") (non-eq) -> 'x' collate UTF8_BINARY, StringType("UTF8_LCASE") unchanged would lock in the regression guard cheaply.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call — added a direct LiteralExpressionSuite test (SPARK-57777: render explicit collation in string literal SQL) pinning the three cases: default singleton -> 'x', explicit StringType("UTF8_BINARY") -> 'x' collate UTF8_BINARY, and UTF8_LCASE unchanged. Thanks for the review!

// Only render a `collate` clause for an explicit collation (including an explicit
// `UTF8_BINARY`). The default `StringType` (the case object) has no explicit collation, so
// it must render without a clause and stay distinguishable from an explicitly-collated
// string on re-parse (e.g. so that default-collation resolution does not treat an
// explicitly-collated literal as eligible for inheriting a default collation).
val collateClause =
if (DataTypeUtils.isDefaultStringCharOrVarcharType(st)) "" else s" collate ${st.collationName}"
// Escapes all backslashes and single quotes.
"'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") +
"'" + st.typeName.substring(6)
"'" + v.toString.replace("\\", "\\\\").replace("'", "\\'") + "'" + collateClause
case (v: Byte, ByteType) => s"${v}Y"
case (v: Short, ShortType) => s"${v}S"
case (v: Long, LongType) => s"${v}L"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -842,4 +842,17 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
assert(lit.dataType === GeometryType(0))
assert(lit.value.isInstanceOf[BinaryView])
}

test("SPARK-57777: render explicit collation in string literal SQL") {
// The default `StringType` (case object) has no explicit collation, so it renders
// without a `collate` clause.
assert(Literal(UTF8String.fromString("x"), StringType).sql === "'x'")
// A non-default (non-singleton) `UTF8_BINARY` `StringType` is an explicit collation, so it
// renders the clause and stays distinguishable from the default on re-parse.
assert(Literal(UTF8String.fromString("x"), StringType("UTF8_BINARY")).sql ===
"'x' collate UTF8_BINARY")
// Other explicit collations are rendered as before.
assert(Literal(UTF8String.fromString("x"), StringType("UTF8_LCASE")).sql ===
"'x' collate UTF8_LCASE")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ class PlanGenerationTestSuite extends ConnectFunSuite with Logging {
/**
* Normalize proto messages for stable comparison:
* - Trim JVM origin fields (lines, stack traces, anonymous function names)
* - Populate default StringType collation when missing (UTF8_BINARY)
*/
private def normalizeProtoForComparison[T <: protobuf.Message](message: T): T = {
def trim(builder: proto.JvmOrigin.Builder): Unit = {
Expand All @@ -221,17 +220,6 @@ class PlanGenerationTestSuite extends ConnectFunSuite with Logging {
val builder = message.toBuilder

builder match {
// For comparison only, we add UTF8_BINARY when StringType collation is missing
// to ensure deterministic plan equality across environments.
case dt: proto.DataType.Builder if dt.getKindCase == proto.DataType.KindCase.STRING =>
val sb = dt.getStringBuilder
if (sb.getCollation.isEmpty) {
val defaultCollationName =
CollationFactory
.fetchCollation(CollationFactory.UTF8_BINARY_COLLATION_ID)
.collationName
sb.setCollation(defaultCollationName)
}
case exp: proto.Relation.Builder
if exp.hasCommon && exp.getCommon.hasOrigin && exp.getCommon.getOrigin.hasJvmOrigin =>
trim(exp.getCommonBuilder.getOriginBuilder.getJvmOriginBuilder)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, INTERVAL '0-0' YEAR TO MONTH AS INTERVAL '0-0' YEAR TO MONTH#0, 23:59:59.999999999 AS TIME '23:59:59.999999999'#0, 2 months 20 days 0.0001 seconds AS INTERVAL '2 months 20 days 0.0001 seconds'#0, [18545,1677155519808000,12345000,1677184560000000,19411,200000000,0,86399999999999,2 months 20 days 0.0001 seconds] AS NAMED_STRUCT('_1', DATE '2020-10-10', '_2', TIMESTAMP '2023-02-23 04:31:59.808', '_3', TIMESTAMP '1969-12-31 16:00:12.345', '_4', TIMESTAMP_NTZ '2023-02-23 20:36:00', '_5', DATE '2023-02-23', '_6', INTERVAL '0 00:03:20' DAY TO SECOND, '_7', INTERVAL '0-0' YEAR TO MONTH, '_8', TIME '23:59:59.999999999', '_9', INTERVAL '2 months 20 days 0.0001 seconds')#0, 1 AS 1#0, [1,2,3] AS ARRAY(1, 2, 3)#0, [null,null] AS ARRAY(CAST(NULL AS INT), CAST(NULL AS INT))#0, [null,null,[1,a],[2,null]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'), NAMED_STRUCT('_1', 2, '_2', CAST(NULL AS STRING COLLATE UTF8_BINARY)))#0, [null,null,[1,a]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'))#0, [1,2,3] AS ARRAY(1, 2, 3)#0, map(keys: [a,b], values: [1,2]) AS MAP('a', 1, 'b', 2)#0, map(keys: [a,b], values: [null,null]) AS MAP('a', CAST(NULL AS INT), 'b', CAST(NULL AS INT))#0, [a,2,1.0] AS NAMED_STRUCT('_1', 'a', '_2', 2, '_3', 1.0D)#0, null AS NULL#0, [1] AS ARRAY(1)#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, [[1,2,3],[4,5,6],[7,8,9]] AS ARRAY(ARRAY(1, 2, 3), ARRAY(4, 5, 6), ARRAY(7, 8, 9))#0, [[1,2,[3,4]],[5,6,[]]] AS ARRAY(NAMED_STRUCT('_1', 1, '_2', '2', '_3', ARRAY('3', '4')), NAMED_STRUCT('_1', 5, '_2', '6', '_3', ARRAY()))#0, [[1,2],[3,4],[5,6]] AS ARRAY(NAMED_STRUCT('a', 1, 'b', '2'), NAMED_STRUCT('a', 3, 'b', '4'), NAMED_STRUCT('a', 5, 'b', '6'))#0, [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4],keys: [a,b], values: [5,6]] AS ARRAY(MAP('a', 1, 'b', 2), MAP('a', 3, 'b', 4), MAP('a', 5, 'b', 6))#0, [keys: [a,b], values: [[1,2],[3,4]],keys: [a,b], values: [[5,6],[7,8]],keys: [a,b], values: [[],[]]] AS ARRAY(MAP('a', ARRAY('1', '2'), 'b', ARRAY('3', '4')), MAP('a', ARRAY('5', '6'), 'b', ARRAY('7', '8')), MAP('a', ARRAY(), 'b', ARRAY()))#0, map(keys: [1,2], values: [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4]]) AS MAP(1, MAP('a', 1, 'b', 2), 2, MAP('a', 3, 'b', 4))#0, [[1,2,3],keys: [a,b], values: [1,2],[a,keys: [1,2], values: [a,b]]] AS NAMED_STRUCT('_1', ARRAY(1, 2, 3), '_2', MAP('a', 1, 'b', 2), '_3', NAMED_STRUCT('_1', 'a', '_2', MAP(1, 'a', 2, 'b')))#0]
Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, INTERVAL '0-0' YEAR TO MONTH AS INTERVAL '0-0' YEAR TO MONTH#0, 23:59:59.999999999 AS TIME '23:59:59.999999999'#0, 2 months 20 days 0.0001 seconds AS INTERVAL '2 months 20 days 0.0001 seconds'#0, [18545,1677155519808000,12345000,1677184560000000,19411,200000000,0,86399999999999,2 months 20 days 0.0001 seconds] AS NAMED_STRUCT('_1', DATE '2020-10-10', '_2', TIMESTAMP '2023-02-23 04:31:59.808', '_3', TIMESTAMP '1969-12-31 16:00:12.345', '_4', TIMESTAMP_NTZ '2023-02-23 20:36:00', '_5', DATE '2023-02-23', '_6', INTERVAL '0 00:03:20' DAY TO SECOND, '_7', INTERVAL '0-0' YEAR TO MONTH, '_8', TIME '23:59:59.999999999', '_9', INTERVAL '2 months 20 days 0.0001 seconds')#0, 1 AS 1#0, [1,2,3] AS ARRAY(1, 2, 3)#0, [null,null] AS ARRAY(CAST(NULL AS INT), CAST(NULL AS INT))#0, [null,null,[1,a],[2,null]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'), NAMED_STRUCT('_1', 2, '_2', CAST(NULL AS STRING)))#0, [null,null,[1,a]] AS ARRAY(NULL, NULL, NAMED_STRUCT('_1', 1, '_2', 'a'))#0, [1,2,3] AS ARRAY(1, 2, 3)#0, map(keys: [a,b], values: [1,2]) AS MAP('a', 1, 'b', 2)#0, map(keys: [a,b], values: [null,null]) AS MAP('a', CAST(NULL AS INT), 'b', CAST(NULL AS INT))#0, [a,2,1.0] AS NAMED_STRUCT('_1', 'a', '_2', 2, '_3', 1.0D)#0, null AS NULL#0, [1] AS ARRAY(1)#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, map(keys: [1], values: [null]) AS MAP(1, CAST(NULL AS INT))#0, [[1,2,3],[4,5,6],[7,8,9]] AS ARRAY(ARRAY(1, 2, 3), ARRAY(4, 5, 6), ARRAY(7, 8, 9))#0, [[1,2,[3,4]],[5,6,[]]] AS ARRAY(NAMED_STRUCT('_1', 1, '_2', '2', '_3', ARRAY('3', '4')), NAMED_STRUCT('_1', 5, '_2', '6', '_3', ARRAY()))#0, [[1,2],[3,4],[5,6]] AS ARRAY(NAMED_STRUCT('a', 1, 'b', '2'), NAMED_STRUCT('a', 3, 'b', '4'), NAMED_STRUCT('a', 5, 'b', '6'))#0, [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4],keys: [a,b], values: [5,6]] AS ARRAY(MAP('a', 1, 'b', 2), MAP('a', 3, 'b', 4), MAP('a', 5, 'b', 6))#0, [keys: [a,b], values: [[1,2],[3,4]],keys: [a,b], values: [[5,6],[7,8]],keys: [a,b], values: [[],[]]] AS ARRAY(MAP('a', ARRAY('1', '2'), 'b', ARRAY('3', '4')), MAP('a', ARRAY('5', '6'), 'b', ARRAY('7', '8')), MAP('a', ARRAY(), 'b', ARRAY()))#0, map(keys: [1,2], values: [keys: [a,b], values: [1,2],keys: [a,b], values: [3,4]]) AS MAP(1, MAP('a', 1, 'b', 2), 2, MAP('a', 3, 'b', 4))#0, [[1,2,3],keys: [a,b], values: [1,2],[a,keys: [1,2], values: [a,b]]] AS NAMED_STRUCT('_1', ARRAY(1, 2, 3), '_2', MAP('a', 1, 'b', 2), '_3', NAMED_STRUCT('_1', 'a', '_2', MAP(1, 'a', 2, 'b')))#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0]
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"name": "c1",
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"nullable": true
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,6 @@
"array": {
"elementType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"containsNull": true
Expand Down Expand Up @@ -579,7 +578,6 @@
"array": {
"elementType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"containsNull": true
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@
},
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
}
},
Expand Down Expand Up @@ -958,7 +957,6 @@
"name": "_2",
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"nullable": true
Expand Down Expand Up @@ -1023,7 +1021,6 @@
"name": "_2",
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"nullable": true
Expand Down Expand Up @@ -1106,7 +1103,6 @@
"map": {
"keyType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"valueType": {
Expand Down Expand Up @@ -1157,7 +1153,6 @@
"map": {
"keyType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"valueType": {
Expand Down Expand Up @@ -1202,7 +1197,6 @@
"name": "_1",
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"nullable": true
Expand Down Expand Up @@ -1545,7 +1539,6 @@
"name": "_2",
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"nullable": true
Expand All @@ -1555,7 +1548,6 @@
"array": {
"elementType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"containsNull": true
Expand Down Expand Up @@ -1629,7 +1621,6 @@
"name": "b",
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"nullable": true
Expand Down Expand Up @@ -1707,7 +1698,6 @@
"map": {
"keyType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"valueType": {
Expand Down Expand Up @@ -1813,14 +1803,12 @@
"map": {
"keyType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"valueType": {
"array": {
"elementType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"containsNull": true
Expand Down Expand Up @@ -1896,7 +1884,6 @@
"map": {
"keyType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"valueType": {
Expand Down Expand Up @@ -1992,7 +1979,6 @@
"map": {
"keyType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"valueType": {
Expand All @@ -2010,7 +1996,6 @@
"name": "_1",
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"nullable": true
Expand All @@ -2024,7 +2009,6 @@
},
"valueType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"valueContainsNull": true
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"name": "c1",
"dataType": {
"string": {
"collation": "UTF8_BINARY"
}
},
"nullable": true
Expand Down
Binary file not shown.