Skip to content

Commit 7d679a4

Browse files
authored
fix: decline CreateArray with struct-nullability-divergent children (#4533)
1 parent 6ea6b1d commit 7d679a4

2 files changed

Lines changed: 81 additions & 0 deletions

File tree

spark/src/main/scala/org/apache/comet/serde/arrays.scala

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,28 @@ object CometCreateArray extends CometExpressionSerde[CreateArray] {
509509
return exprToProtoInternal(emptyArrayLiteral, inputs, binding)
510510
}
511511

512+
// DataFusion's `make_array` asserts strict element-type equality in
513+
// `MutableArrayData::with_capacities` and panics on a mismatch. Spark's CreateArray is more
514+
// permissive: its type coercion compares element types with `sameType`, which ignores
515+
// nullability, so children that share a surface type but differ only in nested field
516+
// nullability get no unifying cast. DataFusion tolerates container nullability differences
517+
// (an `ArrayType.containsNull` / `MapType.valueContainsNull` mismatch is coerced), but NOT a
518+
// struct field's nullability -- `array(struct(a not null), struct(a nullable))` panics inside
519+
// `make_array_inner`. Decline only those cases (i.e. children that still differ after
520+
// normalizing container nullability) so Spark's evaluator handles them.
521+
//
522+
// TODO: remove this decline once apache/datafusion#22366 lands; the upstream fix widens the
523+
// element type via nullability-OR-merge and casts each child before MutableArrayData.
524+
val normalizedTypes = children.map(c => normalizeContainerNullability(c.dataType))
525+
if (normalizedTypes.distinct.size > 1) {
526+
withFallbackReason(
527+
expr,
528+
"CreateArray children have mismatched data types: " +
529+
children.map(_.dataType).distinct.mkString(", "),
530+
children: _*)
531+
return None
532+
}
533+
512534
val childExprs = children.map(exprToProtoInternal(_, inputs, binding))
513535

514536
if (childExprs.forall(_.isDefined)) {
@@ -518,6 +540,26 @@ object CometCreateArray extends CometExpressionSerde[CreateArray] {
518540
None
519541
}
520542
}
543+
544+
/**
545+
* Rewrites a type so that container nullability (`ArrayType.containsNull`,
546+
* `MapType.valueContainsNull`) is forced to `true` everywhere, while struct field nullability
547+
* is left intact. Two CreateArray children whose types differ ONLY in container nullability are
548+
* tolerated by DataFusion's `make_array` (coerced), so they normalize equal here; a difference
549+
* in a struct field's nullability survives normalization and triggers the decline above.
550+
*/
551+
private def normalizeContainerNullability(dt: DataType): DataType = dt match {
552+
case ArrayType(elementType, _) =>
553+
ArrayType(normalizeContainerNullability(elementType), containsNull = true)
554+
case MapType(keyType, valueType, _) =>
555+
MapType(
556+
normalizeContainerNullability(keyType),
557+
normalizeContainerNullability(valueType),
558+
valueContainsNull = true)
559+
case StructType(fields) =>
560+
StructType(fields.map(f => f.copy(dataType = normalizeContainerNullability(f.dataType))))
561+
case other => other
562+
}
521563
}
522564

523565
object CometGetArrayItem extends CometExpressionSerde[GetArrayItem] {

spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,45 @@ class CometArrayExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelp
11071107
}
11081108
}
11091109

1110+
test("array of structs with nullability-divergent children") {
1111+
// Spark's type coercion compares element types with `sameType`, which ignores nullability,
1112+
// so two struct children that differ ONLY in a nested field's nullability get no unifying
1113+
// cast -- CreateArray keeps children of different StructTypes. DataFusion's make_array asserts
1114+
// strict element-type equality (down to nested nullability) and panics on the mismatch. Comet
1115+
// must decline this CreateArray so Spark's evaluator handles it.
1116+
withParquetTable((0 until 5).map(i => (i, i.toLong)), "tbl") {
1117+
val df = spark
1118+
.table("tbl")
1119+
.select(
1120+
array(
1121+
// ct is NOT NULL (literal)
1122+
struct(col("_1").as("id"), lit("a").as("ct")),
1123+
// ct is NULLABLE (when without otherwise) -- same type, different nullability
1124+
struct(col("_1").as("id"), when(col("_1") === 0, lit("b")).as("ct"))).as("arr"))
1125+
checkSparkAnswerAndFallbackReason(df, "CreateArray children have mismatched data types")
1126+
}
1127+
}
1128+
1129+
test("array of maps with nullability-divergent struct values") {
1130+
// Same nested-nullability divergence as the struct case, but wrapped in a MapType value so we
1131+
// exercise normalizeContainerNullability's MapType branch: the two map children share a surface
1132+
// type and differ only in a nested struct field's nullability, so they survive container
1133+
// (`MapType.valueContainsNull`) normalization as distinct types and CreateArray must still
1134+
// decline -- DataFusion's make_array would otherwise panic on the struct-field mismatch.
1135+
withParquetTable((0 until 5).map(i => (i, i.toLong)), "tbl") {
1136+
val df = spark
1137+
.table("tbl")
1138+
.select(
1139+
array(
1140+
// map value struct has ct NOT NULL (literal)
1141+
map(lit("k"), struct(col("_1").as("id"), lit("a").as("ct"))),
1142+
// map value struct has ct NULLABLE -- same type, different nested nullability
1143+
map(lit("k"), struct(col("_1").as("id"), when(col("_1") === 0, lit("b")).as("ct"))))
1144+
.as("arr"))
1145+
checkSparkAnswerAndFallbackReason(df, "CreateArray children have mismatched data types")
1146+
}
1147+
}
1148+
11101149
// https://issues.apache.org/jira/browse/SPARK-55747
11111150
test("(ansi) GetArrayItem on null array from split()") {
11121151
withSQLConf(

0 commit comments

Comments
 (0)