add NullType to shuffles

mbutrovich · mbutrovich · commit 174c939540a0 · 2026-05-22T08:02:09.000-04:00
diff --git a/native/shuffle/src/spark_unsafe/row.rs b/native/shuffle/src/spark_unsafe/row.rs
@@ -28,8 +28,8 @@ use arrow::array::{
     builder::{
         ArrayBuilder, BinaryBuilder, BinaryDictionaryBuilder, BooleanBuilder, Date32Builder,
         Decimal128Builder, Float32Builder, Float64Builder, Int16Builder, Int32Builder,
-        Int64Builder, Int8Builder, ListBuilder, MapBuilder, StringBuilder, StringDictionaryBuilder,
-        StructBuilder, TimestampMicrosecondBuilder,
+        Int64Builder, Int8Builder, ListBuilder, MapBuilder, NullBuilder, StringBuilder,
+        StringDictionaryBuilder, StructBuilder, TimestampMicrosecondBuilder,
     },
     types::Int32Type,
     Array, ArrayRef, RecordBatch, RecordBatchOptions,
@@ -267,6 +267,10 @@ pub(super) fn append_field(
             append_field_to_builder!(Date32Builder, |builder: &mut Date32Builder| builder
                 .append_value(row.get_date(idx)));
         }
+        DataType::Null => {
+            let field_builder = get_field_builder!(struct_builder, NullBuilder, idx);
+            field_builder.append_null();
+        }
         DataType::Timestamp(TimeUnit::Microsecond, _) => {
             append_field_to_builder!(
                 TimestampMicrosecondBuilder,
@@ -1148,6 +1152,12 @@ fn append_columns(
                     .append_value(row.get_date(idx))
             );
         }
+        DataType::Null => {
+            let null_builder = downcast_builder_ref!(NullBuilder, builder);
+            for _ in row_start..row_end {
+                null_builder.append_null();
+            }
+        }
         DataType::Timestamp(TimeUnit::Microsecond, _) => {
             append_column_to_builder!(
                 TimestampMicrosecondBuilder,
@@ -1252,6 +1262,7 @@ fn make_builders(
             }
         }
         DataType::Date32 => Box::new(Date32Builder::with_capacity(row_num)),
+        DataType::Null => Box::new(NullBuilder::new()),
         DataType::Timestamp(TimeUnit::Microsecond, _) => {
             Box::new(TimestampMicrosecondBuilder::with_capacity(row_num).with_data_type(dt.clone()))
         }
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleExchangeExec, ShuffleExchangeLike, ShuffleOrigin}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, ByteType, DataType, DateType, DecimalType, DoubleType, FloatType, IntegerType, LongType, MapType, ShortType, StringType, StructType, TimestampNTZType, TimestampType}
+import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, ByteType, DataType, DateType, DecimalType, DoubleType, FloatType, IntegerType, LongType, MapType, NullType, ShortType, StringType, StructType, TimestampNTZType, TimestampType}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.MutablePair
 import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, RecordComparator}
@@ -364,7 +364,7 @@ object CometShuffleExchangeExec
     def supportedSerializableDataType(dt: DataType): Boolean = dt match {
       case _: BooleanType | _: ByteType | _: ShortType | _: IntegerType | _: LongType |
           _: FloatType | _: DoubleType | _: StringType | _: BinaryType | _: TimestampType |
-          _: TimestampNTZType | _: DecimalType | _: DateType =>
+          _: TimestampNTZType | _: DecimalType | _: DateType | _: NullType =>
         true
       case StructType(fields) =>
         fields.nonEmpty && fields.forall(f => supportedSerializableDataType(f.dataType))
@@ -487,7 +487,7 @@ object CometShuffleExchangeExec
     def supportedSerializableDataType(dt: DataType): Boolean = dt match {
       case _: BooleanType | _: ByteType | _: ShortType | _: IntegerType | _: LongType |
           _: FloatType | _: DoubleType | _: StringType | _: BinaryType | _: TimestampType |
-          _: TimestampNTZType | _: DecimalType | _: DateType =>
+          _: TimestampNTZType | _: DecimalType | _: DateType | _: NullType =>
         true
       case StructType(fields) =>
         fields.nonEmpty && fields.forall(f => supportedSerializableDataType(f.dataType)) &&
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometColumnarShuffleSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometColumnarShuffleSuite.scala
@@ -22,14 +22,13 @@ package org.apache.comet.exec
 import java.nio.file.{Files, Paths}
 
 import scala.reflect.runtime.universe._
-import scala.util.Random
 
 import org.scalactic.source.Position
 import org.scalatest.Tag
 
 import org.apache.hadoop.fs.Path
 import org.apache.spark.{Partitioner, SparkConf}
-import org.apache.spark.sql.{CometTestBase, DataFrame, RandomDataGenerator, Row}
+import org.apache.spark.sql.{CometTestBase, DataFrame, Row}
 import org.apache.spark.sql.comet.execution.shuffle.{CometShuffleDependency, CometShuffleExchangeExec, CometShuffleManager}
 import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, AQEShuffleReadExec, ShuffleQueryStageExec}
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
@@ -94,22 +93,10 @@ abstract class CometColumnarShuffleSuite extends CometTestBase with AdaptiveSpar
         """.stripMargin))
   }
 
-  test("Fallback to Spark for unsupported input besides ordering") {
-    val dataGenerator = RandomDataGenerator
-      .forType(
-        dataType = NullType,
-        nullable = true,
-        new Random(System.nanoTime()),
-        validJulianDatetime = false)
-      .get
-
-    val schema = new StructType()
-      .add("index", IntegerType, nullable = false)
-      .add("col", NullType, nullable = true)
-    val rdd =
-      spark.sparkContext.parallelize((1 to 20).map(i => Row(i, dataGenerator())))
-    val df = spark.createDataFrame(rdd, schema).orderBy("index").coalesce(1)
-    checkSparkAnswer(df)
+  test("columnar shuffle with NullType passthrough column") {
+    val df = sql("SELECT x, y FROM VALUES ('a', null), ('b', null), ('c', null) AS t(x, y)")
+    val shuffled = df.repartition(2, $"x")
+    checkShuffleAnswer(shuffled, 1)
   }
 
   test("columnar shuffle on nested struct including nulls") {
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometNativeShuffleSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometNativeShuffleSuite.scala
@@ -218,6 +218,12 @@ class CometNativeShuffleSuite extends CometTestBase with AdaptiveSparkPlanHelper
     }
   }
 
+  test("native shuffle with NullType passthrough column") {
+    val df = spark.sql("SELECT x, y FROM VALUES ('a', null), ('b', null), ('c', null) AS t(x, y)")
+    val shuffled = df.repartition(2, $"x")
+    checkShuffleAnswer(shuffled, 1)
+  }
+
   test("fix: Comet native shuffle with binary data") {
     withParquetTable((0 until 5).map(i => (i, (i + 1).toLong)), "tbl") {
       val df = sql("SELECT cast(cast(_1 as STRING) as BINARY) as binary, _2 FROM tbl")

Original file line number	Diff line number	Diff line change
`@@ -218,6 +218,12 @@ class CometNativeShuffleSuite extends CometTestBase with AdaptiveSparkPlanHelper`
`218`	`218`	`}`
`219`	`219`	`}`
`220`	`220`
	`221`	`+ test("native shuffle with NullType passthrough column") {`
	`222`	`+ val df = spark.sql("SELECT x, y FROM VALUES ('a', null), ('b', null), ('c', null) AS t(x, y)")`
	`223`	`+ val shuffled = df.repartition(2, $"x")`
	`224`	`+ checkShuffleAnswer(shuffled, 1)`
	`225`	`+ }`
	`226`	`+`
`221`	`227`	`test("fix: Comet native shuffle with binary data") {`
`222`	`228`	`withParquetTable((0 until 5).map(i => (i, (i + 1).toLong)), "tbl") {`
`223`	`229`	`val df = sql("SELECT cast(cast(_1 as STRING) as BINARY) as binary, _2 FROM tbl")`