Add NullVector/NullArray support for native columnar-to-row conversion

andygrove · claude · andygrove · commit d46b56aaff79 · 2026-01-22T08:39:52.000-07:00
- Add NullVector to getFieldVector in Utils.scala to allow export
- Add DataType::Null handling in columnar_to_row.rs for native C2R
- Update withInfo test for new native C2R plan structure

This fixes the round test failure when scale is null, which produces
a NullArray that needs to be handled by the native C2R path.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala b/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
@@ -26,7 +26,7 @@ import java.nio.channels.Channels
 import scala.jdk.CollectionConverters._
 
 import org.apache.arrow.c.CDataDictionaryProvider
-import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, FieldVector, FixedSizeBinaryVector, Float4Vector, Float8Vector, IntVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, ValueVector, VarBinaryVector, VarCharVector, VectorSchemaRoot}
+import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, FieldVector, FixedSizeBinaryVector, Float4Vector, Float8Vector, IntVector, NullVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, ValueVector, VarBinaryVector, VarCharVector, VectorSchemaRoot}
 import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
 import org.apache.arrow.vector.dictionary.DictionaryProvider
 import org.apache.arrow.vector.ipc.ArrowStreamWriter
@@ -282,7 +282,7 @@ object Utils extends CometTypeShim {
           _: BigIntVector | _: Float4Vector | _: Float8Vector | _: VarCharVector |
           _: DecimalVector | _: DateDayVector | _: TimeStampMicroTZVector | _: VarBinaryVector |
           _: FixedSizeBinaryVector | _: TimeStampMicroVector | _: StructVector | _: ListVector |
-          _: MapVector) =>
+          _: MapVector | _: NullVector) =>
         v.asInstanceOf[FieldVector]
       case _ =>
         throw new SparkException(s"Unsupported Arrow Vector for $reason: ${valueVector.getClass}")
diff --git a/native/core/src/execution/columnar_to_row.rs b/native/core/src/execution/columnar_to_row.rs
@@ -51,6 +51,7 @@ const MAX_LONG_DIGITS: u8 = 18;
 /// This enum holds references to concrete array types, allowing direct access
 /// without repeated downcast_ref calls.
 enum TypedArray<'a> {
+    Null(&'a NullArray),
     Boolean(&'a BooleanArray),
     Int8(&'a Int8Array),
     Int16(&'a Int16Array),
@@ -81,6 +82,11 @@ impl<'a> TypedArray<'a> {
     fn from_array(array: &'a ArrayRef, _schema_type: &DataType) -> CometResult<Self> {
         let actual_type = array.data_type();
         match actual_type {
+            DataType::Null => Ok(TypedArray::Null(
+                array.as_any().downcast_ref::<NullArray>().ok_or_else(|| {
+                    CometError::Internal("Failed to downcast to NullArray".to_string())
+                })?,
+            )),
             DataType::Boolean => Ok(TypedArray::Boolean(
                 array
                     .as_any()
@@ -235,6 +241,7 @@ impl<'a> TypedArray<'a> {
     #[inline]
     fn is_null(&self, row_idx: usize) -> bool {
         match self {
+            TypedArray::Null(_) => true, // Null type is always null
             TypedArray::Boolean(arr) => arr.is_null(row_idx),
             TypedArray::Int8(arr) => arr.is_null(row_idx),
             TypedArray::Int16(arr) => arr.is_null(row_idx),
@@ -292,7 +299,8 @@ impl<'a> TypedArray<'a> {
     #[inline]
     fn is_variable_length(&self) -> bool {
         match self {
-            TypedArray::Boolean(_)
+            TypedArray::Null(_)
+            | TypedArray::Boolean(_)
             | TypedArray::Int8(_)
             | TypedArray::Int16(_)
             | TypedArray::Int32(_)
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -30,8 +30,8 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.{CometTestBase, DataFrame, Row}
 import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, FromUnixTime, Literal, TruncDate, TruncTimestamp}
 import org.apache.spark.sql.catalyst.optimizer.SimplifyExtractValueOps
-import org.apache.spark.sql.comet.{CometColumnarToRowExec, CometProjectExec}
-import org.apache.spark.sql.execution.{InputAdapter, ProjectExec, SparkPlan, WholeStageCodegenExec}
+import org.apache.spark.sql.comet.{CometNativeColumnarToRowExec, CometProjectExec}
+import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -958,11 +958,7 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
       val query = sql(s"select cast(id as string) from $table")
       val (_, cometPlan) = checkSparkAnswerAndOperator(query)
       val project = cometPlan
-        .asInstanceOf[WholeStageCodegenExec]
-        .child
-        .asInstanceOf[CometColumnarToRowExec]
-        .child
-        .asInstanceOf[InputAdapter]
+        .asInstanceOf[CometNativeColumnarToRowExec]
         .child
         .asInstanceOf[CometProjectExec]
       val id = project.expressions.head