apache
diff --git a/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 14 additions & 0 deletions b/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/source/user-guide/latest/tuning.md‎
Lines changed: 18 additions & 0 deletions b/‎docs/source/user-guide/latest/tuning.md‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎native/spark-expr/src/array_funcs/array_compact.rs‎
Lines changed: 7 additions & 1 deletion b/‎native/spark-expr/src/array_funcs/array_compact.rs‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala‎
Lines changed: 88 additions & 12 deletions b/‎spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala‎
Lines changed: 88 additions & 12 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/serde/arrays.scala‎
Lines changed: 1 addition & 0 deletions b/‎spark/src/main/scala/org/apache/comet/serde/arrays.scala‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala‎
Lines changed: 23 additions & 7 deletions b/‎spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala‎
Lines changed: 23 additions & 7 deletions
diff --git a/‎spark/src/test/resources/sql-tests/expressions/array/array_compact.sql‎
Lines changed: 31 additions & 11 deletions b/‎spark/src/test/resources/sql-tests/expressions/array/array_compact.sql‎
Lines changed: 31 additions & 11 deletions
@@ -427,6 +427,20 @@ object CometConf extends ShimCometConf {
         "The maximum number of columns to hash for round robin partitioning must be non-negative.")
       .createWithDefault(0)
 
+  val COMET_EXEC_SHUFFLE_REVERT_REDUNDANT_COLUMNAR_ENABLED: ConfigEntry[Boolean] =
+    conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.revertRedundantColumnar.enabled")
+      .category(CATEGORY_SHUFFLE)
+      .doc(
+        "When enabled, Comet reverts a `CometShuffleExchangeExec` with `CometColumnarShuffle` " +
+          "back to Spark's `ShuffleExchangeExec` when both its parent and child are non-Comet " +
+          "hash aggregate operators. This avoids a redundant " +
+          "row -> Arrow -> shuffle -> Arrow -> row conversion when no Comet operator on either " +
+          "side can consume columnar output. Disable to keep Comet columnar shuffle even in " +
+          "that case, which preserves Comet's off-heap shuffle memory accounting at the cost of " +
+          "the extra conversion.")
+      .booleanConf
+      .createWithDefault(true)
+
   val COMET_EXEC_SHUFFLE_COMPRESSION_CODEC: ConfigEntry[String] =
     conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.compression.codec")
       .category(CATEGORY_SHUFFLE)
 
@@ -154,6 +154,24 @@ partitioning keys. Columns that are not partitioning keys may contain complex ty
 Comet Columnar shuffle is JVM-based and supports `HashPartitioning`, `RoundRobinPartitioning`, `RangePartitioning`, and
 `SinglePartitioning`. This shuffle implementation supports complex data types as partitioning keys.
 
+#### Automatic Revert to Spark Shuffle
+
+When a Comet columnar shuffle ends up between two non-Comet operators (for example, a partial/final hash aggregate
+pair that Comet could not convert), Comet reverts it to Spark's built-in shuffle. Keeping columnar shuffle between
+two row-based operators would add `row -> Arrow -> shuffle -> Arrow -> row` conversions with no Comet consumer on
+either side to benefit from columnar output.
+
+This shifts the affected shuffles from Comet's off-heap memory pool back to the JVM execution memory pool. Clusters
+tuned for a small JVM heap may see `ExternalSorter` spills on queries where this revert fires. Shuffle I/O may also
+grow marginally because Spark's row-based serializer generally compresses less well than Comet's Arrow IPC format.
+
+Each revert is logged at `INFO` level on the driver as `Reverting Comet columnar shuffle to Spark shuffle between
+<parent> and <child>`, which lets you correlate any unexpected behavior with this optimization.
+
+This optimization is enabled by default and can be disabled by setting
+`spark.comet.exec.shuffle.revertRedundantColumnar.enabled=false`, in which case Comet will keep the columnar shuffle
+even when both its parent and child are non-Comet operators.
+
 ### Shuffle Compression
 
 By default, Spark compresses shuffle files using LZ4 compression. Comet overrides this behavior with ZSTD compression.
 
@@ -132,6 +132,11 @@ fn compact_list<OffsetSize: OffsetSizeTrait>(
     );
     let mut valid = NullBufferBuilder::new(list_array.len());
 
+    // Use logical_nulls() instead of is_null() to correctly handle NullArray.
+    // NullArray::nulls() returns None (which makes is_null() return false),
+    // but logical_nulls() correctly reports all elements as null.
+    let value_nulls = values.logical_nulls();
+
     for (row_index, offset_window) in list_array.offsets().windows(2).enumerate() {
         if list_array.is_null(row_index) {
             offsets.push(offsets[row_index]);
@@ -144,7 +149,8 @@ fn compact_list<OffsetSize: OffsetSizeTrait>(
         let mut copied = 0usize;
 
         for i in start..end {
-            if !values.is_null(i) {
+            let is_null = value_nulls.as_ref().map(|n| n.is_null(i)).unwrap_or(false);
+            if !is_null {
                 mutable.extend(0, i, i + 1);
                 copied += 1;
             }
 
@@ -89,6 +89,13 @@ object CometExecRule {
 
   val allExecs: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] = nativeExecs ++ sinks
 
+  /**
+   * Tag set on a `ShuffleExchangeExec` that should be left as a plain Spark shuffle rather than
+   * wrapped in `CometShuffleExchangeExec`. See `tagRedundantColumnarShuffle`.
+   */
+  val SKIP_COMET_SHUFFLE_TAG: org.apache.spark.sql.catalyst.trees.TreeNodeTag[Unit] =
+    org.apache.spark.sql.catalyst.trees.TreeNodeTag[Unit]("comet.skipCometShuffle")
+
 }
 
 /**
@@ -100,19 +107,78 @@ case class CometExecRule(session: SparkSession)
 
   private lazy val showTransformations = CometConf.COMET_EXPLAIN_TRANSFORMATIONS.get()
 
+  /**
+   * Revert any `CometShuffleExchangeExec` with `CometColumnarShuffle` whose parent and child are
+   * both non-Comet `HashAggregateExec` / `ObjectHashAggregateExec` operators back to the original
+   * Spark `ShuffleExchangeExec`. This is the partial-final-aggregate pattern where Comet couldn't
+   * convert either aggregate; keeping a columnar shuffle between them only adds
+   * row->arrow->shuffle->arrow->row conversion overhead with no Comet consumer on either side.
+   * See https://github.com/apache/datafusion-comet/issues/4004.
+   *
+   * The match is intentionally narrow (both sides must be row-based aggregates that remained JVM
+   * after the main transform pass). Running the revert post-transform means we only fire when the
+   * main conversion already decided to keep both aggregates JVM - we never create the dangerous
+   * mixed mode where a Comet partial feeds a JVM final (see issue #1389).
+   *
+   * Correctness depends on running as part of `preColumnarTransitions`: if the revert ran after
+   * Spark inserted `ColumnarToRowExec` between the aggregate and the columnar shuffle, the
+   * pattern would no longer match (the shuffle would be separated from the aggregate by the
+   * transition) and the unnecessary conversion could not be eliminated.
+   *
+   * The reverted shuffle is tagged with `SKIP_COMET_SHUFFLE_TAG` so both the AQE
+   * `QueryStagePrepRule` pass and the `ColumnarRule` `preColumnarTransitions` pass leave it alone
+   * on re-entry - AQE in particular re-runs the rule on each stage in isolation, where the outer
+   * aggregate context is no longer visible and the shuffle would otherwise be re-wrapped as a
+   * Comet columnar shuffle.
+   */
+  private def revertRedundantColumnarShuffle(plan: SparkPlan): SparkPlan = {
+    def isAggregate(p: SparkPlan): Boolean =
+      p.isInstanceOf[HashAggregateExec] || p.isInstanceOf[ObjectHashAggregateExec]
+
+    def isRedundantShuffle(child: SparkPlan): Boolean = child match {
+      case s: CometShuffleExchangeExec =>
+        s.shuffleType == CometColumnarShuffle && isAggregate(s.child)
+      case _ => false
+    }
+
+    plan.transform {
+      case op if isAggregate(op) && op.children.exists(isRedundantShuffle) =>
+        val newChildren = op.children.map {
+          case s: CometShuffleExchangeExec
+              if s.shuffleType == CometColumnarShuffle && isAggregate(s.child) =>
+            val reverted =
+              s.originalPlan.withNewChildren(Seq(s.child)).asInstanceOf[ShuffleExchangeExec]
+            reverted.setTagValue(CometExecRule.SKIP_COMET_SHUFFLE_TAG, ())
+            logInfo(
+              "Reverting Comet columnar shuffle to Spark shuffle between " +
+                s"${op.getClass.getSimpleName} and ${s.child.getClass.getSimpleName} " +
+                "(no Comet operator on either side to consume columnar output)")
+            reverted
+          case other => other
+        }
+        op.withNewChildren(newChildren)
+    }
+  }
+
+  private def shouldSkipCometShuffle(s: ShuffleExchangeExec): Boolean =
+    s.getTagValue(CometExecRule.SKIP_COMET_SHUFFLE_TAG).isDefined
+
   private def applyCometShuffle(plan: SparkPlan): SparkPlan = {
-    plan.transformUp { case s: ShuffleExchangeExec =>
-      CometShuffleExchangeExec.shuffleSupported(s) match {
-        case Some(CometNativeShuffle) =>
-          // Switch to use Decimal128 regardless of precision, since Arrow native execution
-          // doesn't support Decimal32 and Decimal64 yet.
-          conf.setConfString(CometConf.COMET_USE_DECIMAL_128.key, "true")
-          CometShuffleExchangeExec(s, shuffleType = CometNativeShuffle)
-        case Some(CometColumnarShuffle) =>
-          CometShuffleExchangeExec(s, shuffleType = CometColumnarShuffle)
-        case None =>
-          s
-      }
+    plan.transformUp {
+      case s: ShuffleExchangeExec if shouldSkipCometShuffle(s) =>
+        s
+      case s: ShuffleExchangeExec =>
+        CometShuffleExchangeExec.shuffleSupported(s) match {
+          case Some(CometNativeShuffle) =>
+            // Switch to use Decimal128 regardless of precision, since Arrow native execution
+            // doesn't support Decimal32 and Decimal64 yet.
+            conf.setConfString(CometConf.COMET_USE_DECIMAL_128.key, "true")
+            CometShuffleExchangeExec(s, shuffleType = CometNativeShuffle)
+          case Some(CometColumnarShuffle) =>
+            CometShuffleExchangeExec(s, shuffleType = CometColumnarShuffle)
+          case None =>
+            s
+        }
     }
   }
 
@@ -261,6 +327,9 @@ case class CometExecRule(session: SparkSession)
       case s @ ShuffleQueryStageExec(_, ReusedExchangeExec(_, _: CometShuffleExchangeExec), _) =>
         convertToComet(s, CometExchangeSink).getOrElse(s)
 
+      case s: ShuffleExchangeExec if shouldSkipCometShuffle(s) =>
+        s
+
       case s: ShuffleExchangeExec =>
         convertToComet(s, CometShuffleExchangeExec).getOrElse(s)
 
@@ -464,6 +533,13 @@ case class CometExecRule(session: SparkSession)
         case CometScanWrapper(_, s) => s
       }
 
+      // Revert CometColumnarShuffle to Spark's ShuffleExchangeExec when both its parent and child
+      // are non-Comet HashAggregate/ObjectHashAggregate operators that remained JVM after the main
+      // transform pass. See https://github.com/apache/datafusion-comet/issues/4004.
+      if (CometConf.COMET_EXEC_SHUFFLE_REVERT_REDUNDANT_COLUMNAR_ENABLED.get()) {
+        newPlan = revertRedundantColumnarShuffle(newPlan)
+      }
+
       // Set up logical links
       newPlan = newPlan.transform {
         case op: CometExec =>
 
@@ -295,6 +295,7 @@ object CometArrayRepeat extends CometExpressionSerde[ArrayRepeat] {
 }
 
 object CometArrayCompact extends CometExpressionSerde[Expression] {
+
   override def convert(
       expr: Expression,
       inputs: Seq[Attribute],
 
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.json.StructsToJsonEvaluator
 import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.types.StringTypeWithCollation
-import org.apache.spark.sql.types.{BinaryType, BooleanType, DataTypes, MapType, StringType}
+import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes, MapType, StringType}
 
 import org.apache.comet.CometSparkSessionExtensions.withInfo
 import org.apache.comet.expressions.{CometCast, CometEvalMode}
@@ -56,6 +56,28 @@ trait CometExprShim extends CommonStringExprs {
       inputs: Seq[Attribute],
       binding: Boolean): Option[Expr] = {
     expr match {
+      case knc: KnownNotContainsNull =>
+        // On Spark 4.0, array_compact rewrites to KnownNotContainsNull(ArrayFilter(IsNotNull)).
+        // Strip the wrapper and serialize the inner ArrayFilter as spark_array_compact.
+        knc.child match {
+          case filter: ArrayFilter =>
+            filter.function.children.headOption match {
+              case Some(_: IsNotNull) =>
+                val arrayChild = filter.left
+                val elementType = arrayChild.dataType.asInstanceOf[ArrayType].elementType
+                val arrayExprProto = exprToProtoInternal(arrayChild, inputs, binding)
+                val returnType = ArrayType(elementType)
+                val scalarExpr = scalarFunctionExprToProtoWithReturnType(
+                  "spark_array_compact",
+                  returnType,
+                  false,
+                  arrayExprProto)
+                optExprWithInfo(scalarExpr, knc, arrayChild)
+              case _ => exprToProtoInternal(knc.child, inputs, binding)
+            }
+          case _ => exprToProtoInternal(knc.child, inputs, binding)
+        }
+
       case s: StaticInvoke
           if s.staticObject == classOf[StringDecode] &&
             s.dataType.isInstanceOf[StringType] &&
@@ -109,12 +131,6 @@ trait CometExprShim extends CommonStringExprs {
         val optExpr = scalarFunctionExprToProto("width_bucket", childExprs: _*)
         optExprWithInfo(optExpr, wb, wb.children: _*)
 
-      // KnownNotContainsNull is a TaggingExpression added in Spark 4.0 that only
-      // changes schema metadata (containsNull = false). It has no runtime effect,
-      // so we pass through to the child expression.
-      case k: KnownNotContainsNull =>
-        exprToProtoInternal(k.child, inputs, binding)
-
       // In Spark 4.0, StructsToJson is a RuntimeReplaceable whose replacement is
       // Invoke(Literal(StructsToJsonEvaluator), "evaluate", ...). Reconstruct the
       // original StructsToJson and recurse so support-level checks apply.
 
@@ -17,28 +17,48 @@
 
 
 statement
-CREATE TABLE test_array_compact(arr array<int>) USING parquet
+CREATE TABLE test_array_compact(
+  ints array<int>,
+  strs array<string>,
+  dbls array<double>,
+  nested array<array<int>>
+) USING parquet
 
 statement
-INSERT INTO test_array_compact VALUES (array(1, NULL, 2, NULL, 3)), (array()), (NULL), (array(NULL, NULL)), (array(1, 2, 3))
+INSERT INTO test_array_compact VALUES
+  (array(1, NULL, 2, NULL, 3), array('a', NULL, 'b', NULL, 'c'), array(1.0, NULL, 2.0), array(array(1, NULL, 3), NULL, array(4, NULL, 6))),
+  (array(), array(), array(), array()),
+  (NULL, NULL, NULL, NULL),
+  (array(NULL, NULL), array(NULL, NULL), array(NULL, NULL), array(NULL, NULL)),
+  (array(1, 2, 3), array('x', 'y', 'z'), array(1.5, 2.5), array(array(1, 2), array(3, 4)))
 
--- column argument
+-- integer column
 query
-SELECT array_compact(arr) FROM test_array_compact
+SELECT array_compact(ints) FROM test_array_compact
+
+-- string column
+query
+SELECT array_compact(strs) FROM test_array_compact
+
+-- double column
+query
+SELECT array_compact(dbls) FROM test_array_compact
+
+-- nested array column: outer nulls removed, inner nulls preserved
+query
+SELECT array_compact(nested) FROM test_array_compact
 
 -- literal arguments
 query
 SELECT array_compact(array(1, NULL, 2, NULL, 3))
 
--- string element type
-statement
-CREATE TABLE test_array_compact_str(arr array<string>) USING parquet
-
-statement
-INSERT INTO test_array_compact_str VALUES (array('a', NULL, 'b', NULL, 'c')), (array()), (NULL), (array(NULL, NULL)), (array('', NULL, '', NULL))
+-- literal string array
+query
+SELECT array_compact(array('a', NULL, 'b'))
 
+-- all-null literal array
 query
-SELECT array_compact(arr) FROM test_array_compact_str
+SELECT array_compact(array(NULL, NULL, NULL))
 
 -- double element type
 query
Original file line number	Diff line number	Diff line change
`@@ -295,6 +295,7 @@ object CometArrayRepeat extends CometExpressionSerde[ArrayRepeat] {`
`295`	`295`	`}`
`296`	`296`
`297`	`297`	`object CometArrayCompact extends CometExpressionSerde[Expression] {`
	`298`	`+`
`298`	`299`	`override def convert(`
`299`	`300`	`expr: Expression,`
`300`	`301`	`inputs: Seq[Attribute],`