apache
diff --git a/‎common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala‎
Lines changed: 108 additions & 3 deletions b/‎common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala‎
Lines changed: 108 additions & 3 deletions
@@ -26,13 +26,15 @@ import java.nio.channels.Channels
 import scala.jdk.CollectionConverters._
 
 import org.apache.arrow.c.CDataDictionaryProvider
-import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, FieldVector, FixedSizeBinaryVector, Float4Vector, Float8Vector, IntVector, NullVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, ValueVector, VarBinaryVector, VarCharVector, VectorSchemaRoot}
+import org.apache.arrow.vector._
 import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
 import org.apache.arrow.vector.dictionary.DictionaryProvider
-import org.apache.arrow.vector.ipc.ArrowStreamWriter
+import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter}
 import org.apache.arrow.vector.types._
 import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+import org.apache.arrow.vector.util.VectorSchemaRootAppender
 import org.apache.spark.{SparkEnv, SparkException}
+import org.apache.spark.internal.Logging
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.sql.comet.execution.arrow.ArrowReaderIterator
 import org.apache.spark.sql.types._
@@ -43,7 +45,7 @@ import org.apache.comet.Constants.COMET_CONF_DIR_ENV
 import org.apache.comet.shims.CometTypeShim
 import org.apache.comet.vector.CometVector
 
-object Utils extends CometTypeShim {
+object Utils extends CometTypeShim with Logging {
   def getConfPath(confFileName: String): String = {
     sys.env
       .get(COMET_CONF_DIR_ENV)
@@ -232,6 +234,7 @@ object Utils extends CometTypeShim {
 
   /**
    * Decodes the byte arrays back to ColumnarBatchs and put them into buffer.
+   *
    * @param bytes
    *   the serialized batches
    * @param source
@@ -252,6 +255,108 @@ object Utils extends CometTypeShim {
     new ArrowReaderIterator(Channels.newChannel(ins), source)
   }
 
+  /**
+   * Coalesces many small Arrow IPC batches into a single batch for broadcasting.
+   *
+   * Why this is necessary: The broadcast exchange collects shuffle output by calling
+   * getByteArrayRdd, which serializes each ColumnarBatch independently into its own
+   * ChunkedByteBuffer. The shuffle reader (CometBlockStoreShuffleReader) produces one
+   * ColumnarBatch per shuffle block, and there is one block per writer task per output partition.
+   * So with W writer tasks and P output partitions, the broadcast collects up to W * P tiny
+   * batches. For example, with 400 writer tasks and 500 partitions, 1M rows would arrive as ~200K
+   * batches of ~5 rows each.
+   *
+   * Without coalescing, every consumer task in the broadcast join would independently deserialize
+   * all of these tiny Arrow IPC streams, paying per-stream overhead (schema parsing, buffer
+   * allocation) for each one. With coalescing, we decode and append all batches into one
+   * VectorSchemaRoot on the driver, then re-serialize once. Each consumer task then deserializes
+   * a single Arrow IPC stream.
+   */
+  def coalesceBroadcastBatches(
+      input: Iterator[ChunkedByteBuffer]): (Array[ChunkedByteBuffer], Long, Long) = {
+    val buffers = input.filterNot(_.size == 0).toArray
+    if (buffers.isEmpty) {
+      return (Array.empty, 0L, 0L)
+    }
+
+    val allocator = org.apache.comet.CometArrowAllocator
+      .newChildAllocator("broadcast-coalesce", 0, Long.MaxValue)
+    try {
+      var targetRoot: VectorSchemaRoot = null
+      var totalRows = 0L
+      var batchCount = 0
+
+      val codec = CompressionCodec.createCodec(SparkEnv.get.conf)
+      try {
+        for (bytes <- buffers) {
+          val compressedInputStream =
+            new DataInputStream(codec.compressedInputStream(bytes.toInputStream()))
+          val reader =
+            new ArrowStreamReader(Channels.newChannel(compressedInputStream), allocator)
+          try {
+            // Comet decodes dictionaries during execution, so this shouldn't happen.
+            // If it does, fall back to the original uncoalesced buffers because each
+            // partition can have a different dictionary, and appending index vectors
+            // would silently mix indices from incompatible dictionaries.
+            if (!reader.getDictionaryVectors.isEmpty) {
+              logWarning(
+                "Unexpected dictionary-encoded column during BroadcastExchange coalescing; " +
+                  "skipping coalesce")
+              reader.close()
+              if (targetRoot != null) {
+                targetRoot.close()
+                targetRoot = null
+              }
+              return (buffers, 0L, 0L)
+            }
+            while (reader.loadNextBatch()) {
+              val sourceRoot = reader.getVectorSchemaRoot
+              if (targetRoot == null) {
+                targetRoot = VectorSchemaRoot.create(sourceRoot.getSchema, allocator)
+                targetRoot.allocateNew()
+              }
+              VectorSchemaRootAppender.append(targetRoot, sourceRoot)
+              totalRows += sourceRoot.getRowCount
+              batchCount += 1
+            }
+          } finally {
+            reader.close()
+          }
+        }
+
+        if (targetRoot == null) {
+          return (Array.empty, 0L, 0L)
+        }
+
+        assert(
+          targetRoot.getRowCount.toLong == totalRows,
+          s"Row count mismatch after coalesce: ${targetRoot.getRowCount} != $totalRows")
+
+        logInfo(s"Coalesced $batchCount broadcast batches into 1 ($totalRows rows)")
+
+        val outputStream = new ChunkedByteBufferOutputStream(1024 * 1024, ByteBuffer.allocate)
+        val compressedOutputStream =
+          new DataOutputStream(codec.compressedOutputStream(outputStream))
+        val writer =
+          new ArrowStreamWriter(targetRoot, null, Channels.newChannel(compressedOutputStream))
+        try {
+          writer.start()
+          writer.writeBatch()
+        } finally {
+          writer.close()
+        }
+
+        (Array(outputStream.toChunkedByteBuffer), batchCount.toLong, totalRows)
+      } finally {
+        if (targetRoot != null) {
+          targetRoot.close()
+        }
+      }
+    } finally {
+      allocator.close()
+    }
+  }
+
   def getBatchFieldVectors(
       batch: ColumnarBatch): (Seq[FieldVector], Option[DictionaryProvider]) = {
     var provider: Option[DictionaryProvider] = None