apache
diff --git a/‎.github/workflows/iceberg_spark_test.yml‎
Lines changed: 0 additions & 132 deletions b/‎.github/workflows/iceberg_spark_test.yml‎
Lines changed: 0 additions & 132 deletions
diff --git a/‎.github/workflows/pr_rat_check.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pr_rat_check.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala‎
Lines changed: 108 additions & 3 deletions b/‎common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala‎
Lines changed: 108 additions & 3 deletions
@@ -102,138 +102,6 @@ jobs:
           path: native/target/ci/libcomet.so
           retention-days: 1
 
-  iceberg-spark:
-    needs: build-native
-    if: contains(github.event.pull_request.title, '[iceberg]')
-    strategy:
-      matrix:
-        os: [ubuntu-24.04]
-        java-version: [11, 17]
-        iceberg-version: [{short: '1.8', full: '1.8.1'}, {short: '1.9', full: '1.9.1'}, {short: '1.10', full: '1.10.0'}]
-        spark-version: [{short: '3.5', full: '3.5.8'}]
-        scala-version: ['2.13']
-      fail-fast: false
-    name: iceberg-spark/${{ matrix.os }}/iceberg-${{ matrix.iceberg-version.full }}/spark-${{ matrix.spark-version.full }}/scala-${{ matrix.scala-version }}/java-${{ matrix.java-version }}
-    runs-on: ${{ matrix.os }}
-    container:
-      image: amd64/rust
-    env:
-      SPARK_LOCAL_IP: localhost
-    steps:
-      - uses: actions/checkout@v6
-      - name: Setup Rust & Java toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: ${{env.RUST_VERSION}}
-          jdk-version: ${{ matrix.java-version }}
-      - name: Download native library
-        uses: actions/download-artifact@v8
-        with:
-          name: native-lib-iceberg
-          path: native/target/release/
-      - name: Build Comet
-        run: |
-          ./mvnw install -Prelease -DskipTests -Pspark-${{ matrix.spark-version.short }} -Pscala-${{ matrix.scala-version }}
-      - name: Setup Iceberg
-        uses: ./.github/actions/setup-iceberg-builder
-        with:
-          iceberg-version: ${{ matrix.iceberg-version.full }}
-      - name: Run Iceberg Spark tests
-        run: |
-          cd apache-iceberg
-          rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
-          ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ./gradlew -DsparkVersions=${{ matrix.spark-version.short }} -DscalaVersion=${{ matrix.scala-version }} -DflinkVersions= -DkafkaVersions= \
-            :iceberg-spark:iceberg-spark-${{ matrix.spark-version.short }}_${{ matrix.scala-version }}:test \
-            -Pquick=true -x javadoc
-
-  iceberg-spark-extensions:
-    needs: build-native
-    if: contains(github.event.pull_request.title, '[iceberg]')
-    strategy:
-      matrix:
-        os: [ubuntu-24.04]
-        java-version: [11, 17]
-        iceberg-version: [{short: '1.8', full: '1.8.1'}, {short: '1.9', full: '1.9.1'}, {short: '1.10', full: '1.10.0'}]
-        spark-version: [{short: '3.5', full: '3.5.8'}]
-        scala-version: ['2.13']
-      fail-fast: false
-    name: iceberg-spark-extensions/${{ matrix.os }}/iceberg-${{ matrix.iceberg-version.full }}/spark-${{ matrix.spark-version.full }}/scala-${{ matrix.scala-version }}/java-${{ matrix.java-version }}
-    runs-on: ${{ matrix.os }}
-    container:
-      image: amd64/rust
-    env:
-      SPARK_LOCAL_IP: localhost
-    steps:
-      - uses: actions/checkout@v6
-      - name: Setup Rust & Java toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: ${{env.RUST_VERSION}}
-          jdk-version: ${{ matrix.java-version }}
-      - name: Download native library
-        uses: actions/download-artifact@v8
-        with:
-          name: native-lib-iceberg
-          path: native/target/release/
-      - name: Build Comet
-        run: |
-          ./mvnw install -Prelease -DskipTests -Pspark-${{ matrix.spark-version.short }} -Pscala-${{ matrix.scala-version }}
-      - name: Setup Iceberg
-        uses: ./.github/actions/setup-iceberg-builder
-        with:
-          iceberg-version: ${{ matrix.iceberg-version.full }}
-      - name: Run Iceberg Spark extensions tests
-        run: |
-          cd apache-iceberg
-          rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
-          ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ./gradlew -DsparkVersions=${{ matrix.spark-version.short }} -DscalaVersion=${{ matrix.scala-version }} -DflinkVersions= -DkafkaVersions= \
-            :iceberg-spark:iceberg-spark-extensions-${{ matrix.spark-version.short }}_${{ matrix.scala-version }}:test \
-            -Pquick=true -x javadoc
-
-  iceberg-spark-runtime:
-    needs: build-native
-    if: contains(github.event.pull_request.title, '[iceberg]')
-    strategy:
-      matrix:
-        os: [ubuntu-24.04]
-        java-version: [11, 17]
-        iceberg-version: [{short: '1.8', full: '1.8.1'}, {short: '1.9', full: '1.9.1'}, {short: '1.10', full: '1.10.0'}]
-        spark-version: [{short: '3.5', full: '3.5.8'}]
-        scala-version: ['2.13']
-      fail-fast: false
-    name: iceberg-spark-runtime/${{ matrix.os }}/iceberg-${{ matrix.iceberg-version.full }}/spark-${{ matrix.spark-version.full }}/scala-${{ matrix.scala-version }}/java-${{ matrix.java-version }}
-    runs-on: ${{ matrix.os }}
-    container:
-      image: amd64/rust
-    env:
-      SPARK_LOCAL_IP: localhost
-    steps:
-      - uses: actions/checkout@v6
-      - name: Setup Rust & Java toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: ${{env.RUST_VERSION}}
-          jdk-version: ${{ matrix.java-version }}
-      - name: Download native library
-        uses: actions/download-artifact@v8
-        with:
-          name: native-lib-iceberg
-          path: native/target/release/
-      - name: Build Comet
-        run: |
-          ./mvnw install -Prelease -DskipTests -Pspark-${{ matrix.spark-version.short }} -Pscala-${{ matrix.scala-version }}
-      - name: Setup Iceberg
-        uses: ./.github/actions/setup-iceberg-builder
-        with:
-          iceberg-version: ${{ matrix.iceberg-version.full }}
-      - name: Run Iceberg Spark runtime tests
-        run: |
-          cd apache-iceberg
-          rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
-          ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ./gradlew -DsparkVersions=${{ matrix.spark-version.short }} -DscalaVersion=${{ matrix.scala-version }} -DflinkVersions= -DkafkaVersions= \
-            :iceberg-spark:iceberg-spark-runtime-${{ matrix.spark-version.short }}_${{ matrix.scala-version }}:integrationTest \
-            -Pquick=true -x javadoc
-
   iceberg-spark-rust:
     needs: build-native
     if: contains(github.event.pull_request.title, '[iceberg]')
 
@@ -37,9 +37,9 @@ jobs:
     name: RAT License Check
     runs-on: ubuntu-slim
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
       - name: Set up Java
-        uses: actions/setup-java@v4
+        uses: actions/setup-java@v5
         with:
           distribution: temurin
           java-version: 11
 
@@ -26,13 +26,15 @@ import java.nio.channels.Channels
 import scala.jdk.CollectionConverters._
 
 import org.apache.arrow.c.CDataDictionaryProvider
-import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, DecimalVector, FieldVector, FixedSizeBinaryVector, Float4Vector, Float8Vector, IntVector, NullVector, SmallIntVector, TimeStampMicroTZVector, TimeStampMicroVector, TinyIntVector, ValueVector, VarBinaryVector, VarCharVector, VectorSchemaRoot}
+import org.apache.arrow.vector._
 import org.apache.arrow.vector.complex.{ListVector, MapVector, StructVector}
 import org.apache.arrow.vector.dictionary.DictionaryProvider
-import org.apache.arrow.vector.ipc.ArrowStreamWriter
+import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter}
 import org.apache.arrow.vector.types._
 import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+import org.apache.arrow.vector.util.VectorSchemaRootAppender
 import org.apache.spark.{SparkEnv, SparkException}
+import org.apache.spark.internal.Logging
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.sql.comet.execution.arrow.ArrowReaderIterator
 import org.apache.spark.sql.types._
@@ -43,7 +45,7 @@ import org.apache.comet.Constants.COMET_CONF_DIR_ENV
 import org.apache.comet.shims.CometTypeShim
 import org.apache.comet.vector.CometVector
 
-object Utils extends CometTypeShim {
+object Utils extends CometTypeShim with Logging {
   def getConfPath(confFileName: String): String = {
     sys.env
       .get(COMET_CONF_DIR_ENV)
@@ -232,6 +234,7 @@ object Utils extends CometTypeShim {
 
   /**
    * Decodes the byte arrays back to ColumnarBatchs and put them into buffer.
+   *
    * @param bytes
    *   the serialized batches
    * @param source
@@ -252,6 +255,108 @@ object Utils extends CometTypeShim {
     new ArrowReaderIterator(Channels.newChannel(ins), source)
   }
 
+  /**
+   * Coalesces many small Arrow IPC batches into a single batch for broadcasting.
+   *
+   * Why this is necessary: The broadcast exchange collects shuffle output by calling
+   * getByteArrayRdd, which serializes each ColumnarBatch independently into its own
+   * ChunkedByteBuffer. The shuffle reader (CometBlockStoreShuffleReader) produces one
+   * ColumnarBatch per shuffle block, and there is one block per writer task per output partition.
+   * So with W writer tasks and P output partitions, the broadcast collects up to W * P tiny
+   * batches. For example, with 400 writer tasks and 500 partitions, 1M rows would arrive as ~200K
+   * batches of ~5 rows each.
+   *
+   * Without coalescing, every consumer task in the broadcast join would independently deserialize
+   * all of these tiny Arrow IPC streams, paying per-stream overhead (schema parsing, buffer
+   * allocation) for each one. With coalescing, we decode and append all batches into one
+   * VectorSchemaRoot on the driver, then re-serialize once. Each consumer task then deserializes
+   * a single Arrow IPC stream.
+   */
+  def coalesceBroadcastBatches(
+      input: Iterator[ChunkedByteBuffer]): (Array[ChunkedByteBuffer], Long, Long) = {
+    val buffers = input.filterNot(_.size == 0).toArray
+    if (buffers.isEmpty) {
+      return (Array.empty, 0L, 0L)
+    }
+
+    val allocator = org.apache.comet.CometArrowAllocator
+      .newChildAllocator("broadcast-coalesce", 0, Long.MaxValue)
+    try {
+      var targetRoot: VectorSchemaRoot = null
+      var totalRows = 0L
+      var batchCount = 0
+
+      val codec = CompressionCodec.createCodec(SparkEnv.get.conf)
+      try {
+        for (bytes <- buffers) {
+          val compressedInputStream =
+            new DataInputStream(codec.compressedInputStream(bytes.toInputStream()))
+          val reader =
+            new ArrowStreamReader(Channels.newChannel(compressedInputStream), allocator)
+          try {
+            // Comet decodes dictionaries during execution, so this shouldn't happen.
+            // If it does, fall back to the original uncoalesced buffers because each
+            // partition can have a different dictionary, and appending index vectors
+            // would silently mix indices from incompatible dictionaries.
+            if (!reader.getDictionaryVectors.isEmpty) {
+              logWarning(
+                "Unexpected dictionary-encoded column during BroadcastExchange coalescing; " +
+                  "skipping coalesce")
+              reader.close()
+              if (targetRoot != null) {
+                targetRoot.close()
+                targetRoot = null
+              }
+              return (buffers, 0L, 0L)
+            }
+            while (reader.loadNextBatch()) {
+              val sourceRoot = reader.getVectorSchemaRoot
+              if (targetRoot == null) {
+                targetRoot = VectorSchemaRoot.create(sourceRoot.getSchema, allocator)
+                targetRoot.allocateNew()
+              }
+              VectorSchemaRootAppender.append(targetRoot, sourceRoot)
+              totalRows += sourceRoot.getRowCount
+              batchCount += 1
+            }
+          } finally {
+            reader.close()
+          }
+        }
+
+        if (targetRoot == null) {
+          return (Array.empty, 0L, 0L)
+        }
+
+        assert(
+          targetRoot.getRowCount.toLong == totalRows,
+          s"Row count mismatch after coalesce: ${targetRoot.getRowCount} != $totalRows")
+
+        logInfo(s"Coalesced $batchCount broadcast batches into 1 ($totalRows rows)")
+
+        val outputStream = new ChunkedByteBufferOutputStream(1024 * 1024, ByteBuffer.allocate)
+        val compressedOutputStream =
+          new DataOutputStream(codec.compressedOutputStream(outputStream))
+        val writer =
+          new ArrowStreamWriter(targetRoot, null, Channels.newChannel(compressedOutputStream))
+        try {
+          writer.start()
+          writer.writeBatch()
+        } finally {
+          writer.close()
+        }
+
+        (Array(outputStream.toChunkedByteBuffer), batchCount.toLong, totalRows)
+      } finally {
+        if (targetRoot != null) {
+          targetRoot.close()
+        }
+      }
+    } finally {
+      allocator.close()
+    }
+  }
+
   def getBatchFieldVectors(
       batch: ColumnarBatch): (Seq[FieldVector], Option[DictionaryProvider]) = {
     var provider: Option[DictionaryProvider] = None