Fix Delta 4.0 Spark 4.1 package build

Mohammad Linjawi · Mohammad Linjawi · commit 0560d84ceb0f · 2026-05-12T16:27:44.000+03:00
diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml
@@ -1227,10 +1227,11 @@ jobs:
     container: apache/gluten:centos-8-jdk17
     steps:
       - uses: actions/checkout@v4
-      - name: Build with fast-build profile (Spark 4.0, Java 17)
+      - name: Build with fast-build profile (Spark 4.0/4.1, Java 17)
         run: |
           cd $GITHUB_WORKSPACE/
           $MVN_CMD clean test-compile -Pspark-4.0 -Pscala-2.13 -Pbackends-velox -Pspark-ut -Piceberg,iceberg-test,delta,paimon -Pfast-build
+          $MVN_CMD clean test-compile -Pspark-4.1 -Pscala-2.13 -Pjava-17 -Pbackends-velox -Pspark-ut -Pdelta -Pfast-build
 
   spark-test-spark40:
     needs: build-native-lib-centos-7
diff --git a/backends-velox/src-delta40/main/scala/org/apache/spark/sql/delta/GlutenDeltaParquetFileFormat.scala b/backends-velox/src-delta40/main/scala/org/apache/spark/sql/delta/GlutenDeltaParquetFileFormat.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.sql.delta
 
-import org.apache.spark.internal.{LoggingShims, MDC}
+import org.apache.spark.internal.{LoggingShims, MDC => SparkMDC}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.delta.GlutenDeltaParquetFileFormat._
@@ -528,7 +528,7 @@ case class GlutenDeltaParquetFileFormat(
       case AlwaysTrue() => Some(AlwaysTrue())
       case AlwaysFalse() => Some(AlwaysFalse())
       case _ =>
-        logError(log"Failed to translate filter ${MDC(DeltaLogKeys.FILTER, filter)}")
+        logError(log"Failed to translate filter ${SparkMDC.of(DeltaLogKeys.FILTER, filter)}")
         None
     }
   }
diff --git a/backends-velox/src-delta40/main/scala/org/apache/spark/sql/delta/files/GlutenDeltaFileFormatWriter.scala b/backends-velox/src-delta40/main/scala/org/apache/spark/sql/delta/files/GlutenDeltaFileFormatWriter.scala
@@ -24,7 +24,7 @@ import org.apache.gluten.execution.datasource.GlutenFormatFactory
 import org.apache.gluten.extension.columnar.transition.{Convention, Transitions}
 
 import org.apache.spark._
-import org.apache.spark.internal.{LoggingShims, MDC}
+import org.apache.spark.internal.{LoggingShims, MDC => SparkMDC}
 import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils}
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.sql.SparkSession
@@ -343,20 +343,20 @@ object GlutenDeltaFileFormatWriter extends LoggingShims {
       val ret = f
       val commitMsgs = ret.map(_.commitMsg)
 
-      logInfo(log"Start to commit write Job ${MDC(DeltaLogKeys.JOB_ID, description.uuid)}.")
+      logInfo(log"Start to commit write Job ${SparkMDC.of(DeltaLogKeys.JOB_ID, description.uuid)}.")
       val (_, duration) = Utils.timeTakenMs { committer.commitJob(job, commitMsgs) }
-      logInfo(log"Write Job ${MDC(DeltaLogKeys.JOB_ID, description.uuid)} committed. " +
-        log"Elapsed time: ${MDC(DeltaLogKeys.DURATION, duration)} ms.")
+      logInfo(log"Write Job ${SparkMDC.of(DeltaLogKeys.JOB_ID, description.uuid)} committed. " +
+        log"Elapsed time: ${SparkMDC.of(DeltaLogKeys.DURATION, duration)} ms.")
 
       processStats(description.statsTrackers, ret.map(_.summary.stats), duration)
       logInfo(log"Finished processing stats for write job " +
-        log"${MDC(DeltaLogKeys.JOB_ID, description.uuid)}.")
+        log"${SparkMDC.of(DeltaLogKeys.JOB_ID, description.uuid)}.")
 
       // return a set of all the partition paths that were updated during this job
       ret.map(_.summary.updatedPartitions).reduceOption(_ ++ _).getOrElse(Set.empty)
     } catch {
       case cause: Throwable =>
-        logError(log"Aborting job ${MDC(DeltaLogKeys.JOB_ID, description.uuid)}", cause)
+        logError(log"Aborting job ${SparkMDC.of(DeltaLogKeys.JOB_ID, description.uuid)}", cause)
         committer.abortJob(job)
         throw cause
     }
@@ -490,7 +490,7 @@ object GlutenDeltaFileFormatWriter extends LoggingShims {
       })(catchBlock = {
         // If there is an error, abort the task
         dataWriter.abort()
-        logError(log"Job ${MDC(DeltaLogKeys.JOB_ID, jobId)} aborted.")
+        logError(log"Job ${SparkMDC.of(DeltaLogKeys.JOB_ID, jobId)} aborted.")
       }, finallyBlock = {
         dataWriter.close()
       })
diff --git a/shims/spark41/src/main/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManager.scala b/shims/spark41/src/main/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManager.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.sql.execution.streaming.checkpointing.{CheckpointFileManager => Spark41CheckpointFileManager}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, FileSystem, FSDataInputStream, Path, PathFilter}
+
+import java.io.OutputStream
+
+/**
+ * Binary compatibility shim for Delta 4.0, which was compiled against Spark 4.0's
+ * CheckpointFileManager package before Spark 4.1 moved it under streaming.checkpointing.
+ */
+trait CheckpointFileManager {
+  def createAtomic(
+      path: Path,
+      overwriteIfPossible: Boolean): CheckpointFileManager.CancellableFSDataOutputStream
+
+  def open(path: Path): FSDataInputStream
+
+  def list(path: Path, filter: PathFilter): Array[FileStatus]
+
+  def list(path: Path): Array[FileStatus] = {
+    list(
+      path,
+      new PathFilter {
+        override def accept(path: Path): Boolean = true
+      })
+  }
+
+  def mkdirs(path: Path): Unit
+
+  def exists(path: Path): Boolean
+
+  def delete(path: Path): Unit
+
+  def isLocal: Boolean
+
+  def createCheckpointDirectory(): Path
+}
+
+object CheckpointFileManager {
+  def create(path: Path, hadoopConf: Configuration): CheckpointFileManager = {
+    new Spark41CheckpointFileManagerAdapter(
+      Spark41CheckpointFileManager.create(path, hadoopConf))
+  }
+
+  abstract class CancellableFSDataOutputStream(outputStream: OutputStream)
+    extends org.apache.hadoop.fs.FSDataOutputStream(
+      outputStream,
+      null.asInstanceOf[FileSystem.Statistics]) {
+    def cancel(): Unit
+  }
+
+  private class Spark41CheckpointFileManagerAdapter(
+      delegate: Spark41CheckpointFileManager)
+    extends CheckpointFileManager {
+    override def createAtomic(
+        path: Path,
+        overwriteIfPossible: Boolean): CancellableFSDataOutputStream = {
+      new CancellableFSDataOutputStreamAdapter(delegate.createAtomic(path, overwriteIfPossible))
+    }
+
+    override def open(path: Path): FSDataInputStream = delegate.open(path)
+
+    override def list(path: Path, filter: PathFilter): Array[FileStatus] =
+      delegate.list(path, filter)
+
+    override def mkdirs(path: Path): Unit = delegate.mkdirs(path)
+
+    override def exists(path: Path): Boolean = delegate.exists(path)
+
+    override def delete(path: Path): Unit = delegate.delete(path)
+
+    override def isLocal: Boolean = delegate.isLocal
+
+    override def createCheckpointDirectory(): Path = delegate.createCheckpointDirectory()
+  }
+
+  private class CancellableFSDataOutputStreamAdapter(
+      delegate: Spark41CheckpointFileManager.CancellableFSDataOutputStream)
+    extends CancellableFSDataOutputStream(delegate) {
+    override def close(): Unit = delegate.close()
+
+    override def cancel(): Unit = delegate.cancel()
+  }
+}
diff --git a/shims/spark41/src/main/scala/org/apache/spark/sql/execution/streaming/MemoryStream.scala b/shims/spark41/src/main/scala/org/apache/spark/sql/execution/streaming/MemoryStream.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.sql.{Encoder, SQLContext}
+import org.apache.spark.sql.execution.streaming.runtime.{MemoryStream => RuntimeMemoryStream}
+
+object MemoryStream {
+  def apply[A: Encoder](implicit sqlContext: SQLContext): RuntimeMemoryStream[A] = {
+    RuntimeMemoryStream[A]()(implicitly[Encoder[A]], sqlContext)
+  }
+
+  def apply[A: Encoder](
+      numPartitions: Int)(
+      implicit sqlContext: SQLContext): RuntimeMemoryStream[A] = {
+    RuntimeMemoryStream[A](numPartitions)(implicitly[Encoder[A]], sqlContext.sparkSession)
+  }
+}