@@ -18,50 +18,46 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
1818FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
1919
2020# Dependency versions - keep these compatible
21+ # Changing these will invalidate the JAR download cache layer
2122ARG ICEBERG_VERSION=1.10.1
2223ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
2324ARG HADOOP_VERSION=3.4.1
24- ARG SCALA_VERSION=2.13
2525ARG AWS_SDK_VERSION=2.24.6
2626ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
2727
2828USER root
2929WORKDIR ${SPARK_HOME}
3030
31- # Install curl for JAR downloads
32- RUN apt-get update && \
33- apt-get install -y --no-install-recommends curl && \
34- rm -rf /var/lib/apt/lists/*
35-
36- # Copy configuration (early for better caching)
37- COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
38-
39- # Create event log directory
40- RUN mkdir -p /home/iceberg/spark-events && \
31+ # Install curl and create directories
32+ RUN apt-get update -qq && \
33+ apt-get install -qq -y --no-install-recommends curl && \
34+ apt-get clean && \
35+ rm -rf /var/lib/apt/lists/* && \
36+ mkdir -p /home/iceberg/spark-events && \
4137 chown -R spark:spark /home/iceberg
4238
43- # Required JAR dependencies
44- ENV JARS_TO_DOWNLOAD="\
45- org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
46- org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
47- org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
48- software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
49-
50- # Download JARs with retry logic
39+ # Download JARs with retry logic (most cacheable - only changes when versions change)
40+ # This is the slowest step, so we do it before copying config files
5141RUN set -e && \
5242 cd "${SPARK_HOME}/jars" && \
53- for jar_path in ${JARS_TO_DOWNLOAD}; do \
43+ for jar_path in \
44+ "org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
45+ "org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
46+ "org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
47+ "software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar" ; \
48+ do \
5449 jar_name=$(basename "${jar_path}" ) && \
55- echo "Downloading ${jar_name}..." && \
5650 curl -fsSL --retry 3 --retry-delay 5 \
5751 -o "${jar_name}" \
5852 "${MAVEN_MIRROR}/${jar_path}" && \
59- echo "✓ Downloaded ${jar_name}" ; \
60- done && \
61- chown -R spark:spark "${SPARK_HOME}/jars"
53+ chown spark:spark "${jar_name}" ; \
54+ done
55+
56+ # Copy configuration last (changes more frequently than JARs)
57+ COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
6258
6359USER spark
6460WORKDIR ${SPARK_HOME}
6561
6662# Start Spark Connect server
67- CMD ["sh " , "-c" , "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh" ]
63+ CMD ["bash " , "-c" , "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh" ]
0 commit comments