@@ -18,10 +18,10 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
1818FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
1919
2020# Dependency versions - keep these compatible
21+ # Changing these will invalidate the JAR download cache layer
2122ARG ICEBERG_VERSION=1.10.1
2223ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
2324ARG HADOOP_VERSION=3.4.1
24- ARG SCALA_VERSION=2.13
2525ARG AWS_SDK_VERSION=2.24.6
2626ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
2727
@@ -31,26 +31,23 @@ WORKDIR ${SPARK_HOME}
3131# Install curl for JAR downloads
3232RUN apt-get update && \
3333 apt-get install -y --no-install-recommends curl && \
34+ apt-get clean && \
3435 rm -rf /var/lib/apt/lists/*
3536
36- # Copy configuration (early for better caching)
37- COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
38-
39- # Create event log directory
37+ # Create directories (separate layer)
4038RUN mkdir -p /home/iceberg/spark-events && \
4139 chown -R spark:spark /home/iceberg
4240
43- # Required JAR dependencies
44- ENV JARS_TO_DOWNLOAD="\
45- org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
46- org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
47- org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
48- software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
49-
50- # Download JARs with retry logic
41+ # Download JARs with retry logic (most cacheable - only changes when versions change)
42+ # This is the slowest step, so we do it before copying config files
5143RUN set -e && \
5244 cd "${SPARK_HOME}/jars" && \
53- for jar_path in ${JARS_TO_DOWNLOAD}; do \
45+ for jar_path in \
46+ "org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
47+ "org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
48+ "org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
49+ "software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar" ; \
50+ do \
5451 jar_name=$(basename "${jar_path}" ) && \
5552 echo "Downloading ${jar_name}..." && \
5653 curl -fsSL --retry 3 --retry-delay 5 \
@@ -60,6 +57,9 @@ RUN set -e && \
6057 done && \
6158 chown -R spark:spark "${SPARK_HOME}/jars"
6259
60+ # Copy configuration last (changes more frequently than JARs)
61+ COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
62+
6363USER spark
6464WORKDIR ${SPARK_HOME}
6565
0 commit comments