Skip to content

Commit 79327f6

Browse files
committed
improve dockerfile for better caching
1 parent 061713b commit 79327f6

File tree

3 files changed

+41
-32
lines changed

3 files changed

+41
-32
lines changed

dev/docker-compose-integration.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
services:
1919
spark-iceberg:
2020
container_name: pyiceberg-spark
21+
image: pyiceberg-spark:latest
2122
build: spark/
2223
networks:
2324
iceberg_net:
@@ -91,6 +92,7 @@ services:
9192
tail -f /dev/null
9293
"
9394
hive:
95+
image: pyiceberg-hive:latest
9496
build: hive/
9597
container_name: pyiceberg-hive
9698
hostname: hive

dev/hive/Dockerfile

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,28 @@
1515

1616
FROM apache/hive:4.0.0
1717

18-
ENV HADOOP_VERSION=3.3.6
19-
ENV AWS_SDK_BUNDLE=1.12.753
18+
# Dependency versions - changing these invalidates the JAR download layer
19+
ARG HADOOP_VERSION=3.3.6
20+
ARG AWS_SDK_BUNDLE=1.12.753
21+
ARG MAVEN_MIRROR=https://repo1.maven.org/maven2
2022

2123
USER root
2224

23-
# Install curl, download JARs, and cleanup in a single layer
24-
RUN apt-get update -qq && apt-get -qq -y install curl && \
25-
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \
26-
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \
27-
apt-get clean && rm -rf /var/lib/apt/lists/*
25+
# Install curl (separate layer - rarely changes)
26+
RUN apt-get update -qq && \
27+
apt-get -qq -y install --no-install-recommends curl && \
28+
apt-get clean && \
29+
rm -rf /var/lib/apt/lists/*
2830

31+
# Download JARs with retry logic (slow layer - only changes when versions change)
32+
RUN curl -fsSL --retry 3 --retry-delay 5 \
33+
-o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \
34+
"${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" && \
35+
curl -fsSL --retry 3 --retry-delay 5 \
36+
-o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \
37+
"${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar"
38+
39+
# Copy configuration last (changes more frequently than JARs)
2940
COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml
3041

3142
USER hive

dev/spark/Dockerfile

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,50 +18,46 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
1818
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
1919

2020
# Dependency versions - keep these compatible
21+
# Changing these will invalidate the JAR download cache layer
2122
ARG ICEBERG_VERSION=1.10.1
2223
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
2324
ARG HADOOP_VERSION=3.4.1
24-
ARG SCALA_VERSION=2.13
2525
ARG AWS_SDK_VERSION=2.24.6
2626
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
2727

2828
USER root
2929
WORKDIR ${SPARK_HOME}
3030

31-
# Install curl for JAR downloads
32-
RUN apt-get update && \
33-
apt-get install -y --no-install-recommends curl && \
34-
rm -rf /var/lib/apt/lists/*
35-
36-
# Copy configuration (early for better caching)
37-
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
38-
39-
# Create event log directory
40-
RUN mkdir -p /home/iceberg/spark-events && \
31+
# Install curl and create directories
32+
RUN apt-get update -qq && \
33+
apt-get install -qq -y --no-install-recommends curl && \
34+
apt-get clean && \
35+
rm -rf /var/lib/apt/lists/* && \
36+
mkdir -p /home/iceberg/spark-events && \
4137
chown -R spark:spark /home/iceberg
4238

43-
# Required JAR dependencies
44-
ENV JARS_TO_DOWNLOAD="\
45-
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
46-
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
47-
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
48-
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
49-
50-
# Download JARs with retry logic
39+
# Download JARs with retry logic (most cacheable - only changes when versions change)
40+
# This is the slowest step, so we do it before copying config files
5141
RUN set -e && \
5242
cd "${SPARK_HOME}/jars" && \
53-
for jar_path in ${JARS_TO_DOWNLOAD}; do \
43+
for jar_path in \
44+
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
45+
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
46+
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
47+
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \
48+
do \
5449
jar_name=$(basename "${jar_path}") && \
55-
echo "Downloading ${jar_name}..." && \
5650
curl -fsSL --retry 3 --retry-delay 5 \
5751
-o "${jar_name}" \
5852
"${MAVEN_MIRROR}/${jar_path}" && \
59-
echo "✓ Downloaded ${jar_name}"; \
60-
done && \
61-
chown -R spark:spark "${SPARK_HOME}/jars"
53+
chown spark:spark "${jar_name}"; \
54+
done
55+
56+
# Copy configuration last (changes more frequently than JARs)
57+
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
6258

6359
USER spark
6460
WORKDIR ${SPARK_HOME}
6561

6662
# Start Spark Connect server
67-
CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]
63+
CMD ["bash", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]

0 commit comments

Comments
 (0)