Skip to content

Commit ca93b31

Browse files
authored
improve dockerfile for better caching (#2930)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Reorder Dockerfile commands for better caching All files has the same functionalities, just reordered. ## Are these changes tested? ## Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent 287f679 commit ca93b31

File tree

3 files changed

+51
-42
lines changed

3 files changed

+51
-42
lines changed

dev/docker-compose-integration.yml

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,22 @@
1717

1818
services:
1919
spark-iceberg:
20-
container_name: pyiceberg-spark
20+
image: pyiceberg-spark:latest
2121
build: spark/
22+
container_name: pyiceberg-spark
2223
networks:
2324
iceberg_net:
2425
depends_on:
2526
- rest
2627
- hive
2728
- minio
29+
ports:
30+
- 15002:15002 # Spark Connect
31+
- 4040:4040 # Spark UI
2832
environment:
2933
- AWS_ACCESS_KEY_ID=admin
3034
- AWS_SECRET_ACCESS_KEY=password
3135
- AWS_REGION=us-east-1
32-
ports:
33-
- 15002:15002 # Spark Connect
34-
- 4040:4040 # Spark UI
3536
links:
3637
- rest:rest
3738
- hive:hive
@@ -60,25 +61,25 @@ services:
6061
minio:
6162
image: minio/minio
6263
container_name: pyiceberg-minio
63-
environment:
64-
- MINIO_ROOT_USER=admin
65-
- MINIO_ROOT_PASSWORD=password
66-
- MINIO_DOMAIN=minio
6764
networks:
6865
iceberg_net:
6966
aliases:
7067
- warehouse.minio
7168
ports:
7269
- 9001:9001
7370
- 9000:9000
71+
environment:
72+
- MINIO_ROOT_USER=admin
73+
- MINIO_ROOT_PASSWORD=password
74+
- MINIO_DOMAIN=minio
7475
command: ["server", "/data", "--console-address", ":9001"]
7576
mc:
76-
depends_on:
77-
- minio
7877
image: minio/mc
7978
container_name: pyiceberg-mc
8079
networks:
8180
iceberg_net:
81+
depends_on:
82+
- minio
8283
environment:
8384
- AWS_ACCESS_KEY_ID=admin
8485
- AWS_SECRET_ACCESS_KEY=password
@@ -91,6 +92,7 @@ services:
9192
tail -f /dev/null
9293
"
9394
hive:
95+
image: pyiceberg-hive:latest
9496
build: hive/
9597
container_name: pyiceberg-hive
9698
hostname: hive

dev/hive/Dockerfile

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,28 @@
1515

1616
FROM apache/hive:4.0.0
1717

18-
ENV HADOOP_VERSION=3.3.6
19-
ENV AWS_SDK_BUNDLE=1.12.753
18+
# Dependency versions - changing these invalidates the JAR download layer
19+
ARG HADOOP_VERSION=3.3.6
20+
ARG AWS_SDK_BUNDLE=1.12.753
21+
ARG MAVEN_MIRROR=https://repo1.maven.org/maven2
2022

2123
USER root
2224

23-
# Install curl, download JARs, and cleanup in a single layer
24-
RUN apt-get update -qq && apt-get -qq -y install curl && \
25-
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \
26-
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \
27-
apt-get clean && rm -rf /var/lib/apt/lists/*
25+
# Install curl (separate layer - rarely changes)
26+
RUN apt-get update -qq && \
27+
apt-get -qq -y install --no-install-recommends curl && \
28+
apt-get clean && \
29+
rm -rf /var/lib/apt/lists/*
2830

31+
# Download JARs with retry logic (slow layer - only changes when versions change)
32+
RUN curl -fsSL --retry 3 --retry-delay 5 \
33+
-o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \
34+
"${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" && \
35+
curl -fsSL --retry 3 --retry-delay 5 \
36+
-o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \
37+
"${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar"
38+
39+
# Copy configuration last (changes more frequently than JARs)
2940
COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml
3041

3142
USER hive

dev/spark/Dockerfile

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,50 +18,46 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
1818
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
1919

2020
# Dependency versions - keep these compatible
21+
# Changing these will invalidate the JAR download cache layer
2122
ARG ICEBERG_VERSION=1.10.1
2223
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
2324
ARG HADOOP_VERSION=3.4.1
24-
ARG SCALA_VERSION=2.13
2525
ARG AWS_SDK_VERSION=2.24.6
2626
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
2727

2828
USER root
2929
WORKDIR ${SPARK_HOME}
3030

31-
# Install curl for JAR downloads
32-
RUN apt-get update && \
33-
apt-get install -y --no-install-recommends curl && \
34-
rm -rf /var/lib/apt/lists/*
35-
36-
# Copy configuration (early for better caching)
37-
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
38-
39-
# Create event log directory
40-
RUN mkdir -p /home/iceberg/spark-events && \
31+
# Install curl and create directories
32+
RUN apt-get update -qq && \
33+
apt-get install -qq -y --no-install-recommends curl && \
34+
apt-get clean && \
35+
rm -rf /var/lib/apt/lists/* && \
36+
mkdir -p /home/iceberg/spark-events && \
4137
chown -R spark:spark /home/iceberg
4238

43-
# Required JAR dependencies
44-
ENV JARS_TO_DOWNLOAD="\
45-
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
46-
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
47-
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
48-
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
49-
50-
# Download JARs with retry logic
39+
# Download JARs with retry logic (most cacheable - only changes when versions change)
40+
# This is the slowest step, so we do it before copying config files
5141
RUN set -e && \
5242
cd "${SPARK_HOME}/jars" && \
53-
for jar_path in ${JARS_TO_DOWNLOAD}; do \
43+
for jar_path in \
44+
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
45+
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
46+
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
47+
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \
48+
do \
5449
jar_name=$(basename "${jar_path}") && \
55-
echo "Downloading ${jar_name}..." && \
5650
curl -fsSL --retry 3 --retry-delay 5 \
5751
-o "${jar_name}" \
5852
"${MAVEN_MIRROR}/${jar_path}" && \
59-
echo "✓ Downloaded ${jar_name}"; \
60-
done && \
61-
chown -R spark:spark "${SPARK_HOME}/jars"
53+
chown spark:spark "${jar_name}"; \
54+
done
55+
56+
# Copy configuration last (changes more frequently than JARs)
57+
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
6258

6359
USER spark
6460
WORKDIR ${SPARK_HOME}
6561

6662
# Start Spark Connect server
67-
CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]
63+
CMD ["bash", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]

0 commit comments

Comments
 (0)