Skip to content

Commit 6c85b2f

Browse files
committed
cache docker image in github ci
1 parent 061713b commit 6c85b2f

File tree

4 files changed

+63
-23
lines changed

4 files changed

+63
-23
lines changed

.github/workflows/python-ci.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,27 @@ jobs:
8585
- name: Install
8686
run: make install
8787

88+
- name: Set up Docker Buildx
89+
uses: docker/setup-buildx-action@v3
90+
91+
- name: Build and cache Spark image
92+
uses: docker/build-push-action@v6
93+
with:
94+
context: dev/spark
95+
load: true
96+
tags: pyiceberg-spark:latest
97+
cache-from: type=gha,scope=spark
98+
cache-to: type=gha,mode=max,scope=spark
99+
100+
- name: Build and cache Hive image
101+
uses: docker/build-push-action@v6
102+
with:
103+
context: dev/hive
104+
load: true
105+
tags: pyiceberg-hive:latest
106+
cache-from: type=gha,scope=hive
107+
cache-to: type=gha,mode=max,scope=hive
108+
88109
- name: Run integration tests with coverage
89110
run: COVERAGE=1 make test-integration
90111
- name: Show debug logs

dev/docker-compose-integration.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
services:
1919
spark-iceberg:
2020
container_name: pyiceberg-spark
21-
build: spark/
21+
image: pyiceberg-spark:latest
22+
build:
23+
context: spark/
24+
cache_from:
25+
- pyiceberg-spark:latest
2226
networks:
2327
iceberg_net:
2428
depends_on:
@@ -91,7 +95,11 @@ services:
9195
tail -f /dev/null
9296
"
9397
hive:
94-
build: hive/
98+
image: pyiceberg-hive:latest
99+
build:
100+
context: hive/
101+
cache_from:
102+
- pyiceberg-hive:latest
95103
container_name: pyiceberg-hive
96104
hostname: hive
97105
networks:

dev/hive/Dockerfile

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,28 @@
1515

1616
FROM apache/hive:4.0.0
1717

18-
ENV HADOOP_VERSION=3.3.6
19-
ENV AWS_SDK_BUNDLE=1.12.753
18+
# Dependency versions - changing these invalidates the JAR download layer
19+
ARG HADOOP_VERSION=3.3.6
20+
ARG AWS_SDK_BUNDLE=1.12.753
21+
ARG MAVEN_MIRROR=https://repo1.maven.org/maven2
2022

2123
USER root
2224

23-
# Install curl, download JARs, and cleanup in a single layer
24-
RUN apt-get update -qq && apt-get -qq -y install curl && \
25-
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \
26-
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \
27-
apt-get clean && rm -rf /var/lib/apt/lists/*
25+
# Install curl (separate layer - rarely changes)
26+
RUN apt-get update -qq && \
27+
apt-get -qq -y install --no-install-recommends curl && \
28+
apt-get clean && \
29+
rm -rf /var/lib/apt/lists/*
2830

31+
# Download JARs with retry logic (slow layer - only changes when versions change)
32+
RUN curl -fsSL --retry 3 --retry-delay 5 \
33+
-o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \
34+
${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar && \
35+
curl -fsSL --retry 3 --retry-delay 5 \
36+
-o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \
37+
${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar
38+
39+
# Copy configuration last (changes more frequently than JARs)
2940
COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml
3041

3142
USER hive

dev/spark/Dockerfile

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
1818
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
1919

2020
# Dependency versions - keep these compatible
21+
# Changing these will invalidate the JAR download cache layer
2122
ARG ICEBERG_VERSION=1.10.1
2223
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
2324
ARG HADOOP_VERSION=3.4.1
24-
ARG SCALA_VERSION=2.13
2525
ARG AWS_SDK_VERSION=2.24.6
2626
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
2727

@@ -31,26 +31,23 @@ WORKDIR ${SPARK_HOME}
3131
# Install curl for JAR downloads
3232
RUN apt-get update && \
3333
apt-get install -y --no-install-recommends curl && \
34+
apt-get clean && \
3435
rm -rf /var/lib/apt/lists/*
3536

36-
# Copy configuration (early for better caching)
37-
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
38-
39-
# Create event log directory
37+
# Create directories (separate layer)
4038
RUN mkdir -p /home/iceberg/spark-events && \
4139
chown -R spark:spark /home/iceberg
4240

43-
# Required JAR dependencies
44-
ENV JARS_TO_DOWNLOAD="\
45-
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
46-
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
47-
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
48-
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
49-
50-
# Download JARs with retry logic
41+
# Download JARs with retry logic (most cacheable - only changes when versions change)
42+
# This is the slowest step, so we do it before copying config files
5143
RUN set -e && \
5244
cd "${SPARK_HOME}/jars" && \
53-
for jar_path in ${JARS_TO_DOWNLOAD}; do \
45+
for jar_path in \
46+
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
47+
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
48+
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
49+
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \
50+
do \
5451
jar_name=$(basename "${jar_path}") && \
5552
echo "Downloading ${jar_name}..." && \
5653
curl -fsSL --retry 3 --retry-delay 5 \
@@ -60,6 +57,9 @@ RUN set -e && \
6057
done && \
6158
chown -R spark:spark "${SPARK_HOME}/jars"
6259

60+
# Copy configuration last (changes more frequently than JARs)
61+
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
62+
6363
USER spark
6464
WORKDIR ${SPARK_HOME}
6565

0 commit comments

Comments
 (0)