Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/workflows/python-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,27 @@ jobs:
- name: Install
run: make install

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build and cache Spark image
uses: docker/build-push-action@v6
with:
context: dev/spark
load: true
tags: pyiceberg-spark:latest
cache-from: type=gha,scope=spark
cache-to: type=gha,mode=max,scope=spark
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


- name: Build and cache Hive image
uses: docker/build-push-action@v6
with:
context: dev/hive
load: true
tags: pyiceberg-hive:latest
cache-from: type=gha,scope=hive
cache-to: type=gha,mode=max,scope=hive

- name: Run integration tests with coverage
run: COVERAGE=1 make test-integration
- name: Show debug logs
Expand Down
12 changes: 10 additions & 2 deletions dev/docker-compose-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
services:
spark-iceberg:
container_name: pyiceberg-spark
build: spark/
image: pyiceberg-spark:latest
build:
context: spark/
cache_from:
- pyiceberg-spark:latest
networks:
iceberg_net:
depends_on:
Expand Down Expand Up @@ -91,7 +95,11 @@ services:
tail -f /dev/null
"
hive:
build: hive/
image: pyiceberg-hive:latest
build:
context: hive/
cache_from:
- pyiceberg-hive:latest
container_name: pyiceberg-hive
hostname: hive
networks:
Expand Down
25 changes: 18 additions & 7 deletions dev/hive/Dockerfile
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same logic, just reordered

Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,28 @@

FROM apache/hive:4.0.0

ENV HADOOP_VERSION=3.3.6
ENV AWS_SDK_BUNDLE=1.12.753
# Dependency versions - changing these invalidates the JAR download layer
ARG HADOOP_VERSION=3.3.6
ARG AWS_SDK_BUNDLE=1.12.753
ARG MAVEN_MIRROR=https://repo1.maven.org/maven2

USER root

# Install curl, download JARs, and cleanup in a single layer
RUN apt-get update -qq && apt-get -qq -y install curl && \
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Install curl (separate layer - rarely changes)
RUN apt-get update -qq && \
apt-get -qq -y install --no-install-recommends curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Download JARs with retry logic (slow layer - only changes when versions change)
RUN curl -fsSL --retry 3 --retry-delay 5 \
-o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \
${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar && \
curl -fsSL --retry 3 --retry-delay 5 \
-o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \
${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar

# Copy configuration last (changes more frequently than JARs)
COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml

USER hive
28 changes: 14 additions & 14 deletions dev/spark/Dockerfile
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same logic, just reordered

Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}

# Dependency versions - keep these compatible
# Changing these will invalidate the JAR download cache layer
ARG ICEBERG_VERSION=1.10.1
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
ARG HADOOP_VERSION=3.4.1
ARG SCALA_VERSION=2.13
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unreferenced, so removing

ARG AWS_SDK_VERSION=2.24.6
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2

Expand All @@ -31,26 +31,23 @@ WORKDIR ${SPARK_HOME}
# Install curl for JAR downloads
RUN apt-get update && \
apt-get install -y --no-install-recommends curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Copy configuration (early for better caching)
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/

# Create event log directory
# Create directories (separate layer)
RUN mkdir -p /home/iceberg/spark-events && \
chown -R spark:spark /home/iceberg

# Required JAR dependencies
ENV JARS_TO_DOWNLOAD="\
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"

# Download JARs with retry logic
# Download JARs with retry logic (most cacheable - only changes when versions change)
# This is the slowest step, so we do it before copying config files
RUN set -e && \
cd "${SPARK_HOME}/jars" && \
for jar_path in ${JARS_TO_DOWNLOAD}; do \
for jar_path in \
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \
do \
jar_name=$(basename "${jar_path}") && \
echo "Downloading ${jar_name}..." && \
curl -fsSL --retry 3 --retry-delay 5 \
Expand All @@ -60,6 +57,9 @@ RUN set -e && \
done && \
chown -R spark:spark "${SPARK_HOME}/jars"

# Copy configuration last (changes more frequently than JARs)
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/

USER spark
WORKDIR ${SPARK_HOME}

Expand Down