Skip to content

Warm Maven Dependency Cache #48

Warm Maven Dependency Cache

Warm Maven Dependency Cache #48

name: Warm Maven Dependency Cache
# This workflow pre-downloads all Maven dependencies via JFrog Artifactory
# and saves them to the GitHub Actions cache. Forked PRs (which cannot
# authenticate to JFrog) restore this cache to build without credentials.
#
# Triggers:
# - push to main when pom.xml changes (keeps cache fresh after dep updates)
# - daily schedule (prevents 7-day cache eviction)
# - manual dispatch (with optional PR number to warm cache for a fork's pom.xml)
on:
push:
branches: [main]
paths: ['**/pom.xml']
schedule:
- cron: '0 6 * * *' # Daily at 06:00 UTC
workflow_dispatch:
inputs:
pr_number:
description: 'PR number to warm cache for (reads pom.xml from the PR branch). Leave empty to warm from main.'
required: false
type: string
permissions:
id-token: write
contents: read
pull-requests: read # Needed to read PR metadata for fork checkout
jobs:
warm-cache:
# Run on both Linux and Windows. GitHub Actions cache is OS-scoped —
# a cache saved on Linux cannot be restored on Windows and vice versa.
strategy:
fail-fast: false
matrix:
github-runner: [linux-ubuntu-latest, windows-server-latest]
runs-on:
group: databricks-protected-runner-group
labels: ${{ matrix.github-runner }}
steps:
- name: Set up JDK
uses: actions/setup-java@c1e323688fd81a25caa38c78aa6df2d33d3e20d9 # v4
with:
java-version: 21
distribution: 'adopt'
- name: Enable long paths (Windows)
if: runner.os == 'Windows'
run: git config --system core.longpaths true
# If PR number provided, checkout only pom.xml files from the fork (security: no source code)
- name: Checkout PR pom.xml files (sparse)
if: inputs.pr_number != ''
shell: bash
run: |
set -euo pipefail
# Fetch PR metadata
PR_DATA=$(curl -sLS \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ github.token }}" \
"https://api.github.com/repos/${{ github.repository }}/pulls/${{ inputs.pr_number }}")
FORK_REPO=$(echo "$PR_DATA" | jq -r '.head.repo.full_name')
FORK_REF=$(echo "$PR_DATA" | jq -r '.head.ref')
echo "Warming cache for PR #${{ inputs.pr_number }} from ${FORK_REPO}@${FORK_REF}"
# Sparse checkout: only pom.xml files (no source code from fork)
git init .
git remote add fork "https://github.com/${FORK_REPO}.git"
git config core.sparseCheckout true
echo "**/pom.xml" > .git/info/sparse-checkout
echo "pom.xml" >> .git/info/sparse-checkout
git fetch --depth=1 fork "${FORK_REF}"
git checkout FETCH_HEAD
- name: Checkout main branch
if: inputs.pr_number == ''
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- name: Get JFrog OIDC token
shell: bash
run: |
set -euo pipefail
# Get GitHub OIDC ID token
ID_TOKEN=$(curl -sLS \
-H "User-Agent: actions/oidc-client" \
-H "Authorization: Bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
"${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=jfrog-github" | jq .value | tr -d '"')
echo "::add-mask::${ID_TOKEN}"
# Exchange for JFrog access token
ACCESS_TOKEN=$(curl -sLS -XPOST -H "Content-Type: application/json" \
"https://databricks.jfrog.io/access/api/v1/oidc/token" \
-d "{\"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\", \"subject_token_type\":\"urn:ietf:params:oauth:token-type:id_token\", \"subject_token\": \"${ID_TOKEN}\", \"provider_name\": \"github-actions\"}" | jq .access_token | tr -d '"')
echo "::add-mask::${ACCESS_TOKEN}"
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
echo "FAIL: Could not extract JFrog access token"
exit 1
fi
echo "JFROG_ACCESS_TOKEN=${ACCESS_TOKEN}" >> "$GITHUB_ENV"
echo "JFrog OIDC token obtained successfully"
- name: Configure Maven with JFrog credentials
shell: bash
run: |
set -euo pipefail
mkdir -p ~/.m2
cat > ~/.m2/settings.xml << EOF
<settings>
<mirrors>
<mirror>
<id>jfrog-central</id>
<mirrorOf>*</mirrorOf>
<url>https://databricks.jfrog.io/artifactory/db-maven/</url>
</mirror>
</mirrors>
<servers>
<server>
<id>jfrog-central</id>
<username>gha-service-account</username>
<password>${JFROG_ACCESS_TOKEN}</password>
</server>
</servers>
</settings>
EOF
- name: Resolve all dependencies via JFrog
shell: bash
run: |
set -euo pipefail
# Run the EXACT same Maven commands as the PR CI workflows.
# This is the only reliable way to ensure every plugin, provider,
# and transitive dependency is resolved and cached. Each command
# mirrors a real CI step from prCheck.yml, prIntegrationTests.yml,
# or coverageReport.yml.
echo "=== 1/8: spotless:check (formatting-check job) ==="
mvn -B --errors spotless:check || true
echo "=== 2/8: install all modules (packaging-tests job) ==="
mvn -B -pl jdbc-core,assembly-uber,assembly-thin clean install -DskipTests -Dmaven.javadoc.skip=true -Dmaven.source.skip=true -Ddependency-check.skip=true
echo "=== 3/8: Arrow Patch Tests (unit-tests job, JDK 17+) ==="
mvn -B -Pjdk21-NioNotOpen -pl jdbc-core test -Dgroups='Jvm17PlusAndArrowToNioReflectionDisabled' -Ddependency-check.skip=true || true
echo "=== 4/8: Arrow Allocator Tests (unit-tests job, JDK 17+) ==="
mvn -B -Pjdk21-NioNotOpen -pl jdbc-core test -Dgroups='Jvm17PlusAndArrowToNioReflectionDisabled' -Dtest="ArrowBufferAllocatorNettyManagerTest,ArrowBufferAllocatorUnsafeManagerTest,ArrowBufferAllocatorUnknownManagerTest" -DforkCount=1 -DreuseForks=false -Ddependency-check.skip=true || true
echo "=== 5/8: Arrow Memory Tests (unit-tests job) ==="
mvn -B -Plow-memory -pl jdbc-core test -Dtest='DatabricksArrowPatchMemoryUsageTest' -Ddependency-check.skip=true || true
echo "=== 6/8: Unit Tests with jacoco (unit-tests job) ==="
mvn -B -pl jdbc-core clean test -Dtest="DatabricksParameterMetaDataTest#testInitialization" -Dgroups='!Jvm17PlusAndArrowToNioReflectionDisabled' jacoco:report -Ddependency-check.skip=true || true
echo "=== 7/8: Integration test compile (prIntegrationTests job) ==="
mvn -B -pl jdbc-core compile test-compile -Ddependency-check.skip=true || true
echo "=== 8/8: Resolve all declared plugins ==="
mvn -B -pl jdbc-core dependency:resolve-plugins -Ddependency-check.skip=true || true
echo "Dependency resolution complete"
- name: Normalize _remote.repositories before saving cache
shell: bash
run: |
# Replace 'jfrog-central' with 'central' in _remote.repositories files.
# These files track which repo ID each artifact was downloaded from. The
# cache warmer downloads from 'jfrog-central' (the JFrog mirror), but
# Maven's offline mode expects artifacts to be associated with 'central'
# (the default Maven Central repo ID). Without this, offline mode refuses
# cached artifacts with "has not been downloaded from it before".
COUNT=$(find ~/.m2/repository -name '_remote.repositories' -print | wc -l)
find ~/.m2/repository -name '_remote.repositories' -exec sed -i 's/jfrog-central/central/g' {} \;
echo "Normalized ${COUNT} _remote.repositories markers (jfrog-central -> central)"
- name: Generate cache key with timestamp
id: cache-key
shell: bash
run: |
# Include timestamp so each warmer run creates a new cache entry
# (GitHub Actions caches are immutable — can't overwrite existing keys).
# The restore step uses prefix 'maven-deps-' to match the latest entry.
# Old entries auto-expire after 7 days of no access.
TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
POM_HASH=${{ hashFiles('**/pom.xml') }}
echo "key=maven-deps-${TIMESTAMP}-${POM_HASH}" >> $GITHUB_OUTPUT
- name: Save Maven dependency cache
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
with:
path: ~/.m2/repository
key: ${{ steps.cache-key.outputs.key }}