Warm Maven Dependency Cache #48

Workflow file for this run

.github/workflows/warmMavenCache.yml at e93ca8d

	name: Warm Maven Dependency Cache

	# This workflow pre-downloads all Maven dependencies via JFrog Artifactory
	# and saves them to the GitHub Actions cache. Forked PRs (which cannot
	# authenticate to JFrog) restore this cache to build without credentials.
	#
	# Triggers:
	# - push to main when pom.xml changes (keeps cache fresh after dep updates)
	# - daily schedule (prevents 7-day cache eviction)
	# - manual dispatch (with optional PR number to warm cache for a fork's pom.xml)

	on:
	push:
	branches: [main]
	paths: ['**/pom.xml']
	schedule:
	- cron: '0 6 * * *' # Daily at 06:00 UTC
	workflow_dispatch:
	inputs:
	pr_number:
	description: 'PR number to warm cache for (reads pom.xml from the PR branch). Leave empty to warm from main.'
	required: false
	type: string

	permissions:
	id-token: write
	contents: read
	pull-requests: read # Needed to read PR metadata for fork checkout

	jobs:
	warm-cache:
	# Run on both Linux and Windows. GitHub Actions cache is OS-scoped —
	# a cache saved on Linux cannot be restored on Windows and vice versa.
	strategy:
	fail-fast: false
	matrix:
	github-runner: [linux-ubuntu-latest, windows-server-latest]
	runs-on:
	group: databricks-protected-runner-group
	labels: ${{ matrix.github-runner }}

	steps:
	- name: Set up JDK
	uses: actions/setup-java@c1e323688fd81a25caa38c78aa6df2d33d3e20d9 # v4
	with:
	java-version: 21
	distribution: 'adopt'

	- name: Enable long paths (Windows)
	if: runner.os == 'Windows'
	run: git config --system core.longpaths true

	# If PR number provided, checkout only pom.xml files from the fork (security: no source code)
	- name: Checkout PR pom.xml files (sparse)
	if: inputs.pr_number != ''
	shell: bash
	run: \|
	set -euo pipefail

	# Fetch PR metadata
	PR_DATA=$(curl -sLS \
	-H "Accept: application/vnd.github+json" \
	-H "Authorization: Bearer ${{ github.token }}" \
	"https://api.github.com/repos/${{ github.repository }}/pulls/${{ inputs.pr_number }}")

	FORK_REPO=$(echo "$PR_DATA" \| jq -r '.head.repo.full_name')
	FORK_REF=$(echo "$PR_DATA" \| jq -r '.head.ref')

	echo "Warming cache for PR #${{ inputs.pr_number }} from ${FORK_REPO}@${FORK_REF}"

	# Sparse checkout: only pom.xml files (no source code from fork)
	git init .
	git remote add fork "https://github.com/${FORK_REPO}.git"
	git config core.sparseCheckout true
	echo "**/pom.xml" > .git/info/sparse-checkout
	echo "pom.xml" >> .git/info/sparse-checkout
	git fetch --depth=1 fork "${FORK_REF}"
	git checkout FETCH_HEAD

	- name: Checkout main branch
	if: inputs.pr_number == ''
	uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

	- name: Get JFrog OIDC token
	shell: bash
	run: \|
	set -euo pipefail

	# Get GitHub OIDC ID token
	ID_TOKEN=$(curl -sLS \
	-H "User-Agent: actions/oidc-client" \
	-H "Authorization: Bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
	"${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=jfrog-github" \| jq .value \| tr -d '"')
	echo "::add-mask::${ID_TOKEN}"

	# Exchange for JFrog access token
	ACCESS_TOKEN=$(curl -sLS -XPOST -H "Content-Type: application/json" \
	"https://databricks.jfrog.io/access/api/v1/oidc/token" \
	-d "{\"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\", \"subject_token_type\":\"urn:ietf:params:oauth:token-type:id_token\", \"subject_token\": \"${ID_TOKEN}\", \"provider_name\": \"github-actions\"}" \| jq .access_token \| tr -d '"')
	echo "::add-mask::${ACCESS_TOKEN}"

	if [ -z "$ACCESS_TOKEN" ] \|\| [ "$ACCESS_TOKEN" = "null" ]; then
	echo "FAIL: Could not extract JFrog access token"
	exit 1
	fi

	echo "JFROG_ACCESS_TOKEN=${ACCESS_TOKEN}" >> "$GITHUB_ENV"
	echo "JFrog OIDC token obtained successfully"

	- name: Configure Maven with JFrog credentials
	shell: bash
	run: \|
	set -euo pipefail

	mkdir -p ~/.m2
	cat > ~/.m2/settings.xml << EOF
	<settings>
	<mirrors>
	<mirror>
	<id>jfrog-central</id>
	<mirrorOf>*</mirrorOf>
	<url>https://databricks.jfrog.io/artifactory/db-maven/</url>
	</mirror>
	</mirrors>
	<servers>
	<server>
	<id>jfrog-central</id>
	<username>gha-service-account</username>
	<password>${JFROG_ACCESS_TOKEN}</password>
	</server>
	</servers>
	</settings>
	EOF

	- name: Resolve all dependencies via JFrog
	shell: bash
	run: \|
	set -euo pipefail

	# Run the EXACT same Maven commands as the PR CI workflows.
	# This is the only reliable way to ensure every plugin, provider,
	# and transitive dependency is resolved and cached. Each command
	# mirrors a real CI step from prCheck.yml, prIntegrationTests.yml,
	# or coverageReport.yml.

	echo "=== 1/8: spotless:check (formatting-check job) ==="
	mvn -B --errors spotless:check \|\| true

	echo "=== 2/8: install all modules (packaging-tests job) ==="
	mvn -B -pl jdbc-core,assembly-uber,assembly-thin clean install -DskipTests -Dmaven.javadoc.skip=true -Dmaven.source.skip=true -Ddependency-check.skip=true

	echo "=== 3/8: Arrow Patch Tests (unit-tests job, JDK 17+) ==="
	mvn -B -Pjdk21-NioNotOpen -pl jdbc-core test -Dgroups='Jvm17PlusAndArrowToNioReflectionDisabled' -Ddependency-check.skip=true \|\| true

	echo "=== 4/8: Arrow Allocator Tests (unit-tests job, JDK 17+) ==="
	mvn -B -Pjdk21-NioNotOpen -pl jdbc-core test -Dgroups='Jvm17PlusAndArrowToNioReflectionDisabled' -Dtest="ArrowBufferAllocatorNettyManagerTest,ArrowBufferAllocatorUnsafeManagerTest,ArrowBufferAllocatorUnknownManagerTest" -DforkCount=1 -DreuseForks=false -Ddependency-check.skip=true \|\| true

	echo "=== 5/8: Arrow Memory Tests (unit-tests job) ==="
	mvn -B -Plow-memory -pl jdbc-core test -Dtest='DatabricksArrowPatchMemoryUsageTest' -Ddependency-check.skip=true \|\| true

	echo "=== 6/8: Unit Tests with jacoco (unit-tests job) ==="
	mvn -B -pl jdbc-core clean test -Dtest="DatabricksParameterMetaDataTest#testInitialization" -Dgroups='!Jvm17PlusAndArrowToNioReflectionDisabled' jacoco:report -Ddependency-check.skip=true \|\| true

	echo "=== 7/8: Integration test compile (prIntegrationTests job) ==="
	mvn -B -pl jdbc-core compile test-compile -Ddependency-check.skip=true \|\| true

	echo "=== 8/8: Resolve all declared plugins ==="
	mvn -B -pl jdbc-core dependency:resolve-plugins -Ddependency-check.skip=true \|\| true

	echo "Dependency resolution complete"

	- name: Normalize _remote.repositories before saving cache
	shell: bash
	run: \|
	# Replace 'jfrog-central' with 'central' in _remote.repositories files.
	# These files track which repo ID each artifact was downloaded from. The
	# cache warmer downloads from 'jfrog-central' (the JFrog mirror), but
	# Maven's offline mode expects artifacts to be associated with 'central'
	# (the default Maven Central repo ID). Without this, offline mode refuses
	# cached artifacts with "has not been downloaded from it before".
	COUNT=$(find ~/.m2/repository -name '_remote.repositories' -print \| wc -l)
	find ~/.m2/repository -name '_remote.repositories' -exec sed -i 's/jfrog-central/central/g' {} \;
	echo "Normalized ${COUNT} _remote.repositories markers (jfrog-central -> central)"

	- name: Generate cache key with timestamp
	id: cache-key
	shell: bash
	run: \|
	# Include timestamp so each warmer run creates a new cache entry
	# (GitHub Actions caches are immutable — can't overwrite existing keys).
	# The restore step uses prefix 'maven-deps-' to match the latest entry.
	# Old entries auto-expire after 7 days of no access.
	TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
	POM_HASH=${{ hashFiles('**/pom.xml') }}
	echo "key=maven-deps-${TIMESTAMP}-${POM_HASH}" >> $GITHUB_OUTPUT

	- name: Save Maven dependency cache
	uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
	with:
	path: ~/.m2/repository
	key: ${{ steps.cache-key.outputs.key }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Warm Maven Dependency Cache #48

Workflow file

Warm Maven Dependency Cache #48

Uh oh!

Workflow file for this run