Skip to content

Commit e1b0d6e

Browse files
Merge pull request #3788 from AI-Hypercomputer:docker_package_build
PiperOrigin-RevId: 910323482
2 parents f6fca0f + 52d8035 commit e1b0d6e

7 files changed

Lines changed: 123 additions & 125 deletions

File tree

Lines changed: 29 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023–2025 Google LLC
1+
# Copyright 2023–2026 Google LLC
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -42,27 +42,16 @@ permissions:
4242
contents: read
4343

4444
jobs:
45-
setup:
46-
runs-on: ubuntu-latest
47-
outputs:
48-
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
49-
image_date: ${{ steps.vars.outputs.image_date }}
50-
steps:
51-
- name: Checkout MaxText
52-
uses: actions/checkout@v5
53-
54-
- name: Get metadata
55-
id: vars
56-
run: |
57-
# MaxText SHA
58-
echo "maxtext_sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT
59-
60-
# Image date
61-
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
45+
build_and_upload_maxtext_package:
46+
uses: ./.github/workflows/build_package.yml
47+
with:
48+
device_type: tpu
49+
device_name: v4-8
50+
cloud_runner: linux-x86-n2-16-buildkit
6251

6352
build-and-push:
6453
name: ${{ matrix.image_name }}
65-
needs: setup
54+
needs: build_and_upload_maxtext_package
6655
strategy:
6756
fail-fast: false
6857
matrix:
@@ -71,72 +60,49 @@ jobs:
7160
build_mode: stable
7261
workflow: pre-training
7362
image_name: maxtext_jax_stable
74-
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
63+
dockerfile: maxtext_tpu_dependencies.Dockerfile
7564
- device: tpu
7665
build_mode: nightly
7766
workflow: pre-training
7867
image_name: maxtext_jax_nightly
79-
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
68+
dockerfile: maxtext_tpu_dependencies.Dockerfile
8069
- device: tpu
8170
build_mode: nightly
8271
workflow: post-training
8372
image_name: maxtext_post_training_nightly
84-
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
73+
dockerfile: maxtext_tpu_dependencies.Dockerfile
8574
- device: gpu
8675
build_mode: stable
8776
workflow: pre-training
8877
image_name: maxtext_gpu_jax_stable
89-
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
78+
dockerfile: maxtext_gpu_dependencies.Dockerfile
9079
- device: gpu
9180
build_mode: nightly
9281
workflow: pre-training
9382
image_name: maxtext_gpu_jax_nightly
94-
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
83+
dockerfile: maxtext_gpu_dependencies.Dockerfile
9584
uses: ./.github/workflows/build_and_push_docker_image.yml
9685
with:
97-
image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }}
86+
image_name: ${{ inputs.image_suffix != '' && format('{0}_{1}', matrix.image_name, inputs.image_suffix) || matrix.image_name }}
9887
device: ${{ matrix.device }}
9988
build_mode: ${{ matrix.build_mode }}
10089
workflow: ${{ matrix.workflow }}
10190
dockerfile: ${{ matrix.dockerfile }}
102-
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
103-
image_date: ${{ needs.setup.outputs.image_date }}
91+
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
92+
include_test_assets: true
10493
secrets:
10594
HF_TOKEN: ${{ secrets.HF_TOKEN }}
10695

107-
promote:
108-
name: promote-${{ matrix.image_name }}
109-
needs: build-and-push
110-
strategy:
111-
fail-fast: false
112-
matrix:
113-
include:
114-
- device: tpu
115-
build_mode: stable
116-
workflow: pre-training
117-
image_name: maxtext_jax_stable
118-
- device: tpu
119-
build_mode: nightly
120-
workflow: pre-training
121-
image_name: maxtext_jax_nightly
122-
- device: tpu
123-
build_mode: nightly
124-
workflow: post-training
125-
image_name: maxtext_post_training_nightly
126-
- device: gpu
127-
build_mode: stable
128-
workflow: pre-training
129-
image_name: maxtext_gpu_jax_stable
130-
- device: gpu
131-
build_mode: nightly
132-
workflow: pre-training
133-
image_name: maxtext_gpu_jax_nightly
134-
135-
uses: ./.github/workflows/promote_docker_image.yml
136-
with:
137-
image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }}
138-
image_tag: ${{ github.run_id }}
139-
device: ${{ matrix.device }}
140-
workflow: ${{ matrix.workflow }}
141-
secrets:
142-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
96+
notify_failure:
97+
name: Notify failed build
98+
needs: [build-and-push]
99+
if: ${{ failure() && inputs.image_suffix == '' }}
100+
runs-on: ubuntu-latest
101+
permissions:
102+
issues: write
103+
steps:
104+
- name: Create issue on failure
105+
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b
106+
with:
107+
github-token: ${{ secrets.GITHUB_TOKEN }}
108+
title-template: "MaxText Docker Image Build Failure"
Lines changed: 79 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2025 Google LLC
1+
# Copyright 2023-2026 Google LLC
22

33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -34,9 +34,6 @@ on:
3434
maxtext_sha:
3535
required: true
3636
type: string
37-
image_date:
38-
required: false
39-
type: string
4037
workflow:
4138
required: false
4239
type: string
@@ -45,6 +42,10 @@ on:
4542
required: false
4643
type: string
4744
default: ''
45+
include_test_assets:
46+
required: false
47+
type: boolean
48+
default: false
4849
secrets:
4950
HF_TOKEN:
5051
required: true
@@ -53,36 +54,42 @@ permissions:
5354
contents: read
5455

5556
jobs:
56-
build_and_push:
57-
runs-on: linux-x86-n2-16-buildkit
58-
container: google/cloud-sdk:524.0.0
59-
if: >
60-
github.event_name == 'release' ||
61-
github.event_name == 'schedule' ||
62-
github.event_name == 'pull_request' ||
63-
github.event_name == 'workflow_dispatch' && (
64-
github.event.inputs.target_device == 'all' ||
65-
github.event.inputs.target_device == 'tpu' ||
66-
github.event.inputs.target_device == 'gpu'
67-
)
57+
pre_build_check:
58+
runs-on: ubuntu-latest
59+
outputs:
60+
should_run: ${{ steps.check.outputs.should_run }}
6861
steps:
6962
- name: Check if build should run
7063
id: check
7164
shell: bash
7265
run: |
73-
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${GITHUB_EVENT_INPUTS_TARGET_DEVICE}" != "all" && "${GITHUB_EVENT_INPUTS_TARGET_DEVICE}" != "${INPUTS_DEVICE}" ]]; then
74-
echo "should_run=false" >> $GITHUB_OUTPUT
75-
echo "Skipping ${INPUTS_IMAGE_NAME} build for device: ${INPUTS_DEVICE} in ${INPUTS_BUILD_MODE} mode."
76-
else
66+
EVENT_NAME="${{ github.event_name }}"
67+
TARGET_DEVICE="${{ github.event.inputs.target_device }}"
68+
INPUT_DEVICE="${{ inputs.device }}"
69+
70+
SHOULD_RUN="false"
71+
if [[ "$EVENT_NAME" == "release" || "$EVENT_NAME" == "schedule" || "$EVENT_NAME" == "pull_request" ]]; then
72+
SHOULD_RUN="true"
73+
elif [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
74+
if [[ "$TARGET_DEVICE" == "all" || "$TARGET_DEVICE" == "$INPUT_DEVICE" ]]; then
75+
SHOULD_RUN="true"
76+
fi
77+
fi
78+
79+
if [[ "$SHOULD_RUN" == "true" ]]; then
7780
echo "should_run=true" >> $GITHUB_OUTPUT
78-
echo "Building ${INPUTS_IMAGE_NAME} for device: ${INPUTS_DEVICE} in ${INPUTS_BUILD_MODE} mode."
81+
echo "Building ${{ inputs.image_name }} for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
82+
else
83+
echo "should_run=false" >> $GITHUB_OUTPUT
84+
echo "Skipping ${{ inputs.image_name }} build for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
7985
fi
80-
env:
81-
GITHUB_EVENT_INPUTS_TARGET_DEVICE: ${{ github.event.inputs.target_device }}
82-
INPUTS_DEVICE: ${{ inputs.device }}
83-
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
84-
INPUTS_BUILD_MODE: ${{ inputs.build_mode }}
8586
87+
build_and_push:
88+
needs: pre_build_check
89+
runs-on: linux-x86-n2-16-buildkit
90+
container: google/cloud-sdk:524.0.0
91+
if: needs.pre_build_check.outputs.should_run == 'true'
92+
steps:
8693
- name: Matrix Debugger
8794
run: |
8895
echo "device: ${{ inputs.device }}"
@@ -93,50 +100,68 @@ jobs:
93100
94101
- name: Checkout MaxText
95102
uses: actions/checkout@v5
96-
if: steps.check.outputs.should_run == 'true'
97103
with:
98-
# This ensures that every job clones the exact same commit as "setup" job
99104
ref: ${{ inputs.maxtext_sha }}
100105

101106
- name: Mark git repositories as safe
102107
run: git config --global --add safe.directory ${GITHUB_WORKSPACE}
103-
if: steps.check.outputs.should_run == 'true'
104108

105109
- name: Configure Docker
106110
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
107-
if: steps.check.outputs.should_run == 'true'
108111

109112
- name: Set up Docker BuildX
110113
uses: docker/setup-buildx-action@v3.11.1
111-
if: steps.check.outputs.should_run == 'true'
112114
with:
113115
driver: remote
114116
endpoint: tcp://localhost:1234
115117

118+
- name: Download MaxText wheel
119+
uses: actions/download-artifact@v4
120+
with:
121+
name: maxtext-wheel
122+
123+
- name: Install uv and set Python version
124+
uses: astral-sh/setup-uv@v7
125+
with:
126+
python-version: '3.12'
127+
enable-cache: true
128+
129+
- name: Install MaxText wheel
130+
shell: bash
131+
run: |
132+
uv venv --seed
133+
source .venv/bin/activate
134+
maxtext_wheel=$(ls maxtext-*-py3-none-any.whl 2>/dev/null)
135+
uv pip install ${maxtext_wheel}[runner] --resolution=lowest
136+
137+
- name: Copy tests assets to package directory
138+
if: inputs.include_test_assets == true
139+
shell: bash
140+
run: |
141+
source .venv/bin/activate
142+
cp -r ${PWD}/tests .venv/lib/python3.12/site-packages/
143+
cp ${PWD}/pytest.ini .venv/lib/python3.12/site-packages/
144+
116145
- name: Build and push Docker image
117146
uses: docker/build-push-action@v6
118-
if: steps.check.outputs.should_run == 'true'
119147
with:
120148
push: true
121149
context: .
122-
file: ${{ inputs.dockerfile }}
123-
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ github.run_id }}
150+
file: .venv/lib/python3.12/site-packages/dependencies/dockerfiles/${{ inputs.dockerfile }}
151+
tags: gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}:${{ github.run_id }}
124152
cache-from: type=gha
125153
outputs: type=image,compression=zstd,force-compression=true
126154
build-args: |
127155
DEVICE=${{ inputs.device }}
128156
MODE=${{ inputs.build_mode }}
129157
WORKFLOW=${{ inputs.workflow }}
130-
PACKAGE_DIR=./src
131-
JAX_VERSION=NONE
132-
LIBTPU_VERSION=NONE
133-
INCLUDE_TEST_ASSETS=true
158+
PACKAGE_DIR=.venv/lib/python3.12/site-packages
159+
INCLUDE_TEST_ASSETS=${{ inputs.include_test_assets }}
134160
135161
- name: Add tags to Docker image
136-
if: steps.check.outputs.should_run == 'true'
137162
shell: bash
138163
run: |
139-
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
164+
SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${INPUTS_IMAGE_NAME}"
140165
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
141166
142167
if [[ $INPUTS_VERSION_NAME ]]; then
@@ -146,16 +171,26 @@ jobs:
146171
echo "Tagging docker images corresponding to nightly release..."
147172
148173
# Add date tag
149-
gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
174+
IMAGE_DATE="$(date +%Y-%m-%d)"
175+
gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:$IMAGE_DATE" --quiet
150176
151177
# Convert date to YYYYMMDD format
152-
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
178+
clean_date=$(echo "$IMAGE_DATE" | sed 's/[-:]//g' | cut -c1-8)
153179
154180
# Add MaxText tag
181+
MAXTEXT_SHA=$(git rev-parse --short HEAD)
155182
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${MAXTEXT_SHA}_${clean_date}" --quiet
156183
fi
157184
env:
158185
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
159-
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
160186
INPUTS_VERSION_NAME: ${{ inputs.version_name }}
161-
MAXTEXT_SHA: ${{ inputs.maxtext_sha }}
187+
188+
promote_image:
189+
needs: [pre_build_check, build_and_push]
190+
if: needs.pre_build_check.outputs.should_run == 'true' && inputs.include_test_assets == true
191+
uses: ./.github/workflows/promote_docker_image.yml
192+
with:
193+
image_name: ${{ inputs.image_name }}
194+
image_tag: ${{ github.run_id }}
195+
device: ${{ inputs.device }}
196+
workflow: ${{ inputs.workflow }}

.github/workflows/promote_docker_image.yml

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ on:
3131
workflow:
3232
required: true
3333
type: string
34-
secrets:
35-
HF_TOKEN:
36-
required: false
37-
3834
permissions:
3935
contents: read
4036

@@ -49,7 +45,7 @@ jobs:
4945
id: check
5046
shell: bash
5147
run: |
52-
if gcloud container images describe "gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_tag }}" >/dev/null 2>&1; then
48+
if gcloud container images describe "gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}:${{ inputs.image_tag }}" >/dev/null 2>&1; then
5349
echo "exists=true" >> $GITHUB_OUTPUT
5450
else
5551
echo "exists=false" >> $GITHUB_OUTPUT
@@ -87,9 +83,6 @@ jobs:
8783
- name: Add tags to Docker image
8884
shell: bash
8985
run: |
90-
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
91-
TEMP_IMG="${SOURCE_IMAGE}:${{ inputs.image_tag }}"
92-
# Latest Tag
93-
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
94-
env:
95-
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
86+
# Add Latest Tag
87+
SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${{ inputs.image_name }}"
88+
gcloud container images add-tag "${SOURCE_IMAGE}:${{ inputs.image_tag }}" "${SOURCE_IMAGE}:latest" --quiet

0 commit comments

Comments
 (0)