Skip to content

Commit ee5f011

Browse files
committed
Build MaxText docker image from PyPI
1 parent ff916b8 commit ee5f011

3 files changed

Lines changed: 90 additions & 109 deletions

File tree

Lines changed: 24 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023–2025 Google LLC
1+
# Copyright 2023–2026 Google LLC
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -42,27 +42,16 @@ permissions:
4242
contents: read
4343

4444
jobs:
45-
setup:
46-
runs-on: ubuntu-latest
47-
outputs:
48-
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
49-
image_date: ${{ steps.vars.outputs.image_date }}
50-
steps:
51-
- name: Checkout MaxText
52-
uses: actions/checkout@v5
53-
54-
- name: Get metadata
55-
id: vars
56-
run: |
57-
# MaxText SHA
58-
echo "maxtext_sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT
59-
60-
# Image date
61-
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
45+
build_and_upload_maxtext_package:
46+
uses: ./.github/workflows/build_package.yml
47+
with:
48+
device_type: tpu
49+
device_name: v4-8
50+
cloud_runner: linux-x86-n2-16-buildkit
6251

6352
build-and-push:
6453
name: ${{ matrix.image_name }}
65-
needs: setup
54+
needs: build_and_upload_maxtext_package
6655
strategy:
6756
fail-fast: false
6857
matrix:
@@ -71,72 +60,43 @@ jobs:
7160
build_mode: stable
7261
workflow: pre-training
7362
image_name: maxtext_jax_stable
74-
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
7563
- device: tpu
7664
build_mode: nightly
7765
workflow: pre-training
7866
image_name: maxtext_jax_nightly
79-
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
8067
- device: tpu
8168
build_mode: nightly
8269
workflow: post-training
8370
image_name: maxtext_post_training_nightly
84-
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
8571
- device: gpu
8672
build_mode: stable
8773
workflow: pre-training
8874
image_name: maxtext_gpu_jax_stable
89-
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
9075
- device: gpu
9176
build_mode: nightly
9277
workflow: pre-training
9378
image_name: maxtext_gpu_jax_nightly
94-
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
9579
uses: ./.github/workflows/build_and_push_docker_image.yml
9680
with:
97-
image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }}
81+
image_name: ${{ inputs.image_suffix != '' && format('{0}_{1}', matrix.image_name, inputs.image_suffix) || matrix.image_name }}
9882
device: ${{ matrix.device }}
9983
build_mode: ${{ matrix.build_mode }}
10084
workflow: ${{ matrix.workflow }}
101-
dockerfile: ${{ matrix.dockerfile }}
102-
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
103-
image_date: ${{ needs.setup.outputs.image_date }}
85+
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
86+
include_test_assets: true
10487
secrets:
10588
HF_TOKEN: ${{ secrets.HF_TOKEN }}
10689

107-
promote:
108-
name: promote-${{ matrix.image_name }}
109-
needs: build-and-push
110-
strategy:
111-
fail-fast: false
112-
matrix:
113-
include:
114-
- device: tpu
115-
build_mode: stable
116-
workflow: pre-training
117-
image_name: maxtext_jax_stable
118-
- device: tpu
119-
build_mode: nightly
120-
workflow: pre-training
121-
image_name: maxtext_jax_nightly
122-
- device: tpu
123-
build_mode: nightly
124-
workflow: post-training
125-
image_name: maxtext_post_training_nightly
126-
- device: gpu
127-
build_mode: stable
128-
workflow: pre-training
129-
image_name: maxtext_gpu_jax_stable
130-
- device: gpu
131-
build_mode: nightly
132-
workflow: pre-training
133-
image_name: maxtext_gpu_jax_nightly
134-
135-
uses: ./.github/workflows/promote_docker_image.yml
136-
with:
137-
image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }}
138-
image_tag: ${{ github.run_id }}
139-
device: ${{ matrix.device }}
140-
workflow: ${{ matrix.workflow }}
141-
secrets:
142-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
90+
notify_failure:
91+
name: Notify failed build
92+
needs: [build-and-push]
93+
if: ${{ failure() && inputs.image_suffix == '' }}
94+
runs-on: ubuntu-latest
95+
permissions:
96+
issues: write
97+
steps:
98+
- name: Create issue on failure
99+
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b
100+
with:
101+
github-token: ${{ secrets.GITHUB_TOKEN }}
102+
title-template: "MaxText Docker Image Build Failure"

.github/workflows/build_and_push_docker_image.yml

Lines changed: 65 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2025 Google LLC
1+
# Copyright 2023-2026 Google LLC
22

33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -28,15 +28,9 @@ on:
2828
build_mode:
2929
required: true
3030
type: string
31-
dockerfile:
32-
required: true
33-
type: string
3431
maxtext_sha:
3532
required: true
3633
type: string
37-
image_date:
38-
required: false
39-
type: string
4034
workflow:
4135
required: false
4236
type: string
@@ -45,6 +39,10 @@ on:
4539
required: false
4640
type: string
4741
default: ''
42+
include_test_assets:
43+
required: false
44+
type: boolean
45+
default: false
4846
secrets:
4947
HF_TOKEN:
5048
required: true
@@ -54,7 +52,7 @@ permissions:
5452

5553
jobs:
5654
build_and_push:
57-
runs-on: linux-x86-n2-16-buildkit
55+
runs-on: ubuntu-latest
5856
container: google/cloud-sdk:524.0.0
5957
if: >
6058
github.event_name == 'release' ||
@@ -89,54 +87,69 @@ jobs:
8987
echo "workflow: ${{ inputs.workflow }}"
9088
echo "build_mode: ${{ inputs.build_mode }}"
9189
echo "image_name: ${{ inputs.image_name }}"
92-
echo "dockerfile: ${{ inputs.dockerfile }}"
93-
94-
- name: Checkout MaxText
95-
uses: actions/checkout@v5
96-
if: steps.check.outputs.should_run == 'true'
97-
with:
98-
# This ensures that every job clones the exact same commit as "setup" job
99-
ref: ${{ inputs.maxtext_sha }}
100-
101-
- name: Mark git repositories as safe
102-
run: git config --global --add safe.directory ${GITHUB_WORKSPACE}
103-
if: steps.check.outputs.should_run == 'true'
10490
10591
- name: Configure Docker
10692
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
10793
if: steps.check.outputs.should_run == 'true'
10894

95+
- name: Configure gcloud project
96+
run: gcloud config set project tpu-prod-env-multipod
97+
if: steps.check.outputs.should_run == 'true'
98+
10999
- name: Set up Docker BuildX
110100
uses: docker/setup-buildx-action@v3.11.1
111101
if: steps.check.outputs.should_run == 'true'
102+
103+
- name: Download MaxText wheel
104+
uses: actions/download-artifact@v4
105+
if: steps.check.outputs.should_run == 'true'
112106
with:
113-
driver: remote
114-
endpoint: tcp://localhost:1234
107+
name: maxtext-wheel
115108

116-
- name: Build and push Docker image
117-
uses: docker/build-push-action@v6
109+
- name: Install uv and set the Python version
110+
uses: astral-sh/setup-uv@v7
118111
if: steps.check.outputs.should_run == 'true'
119112
with:
120-
push: true
121-
context: .
122-
file: ${{ inputs.dockerfile }}
123-
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ github.run_id }}
124-
cache-from: type=gha
125-
outputs: type=image,compression=zstd,force-compression=true
126-
build-args: |
127-
DEVICE=${{ inputs.device }}
128-
MODE=${{ inputs.build_mode }}
113+
python-version: '3.12'
114+
enable-cache: true
115+
116+
- name: Install MaxText wheel
117+
if: steps.check.outputs.should_run == 'true'
118+
shell: bash
119+
run: |
120+
uv venv --seed
121+
source .venv/bin/activate
122+
maxtext_wheel=$(ls maxtext-*-py3-none-any.whl 2>/dev/null)
123+
uv pip install ${maxtext_wheel}[runner] --resolution=lowest
124+
125+
- name: Build and push Docker image
126+
if: steps.check.outputs.should_run == 'true'
127+
shell: bash
128+
env:
129+
IMAGE_TAG: "gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ github.run_id }}"
130+
INCLUDE_TEST_ASSETS: ${{ inputs.include_test_assets }}
131+
run: |
132+
source .venv/bin/activate
133+
134+
# Add .venv to .dockerignore so it is excluded from the Docker build context
135+
echo ".venv" >> .dockerignore
136+
137+
echo "Building MaxText Docker image..."
138+
build_maxtext_docker_image \
139+
DEVICE=${{ inputs.device }} \
140+
MODE=${{ inputs.build_mode }} \
129141
WORKFLOW=${{ inputs.workflow }}
130-
PACKAGE_DIR=./src
131-
JAX_VERSION=NONE
132-
LIBTPU_VERSION=NONE
133-
INCLUDE_TEST_ASSETS=true
142+
143+
echo "Pushing MaxText Docker image..."
144+
upload_maxtext_docker_image \
145+
CLOUD_IMAGE_NAME=${{ inputs.image_name }}
134146
135147
- name: Add tags to Docker image
136148
if: steps.check.outputs.should_run == 'true'
137149
shell: bash
138150
run: |
139151
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
152+
140153
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
141154
142155
if [[ $INPUTS_VERSION_NAME ]]; then
@@ -146,16 +159,28 @@ jobs:
146159
echo "Tagging docker images corresponding to nightly release..."
147160
148161
# Add date tag
149-
gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
162+
IMAGE_DATE="$(date +%Y-%m-%d)"
163+
gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:$IMAGE_DATE" --quiet
150164
151165
# Convert date to YYYYMMDD format
152-
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
166+
clean_date=$(echo "$IMAGE_DATE" | sed 's/[-:]//g' | cut -c1-8)
153167
154168
# Add MaxText tag
155169
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${MAXTEXT_SHA}_${clean_date}" --quiet
156170
fi
157171
env:
158172
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
159-
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
160173
INPUTS_VERSION_NAME: ${{ inputs.version_name }}
161174
MAXTEXT_SHA: ${{ inputs.maxtext_sha }}
175+
176+
promote_image:
177+
needs: build_and_push
178+
runs-on: ubuntu-latest
179+
steps:
180+
- name: Promote image
181+
uses: ./.github/workflows/promote_docker_image.yml
182+
with:
183+
image_name: ${{ inputs.image_name }}
184+
image_tag: ${{ github.run_id }}
185+
device: ${{ inputs.device }}
186+
workflow: ${{ inputs.workflow }}

.github/workflows/pypi_release.yml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2025 Google LLC
1+
# Copyright 2023-2026 Google LLC
22

33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -103,24 +103,20 @@ jobs:
103103
build_mode: stable
104104
image_name: maxtext_jax_stable
105105
workflow: pre-training
106-
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
107106
- device: gpu
108107
build_mode: stable
109108
image_name: maxtext_gpu_jax_stable
110109
workflow: pre-training
111-
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
112110
- device: tpu
113111
build_mode: stable
114112
image_name: maxtext_post_training_stable
115113
workflow: post-training
116-
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
117114
uses: ./.github/workflows/build_and_push_docker_image.yml
118115
with:
119116
image_name: ${{ matrix.image_name }}
120117
device: ${{ matrix.device }}
121118
build_mode: ${{ matrix.build_mode }}
122119
workflow: ${{ matrix.workflow }}
123-
dockerfile: ${{ matrix.dockerfile }}
124120
maxtext_sha: ${{ github.sha }}
125121
version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}
126122
secrets:

0 commit comments

Comments
 (0)