Skip to content

Commit 0143f05

Browse files
committed
Merge branch 'main' of github.com:AI-Hypercomputer/maxtext into shuningjin-ckpt-structure
2 parents 5243e93 + cd5cd5e commit 0143f05

429 files changed

Lines changed: 11460 additions & 5188 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Model bring-up
44
src/MaxText/assets @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @gagika @shralex @richjames0 @NicoGrande
55
src/MaxText/configs/models @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @gagika @shralex @richjames0 @NicoGrande @suexu1025 @jesselu-google
6-
src/MaxText/utils/ckpt_conversion @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @hengtaoguo @gagika @shralex @richjames0 @NicoGrande
6+
src/maxtext/checkpoint_conversion @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @hengtaoguo @gagika @shralex @richjames0 @NicoGrande
77
src/MaxText/layers @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @gagika @shralex @richjames0 @NicoGrande @suexu1025 @jesselu-google
88

99
# Features

.github/workflows/UploadDockerImages.yml

Lines changed: 5 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# This workflow builds and pushes MaxText images for both TPU and GPU devices.
1616
# It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch.
1717

18-
name: Build and Test Images
18+
name: Build Images
1919

2020
on:
2121
schedule:
@@ -32,11 +32,6 @@ on:
3232
- all
3333
- tpu
3434
- gpu
35-
for_dev_test:
36-
description: 'For development test purpose. All images will be added a -test suffix'
37-
required: false
38-
type: boolean
39-
default: false
4035

4136
permissions:
4237
contents: read
@@ -47,7 +42,6 @@ jobs:
4742
outputs:
4843
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
4944
image_date: ${{ steps.vars.outputs.image_date }}
50-
image_suffix: ${{ steps.vars.outputs.image_suffix }}
5145
steps:
5246
- name: Checkout MaxText
5347
uses: actions/checkout@v5
@@ -61,13 +55,6 @@ jobs:
6155
# Image date
6256
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
6357
64-
# If for_dev_test is true, set suffix to -test, otherwise empty
65-
if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then
66-
echo "image_suffix=-test" >> $GITHUB_OUTPUT
67-
else
68-
echo "image_suffix=" >> $GITHUB_OUTPUT
69-
fi
70-
7158
tpu-pre-training:
7259
name: ${{ matrix.image_name }}
7360
needs: setup
@@ -85,7 +72,7 @@ jobs:
8572
dockerfile: ./dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
8673
uses: ./.github/workflows/build_and_push_docker_image.yml
8774
with:
88-
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
75+
image_name: ${{ matrix.image_name }}
8976
device: ${{ matrix.device }}
9077
build_mode: ${{ matrix.build_mode }}
9178
dockerfile: ${{ matrix.dockerfile }}
@@ -109,13 +96,14 @@ jobs:
10996
dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
11097
uses: ./.github/workflows/build_and_push_docker_image.yml
11198
with:
112-
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
99+
image_name: ${{ matrix.image_name }}
113100
device: ${{ matrix.device }}
114101
build_mode: ${{ matrix.build_mode }}
115102
dockerfile: ${{ matrix.dockerfile }}
116103
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
117104
image_date: ${{ needs.setup.outputs.image_date }}
118105
base_image: gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:${{ needs.setup.outputs.image_date }}
106+
is_post_training: true
119107

120108
gpu-pre-training:
121109
name: ${{ matrix.image_name }}
@@ -134,48 +122,9 @@ jobs:
134122
dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
135123
uses: ./.github/workflows/build_and_push_docker_image.yml
136124
with:
137-
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
125+
image_name: ${{ matrix.image_name }}
138126
device: ${{ matrix.device }}
139127
build_mode: ${{ matrix.build_mode }}
140128
dockerfile: ${{ matrix.dockerfile }}
141129
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
142130
image_date: ${{ needs.setup.outputs.image_date }}
143-
144-
# TEST JOBS
145-
pre-training-tpu-tests:
146-
needs: [setup, tpu-pre-training]
147-
strategy:
148-
fail-fast: false
149-
matrix:
150-
image: [maxtext_jax_stable, maxtext_jax_nightly]
151-
uses: ./.github/workflows/test_and_tag_docker_image.yml
152-
with:
153-
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
154-
image_date: ${{ needs.setup.outputs.image_date }}
155-
test_mode: tpu-pre-training
156-
157-
post-training-tpu-tests:
158-
needs: [setup, tpu-post-training]
159-
strategy:
160-
fail-fast: false
161-
matrix:
162-
image: [maxtext_post_training_stable, maxtext_post_training_nightly]
163-
uses: ./.github/workflows/test_and_tag_docker_image.yml
164-
with:
165-
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
166-
image_date: ${{ needs.setup.outputs.image_date }}
167-
test_mode: tpu-post-training
168-
169-
170-
pre-training-gpu-tests:
171-
needs: [setup, gpu-pre-training]
172-
strategy:
173-
fail-fast: false
174-
matrix:
175-
image: [maxtext_gpu_jax_stable, maxtext_gpu_jax_nightly]
176-
uses: ./.github/workflows/test_and_tag_docker_image.yml
177-
with:
178-
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
179-
image_date: ${{ needs.setup.outputs.image_date }}
180-
test_mode: gpu-pre-training
181-

.github/workflows/build_and_push_docker_image.yml

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ on:
4141
required: false
4242
type: string
4343
default: ''
44+
is_post_training:
45+
required: false
46+
type: boolean
47+
default: false
4448

4549
permissions:
4650
contents: read
@@ -62,13 +66,18 @@ jobs:
6266
id: check
6367
shell: bash
6468
run: |
65-
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ github.event.inputs.target_device }}" != "all" && "${{ github.event.inputs.target_device }}" != "${{ inputs.device }}" ]]; then
69+
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${GITHUB_EVENT_INPUTS_TARGET_DEVICE}" != "all" && "${GITHUB_EVENT_INPUTS_TARGET_DEVICE}" != "${INPUTS_DEVICE}" ]]; then
6670
echo "should_run=false" >> $GITHUB_OUTPUT
67-
echo "Skipping ${{ inputs.image_name }} build for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
71+
echo "Skipping ${INPUTS_IMAGE_NAME} build for device: ${INPUTS_DEVICE} in ${INPUTS_BUILD_MODE} mode."
6872
else
6973
echo "should_run=true" >> $GITHUB_OUTPUT
70-
echo "Building ${{ inputs.image_name }} for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
74+
echo "Building ${INPUTS_IMAGE_NAME} for device: ${INPUTS_DEVICE} in ${INPUTS_BUILD_MODE} mode."
7175
fi
76+
env:
77+
GITHUB_EVENT_INPUTS_TARGET_DEVICE: ${{ github.event.inputs.target_device }}
78+
INPUTS_DEVICE: ${{ inputs.device }}
79+
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
80+
INPUTS_BUILD_MODE: ${{ inputs.build_mode }}
7281

7382
- name: Checkout MaxText
7483
uses: actions/checkout@v5
@@ -78,9 +87,7 @@ jobs:
7887
ref: ${{ inputs.maxtext_sha }}
7988

8089
- name: Checkout post-training dependencies
81-
if: |
82-
steps.check.outputs.should_run == 'true' &&
83-
contains(inputs.image_name, 'post_training_nightly')
90+
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
8491
run: |
8592
git clone https://github.com/google/tunix.git ./tunix
8693
git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -108,7 +115,8 @@ jobs:
108115
push: true
109116
context: .
110117
file: ${{ inputs.dockerfile }}
111-
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
118+
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
119+
cache-from: type=gha
112120
outputs: type=image,compression=zstd,force-compression=true
113121
build-args: |
114122
DEVICE=${{ inputs.device }}
@@ -122,21 +130,28 @@ jobs:
122130
if: steps.check.outputs.should_run == 'true'
123131
shell: bash
124132
run: |
125-
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}"
126-
TEMP_IMG="$SOURCE_IMAGE:${{ inputs.image_date }}-build-${{ github.run_id }}"
133+
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
134+
135+
# Add date tag
136+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
127137
128138
# Convert date to YYYYMMDD format
129-
clean_date=$(echo "${{ inputs.image_date }}" | sed 's/[-:]//g' | cut -c1-8)
139+
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
130140
131141
# Add MaxText tag
132142
maxtext_hash=$(git rev-parse --short HEAD)
133-
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
134-
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet
143+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
144+
135145
136146
# Add post-training dependencies tags
137-
for dir in tunix vllm tpu-inference; do
138-
if [ -d "./$dir" ]; then
139-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
140-
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
141-
fi
142-
done
147+
if [ "${{ inputs.is_post_training }}" == "true" ]; then
148+
for dir in tunix vllm tpu-inference; do
149+
if [ -d "./$dir" ]; then
150+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
151+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
152+
fi
153+
done
154+
fi
155+
env:
156+
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
157+
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}

0 commit comments

Comments
 (0)