Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 19 additions & 31 deletions .github/workflows/UploadDockerImages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ on:
- all
- tpu
- gpu
image_suffix:
description: 'An image suffix can be provided to add to the image name'
required: false
type: string
default: ""

permissions:
contents: read
Expand All @@ -55,7 +60,7 @@ jobs:
# Image date
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT

tpu-pre-training:
build-and-test:
name: ${{ matrix.image_name }}
needs: setup
strategy:
Expand All @@ -64,54 +69,37 @@ jobs:
include:
- device: tpu
build_mode: stable
workflow: pre-training
image_name: maxtext_jax_stable
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
- device: tpu
build_mode: nightly
workflow: pre-training
image_name: maxtext_jax_nightly
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}

tpu-post-training-nightly:
name: tpu-post-training-nightly
needs: [setup]
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: maxtext_post_training_nightly
device: tpu
build_mode: nightly
workflow: post-training
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}

gpu-pre-training:
name: ${{ matrix.image_name }}
needs: setup
strategy:
fail-fast: false
matrix:
include:
- device: tpu
build_mode: nightly
workflow: post-training
image_name: maxtext_post_training_nightly
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
- device: gpu
build_mode: stable
workflow: pre-training
image_name: maxtext_gpu_jax_stable
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
- device: gpu
build_mode: nightly
workflow: pre-training
image_name: maxtext_gpu_jax_nightly
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
image_name: ${{ matrix.image_name }}${{ inputs.image_suffix }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
workflow: ${{ matrix.workflow }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
85 changes: 78 additions & 7 deletions .github/workflows/build_and_push_docker_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ on:
required: false
type: string
default: ''
secrets:
HF_TOKEN:
required: true

permissions:
contents: read
Expand All @@ -62,6 +65,8 @@ jobs:
github.event.inputs.target_device == 'tpu' ||
github.event.inputs.target_device == 'gpu'
)
outputs:
should_run: ${{ steps.check.outputs.should_run }} # Map the step output to the job level
steps:
- name: Check if build should run
id: check
Expand All @@ -80,6 +85,14 @@ jobs:
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
INPUTS_BUILD_MODE: ${{ inputs.build_mode }}

- name: Matrix Debugger
run: |
echo "device: ${{ inputs.device }}"
echo "workflow: ${{ inputs.workflow }}"
echo "build_mode: ${{ inputs.build_mode }}"
echo "image_name: ${{ inputs.image_name }}"
echo "dockerfile: ${{ inputs.dockerfile }}"

- name: Checkout MaxText
uses: actions/checkout@v5
if: steps.check.outputs.should_run == 'true'
Expand Down Expand Up @@ -126,27 +139,85 @@ jobs:
shell: bash
run: |
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"

if [[ $INPUTS_VERSION_NAME ]]; then
echo "Tagging docker images corresponding to PyPI release..."
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_VERSION_NAME}" --quiet
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_VERSION_NAME}" --quiet
else
echo "Tagging docker images corresponding to nightly release..."

# Add date tag
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet

# Convert date to YYYYMMDD format
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)

# Add MaxText tag
maxtext_hash=$(git rev-parse --short HEAD)
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet

# Add latest tag (TODO: add this tag only after tests pass)
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:latest" --quiet
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${MAXTEXT_SHA}_${clean_date}" --quiet
fi
env:
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
INPUTS_VERSION_NAME: ${{ inputs.version_name }}
MAXTEXT_SHA: ${{ inputs.maxtext_sha }}

test:
needs: build_and_push
if: |
needs.build_and_push.result == 'success' &&
needs.build_and_push.outputs.should_run == 'true'
strategy:
fail-fast: false
matrix:
flavor: >-
${{ fromJSON('{
"gpu-pre-training": ["gpu-unit", "gpu-integration"],
"tpu-post-training": ["tpu-post-training-unit", "tpu-post-training-integration", "cpu-post-training-unit"],
"tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"]
}')[format('{0}-{1}', inputs.device, inputs.workflow)] }}
uses: ./.github/workflows/run_tests_coordinator.yml
with:
flavor: ${{ matrix.flavor }}
base_image: ${{ inputs.image_name }}:${{ github.run_id }}
is_scheduled_run: true
maxtext_installed: true

notebook-test:
needs: build_and_push
if: |
inputs.device == 'tpu' &&
inputs.workflow == 'post-training' &&
needs.build_and_push.result == 'success' &&
needs.build_and_push.outputs.should_run == 'true'
uses: ./.github/workflows/run_jupyter_notebooks.yml
with:
device_type: tpu
device_name: v6e-4
base_image: ${{ inputs.image_name }}:${{ github.run_id }}
cloud_runner: linux-x86-ct6e-180-4tpu
maxtext_installed: true
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

tagging:
needs: [test, notebook-test]
if: |
always() &&
needs.test.result == 'success' &&
(needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped')
runs-on: linux-x86-n2-16-buildkit
container: google/cloud-sdk:524.0.0
steps:
- name: Configure Docker
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q

- name: Add tags to Docker image
shell: bash
run: |
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
# Latest Tag
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
env:
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
2 changes: 2 additions & 0 deletions .github/workflows/pypi_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,5 @@ jobs:
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ github.sha }}
version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
2 changes: 1 addition & 1 deletion .github/workflows/run_jupyter_notebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ jobs:
- name: Run Post-Training Notebooks
shell: bash
env:
PYTHONPATH: "${{ github.workspace }}/src"
HF_TOKEN: ${{ secrets.HF_TOKEN }}
MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }}
# TODO: Fix evaluation in sft_qwen3_demo.ipynb and remove this env variable
Expand All @@ -95,6 +94,7 @@ jobs:
PAPERMILL_EXE=".venv/bin/papermill"
source .venv/bin/activate
fi
export PYTHONPATH="${pwd}/src${PYTHONPATH:+:${PYTHONPATH}}"

export MAXTEXT_REPO_ROOT=$(pwd)
export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run_tests_against_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ jobs:
uv pip install pytest-cov
PYTEST_COV_ARGS="--cov=MaxText --cov=maxtext --cov-report=xml --cov-report=term"
fi
export PYTHONPATH="${pwd}/src${PYTHONPATH:+:${PYTHONPATH}}"

if [ "${INPUTS_IS_SCHEDULED_RUN}" == "true" ]; then
FINAL_PYTEST_MARKER="${INPUTS_PYTEST_MARKER}"
Expand Down Expand Up @@ -176,7 +177,6 @@ jobs:
${INPUTS_PYTEST_EXTRA_ARGS}

env:
PYTHONPATH: "${{ github.workspace }}/src"
INPUTS_IS_SCHEDULED_RUN: ${{ inputs.is_scheduled_run }}
INPUTS_PYTEST_MARKER: ${{ inputs.pytest_marker }}
INPUTS_DEVICE_TYPE: ${{ inputs.device_type }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Now copy the remaining code (source files that may change frequently)
COPY ${PACKAGE_DIR}/maxtext/ src/maxtext/
COPY ${PACKAGE_DIR}/MaxText/ src/MaxText/
# Now copy resource needed for pytest:
COPY tests*/ tests/
COPY pytest.ini pytest.ini
COPY benchmarks*/ benchmarks/


# Download test assets from GCS if building image with test assets
ARG INCLUDE_TEST_ASSETS=false
RUN if [ "$INCLUDE_TEST_ASSETS" = "true" ]; then \
Expand All @@ -76,4 +79,4 @@ RUN if [ "$INCLUDE_TEST_ASSETS" = "true" ]; then \
fi; \
fi

ENV PYTHONPATH="/deps/src:${PYTHONPATH}"
ENV PYTHONPATH="/deps/src${PYTHONPATH:+:${PYTHONPATH}}"
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Now copy the remaining code (source files that may change frequently)
COPY ${PACKAGE_DIR}/maxtext/ src/maxtext/
COPY ${PACKAGE_DIR}/MaxText/ src/MaxText/
# Now copy resource needed for pytest:
COPY tools*/ tools/
COPY tests*/ tests/
COPY pytest.ini pytest.ini
COPY benchmarks*/ benchmarks/

# Download test assets from GCS if building image with test assets
Expand All @@ -76,4 +79,4 @@ RUN if [ "$INCLUDE_TEST_ASSETS" = "true" ]; then \
fi; \
fi

ENV PYTHONPATH="/deps/src:${PYTHONPATH}"
ENV PYTHONPATH="/deps/src${PYTHONPATH:+:${PYTHONPATH}}"
Loading