4545 required : false
4646 type : string
4747 default : ' '
48+ test_mode :
49+ description : " Test mode (tpu-pre-training, tpu-post-training, gpu-pre-training)"
50+ required : true
51+ type : string
4852
4953permissions :
5054 contents : read
6165 github.event.inputs.target_device == 'tpu' ||
6266 github.event.inputs.target_device == 'gpu'
6367 )
68+ outputs :
69+ should_run : ${{ steps.check.outputs.should_run }} # Map the step output to the job level
6470 steps :
6571 - name : Check if build should run
6672 id : check
8793 ref : ${{ inputs.maxtext_sha }}
8894
8995 - name : Checkout post-training dependencies
90- if : steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
96+ if : |
97+ steps.check.outputs.should_run == 'true' &&
98+ contains(inputs.image_name, 'post_training_nightly')
9199 run : |
92100 git clone https://github.com/google/tunix.git ./tunix
93101 git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -132,29 +140,31 @@ jobs:
132140 shell : bash
133141 run : |
134142 SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
143+ TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
144+
135145
136146 if [[ $INPUTS_VERSION_NAME ]]; then
137147 echo "Tagging docker images corresponding to PyPI release..."
138- gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }} " "$SOURCE_IMAGE:${INPUTS_VERSION_NAME}" --quiet
148+ gcloud container images add-tag "${TEMP_IMG} " "${ SOURCE_IMAGE} :${INPUTS_VERSION_NAME}" --quiet
139149 else
140150 echo "Tagging docker images corresponding to nightly release..."
141151
142- # Add date tag
143- gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id } }" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
152+ # TODO: Add date tag - Decide if we should add date here or after testing.
153+ # gcloud container images add-tag "${TEMP_IMG }" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
144154
145155 # Convert date to YYYYMMDD format
146156 clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
147157
148158 # Add MaxText tag
149159 maxtext_hash=$(git rev-parse --short HEAD)
150- gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }} " "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
151-
152- # Add post-training dependencies tags
153- if [ "${{ inputs.workflow }}" == "post-training" ]; then
154- for dir in tunix vllm tpu-inference; do
155- if [ -d "./$dir" ]; then
156- dir_hash=$(git -C "$dir" rev-parse --short HEAD)
157- gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }} " "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
160+ gcloud container images add-tag "${TEMP_IMG} " "${ SOURCE_IMAGE} :maxtext_${maxtext_hash}_${clean_date}" --quiet
161+
162+ # Add post-training dependencies tags
163+ if [ "${{ inputs.workflow }}" == "post-training" ]; then
164+ for dir in tunix vllm tpu-inference; do
165+ if [ -d "./$dir" ]; then
166+ dir_hash=$(git -C "$dir" rev-parse --short HEAD)
167+ gcloud container images add-tag "${TEMP_IMG} " "${ SOURCE_IMAGE} :${dir}_${dir_hash}_${clean_date}" --quiet
158168 fi
159169 done
160170 fi
@@ -163,3 +173,73 @@ jobs:
163173 INPUTS_IMAGE_NAME : ${{ inputs.image_name }}
164174 INPUTS_IMAGE_DATE : ${{ inputs.image_date }}
165175 INPUTS_VERSION_NAME : ${{ inputs.version_name }}
176+
177+ test :
178+ needs : build_and_push
179+ if : |
180+ needs.build_and_push.result == 'success' &&
181+ needs.build_and_push.outputs.should_run == 'true'
182+ strategy :
183+ fail-fast : false
184+ matrix :
185+ flavor : >-
186+ ${{ fromJSON('{
187+ "gpu-pre-training": ["gpu-unit", "gpu-integration"],
188+ "tpu-post-training": ["post-training-tpu-unit", "post-training-tpu-integration", "post-training-cpu-unit"],
189+ "tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"]
190+ }')[inputs.test_mode] }}
191+ uses : ./.github/workflows/run_tests_coordinator.yml
192+ with :
193+ flavor : ${{ matrix.flavor }}
194+ base_image : ${{ inputs.image_name }}:${{ github.run_id }}
195+ is_scheduled_run : true
196+ maxtext_installed : true
197+
198+ notebook-test :
199+ needs : build_and_push
200+ if : |
201+ inputs.test_mode == 'tpu-post-training' &&
202+ needs.build_and_push.result == 'success' &&
203+ needs.build_and_push.outputs.should_run == 'true'
204+ uses : ./.github/workflows/run_jupyter_notebooks.yml
205+ with :
206+ device_type : tpu
207+ device_name : v6e-4
208+ base_image : ${{ inputs.image_name }}:${{ github.run_id }}
209+ cloud_runner : linux-x86-ct6e-180-4tpu
210+ maxtext_installed : true
211+ secrets :
212+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
213+
214+ tagging :
215+ needs : [test, notebook-test]
216+ if : |
217+ always() &&
218+ needs.test.result == 'success' &&
219+ (needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped')
220+ runs-on : linux-x86-n2-16-buildkit
221+ container : google/cloud-sdk:524.0.0
222+ steps :
223+ - name : Configure Docker
224+ run : gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
225+
226+ - name : Create Production Tags
227+ shell : bash
228+ env :
229+ INPUTS_IMAGE_NAME : ${{ inputs.image_name }}
230+ INPUTS_IMAGE_DATE : ${{ inputs.image_date }}
231+ run : |
232+ SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
233+ TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
234+
235+ # Validate existence first
236+ gcloud container images describe "${TEMP_IMG}" > /dev/null
237+
238+ # 1. Date Tag
239+ gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_IMAGE_DATE}" --quiet
240+
241+ # 2. Latest Tag
242+ gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
243+
244+ # TODO: Clean up Temporary Tag. But this step needs the extra permission: 'artifactregistry.repositories.deleteArtifacts'
245+ # gcloud container images untag "${TEMP_IMG}" --quiet
0 commit comments