4949 required : false
5050 type : string
5151 default : ' '
52+ test_mode :
53+ description : " Test mode (tpu-pre-training, tpu-post-training, gpu-pre-training)"
54+ required : true
55+ type : string
5256
5357permissions :
5458 contents : read
6569 github.event.inputs.target_device == 'tpu' ||
6670 github.event.inputs.target_device == 'gpu'
6771 )
72+ outputs :
73+ should_run : ${{ steps.check.outputs.should_run }} # Map the step output to the job level
6874 steps :
6975 - name : Check if build should run
7076 id : check
9197 ref : ${{ inputs.maxtext_sha }}
9298
9399 - name : Checkout post-training dependencies
94- if : steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
100+ if : |
101+ steps.check.outputs.should_run == 'true' &&
102+ contains(inputs.image_name, 'post_training_nightly')
95103 run : |
96104 git clone https://github.com/google/tunix.git ./tunix
97105 git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -136,29 +144,31 @@ jobs:
136144 shell : bash
137145 run : |
138146 SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
147+ TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
148+
139149
140150 if [[ $INPUTS_VERSION_NAME ]]; then
141151 echo "Tagging docker images corresponding to PyPI release..."
142- gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }} " "$SOURCE_IMAGE:${INPUTS_VERSION_NAME}" --quiet
152+ gcloud container images add-tag "${TEMP_IMG} " "${ SOURCE_IMAGE} :${INPUTS_VERSION_NAME}" --quiet
143153 else
144154 echo "Tagging docker images corresponding to nightly release..."
145155
146- # Add date tag
147- gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id } }" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
156+ # TODO: Add date tag - Decide if we should add date here or after testing.
157+ # gcloud container images add-tag "${TEMP_IMG }" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
148158
149159 # Convert date to YYYYMMDD format
150160 clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
151161
152162 # Add MaxText tag
153163 maxtext_hash=$(git rev-parse --short HEAD)
154- gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }} " "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
155-
156- # Add post-training dependencies tags
157- if [ "${{ inputs.workflow }}" == "post-training" ]; then
158- for dir in tunix vllm tpu-inference; do
159- if [ -d "./$dir" ]; then
160- dir_hash=$(git -C "$dir" rev-parse --short HEAD)
161- gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }} " "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
164+ gcloud container images add-tag "${TEMP_IMG} " "${ SOURCE_IMAGE} :maxtext_${maxtext_hash}_${clean_date}" --quiet
165+
166+ # Add post-training dependencies tags
167+ if [ "${{ inputs.workflow }}" == "post-training" ]; then
168+ for dir in tunix vllm tpu-inference; do
169+ if [ -d "./$dir" ]; then
170+ dir_hash=$(git -C "$dir" rev-parse --short HEAD)
171+ gcloud container images add-tag "${TEMP_IMG} " "${ SOURCE_IMAGE} :${dir}_${dir_hash}_${clean_date}" --quiet
162172 fi
163173 done
164174 fi
@@ -167,3 +177,73 @@ jobs:
167177 INPUTS_IMAGE_NAME : ${{ inputs.image_name }}
168178 INPUTS_IMAGE_DATE : ${{ inputs.image_date }}
169179 INPUTS_VERSION_NAME : ${{ inputs.version_name }}
180+
181+ test :
182+ needs : build_and_push
183+ if : |
184+ needs.build_and_push.result == 'success' &&
185+ needs.build_and_push.outputs.should_run == 'true'
186+ strategy :
187+ fail-fast : false
188+ matrix :
189+ flavor : >-
190+ ${{ fromJSON('{
191+ "gpu-pre-training": ["gpu-unit", "gpu-integration"],
192+ "tpu-post-training": ["post-training-tpu-unit", "post-training-tpu-integration"],
193+ "tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"]
194+ }')[inputs.test_mode] }}
195+ uses : ./.github/workflows/run_tests_coordinator.yml
196+ with :
197+ flavor : ${{ matrix.flavor }}
198+ base_image : ${{ inputs.image_name }}:${{ github.run_id }}
199+ is_scheduled_run : true
200+ maxtext_installed : true
201+
202+ notebook-test :
203+ needs : build_and_push
204+ if : |
205+ inputs.test_mode == 'tpu-post-training' &&
206+ needs.build_and_push.result == 'success' &&
207+ needs.build_and_push.outputs.should_run == 'true'
208+ uses : ./.github/workflows/run_jupyter_notebooks.yml
209+ with :
210+ device_type : tpu
211+ device_name : v6e-4
212+ base_image : ${{ inputs.image_name }}:${{ github.run_id }}
213+ cloud_runner : linux-x86-ct6e-180-4tpu
214+ maxtext_installed : true
215+ secrets :
216+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
217+
218+ tagging :
219+ needs : [test, notebook-test]
220+ if : |
221+ always() &&
222+ needs.test.result == 'success' &&
223+ (needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped')
224+ runs-on : linux-x86-n2-16-buildkit
225+ container : google/cloud-sdk:524.0.0
226+ steps :
227+ - name : Configure Docker
228+ run : gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
229+
230+ - name : Create Production Tags
231+ shell : bash
232+ env :
233+ INPUTS_IMAGE_NAME : ${{ inputs.image_name }}
234+ INPUTS_IMAGE_DATE : ${{ inputs.image_date }}
235+ run : |
236+ SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
237+ TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
238+
239+ # Validate existence first
240+ gcloud container images describe "${TEMP_IMG}" > /dev/null
241+
242+ # 1. Date Tag
243+ gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_IMAGE_DATE}" --quiet
244+
245+ # 2. Latest Tag
246+ gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
247+
248+ # TODO: Clean up Temporary Tag. But this step needs the extra permission: 'artifactregistry.repositories.deleteArtifacts'
249+ # gcloud container images untag "${TEMP_IMG}" --quiet
0 commit comments