3131 description : Upload coverage as unit test
3232 required : false
3333 default : false
34+ TESTS_TO_RUN :
35+ type : string
36+ description : Tests to run
37+ required : false
38+ default : ' ["all"]'
3439 outputs :
3540 conclusion :
3641 description : Conclusion of main test step
4449jobs :
4550 main :
4651 runs-on : ${{ inputs.RUNNER }}
52+ name : ${{ inputs.SCRIPT }}
53+ if : contains(fromJSON(inputs.TESTS_TO_RUN), inputs.SCRIPT) || contains(fromJSON(inputs.TESTS_TO_RUN), 'all')
4754 outputs :
4855 conclusion : ${{ steps.check.conclusion }}
4956 log : ${{ steps.check.outputs.log }}
@@ -62,26 +69,56 @@ jobs:
6269 run : |
6370 docker pull nemoci.azurecr.io/nemo_container:${{ github.run_id }}
6471
72+ - name : Clean repos
73+ run : |
74+
75+ - name : Install jq
76+ run : |
77+ curl -sS https://webi.sh/jq | sh
78+
79+ - name : Create UUID
80+ id : uuid
81+ run : |
82+ echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
83+
84+ - name : Checkout NeMo
85+ uses : actions/checkout@v2
86+ with :
87+ repository : NVIDIA/NeMo
88+ path : ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
89+
6590 - name : Start container
6691 run : |
6792 mkdir -p $DIR
6893
94+ # Map of runner names to GPU device configurations
95+ declare -A GPU_CONFIGS=(
96+ ["myVm-01"]="0,1"
97+ ["myVm-02"]="2,3"
98+ ["myVm-03"]="4,5"
99+ ["myVm-04"]="6,7"
100+ )
101+
69102 ARG=("")
70- if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
103+ if [[ -n "${GPU_CONFIGS[${{ runner.name }}]}" ]]; then
104+ ARG=("--runtime=nvidia --cpus="40" --memory="400g" --gpus '\"device=${GPU_CONFIGS[${{ runner.name }}]}\"'")
105+ elif [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
71106 ARG=("--runtime=nvidia --gpus all")
72107 fi
73108
74109 cmd=$(cat <<RUN_TEST_EOF
75110 #!/bin/bash
76- docker container rm -f nemo_container_${{ github.run_id }} || true
111+ docker container rm -f nemo_container_${{ github.run_id }}_${{ runner.name }} || true
77112 docker run \
78113 --rm \
79114 -d \
80- --name nemo_container_${{ github.run_id }} ${ARG[@]} \
115+ --name nemo_container_${{ github.run_id }}_${{ runner.name }} ${ARG[@]} \
81116 --shm-size=64g \
82117 --env TRANSFORMERS_OFFLINE=0 \
83118 --env HYDRA_FULL_ERROR=1 \
84119 --env HF_HOME=/home/TestData/HF_HOME \
120+ --env RUN_ID=${{ github.run_id }} \
121+ --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
85122 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} \
86123 bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
87124 RUN_TEST_EOF
@@ -107,7 +144,10 @@ jobs:
107144 (
108145 set -e
109146
110- docker exec -t nemo_container_${{ github.run_id }} bash -c 'RUN_ID=${{ github.run_id }} bash tests/functional_tests/$SCRIPT.sh && echo "Finished successfully." || echo "Did not finish."'
147+ docker exec -t nemo_container_${{ github.run_id }}_${{ runner.name }} bash -c '\
148+ cp -r /opt/Megatron-LM/ /workspace/ && \
149+ bash tests/functional_tests/$SCRIPT.sh && \
150+ echo "Finished successfully." || echo "Did not finish."'
111151 ) 2>&1 | tee $DIR/err.log
112152
113153 RUN_TEST_EOF
@@ -137,10 +177,10 @@ jobs:
137177 potential_infra_failure=$(cat $DIR/err.log | grep -Eqiw "device" && echo true || echo false)
138178 echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
139179
140- docker exec nemo_container_${{ github.run_id }} coverage combine
141- docker exec nemo_container_${{ github.run_id }} coverage xml
142- docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage $DIR/.coverage
143- docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml $DIR/coverage.xml
180+ docker exec nemo_container_${{ github.run_id }}_${{ runner.name }} coverage combine
181+ docker exec nemo_container_${{ github.run_id }}_${{ runner.name }} coverage xml
182+ docker cp nemo_container_${{ github.run_id }}_${{ runner.name }} :/workspace/.coverage $DIR/.coverage
183+ docker cp nemo_container_${{ github.run_id }}_${{ runner.name }} :/workspace/coverage.xml $DIR/coverage.xml
144184
145185 coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
146186 echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
@@ -162,7 +202,7 @@ jobs:
162202 - name : Test coverage
163203 shell : bash -x -e -u -o pipefail {0}
164204 run : |
165- docker exec -t nemo_container_${{ github.run_id }} coverage report -i
205+ docker exec -t nemo_container_${{ github.run_id }}_${{ runner.name }} coverage report -i
166206
167207 - name : Upload artifacts
168208 uses : actions/upload-artifact@v4
@@ -174,14 +214,9 @@ jobs:
174214 ${{ github.run_id }}/.coverage
175215 include-hidden-files : true
176216
177- - uses : " NVIDIA/NeMo/.github/actions/cancel-workflow@main"
178- if : failure() && inputs.IS_OPTIONAL == false && github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'no-fail-fast')
179- - name : after_script
180- if : always() && inputs.AFTER_SCRIPT != ':'
181- run : |
182- docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
183-
184217 - name : Container shutdown
185218 if : always()
186219 run : |
187- docker container rm -f nemo_container_${{ github.run_id }} || true
220+ docker exec nemo_container_${{ github.run_id }}_${{ runner.name }} bash -c "chown -R $(id -u):$(id -g) /workspace"
221+ rm -rf $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }} || true
222+ docker container rm -f nemo_container_${{ github.run_id }}_${{ runner.name }} || true
0 commit comments