Skip to content

Commit 14c01f7

Browse files
committed
Merge remote-tracking branch 'upstream/main' into update-lazy-trt-compile
Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
2 parents a1d4325 + 020d289 commit 14c01f7

157 files changed

Lines changed: 18322 additions & 3986 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/scripts/notify.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import os
15+
16+
import requests
17+
from github import Github
18+
19+
20+
def send_slack_notification():
21+
# Get environment variables
22+
gh_token = os.environ.get('GH_TOKEN')
23+
webhook_url = os.environ.get('SLACK_WEBHOOK')
24+
repository = os.environ.get('REPOSITORY')
25+
run_id = os.environ.get('RUN_ID')
26+
server_url = os.environ.get('SERVER_URL', 'https://github.com')
27+
pr_number = int(os.environ.get('PR_NUMBER'))
28+
29+
# Get failure info from GitHub API
30+
gh = Github(gh_token)
31+
repo = gh.get_repo(repository)
32+
pr = repo.get_pull(pr_number)
33+
34+
# Get failed jobs
35+
failed_jobs = [job.name for job in repo.get_workflow_run(int(run_id)).jobs() if job.conclusion == 'failure']
36+
37+
# Build message blocks
38+
blocks = [
39+
{
40+
"type": "section",
41+
"text": {
42+
"type": "mrkdwn",
43+
"text": (
44+
f"*<{server_url}/{repository}/pull/{pr_number}|PR#{pr_number}: {pr.title.replace('`', '')}>*\n"
45+
f"• Author: <{server_url}/{pr.user.login}|{pr.user.login}>\n"
46+
f"• Branch: <{server_url}/{pr.head.repo.full_name}/tree/{pr.head.ref}|{pr.head.ref}>\n"
47+
f"• Pipeline: <{server_url}/{repository}/actions/runs/{run_id}|View Run>\n"
48+
f"• Failed Jobs:\n"
49+
+ "\n".join(
50+
[
51+
f" • <{server_url}/{repository}/actions/runs/{run_id}|{job.split('/')[-1]}>"
52+
for job in failed_jobs
53+
if job.split('/')[-1] != 'Nemo_CICD_Test'
54+
]
55+
)
56+
),
57+
},
58+
}
59+
]
60+
61+
print({"blocks": blocks})
62+
63+
# Send to Slack
64+
response = requests.post(webhook_url, json={"blocks": blocks})
65+
response.raise_for_status()
66+
67+
68+
if __name__ == "__main__":
69+
send_slack_notification()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
name: Bump Megatron Tag Template
2+
on:
3+
workflow_call:
4+
inputs:
5+
nemo-target-branch:
6+
required: true
7+
type: string
8+
description: "The target branch to bump"
9+
mcore-target-branch:
10+
required: true
11+
type: string
12+
description: "The target branch to bump"
13+
secrets:
14+
PAT:
15+
required: true
16+
17+
jobs:
18+
update-branch:
19+
runs-on: ubuntu-latest
20+
steps:
21+
- uses: actions/checkout@v2
22+
with:
23+
ref: main
24+
25+
- name: Set Git config
26+
run: |
27+
git config --local user.email "actions@github.com"
28+
git config --local user.name "Github Actions"
29+
- name: Merge ${{ inputs.nemo-target-branch }} back to base branch
30+
run: |
31+
git fetch --unshallow
32+
# Try to checkout and pull existing branch, create new one if it doesn't exist
33+
if git ls-remote --exit-code origin ${{ inputs.nemo-target-branch }}; then
34+
git checkout ${{ inputs.nemo-target-branch }}
35+
git pull origin ${{ inputs.nemo-target-branch }}
36+
git merge --no-ff ${{ inputs.nemo-target-branch }} -m "chore: Auto-merge main back to ${{ inputs.nemo-target-branch }}"
37+
else
38+
git checkout -b ${{ inputs.nemo-target-branch }} main
39+
fi
40+
git push -u origin ${{ inputs.nemo-target-branch }}
41+
42+
mcore:
43+
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_bump_yamlfile.yml@v0.27.1
44+
needs: [update-branch]
45+
with:
46+
source-repository: NVIDIA/Megatron-LM
47+
source-ref: ${{ inputs.mcore-target-branch }}
48+
yaml-path: '."vcs-dependencies"."megatron-lm".ref'
49+
file: requirements/manifest.json
50+
base-branch: weekly-bump-${{ inputs.nemo-target-branch }}
51+
cicd-labels: Run CICD,no-fail-fast
52+
pr-reviewers: ${{ inputs.pr-reviewers }}
53+
secrets:
54+
PAT: ${{ secrets.PAT }}

.github/workflows/_test_template.yml

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ on:
3131
description: Upload coverage as unit test
3232
required: false
3333
default: false
34+
TESTS_TO_RUN:
35+
type: string
36+
description: Tests to run
37+
required: false
38+
default: '["all"]'
3439
outputs:
3540
conclusion:
3641
description: Conclusion of main test step
@@ -44,6 +49,8 @@ on:
4449
jobs:
4550
main:
4651
runs-on: ${{ inputs.RUNNER }}
52+
name: ${{ inputs.SCRIPT }}
53+
if: contains(fromJSON(inputs.TESTS_TO_RUN), inputs.SCRIPT) || contains(fromJSON(inputs.TESTS_TO_RUN), 'all')
4754
outputs:
4855
conclusion: ${{ steps.check.conclusion }}
4956
log: ${{ steps.check.outputs.log }}
@@ -62,26 +69,56 @@ jobs:
6269
run: |
6370
docker pull nemoci.azurecr.io/nemo_container:${{ github.run_id }}
6471
72+
- name: Clean repos
73+
run: |
74+
75+
- name: Install jq
76+
run: |
77+
curl -sS https://webi.sh/jq | sh
78+
79+
- name: Create UUID
80+
id: uuid
81+
run: |
82+
echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
83+
84+
- name: Checkout NeMo
85+
uses: actions/checkout@v2
86+
with:
87+
repository: NVIDIA/NeMo
88+
path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
89+
6590
- name: Start container
6691
run: |
6792
mkdir -p $DIR
6893
94+
# Map of runner names to GPU device configurations
95+
declare -A GPU_CONFIGS=(
96+
["myVm-01"]="0,1"
97+
["myVm-02"]="2,3"
98+
["myVm-03"]="4,5"
99+
["myVm-04"]="6,7"
100+
)
101+
69102
ARG=("")
70-
if [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
103+
if [[ -n "${GPU_CONFIGS[${{ runner.name }}]}" ]]; then
104+
ARG=("--runtime=nvidia --cpus="40" --memory="400g" --gpus '\"device=${GPU_CONFIGS[${{ runner.name }}]}\"'")
105+
elif [[ "${{ inputs.RUNNER }}" != *cpu* ]]; then
71106
ARG=("--runtime=nvidia --gpus all")
72107
fi
73108
74109
cmd=$(cat <<RUN_TEST_EOF
75110
#!/bin/bash
76-
docker container rm -f nemo_container_${{ github.run_id }} || true
111+
docker container rm -f nemo_container_${{ github.run_id }}_${{ runner.name }} || true
77112
docker run \
78113
--rm \
79114
-d \
80-
--name nemo_container_${{ github.run_id }} ${ARG[@]} \
115+
--name nemo_container_${{ github.run_id }}_${{ runner.name }} ${ARG[@]} \
81116
--shm-size=64g \
82117
--env TRANSFORMERS_OFFLINE=0 \
83118
--env HYDRA_FULL_ERROR=1 \
84119
--env HF_HOME=/home/TestData/HF_HOME \
120+
--env RUN_ID=${{ github.run_id }} \
121+
--volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
85122
--volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} \
86123
bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
87124
RUN_TEST_EOF
@@ -107,7 +144,10 @@ jobs:
107144
(
108145
set -e
109146
110-
docker exec -t nemo_container_${{ github.run_id }} bash -c 'RUN_ID=${{ github.run_id }} bash tests/functional_tests/$SCRIPT.sh && echo "Finished successfully." || echo "Did not finish."'
147+
docker exec -t nemo_container_${{ github.run_id }}_${{ runner.name }} bash -c '\
148+
cp -r /opt/Megatron-LM/ /workspace/ && \
149+
bash tests/functional_tests/$SCRIPT.sh && \
150+
echo "Finished successfully." || echo "Did not finish."'
111151
) 2>&1 | tee $DIR/err.log
112152
113153
RUN_TEST_EOF
@@ -137,10 +177,10 @@ jobs:
137177
potential_infra_failure=$(cat $DIR/err.log | grep -Eqiw "device" && echo true || echo false)
138178
echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
139179
140-
docker exec nemo_container_${{ github.run_id }} coverage combine
141-
docker exec nemo_container_${{ github.run_id }} coverage xml
142-
docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage $DIR/.coverage
143-
docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml $DIR/coverage.xml
180+
docker exec nemo_container_${{ github.run_id }}_${{ runner.name }} coverage combine
181+
docker exec nemo_container_${{ github.run_id }}_${{ runner.name }} coverage xml
182+
docker cp nemo_container_${{ github.run_id }}_${{ runner.name }}:/workspace/.coverage $DIR/.coverage
183+
docker cp nemo_container_${{ github.run_id }}_${{ runner.name }}:/workspace/coverage.xml $DIR/coverage.xml
144184
145185
coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
146186
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
@@ -162,7 +202,7 @@ jobs:
162202
- name: Test coverage
163203
shell: bash -x -e -u -o pipefail {0}
164204
run: |
165-
docker exec -t nemo_container_${{ github.run_id }} coverage report -i
205+
docker exec -t nemo_container_${{ github.run_id }}_${{ runner.name }} coverage report -i
166206
167207
- name: Upload artifacts
168208
uses: actions/upload-artifact@v4
@@ -174,14 +214,9 @@ jobs:
174214
${{ github.run_id }}/.coverage
175215
include-hidden-files: true
176216

177-
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
178-
if: failure() && inputs.IS_OPTIONAL == false && github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'no-fail-fast')
179-
- name: after_script
180-
if: always() && inputs.AFTER_SCRIPT != ':'
181-
run: |
182-
docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
183-
184217
- name: Container shutdown
185218
if: always()
186219
run: |
187-
docker container rm -f nemo_container_${{ github.run_id }} || true
220+
docker exec nemo_container_${{ github.run_id }}_${{ runner.name }} bash -c "chown -R $(id -u):$(id -g) /workspace"
221+
rm -rf $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }} || true
222+
docker container rm -f nemo_container_${{ github.run_id }}_${{ runner.name }} || true

0 commit comments

Comments
 (0)