Skip to content

Commit 84e0a8b

Browse files
committed
updating branch
2 parents f36a1b3 + 63b30fb commit 84e0a8b

1,238 files changed

Lines changed: 25410 additions & 6208 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/CODEOWNERS

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
.github/ @pablo-garay @ko3n1g @thomasdhc @chtruong814
2-
Dockerfile.ci @pablo-garay @ko3n1g @thomasdhc @chtruong814
2+
docker/Dockerfile.ci @pablo-garay @ko3n1g @thomasdhc @chtruong814
33
.pylintrc.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
4-
.flake8.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
4+
.flake8.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
name: "Test Template"
15+
description: "Template for running NeMo tests in a containerized environment"
16+
17+
inputs:
18+
runner:
19+
description: "Runner to use for test"
20+
required: true
21+
timeout:
22+
description: "Max runtime of test in minutes"
23+
required: false
24+
default: "10"
25+
script:
26+
description: "Test script to execute"
27+
required: true
28+
after_script:
29+
description: "Script to run after main test"
30+
required: false
31+
default: ":"
32+
is_optional:
33+
description: "Failure will cancel all other tests if set to true"
34+
required: false
35+
default: "false"
36+
is_unit_test:
37+
description: "Upload coverage as unit test"
38+
required: false
39+
default: "false"
40+
tests_to_run:
41+
description: "Tests to run"
42+
required: false
43+
default: '["all"]'
44+
image:
45+
description: "Image to use for test"
46+
required: false
47+
default: "nemo_container"
48+
cpu-only:
49+
description: "Run tests on CPU only"
50+
required: false
51+
default: "false"
52+
runs:
53+
using: "composite"
54+
steps:
55+
- name: Noop
56+
shell: bash
57+
run: |
58+
chmod -R u+rwX ${{ github.run_id }}
59+
echo "noop"
60+
61+
- name: Docker system cleanup
62+
shell: bash
63+
run: |
64+
docker system prune -af --filter "until=24h" --filter "label!=nemo.pr_number=${{ github.event.pull_request.number || 0 }}" --force || true
65+
66+
- name: Docker pull image
67+
shell: bash
68+
run: |
69+
docker pull nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }}
70+
71+
- name: Clean repos
72+
shell: bash
73+
run: |
74+
75+
- name: Create UUID
76+
id: uuid
77+
shell: bash
78+
run: |
79+
echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
80+
81+
- name: Checkout NeMo
82+
uses: actions/checkout@v2
83+
env:
84+
DIR: ${{ github.run_id }}
85+
with:
86+
repository: NVIDIA/NeMo
87+
path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
88+
89+
- name: Start container
90+
shell: bash
91+
env:
92+
DIR: ${{ github.run_id }}
93+
run: |
94+
mkdir -p $DIR
95+
96+
# Map of runner names to GPU device configurations
97+
declare -A GPU_CONFIGS=(
98+
["myVm-01"]="0,1"
99+
["myVm-02"]="2,3"
100+
["myVm-03"]="4,5"
101+
["myVm-04"]="6,7"
102+
)
103+
104+
ARG=("")
105+
if [[ "${{ inputs.cpu-only }}" == "false" ]]; then
106+
ARG=("--runtime=nvidia --gpus all")
107+
fi
108+
109+
cmd=$(cat <<RUN_TEST_EOF
110+
#!/bin/bash
111+
docker container rm -f nemo_container_${{ github.run_id }}_${{ inputs.runner }} || true
112+
docker run \
113+
--rm \
114+
-d \
115+
--name nemo_container_${{ github.run_id }}_${{ inputs.runner }} ${ARG[@]} \
116+
--shm-size=64g \
117+
--env TRANSFORMERS_OFFLINE=0 \
118+
--env HYDRA_FULL_ERROR=1 \
119+
--env HF_HOME=/home/TestData/HF_HOME \
120+
--env RUN_ID=${{ github.run_id }} \
121+
--volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
122+
--volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} \
123+
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
124+
RUN_TEST_EOF
125+
)
126+
127+
echo "$cmd" | tee "$DIR/retry_job.sh"
128+
bash $DIR/retry_job.sh
129+
130+
- name: Create run-script
131+
id: create
132+
env:
133+
DIR: ${{ github.run_id }}
134+
shell: bash
135+
run: |
136+
COVERAGE_PREFIX=$([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e")
137+
echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT"
138+
139+
mkdir -p $DIR
140+
rm $DIR/.coverage || true
141+
rm $DIR/err.log || true
142+
143+
cmd=$(cat <<RUN_TEST_EOF
144+
#!/bin/bash
145+
146+
(
147+
set -e
148+
149+
docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c '\
150+
cp -r /opt/Megatron-LM/ /workspace/ && \
151+
bash tests/functional_tests/${{ inputs.script }}.sh && \
152+
echo "Finished successfully." || echo "Did not finish."'
153+
) 2>&1 | tee $DIR/err.log
154+
155+
RUN_TEST_EOF
156+
)
157+
158+
echo "timeout_in_seconds=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
159+
echo "$cmd" | tee "$DIR/job.sh"
160+
161+
- name: Run main script
162+
uses: nick-fields/retry@v3
163+
with:
164+
timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
165+
max_attempts: 3
166+
shell: bash
167+
retry_on: timeout
168+
command: /bin/bash ${{ github.run_id }}/job.sh
169+
on_retry_command: /bin/bash ${{ github.run_id }}/retry_job.sh
170+
171+
- name: Check result
172+
id: check
173+
shell: bash
174+
env:
175+
DIR: ${{ github.run_id }}
176+
run: |
177+
cat $DIR/err.log
178+
179+
log=$(tail -c 2000 $DIR/err.log | base64 -w 0)
180+
echo "log=$log" >> "$GITHUB_OUTPUT"
181+
182+
potential_infra_failure=$(cat $DIR/err.log | grep -Eqiw "device" && echo true || echo false)
183+
echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
184+
185+
docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage combine
186+
docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage xml
187+
docker cp nemo_container_${{ github.run_id }}_${{ inputs.runner }}:/workspace/.coverage $DIR/.coverage
188+
docker cp nemo_container_${{ github.run_id }}_${{ inputs.runner }}:/workspace/coverage.xml $DIR/coverage.xml
189+
190+
coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
191+
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
192+
193+
IS_SUCCESS=$(tail -n 1 $DIR/err.log | grep -q "Finished successfully." && echo "true" || echo "false")
194+
195+
if [[ "$IS_SUCCESS" == "false" && "${{ inputs.is_optional }}" == "true" ]]; then
196+
echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
197+
IS_SUCCESS=true
198+
fi
199+
200+
if [[ "$IS_SUCCESS" == "false" ]]; then
201+
echo Test did not finish successfully.
202+
exit 1
203+
fi
204+
205+
exit $EXIT_CODE
206+
207+
- name: Test coverage
208+
shell: bash -x -e -u -o pipefail {0}
209+
run: |
210+
docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage report -i
211+
212+
- name: Upload artifacts
213+
uses: actions/upload-artifact@v4
214+
if: ${{ steps.check.outputs.coverage_report != 'none' }}
215+
with:
216+
name: ${{ steps.check.outputs.coverage_report }}
217+
path: |
218+
${{ github.run_id }}/coverage.xml
219+
${{ github.run_id }}/.coverage
220+
include-hidden-files: true
221+
222+
- name: Container shutdown
223+
if: always()
224+
shell: bash
225+
run: |
226+
docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c "chown -R $(id -u):$(id -g) /workspace"
227+
rm -rf $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }} || true
228+
docker container rm -f nemo_container_${{ github.run_id }}_${{ inputs.runner }} || true

.github/scripts/__init__.py

Whitespace-only changes.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
#!/usr/bin/env python3
16+
import json
17+
import os
18+
import sys
19+
from typing import Any, Dict, List, Set
20+
21+
import click
22+
import git
23+
24+
import nemo_dependencies
25+
26+
27+
def get_changed_files(source_sha: str, target_sha: str) -> List[str]:
28+
"""
29+
Fetch the changelog between current branch and main.
30+
Returns a list of dictionaries containing commit information.
31+
"""
32+
try:
33+
# Initialize the repo object - go up two levels from this file's location
34+
repo = git.Repo(os.path.join(os.path.dirname(__file__), "..", ".."))
35+
36+
# Get the diff between target and source
37+
diff_index = repo.commit(target_sha).diff(repo.commit(source_sha))
38+
39+
# Get just the changed filenames
40+
changed_files = []
41+
for diff in diff_index:
42+
changed_files.append(diff.a_path if diff.a_path else diff.b_path)
43+
44+
return changed_files
45+
46+
except git.exc.GitCommandError as e:
47+
print(f"Error fetching changelog: {e}", file=sys.stderr)
48+
sys.exit(1)
49+
except Exception as e:
50+
print(f"Unexpected error: {e}", file=sys.stderr)
51+
sys.exit(1)
52+
53+
54+
@click.command()
55+
@click.option('--source-sha', type=str, required=True, help='Source commit SHA')
56+
@click.option('--target-sha', type=str, required=True, help='Target commit sha')
57+
def main(source_sha: str, target_sha: str):
58+
"""
59+
Main function to fetch and output the changelog and changed files.
60+
"""
61+
62+
# Output unique changed files
63+
print("\nChanged files:")
64+
changed_files = get_changed_files(source_sha, target_sha)
65+
66+
print(json.dumps(sorted(list(changed_files)), indent=2))
67+
68+
nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
69+
# Build dependency graph
70+
dependencies = nemo_dependencies.build_dependency_graph(nemo_root)
71+
72+
test_modules: List[str] = []
73+
for changed_file in changed_files:
74+
if changed_file in dependencies:
75+
test_modules.extend(dependencies[changed_file])
76+
77+
test_modules = list(set(test_modules))
78+
79+
with open("test_modules.json", "w", encoding="utf-8") as f:
80+
json.dump(test_modules, f)
81+
82+
83+
if __name__ == "__main__":
84+
main()

0 commit comments

Comments
 (0)