diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml index 8727b366f4..a8220769a1 100644 --- a/.github/actions/test-template/action.yml +++ b/.github/actions/test-template/action.yml @@ -41,19 +41,6 @@ inputs: description: "Run tests on CPU only" required: false default: "false" - azure-client-id: - description: "Azure Client ID" - required: true - azure-tenant-id: - description: "Azure Tenant ID" - required: true - azure-subscription-id: - description: "Azure Subscription ID" - required: true - has-azure-credentials: - description: "Has Azure credentials" - required: false - default: "false" is_fork_pr: description: "Whether this is a pull request from a fork" required: false @@ -77,19 +64,9 @@ inputs: runs: using: "composite" steps: - - name: Install Azure CLI - if: ${{ inputs.has-azure-credentials == 'true' }} - shell: bash - run: | - for i in 1 2 3; do - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash && break - echo "Attempt $i failed, retrying in 10s..." - sleep 10 - done - - name: Install uuidgen shell: bash -x -e -u -o pipefail {0} - if: ${{ contains(inputs.runner, 'gcp') }} + if: ${{ contains(inputs.runner, 'aws') || contains(inputs.runner, 'gcp') }} run: | for i in 1 2 3; do apt-get update && apt-get install -y uuid-runtime && break @@ -97,11 +74,6 @@ runs: sleep 10 done - - name: Docker system cleanup - shell: bash - run: | - docker system prune -af --filter "until=48h" --force || true - - name: Docker pull image shell: bash run: | @@ -138,6 +110,7 @@ runs: docker run --rm -u root --runtime=nvidia --gpus all \ --shm-size=64g \ --env TRANSFORMERS_OFFLINE=0 \ + --env GHA_RUNNER=${{ inputs.runner }} \ --env HYDRA_FULL_ERROR=1 \ --env HF_HOME=/home/TestData/nemo-rl/hf_home \ --env HF_DATASETS_CACHE=/home/TestData/nemo-rl/hf_datasets_cache \ diff --git a/.github/workflows/_build_container.yml b/.github/workflows/_build_container.yml new file mode 100644 index 0000000000..4b6e527e06 --- /dev/null +++ b/.github/workflows/_build_container.yml @@ -0,0 +1,172 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Build container + +on: + workflow_call: + inputs: + build-ref: + required: false + default: ${{ github.sha }} + description: Ref, branch, or SHA to build. + type: string + image-name: + required: true + description: Name of the image to build and push. + type: string + build-args: + required: false + default: "" + description: Additional Docker build args. + type: string + build-contexts: + required: false + default: "" + description: Additional Docker build contexts. + type: string + dockerfile: + required: true + description: Path to the Dockerfile. + type: string + platform: + required: true + description: Docker build platform. + type: string + runner: + required: true + description: Runner to use for the build. + type: string + registry: + required: true + description: Container registry to push to. + type: string + target: + required: false + default: "" + description: Dockerfile stage to build. + type: string + +permissions: + contents: read + pull-requests: read + +defaults: + run: + shell: bash -x -e -u -o pipefail {0} + +jobs: + build: + runs-on: ${{ inputs.runner }} + env: + REGISTRY: ${{ inputs.registry }} + IMAGE_NAME: ${{ inputs.image-name }} + GH_REF: ${{ github.ref }} + RUN_ID: ${{ github.run_id }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + ref: ${{ inputs.build-ref }} + submodules: recursive + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Get recently merged PR cache refs + id: recent_pr_cache_refs + uses: actions/github-script@v8 + env: + REGISTRY: ${{ inputs.registry }} + IMAGE_NAME: ${{ inputs.image-name }} + with: + script: | + const [owner, repo] = process.env.GITHUB_REPOSITORY.split("/"); + const result = await github.graphql(` + query($owner: String!, $repo: String!) { + repository(owner: $owner, name: $repo) { + pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) { + nodes { + number + } + } + } + } + `, { owner, repo }); + + const refs = result.repository.pullRequests.nodes + .map(({ number }) => `type=registry,ref=${process.env.REGISTRY}/${process.env.IMAGE_NAME}:${number}-buildcache,mode=max`) + .join("\n"); + + core.setOutput("cache-from", refs); + core.info(`Found ${result.repository.pullRequests.nodes.length} recently merged PR cache refs.`); + + - name: Compute build metadata + id: build_meta + shell: bash + run: | + set -euo pipefail + + PR_NUMBER="" + if [[ "$GH_REF" =~ refs/heads/pull-request/([0-9]+) ]]; then + PR_NUMBER="${BASH_REMATCH[1]}" + fi + + TAGS=("$REGISTRY/$IMAGE_NAME:$RUN_ID") + if [[ "$GH_REF" == "refs/heads/main" ]]; then + CACHE_KEY="main" + TAGS+=("$REGISTRY/$IMAGE_NAME:main") + elif [[ -n "$PR_NUMBER" ]]; then + CACHE_KEY="$PR_NUMBER" + TAGS+=("$REGISTRY/$IMAGE_NAME:$PR_NUMBER") + else + CACHE_KEY=$(printf '%s' "${GITHUB_REF_NAME:-$RUN_ID}" | tr '/' '-' | tr -cd '[:alnum:]._-') + if [[ -z "$CACHE_KEY" ]]; then + CACHE_KEY="$RUN_ID" + fi + fi + + CACHE_FROM=( + "type=registry,ref=$REGISTRY/$IMAGE_NAME:main-buildcache,mode=max" + ) + if [[ "$CACHE_KEY" != "main" ]]; then + CACHE_FROM+=("type=registry,ref=$REGISTRY/$IMAGE_NAME:$CACHE_KEY-buildcache,mode=max") + fi + + { + echo "tags<> "$GITHUB_OUTPUT" + + - name: Build and push + uses: docker/build-push-action@v5 + with: + file: ${{ inputs.dockerfile }} + push: true + context: . + platforms: ${{ inputs.platform }} + build-contexts: ${{ inputs.build-contexts }} + build-args: ${{ inputs.build-args }} + cache-from: | + ${{ steps.build_meta.outputs.cache-from }} + ${{ steps.recent_pr_cache_refs.outputs.cache-from }} + cache-to: ${{ steps.build_meta.outputs.cache-to }} + no-cache: false + tags: | + ${{ steps.build_meta.outputs.tags }} + target: ${{ inputs.target }} diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml new file mode 100644 index 0000000000..ce9677163a --- /dev/null +++ b/.github/workflows/cicd-approve-test-queue.yml @@ -0,0 +1,34 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Approve Test Queue + +on: + schedule: + - cron: "*/5 * * * *" + workflow_dispatch: + +jobs: + approve-test-queue: + if: github.repository == 'NVIDIA-NeMo/RL' + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_test_approval_queue.yml@v1.3.0 + with: + workflow_name: CICD NeMo RL + max_concurrency_internal: ${{ fromJSON(vars.MAX_CONCURRENCY || '3') }} + max_concurrency_external: ${{ fromJSON(vars.MAX_CONCURRENCY_EXTERNAL || '3') }} + secrets: + PAT: ${{ secrets.PAT }} + NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }} + SLACK_CI_CHANNEL_WEBHOOK: ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }} + SLACK_TEAM_GROUP_ID: ${{ secrets.SLACK_TEAM_GROUP_ID }} diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index e1c802b5bf..8f58db020c 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -42,6 +42,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }} cancel-in-progress: true +env: + GB200_CONTAINER_REGISTRY: ${{ vars.GB200_CONTAINER_REGISTRY }} + jobs: pre-flight: runs-on: ubuntu-latest @@ -176,16 +179,27 @@ jobs: org-member-pre-flight: uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.80.1 with: - default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }} - non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }} - default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }} - non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }} - default_registry: ${{ vars.DEFAULT_CONTAINER_REGISTRY }} - non_nvidia_registry: ${{ vars.NON_NVIDIA_CONTAINER_REGISTRY }} + default_runner_prefix: ${{ vars.DEFAULT_H100_RUNNER }} + non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_H100_RUNNER }} + default_test_data_path: ${{ vars.DEFAULT_H100_TEST_DATA_PATH }} + non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_H100_TEST_DATA_PATH }} + default_registry: ${{ vars.DEFAULT_H100_CONTAINER_REGISTRY }} + non_nvidia_registry: ${{ vars.NON_NVIDIA_H100_CONTAINER_REGISTRY }} sso_users_filename: ${{ vars.SSO_USERS_FILENAME }} secrets: NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }} + gb200-config: + runs-on: ubuntu-latest + outputs: + registry: ${{ steps.config.outputs.registry }} + steps: + - name: Configure GB200 registry + id: config + env: + GB200_REGISTRY: ${{ env.GB200_CONTAINER_REGISTRY }} + run: echo "registry=$GB200_REGISTRY" | tee -a "$GITHUB_OUTPUT" + pr-branch-up-to-date-check: name: Check if PR branch is up to date needs: [pre-flight] @@ -278,26 +292,109 @@ jobs: - name: Minimize uv cache run: uv cache prune --ci + cicd-wait-in-queue: + name: Wait in test approval queue + needs: [pre-flight, lint-check] + runs-on: ubuntu-latest + environment: test + if: >- + ${{ + always() && + startsWith(github.ref, 'refs/heads/pull-request/') && + contains('Lfast L0 L1 L2', needs.pre-flight.outputs.test_level) && + needs.pre-flight.result == 'success' && + needs.lint-check.result == 'success' && + !cancelled() + }} + steps: + - name: Approved + run: echo "Approved to run CI tests." + sphinx-build: - needs: [pre-flight] - if: ${{ needs.pre-flight.outputs.test_level != 'none' }} + needs: [pre-flight, cicd-wait-in-queue] + if: >- + ${{ + always() && + needs.pre-flight.result == 'success' && + needs.pre-flight.outputs.test_level != 'none' && + ( + needs.cicd-wait-in-queue.result == 'success' || + needs.pre-flight.outputs.test_level == 'docs' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && + !cancelled() + }} uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0 build-container: - if: ${{ needs.pre-flight.outputs.test_level != 'none' && needs.pre-flight.outputs.image_tag == '' }} - needs: [pre-flight, org-member-pre-flight] - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.78.0 + name: Build H100 container + if: >- + ${{ + always() && + needs.pre-flight.result == 'success' && + needs.org-member-pre-flight.result == 'success' && + needs.pre-flight.outputs.test_level != 'none' && + needs.pre-flight.outputs.image_tag == '' && + ( + needs.cicd-wait-in-queue.result == 'success' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && + !cancelled() + }} + needs: [pre-flight, org-member-pre-flight, cicd-wait-in-queue] + permissions: + contents: read + pull-requests: read + uses: ./.github/workflows/_build_container.yml with: build-ref: ${{ needs.pre-flight.outputs.test_sha }} image-name: ${{ vars.CI_CONTAINER_NAME }} dockerfile: docker/Dockerfile - runner: ${{ contains(needs.org-member-pre-flight.outputs.runner_prefix, 'azure') && format('{0}-gpu-x2', needs.org-member-pre-flight.outputs.runner_prefix) || contains(needs.org-member-pre-flight.outputs.runner_prefix, 'gcp') && format('{0}-gpu-x4', needs.org-member-pre-flight.outputs.runner_prefix) }} - image-label: ${{ vars.CI_CONTAINER_NAME }} - target: release + platform: linux/amd64 registry: ${{ needs.org-member-pre-flight.outputs.registry }} + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + target: release build-contexts: | - nemo-rl=${{ github.run_id }}/ - ${{ vars.UV_BUILD_CACHE == 'enabled' && format('uv-cache-seed=docker-image://{0}/{1}-uv-cache:latest', needs.org-member-pre-flight.outputs.registry, vars.CI_CONTAINER_NAME) || '' }} + nemo-rl=. + ${{ vars.UV_BUILD_CACHE == 'enabled' && format('uv-cache-seed=docker-image://{0}/{1}:uv-cache', needs.org-member-pre-flight.outputs.registry, vars.CI_CONTAINER_NAME) || '' }} + build-args: | + MAX_JOBS=4 + NEMO_RL_COMMIT=${{ needs.pre-flight.outputs.test_sha }} + + build-container-gb200: + name: Build GB200/GCP container + if: >- + ${{ + always() && + needs.pre-flight.result == 'success' && + needs.org-member-pre-flight.result == 'success' && + needs.gb200-config.result == 'success' && + needs.pre-flight.outputs.test_level != 'none' && + needs.pre-flight.outputs.image_tag == '' && + needs.org-member-pre-flight.outputs.is_member == 'true' && + contains('L1 L2', needs.pre-flight.outputs.test_level) && + ( + needs.cicd-wait-in-queue.result == 'success' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && + !cancelled() + }} + needs: [pre-flight, org-member-pre-flight, gb200-config, cicd-wait-in-queue] + permissions: + contents: read + pull-requests: read + uses: ./.github/workflows/_build_container.yml + with: + build-ref: ${{ needs.pre-flight.outputs.test_sha }} + image-name: ${{ vars.CI_CONTAINER_NAME }} + dockerfile: docker/Dockerfile + platform: linux/arm64 + registry: ${{ needs.gb200-config.outputs.registry }} + runner: ${{ vars.GB200_RUNNER }} + target: release + build-contexts: | + nemo-rl=. + ${{ vars.UV_BUILD_CACHE == 'enabled' && format('uv-cache-seed=docker-image://{0}/{1}:uv-cache', needs.gb200-config.outputs.registry, vars.CI_CONTAINER_NAME) || '' }} build-args: | MAX_JOBS=4 NEMO_RL_COMMIT=${{ needs.pre-flight.outputs.test_sha }} @@ -308,10 +405,10 @@ jobs: if: >- ${{ github.ref == 'refs/heads/main' && + vars.UV_BUILD_CACHE == 'enabled' && needs.build-container.result == 'success' }} - runs-on: ${{ format('{0}-gpu-x2', needs.org-member-pre-flight.outputs.runner_prefix) }} - environment: nemo-ci + runs-on: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} env: REGISTRY: ${{ needs.org-member-pre-flight.outputs.registry }} IMAGE_NAME: ${{ vars.CI_CONTAINER_NAME }} @@ -320,7 +417,40 @@ jobs: run: | set -euo pipefail SRC="${REGISTRY}/${IMAGE_NAME}:${{ github.run_id }}" - DST="${REGISTRY}/${IMAGE_NAME}-uv-cache:latest" + DST="${REGISTRY}/${IMAGE_NAME}:uv-cache" + + docker pull "${SRC}" + CID=$(docker create "${SRC}" true) + mkdir -p /tmp/uv-cache + docker cp "${CID}:/root/.cache/uv/." /tmp/uv-cache/ + docker rm "${CID}" + + printf 'FROM scratch\nCOPY uv-cache/ /\n' > /tmp/Dockerfile.uv-cache + docker build -t "${DST}" -f /tmp/Dockerfile.uv-cache /tmp + docker push "${DST}" + + docker rmi "${SRC}" "${DST}" 2>/dev/null || true + rm -rf /tmp/uv-cache /tmp/Dockerfile.uv-cache + + update-uv-cache-gb200: + name: Update GB200 uv build cache + needs: [build-container-gb200, gb200-config] + if: >- + ${{ + github.ref == 'refs/heads/main' && + vars.UV_BUILD_CACHE == 'enabled' && + needs.build-container-gb200.result == 'success' + }} + runs-on: ${{ vars.GB200_RUNNER }} + env: + REGISTRY: ${{ needs.gb200-config.outputs.registry }} + IMAGE_NAME: ${{ vars.CI_CONTAINER_NAME }} + steps: + - name: Extract and push uv cache image + run: | + set -euo pipefail + SRC="${REGISTRY}/${IMAGE_NAME}:${{ github.run_id }}" + DST="${REGISTRY}/${IMAGE_NAME}:uv-cache" docker pull "${SRC}" CID=$(docker create "${SRC}" true) @@ -341,27 +471,32 @@ jobs: matrix: include: - script: Docs_Tests - runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2 - needs: [pre-flight, build-container, org-member-pre-flight] + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + needs: [pre-flight, build-container, org-member-pre-flight, cicd-wait-in-queue] if: >- ${{ ( always() && contains('docs Lfast L0 L1 L2', needs.pre-flight.outputs.test_level) && needs.pre-flight.result == 'success' && + needs.org-member-pre-flight.result == 'success' && + ( + needs.cicd-wait-in-queue.result == 'success' || + needs.pre-flight.outputs.test_level == 'docs' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && (needs.build-container.result == 'success' || needs.build-container.result == 'skipped') ) && !cancelled() }} runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} - environment: nemo-ci steps: - name: Checkout uses: actions/checkout@v6 - name: main uses: ./.github/actions/test-template with: - runner: ${{ runner.name }} + runner: ${{ matrix.runner }} registry: ${{ needs.org-member-pre-flight.outputs.registry }} image: ${{ vars.CI_CONTAINER_NAME }} image-tag: ${{ needs.pre-flight.outputs.image_tag }} @@ -373,21 +508,65 @@ jobs: cicd-unit-tests: strategy: fail-fast: false + max-parallel: 16 matrix: include: - - script: L0_Unit_Tests_Generation - runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2 - - script: L0_Unit_Tests_Policy - runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2 + - script: L0_Unit_Tests_Vllm_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Vllm_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Vllm_3 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Sglang + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Mcore + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Mcore_Policy_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Mcore_Policy_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Mcore_Policy_3 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Automodel + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Automodel_Policy_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Automodel_Policy_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Automodel_Policy_3 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Models_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Models_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Models_3 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Models_4 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Environments + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Nemo_Gym + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Algorithms + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Data + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L0_Unit_Tests_Distributed + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} - script: L0_Unit_Tests_Other - runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2 - needs: [pre-flight, build-container, cicd-doc-tests, org-member-pre-flight] + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + needs: [pre-flight, build-container, cicd-doc-tests, org-member-pre-flight, cicd-wait-in-queue] if: >- ${{ ( always() && contains('L0 L1 L2 Lfast', needs.pre-flight.outputs.test_level) && needs.pre-flight.result == 'success' && + needs.org-member-pre-flight.result == 'success' && + ( + needs.cicd-wait-in-queue.result == 'success' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && (needs.build-container.result == 'success' || needs.build-container.result == 'skipped') && (needs.cicd-doc-tests.result == 'success' || needs.cicd-doc-tests.result == 'skipped') ) && !cancelled() @@ -404,7 +583,7 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} with: - runner: ${{ runner.name }} + runner: ${{ matrix.runner }} script: ${{ matrix.script }} registry: ${{ needs.org-member-pre-flight.outputs.registry }} test_data_path: ${{ needs.org-member-pre-flight.outputs.test_data_path }} @@ -414,18 +593,148 @@ jobs: cpu-only: ${{ matrix.cpu-only || false }} test-commit-sha: ${{ needs.pre-flight.outputs.test_sha }} + unit-test-script-check: + name: Check unit test script coverage + needs: [pre-flight, cicd-wait-in-queue] + if: >- + ${{ + always() && + contains('L0 L1 L2 Lfast', needs.pre-flight.outputs.test_level) && + needs.pre-flight.result == 'success' && + ( + needs.cicd-wait-in-queue.result == 'success' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && + !cancelled() + }} + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + ref: ${{ needs.pre-flight.outputs.test_sha }} + + - name: Verify L0 unit scripts are in the workflow + run: | + set -euo pipefail + + expected=$(mktemp) + configured=$(mktemp) + + find tests/unit -maxdepth 1 -type f -name 'L0_Unit*.sh' \ + -exec basename {} .sh \; | sort -u > "$expected" + + { + grep -E '^[[:space:]]*-[[:space:]]*script:[[:space:]]*L0_Unit' .github/workflows/cicd-main.yml || true + } | sed -E 's/^[[:space:]]*-[[:space:]]*script:[[:space:]]*//' | sort -u > "$configured" + + missing=$(comm -23 "$expected" "$configured") + if [[ -n "$missing" ]]; then + echo "The following tests/unit/L0_Unit*.sh scripts are missing from .github/workflows/cicd-main.yml:" + printf '%s\n' "$missing" + exit 1 + fi + + echo "All L0 unit scripts are included in .github/workflows/cicd-main.yml." + + functional-test-script-check: + name: Check functional test script coverage + needs: [pre-flight, cicd-wait-in-queue] + if: >- + ${{ + always() && + contains('L1 L2 Lfast', needs.pre-flight.outputs.test_level) && + needs.pre-flight.result == 'success' && + ( + needs.cicd-wait-in-queue.result == 'success' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && + !cancelled() + }} + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + ref: ${{ needs.pre-flight.outputs.test_sha }} + + - name: Verify L1 functional scripts are in the workflow + run: | + set -euo pipefail + + expected=$(mktemp) + configured=$(mktemp) + + find tests/functional -maxdepth 1 -type f -name 'L1_Functional*.sh' \ + -exec basename {} .sh \; | sort -u > "$expected" + + { + grep -E '^[[:space:]]*-[[:space:]]*script:[[:space:]]*L1_Functional' .github/workflows/cicd-main.yml || true + } | sed -E 's/^[[:space:]]*-[[:space:]]*script:[[:space:]]*//' | sort -u > "$configured" + + missing=$(comm -23 "$expected" "$configured") + if [[ -n "$missing" ]]; then + echo "The following tests/functional/L1_Functional*.sh scripts are missing from .github/workflows/cicd-main.yml:" + printf '%s\n' "$missing" + exit 1 + fi + + echo "All L1 functional scripts are included in .github/workflows/cicd-main.yml." + cicd-functional-tests: strategy: fail-fast: false + max-parallel: 16 matrix: include: - - script: L1_Functional_Tests_GPU - runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2 - needs: [pre-flight, build-container, cicd-unit-tests, org-member-pre-flight] + - script: L1_Functional_Tests_Megatron_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Megatron_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Megatron_3 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_AutoModel + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_SGLang + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Gym + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_GRPO_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_GRPO_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_GRPO_3 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_SFT + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Eval + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Other_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Other_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + needs: [pre-flight, build-container, cicd-unit-tests, functional-test-script-check, org-member-pre-flight, cicd-wait-in-queue] runs-on: ${{ matrix.runner }} - if: ${{ contains('L1 L2', needs.pre-flight.outputs.test_level) }} + if: >- + ${{ + always() && + contains('L1 L2', needs.pre-flight.outputs.test_level) && + needs.pre-flight.result == 'success' && + needs.org-member-pre-flight.result == 'success' && + ( + needs.cicd-wait-in-queue.result == 'success' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && + (needs.build-container.result == 'success' || needs.build-container.result == 'skipped') && + needs.cicd-unit-tests.result == 'success' && + needs.functional-test-script-check.result == 'success' && + !cancelled() + }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} - environment: nemo-ci steps: - name: Checkout uses: actions/checkout@v6 @@ -434,25 +743,125 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} with: - runner: ${{ runner.name }} + runner: ${{ matrix.runner }} registry: ${{ needs.org-member-pre-flight.outputs.registry }} image: ${{ vars.CI_CONTAINER_NAME }} test_data_path: ${{ needs.org-member-pre-flight.outputs.test_data_path }} script: ${{ matrix.script }} test-commit-sha: ${{ needs.pre-flight.outputs.test_sha }} + cicd-functional-tests-gb200: + strategy: + fail-fast: false + max-parallel: 16 + matrix: + include: + - script: L1_Functional_Tests_Megatron_1 + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_Megatron_2 + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_Megatron_3 + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_AutoModel + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_SGLang + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_Gym + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_GRPO_1 + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_GRPO_2 + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_GRPO_3 + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_SFT + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_Eval + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_Other_1 + runner: ${{ vars.GB200_RUNNER }} + - script: L1_Functional_Tests_Other_2 + runner: ${{ vars.GB200_RUNNER }} + needs: [pre-flight, build-container-gb200, cicd-unit-tests, functional-test-script-check, org-member-pre-flight, gb200-config, cicd-wait-in-queue] + runs-on: ${{ matrix.runner }} + if: >- + ${{ + always() && + contains('L1 L2', needs.pre-flight.outputs.test_level) && + needs.org-member-pre-flight.outputs.is_member == 'true' && + needs.pre-flight.result == 'success' && + needs.org-member-pre-flight.result == 'success' && + ( + needs.cicd-wait-in-queue.result == 'success' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && + (needs.build-container-gb200.result == 'success' || needs.build-container-gb200.result == 'skipped') && + needs.cicd-unit-tests.result == 'success' && + needs.functional-test-script-check.result == 'success' && + !cancelled() + }} + name: gb200_${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} + steps: + - name: Checkout + uses: actions/checkout@v6 + - name: main + uses: ./.github/actions/test-template + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + runner: ${{ matrix.runner }} + registry: ${{ needs.gb200-config.outputs.registry }} + image: ${{ vars.CI_CONTAINER_NAME }} + test_data_path: ${{ needs.org-member-pre-flight.outputs.test_data_path }} + image-tag: ${{ needs.pre-flight.outputs.image_tag }} + script: ${{ matrix.script }} + test-commit-sha: ${{ needs.pre-flight.outputs.test_sha }} + cicd-fast-functional-tests: strategy: fail-fast: false matrix: include: - - script: L1_Functional_Tests_GPU - runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2 - needs: [pre-flight, org-member-pre-flight] - if: ${{ contains('Lfast', needs.pre-flight.outputs.test_level) }} + - script: L1_Functional_Tests_Megatron_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Megatron_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Megatron_3 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_AutoModel + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Gym + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_GRPO_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_GRPO_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_GRPO_3 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_SFT + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Eval + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Other_1 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + - script: L1_Functional_Tests_Other_2 + runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }} + needs: [pre-flight, functional-test-script-check, org-member-pre-flight, cicd-wait-in-queue] + if: >- + ${{ + always() && + contains('Lfast', needs.pre-flight.outputs.test_level) && + needs.pre-flight.result == 'success' && + needs.org-member-pre-flight.result == 'success' && + ( + needs.cicd-wait-in-queue.result == 'success' || + !startsWith(github.ref, 'refs/heads/pull-request/') + ) && + needs.functional-test-script-check.result == 'success' && + !cancelled() + }} runs-on: ${{ matrix.runner }} name: fast_${{ matrix.script }} - environment: nemo-ci steps: - name: Checkout uses: actions/checkout@v6 @@ -461,7 +870,7 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} with: - runner: ${{ runner.name }} + runner: ${{ matrix.runner }} script: ${{ matrix.script }} image-tag: ${{ needs.pre-flight.outputs.image_tag }} registry: ${{ needs.org-member-pre-flight.outputs.registry }} @@ -475,13 +884,19 @@ jobs: runs-on: ubuntu-latest needs: - pre-flight + - org-member-pre-flight - pr-branch-up-to-date-check - lint-check + - cicd-wait-in-queue - sphinx-build - build-container + - build-container-gb200 - cicd-doc-tests - cicd-unit-tests + - unit-test-script-check + - functional-test-script-check - cicd-functional-tests + - cicd-functional-tests-gb200 - cicd-fast-functional-tests steps: - name: main @@ -491,24 +906,48 @@ jobs: ALL_SUCCESS: >- ${{ needs.lint-check.result == 'success' && + (needs.cicd-wait-in-queue.result == 'success' || needs.cicd-wait-in-queue.result == 'skipped') && (needs.pr-branch-up-to-date-check.result == 'success' || needs.pr-branch-up-to-date-check.result == 'skipped') && ( needs.pre-flight.outputs.test_level != 'none' && needs.sphinx-build.result == 'success' && (needs.build-container.result == 'success' || needs.build-container.result == 'skipped') && + (needs.build-container-gb200.result == 'success' || needs.build-container-gb200.result == 'skipped') && ( ( (needs.cicd-doc-tests.result == 'success' || needs.cicd-doc-tests.result == 'skipped') && - (needs.cicd-unit-tests.result == 'skipped' || needs.cicd-unit-tests.result == 'success') && - (needs.cicd-functional-tests.result == 'skipped' || needs.cicd-functional-tests.result == 'success') && - (needs.cicd-fast-functional-tests.result == 'skipped' || needs.cicd-fast-functional-tests.result == 'success') + ( + !contains('L0 L1 L2 Lfast', needs.pre-flight.outputs.test_level) || + needs.cicd-unit-tests.result == 'success' + ) && + ( + !contains('L0 L1 L2 Lfast', needs.pre-flight.outputs.test_level) || + needs.unit-test-script-check.result == 'success' + ) && + ( + !contains('L1 L2 Lfast', needs.pre-flight.outputs.test_level) || + needs.functional-test-script-check.result == 'success' + ) && + ( + !contains('L1 L2', needs.pre-flight.outputs.test_level) || + needs.cicd-functional-tests.result == 'success' + ) && + ( + needs.org-member-pre-flight.outputs.is_member != 'true' || + !contains('L1 L2', needs.pre-flight.outputs.test_level) || + needs.cicd-functional-tests-gb200.result == 'success' + ) && + ( + !contains('Lfast', needs.pre-flight.outputs.test_level) || + needs.cicd-fast-functional-tests.result == 'success' + ) ) ) ) }} - CI_SKIP: ${{ needs.pre-flight.outputs.has_cicd_skip_label }} + CI_SKIP: ${{ needs.pre-flight.outputs.has_skip_cicd }} TEST_LEVEL: ${{ needs.pre-flight.outputs.test_level }} run: | SUMMARY=$(echo $JOB_RESULTS | jq 'to_entries[] | .key + ": " + .value.result' | tr -d '"') diff --git a/pyproject.toml b/pyproject.toml index 73c2b2e1b9..e75ce7e279 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -220,6 +220,7 @@ test = [ "pytest-cov", "pytest-asyncio", "pytest-testmon", + "pytest-shard", ] [tool.uv.sources] diff --git a/tests/functional/L1_Functional_Tests_AutoModel.sh b/tests/functional/L1_Functional_Tests_AutoModel.sh new file mode 100644 index 0000000000..9ea77645e3 --- /dev/null +++ b/tests/functional/L1_Functional_Tests_AutoModel.sh @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test uv run --no-sync bash ./tests/functional/dpo_automodel_lora.sh +run_test uv run --no-sync bash ./tests/functional/grpo_automodel_lora.sh +run_test uv run --no-sync bash ./tests/functional/grpo_automodel_lora_async.sh +run_test uv run --no-sync bash ./tests/functional/grpo_automodel_lora_non_colocated.sh +run_test uv run --no-sync bash ./tests/functional/sft_automodel_lora.sh +run_test uv run --no-sync bash ./tests/functional/test_automodel_extra_installed_correctly.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_Eval.sh b/tests/functional/L1_Functional_Tests_Eval.sh new file mode 100644 index 0000000000..3d6a3b63e2 --- /dev/null +++ b/tests/functional/L1_Functional_Tests_Eval.sh @@ -0,0 +1,42 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test uv run --no-sync bash ./tests/functional/eval.sh +run_test uv run --no-sync bash ./tests/functional/eval_async.sh +run_test fast uv run --no-sync bash ./tests/functional/eval_audio.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh deleted file mode 100644 index 7f3dba5053..0000000000 --- a/tests/functional/L1_Functional_Tests_GPU.sh +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/bin/bash -set -xeuo pipefail # Exit immediately if a command exits with a non-zero status - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) - -cd ${PROJECT_ROOT} - -# run_test [fast] -# - "run_test fast " = always runs (both fast and full modes) -# - "run_test " = only runs in full mode; skipped when FAST=1 -run_test() { - if [[ "$1" == "fast" ]]; then - shift - time "$@" - elif [[ "${FAST:-0}" == "1" ]]; then - echo "FAST: Skipping: $*" - else - time "$@" - fi -} - -# This test is intentionally not run with uv run --no-sync to verify that the frozen environment is working correctly. -run_test bash ./tests/functional/grpo_frozen_env.sh -run_test bash ./tests/functional/test_frozen_env.sh - -run_test fast uv run --no-sync bash ./tests/functional/audio_grpo_megatron.sh -run_test fast uv run --no-sync bash ./tests/functional/distillation.sh -run_test uv run --no-sync bash ./tests/functional/distillation_megatron.sh -run_test fast uv run --no-sync bash ./tests/functional/dpo.sh -run_test uv run --no-sync bash ./tests/functional/dpo_automodel_lora.sh -run_test fast uv run --no-sync bash ./tests/functional/dpo_megatron_lora.sh -run_test uv run --no-sync bash ./tests/functional/dpo_megatron.sh -run_test uv run --no-sync bash ./tests/functional/eval.sh -run_test uv run --no-sync bash ./tests/functional/eval_async.sh -run_test fast uv run --no-sync bash ./tests/functional/eval_audio.sh -run_test fast uv run --no-sync bash ./tests/functional/gdpo.sh -run_test fast uv run --no-sync bash ./tests/functional/gdpo_async_grpo.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo_dp_simple.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo_dp_mooncake.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo_async_gym.sh -run_test uv run --no-sync bash ./tests/functional/grpo_automodel_lora.sh -run_test uv run --no-sync bash ./tests/functional/grpo_automodel_lora_async.sh -run_test uv run --no-sync bash ./tests/functional/grpo_automodel_lora_non_colocated.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo_fsdp2.sh -run_test uv run --no-sync bash ./tests/functional/grpo_megatron.sh -run_test uv run --no-sync bash ./tests/functional/grpo_megatron_mbridge_restore.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo_megatron_eagle3_online.sh -run_test uv run --no-sync bash ./tests/functional/grpo_megatron_generation.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo_megatron_lora.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo_megatron_lora_async.sh -run_test uv run --no-sync bash ./tests/functional/grpo_multiple_dataloaders.sh -run_test uv run --no-sync bash ./tests/functional/grpo_multiturn.sh -run_test uv run --no-sync bash ./tests/functional/grpo_non_colocated.sh -run_test uv run --no-sync bash ./tests/functional/grpo_rm_env.sh -run_test uv run --no-sync bash ./tests/functional/grpo_sglang.sh -run_test fast uv run --no-sync bash ./tests/functional/grpo_topp_topk.sh -run_test uv run --no-sync bash ./tests/functional/prorlv2.sh -run_test uv run --no-sync bash ./tests/functional/qa_distillation_megatron.sh -run_test uv run --no-sync bash ./tests/functional/rm.sh -run_test fast uv run --no-sync bash ./tests/functional/sft.sh -run_test uv run --no-sync bash ./tests/functional/sft_automodel_lora.sh -run_test uv run --no-sync bash ./tests/functional/sft_avlm.sh -run_test uv run --no-sync bash ./tests/functional/sft_megatron.sh -run_test uv run --no-sync bash ./tests/functional/sft_megatron_lora.sh -run_test uv run --no-sync bash ./tests/functional/sft_resume_diamond.sh -run_test uv run --no-sync bash ./tests/functional/test_automodel_extra_installed_correctly.sh -run_test fast uv run --no-sync bash ./tests/functional/test_converters.sh -run_test uv run --no-sync bash ./tests/functional/test_decode_vs_prefill.sh -run_test uv run --no-sync bash ./tests/functional/test_mcore_extra_installed_correctly.sh -run_test uv run --no-sync bash ./tests/functional/vlm_grpo.sh - -# Research functional tests (self-discovery) -if [[ "${FAST:-0}" != "1" ]]; then - for test_script in research/*/tests/functional/*.sh; do - project_dir=$(echo $test_script | cut -d/ -f1-2) - pushd $project_dir - time uv run --no-sync bash $(echo $test_script | cut -d/ -f3-) - popd - done -fi - -cd ${PROJECT_ROOT}/tests -coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_GRPO_1.sh b/tests/functional/L1_Functional_Tests_GRPO_1.sh new file mode 100644 index 0000000000..f2a63930bd --- /dev/null +++ b/tests/functional/L1_Functional_Tests_GRPO_1.sh @@ -0,0 +1,47 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +# This test is intentionally not run with uv run --no-sync to verify that the frozen environment is working correctly. +run_test bash ./tests/functional/grpo_frozen_env.sh + +run_test fast uv run --no-sync bash ./tests/functional/gdpo.sh +run_test fast uv run --no-sync bash ./tests/functional/grpo.sh +run_test uv run --no-sync bash ./tests/functional/grpo_multiple_dataloaders.sh +run_test fast uv run --no-sync bash ./tests/functional/grpo_dp_simple.sh +run_test fast uv run --no-sync bash ./tests/functional/grpo_dp_mooncake.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_GRPO_2.sh b/tests/functional/L1_Functional_Tests_GRPO_2.sh new file mode 100644 index 0000000000..b1d8c26d26 --- /dev/null +++ b/tests/functional/L1_Functional_Tests_GRPO_2.sh @@ -0,0 +1,43 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test fast uv run --no-sync bash ./tests/functional/gdpo_async_grpo.sh +run_test fast uv run --no-sync bash ./tests/functional/grpo_fsdp2.sh +run_test uv run --no-sync bash ./tests/functional/grpo_multiturn.sh +run_test uv run --no-sync bash ./tests/functional/grpo_non_colocated.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_GRPO_3.sh b/tests/functional/L1_Functional_Tests_GRPO_3.sh new file mode 100644 index 0000000000..e64b56cefe --- /dev/null +++ b/tests/functional/L1_Functional_Tests_GRPO_3.sh @@ -0,0 +1,42 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test uv run --no-sync bash ./tests/functional/grpo_rm_env.sh +run_test fast uv run --no-sync bash ./tests/functional/grpo_topp_topk.sh +run_test uv run --no-sync bash ./tests/functional/vlm_grpo.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_Gym.sh b/tests/functional/L1_Functional_Tests_Gym.sh new file mode 100644 index 0000000000..33dc450d7b --- /dev/null +++ b/tests/functional/L1_Functional_Tests_Gym.sh @@ -0,0 +1,40 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test fast uv run --no-sync bash ./tests/functional/grpo_async_gym.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_Megatron_1.sh b/tests/functional/L1_Functional_Tests_Megatron_1.sh new file mode 100644 index 0000000000..dd5a0640f6 --- /dev/null +++ b/tests/functional/L1_Functional_Tests_Megatron_1.sh @@ -0,0 +1,44 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test fast uv run --no-sync bash ./tests/functional/audio_grpo_megatron.sh +run_test uv run --no-sync bash ./tests/functional/grpo_megatron.sh +run_test uv run --no-sync bash ./tests/functional/grpo_megatron_mbridge_restore.sh +run_test fast uv run --no-sync bash ./tests/functional/grpo_megatron_eagle3_online.sh +run_test uv run --no-sync bash ./tests/functional/grpo_megatron_generation.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_Megatron_2.sh b/tests/functional/L1_Functional_Tests_Megatron_2.sh new file mode 100644 index 0000000000..8884617d53 --- /dev/null +++ b/tests/functional/L1_Functional_Tests_Megatron_2.sh @@ -0,0 +1,43 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test fast uv run --no-sync bash ./tests/functional/grpo_megatron_lora.sh +run_test fast uv run --no-sync bash ./tests/functional/grpo_megatron_lora_async.sh +run_test fast uv run --no-sync bash ./tests/functional/dpo_megatron_lora.sh +run_test uv run --no-sync bash ./tests/functional/sft_megatron_lora.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_Megatron_3.sh b/tests/functional/L1_Functional_Tests_Megatron_3.sh new file mode 100644 index 0000000000..341aad7234 --- /dev/null +++ b/tests/functional/L1_Functional_Tests_Megatron_3.sh @@ -0,0 +1,43 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test uv run --no-sync bash ./tests/functional/distillation_megatron.sh +run_test uv run --no-sync bash ./tests/functional/qa_distillation_megatron.sh +run_test uv run --no-sync bash ./tests/functional/dpo_megatron.sh +run_test uv run --no-sync bash ./tests/functional/sft_megatron.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_Other_1.sh b/tests/functional/L1_Functional_Tests_Other_1.sh new file mode 100644 index 0000000000..7cb7f33f61 --- /dev/null +++ b/tests/functional/L1_Functional_Tests_Other_1.sh @@ -0,0 +1,55 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +# This test is intentionally not run with uv run --no-sync to verify that the frozen environment is working correctly. +run_test bash ./tests/functional/test_frozen_env.sh + +run_test fast uv run --no-sync bash ./tests/functional/test_converters.sh +run_test uv run --no-sync bash ./tests/functional/test_decode_vs_prefill.sh +run_test uv run --no-sync bash ./tests/functional/test_mcore_extra_installed_correctly.sh + +# Research functional tests (self-discovery) +if [[ "${FAST:-0}" != "1" ]]; then + for test_script in research/*/tests/functional/*.sh; do + project_dir=$(echo $test_script | cut -d/ -f1-2) + pushd $project_dir + time uv run --no-sync bash $(echo $test_script | cut -d/ -f3-) + popd + done +fi + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_Other_2.sh b/tests/functional/L1_Functional_Tests_Other_2.sh new file mode 100644 index 0000000000..7c18df6865 --- /dev/null +++ b/tests/functional/L1_Functional_Tests_Other_2.sh @@ -0,0 +1,43 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test fast uv run --no-sync bash ./tests/functional/distillation.sh +run_test fast uv run --no-sync bash ./tests/functional/dpo.sh +run_test uv run --no-sync bash ./tests/functional/prorlv2.sh +run_test uv run --no-sync bash ./tests/functional/rm.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_SFT.sh b/tests/functional/L1_Functional_Tests_SFT.sh new file mode 100644 index 0000000000..7b1b952e4b --- /dev/null +++ b/tests/functional/L1_Functional_Tests_SFT.sh @@ -0,0 +1,42 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test fast uv run --no-sync bash ./tests/functional/sft.sh +run_test uv run --no-sync bash ./tests/functional/sft_avlm.sh +run_test uv run --no-sync bash ./tests/functional/sft_resume_diamond.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/L1_Functional_Tests_SGLang.sh b/tests/functional/L1_Functional_Tests_SGLang.sh new file mode 100644 index 0000000000..c7143e59fa --- /dev/null +++ b/tests/functional/L1_Functional_Tests_SGLang.sh @@ -0,0 +1,40 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +set -xeuo pipefail # Exit immediately if a command exits with a non-zero status + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# run_test [fast] +# - "run_test fast " = always runs (both fast and full modes) +# - "run_test " = only runs in full mode; skipped when FAST=1 +run_test() { + if [[ "$1" == "fast" ]]; then + shift + time "$@" + elif [[ "${FAST:-0}" == "1" ]]; then + echo "FAST: Skipping: $*" + else + time "$@" + fi +} + +run_test uv run --no-sync bash ./tests/functional/grpo_sglang.sh + +cd ${PROJECT_ROOT}/tests +coverage combine .coverage* diff --git a/tests/functional/test_converters.sh b/tests/functional/test_converters.sh index ef789ecf90..1306414b17 100644 --- a/tests/functional/test_converters.sh +++ b/tests/functional/test_converters.sh @@ -1 +1,9 @@ -uv run --extra mcore tests/functional/test_converter_roundtrip.py \ No newline at end of file +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..") + +cd "$PROJECT_ROOT" +uv run --extra mcore coverage run -a --data-file="$PROJECT_ROOT/tests/.coverage" --source="$PROJECT_ROOT/nemo_rl" \ + tests/functional/test_converter_roundtrip.py diff --git a/tests/functional/test_decode_vs_prefill.sh b/tests/functional/test_decode_vs_prefill.sh index 23d05307ae..ba44872159 100644 --- a/tests/functional/test_decode_vs_prefill.sh +++ b/tests/functional/test_decode_vs_prefill.sh @@ -1,4 +1,12 @@ -uv run --extra vllm python tools/model_diagnostics/2.long_generation_decode_vs_prefill.py \ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..") + +cd "$PROJECT_ROOT" +uv run --extra vllm coverage run -a --data-file="$PROJECT_ROOT/tests/.coverage" --source="$PROJECT_ROOT/nemo_rl" \ + tools/model_diagnostics/2.long_generation_decode_vs_prefill.py \ --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \ --prompts arc \ --max-tokens 8192 \ diff --git a/tests/run_unit.sh b/tests/run_unit.sh index 0366d6864b..336189e156 100755 --- a/tests/run_unit.sh +++ b/tests/run_unit.sh @@ -40,7 +40,13 @@ else pytest_args="$@" fi -if ! pytest $pytest_args; then +set +e +pytest $pytest_args +exit_code=$? +set -e +if [[ $exit_code -eq 5 ]]; then + echo "No tests collected; skipping." +elif [[ $exit_code -ne 0 ]]; then echo "[ERROR]: Unit tests failed." exit 1 fi diff --git a/tests/unit/L0_Unit_Tests_Algorithms.sh b/tests/unit/L0_Unit_Tests_Algorithms.sh new file mode 100644 index 0000000000..137c242531 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Algorithms.sh @@ -0,0 +1,22 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: Algorithm tests not covered by mcore/automodel shards +# mcore-marked tests (e.g., test_sequence_packing_gradients) are picked up +# by L0_Unit_Tests_Mcore shard via conftest.py filtering. + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --no-sync bash -x ./tests/run_unit.sh "unit/algorithms/" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated diff --git a/tests/unit/L0_Unit_Tests_Automodel.sh b/tests/unit/L0_Unit_Tests_Automodel.sh new file mode 100644 index 0000000000..1770127ce3 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Automodel.sh @@ -0,0 +1,21 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: All automodel-marked tests except policy worker tests +# Policy worker automodel tests run in L0_Unit_Tests_Automodel_Policy + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra automodel bash -x ./tests/run_unit.sh "unit/" "--ignore=unit/models/policy/" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated --automodel-only diff --git a/tests/unit/L0_Unit_Tests_Automodel_Policy_1.sh b/tests/unit/L0_Unit_Tests_Automodel_Policy_1.sh new file mode 100644 index 0000000000..5e4f4b29de --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Automodel_Policy_1.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: automodel-marked policy worker tests (test_dtensor_worker*.py, test_automodel_types.py) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra automodel bash -x ./tests/run_unit.sh "unit/models/policy/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=0 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated --automodel-only diff --git a/tests/unit/L0_Unit_Tests_Automodel_Policy_2.sh b/tests/unit/L0_Unit_Tests_Automodel_Policy_2.sh new file mode 100644 index 0000000000..9cb575b08c --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Automodel_Policy_2.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: automodel-marked policy worker tests (test_dtensor_worker*.py, test_automodel_types.py) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra automodel bash -x ./tests/run_unit.sh "unit/models/policy/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=1 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated --automodel-only diff --git a/tests/unit/L0_Unit_Tests_Automodel_Policy_3.sh b/tests/unit/L0_Unit_Tests_Automodel_Policy_3.sh new file mode 100644 index 0000000000..9e3f43aec3 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Automodel_Policy_3.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: automodel-marked policy worker tests (test_dtensor_worker*.py, test_automodel_types.py) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra automodel bash -x ./tests/run_unit.sh "unit/models/policy/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=2 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated --automodel-only diff --git a/tests/unit/L0_Unit_Tests_Data.sh b/tests/unit/L0_Unit_Tests_Data.sh new file mode 100644 index 0000000000..9ed0423c2e --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Data.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: Data pipeline tests (datasets, data processing, message utils) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --no-sync bash -x ./tests/run_unit.sh "unit/data/" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated diff --git a/tests/unit/L0_Unit_Tests_Distributed.sh b/tests/unit/L0_Unit_Tests_Distributed.sh new file mode 100644 index 0000000000..ad33c14648 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Distributed.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: Distributed infrastructure tests (worker groups, virtual cluster, logprob, model utils) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --no-sync bash -x ./tests/run_unit.sh "unit/distributed/" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated diff --git a/tests/unit/L0_Unit_Tests_Environments.sh b/tests/unit/L0_Unit_Tests_Environments.sh new file mode 100644 index 0000000000..88e032bf99 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Environments.sh @@ -0,0 +1,21 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: Environment tests (base only, not nemo_gym-marked) +# nemo_gym-marked tests are picked up by L0_Unit_Tests_Nemo_Gym shard. + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --no-sync bash -x ./tests/run_unit.sh "unit/environments/" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated diff --git a/tests/unit/L0_Unit_Tests_Generation.sh b/tests/unit/L0_Unit_Tests_Generation.sh deleted file mode 100644 index c9a974afb8..0000000000 --- a/tests/unit/L0_Unit_Tests_Generation.sh +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/bin/bash -set -xeuo pipefail # Exit immediately if a command exits with a non-zero status - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) - -cd ${PROJECT_ROOT} - -# Source exclusion list for FAST mode -EXCLUDED_UNIT_TESTS=() -if [[ "${FAST:-0}" == "1" ]]; then - source ${SCRIPT_DIR}/excluded_unit_tests.sh -fi - -uv run tests/unit/prepare_unit_test_assets.py - -TEST_PATHS=("unit/models/generation/") -IGNORE=() - -uv run --no-sync bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated - -# Check and run mcore tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra mcore pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --mcore-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No mcore tests to run" -else - uv run --extra mcore bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --mcore-only -fi - -# Check and run automodel tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra automodel pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --automodel-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No automodel tests to run" -else - uv run --extra automodel bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --automodel-only -fi - -# Check and run vllm tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra vllm pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --vllm-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No vllm tests to run" -else - uv run --extra vllm bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only -fi - -# Check and run sglang tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra sglang pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --sglang-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No sglang tests to run" -else - uv run --extra sglang bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --sglang-only -fi diff --git a/tests/unit/L0_Unit_Tests_Mcore.sh b/tests/unit/L0_Unit_Tests_Mcore.sh new file mode 100644 index 0000000000..19dcf39345 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Mcore.sh @@ -0,0 +1,21 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: All mcore-marked tests except policy worker tests +# Policy worker mcore tests run in L0_Unit_Tests_Mcore_Policy + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra mcore bash -x ./tests/run_unit.sh "unit/" "--ignore=unit/models/policy/" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated --mcore-only diff --git a/tests/unit/L0_Unit_Tests_Mcore_Policy_1.sh b/tests/unit/L0_Unit_Tests_Mcore_Policy_1.sh new file mode 100644 index 0000000000..fd4fc76bc8 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Mcore_Policy_1.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: mcore-marked policy worker tests (test_megatron_worker.py) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra mcore bash -x ./tests/run_unit.sh "unit/models/policy/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=0 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated --mcore-only diff --git a/tests/unit/L0_Unit_Tests_Mcore_Policy_2.sh b/tests/unit/L0_Unit_Tests_Mcore_Policy_2.sh new file mode 100644 index 0000000000..864cbde8fe --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Mcore_Policy_2.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: mcore-marked policy worker tests (test_megatron_worker.py) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra mcore bash -x ./tests/run_unit.sh "unit/models/policy/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=1 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated --mcore-only diff --git a/tests/unit/L0_Unit_Tests_Mcore_Policy_3.sh b/tests/unit/L0_Unit_Tests_Mcore_Policy_3.sh new file mode 100644 index 0000000000..04a629ffb6 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Mcore_Policy_3.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: mcore-marked policy worker tests (test_megatron_worker.py) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra mcore bash -x ./tests/run_unit.sh "unit/models/policy/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=2 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated --mcore-only diff --git a/tests/unit/L0_Unit_Tests_Models_1.sh b/tests/unit/L0_Unit_Tests_Models_1.sh new file mode 100644 index 0000000000..75c8109626 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Models_1.sh @@ -0,0 +1,23 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: Model tests not covered by mcore/automodel/generation shards +# Picks up base (unmarked) tests from models/policy/, models/dtensor/, models/huggingface/ +# Tests in models/megatron/ (all mcore) and models/automodel/ (all automodel) are excluded +# by conftest.py filtering since this is a base run. + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --no-sync bash -x ./tests/run_unit.sh "unit/models/" "--ignore=unit/models/generation/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=0 --num-shards=4 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated diff --git a/tests/unit/L0_Unit_Tests_Models_2.sh b/tests/unit/L0_Unit_Tests_Models_2.sh new file mode 100644 index 0000000000..b8d7253896 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Models_2.sh @@ -0,0 +1,23 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: Model tests not covered by mcore/automodel/generation shards +# Picks up base (unmarked) tests from models/policy/, models/dtensor/, models/huggingface/ +# Tests in models/megatron/ (all mcore) and models/automodel/ (all automodel) are excluded +# by conftest.py filtering since this is a base run. + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --no-sync bash -x ./tests/run_unit.sh "unit/models/" "--ignore=unit/models/generation/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=1 --num-shards=4 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated diff --git a/tests/unit/L0_Unit_Tests_Models_3.sh b/tests/unit/L0_Unit_Tests_Models_3.sh new file mode 100644 index 0000000000..984c5c5b62 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Models_3.sh @@ -0,0 +1,23 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: Model tests not covered by mcore/automodel/generation shards +# Picks up base (unmarked) tests from models/policy/, models/dtensor/, models/huggingface/ +# Tests in models/megatron/ (all mcore) and models/automodel/ (all automodel) are excluded +# by conftest.py filtering since this is a base run. + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --no-sync bash -x ./tests/run_unit.sh "unit/models/" "--ignore=unit/models/generation/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=2 --num-shards=4 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated diff --git a/tests/unit/L0_Unit_Tests_Models_4.sh b/tests/unit/L0_Unit_Tests_Models_4.sh new file mode 100644 index 0000000000..84ea65b0ea --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Models_4.sh @@ -0,0 +1,23 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: Model tests not covered by mcore/automodel/generation shards +# Picks up base (unmarked) tests from models/policy/, models/dtensor/, models/huggingface/ +# Tests in models/megatron/ (all mcore) and models/automodel/ (all automodel) are excluded +# by conftest.py filtering since this is a base run. + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --no-sync bash -x ./tests/run_unit.sh "unit/models/" "--ignore=unit/models/generation/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=3 --num-shards=4 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated diff --git a/tests/unit/L0_Unit_Tests_Nemo_Gym.sh b/tests/unit/L0_Unit_Tests_Nemo_Gym.sh new file mode 100644 index 0000000000..288291ffb4 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Nemo_Gym.sh @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: All nemo_gym-marked tests anywhere in the codebase + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +uv run --extra nemo_gym bash -x ./tests/run_unit.sh "unit/" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --nemo-gym-only -vv diff --git a/tests/unit/L0_Unit_Tests_Other.sh b/tests/unit/L0_Unit_Tests_Other.sh index fa830aeb0b..424e1ce091 100644 --- a/tests/unit/L0_Unit_Tests_Other.sh +++ b/tests/unit/L0_Unit_Tests_Other.sh @@ -13,65 +13,21 @@ # limitations under the License. #!/bin/bash -set -xeuo pipefail # Exit immediately if a command exits with a non-zero status +# Shard: Catch-all for everything not in other shards +# Covers: experience (base), utils, tools, evals, rewards, root-level tests +# Extra-marked tests are picked up by their respective shards (Mcore, Automodel, etc.) -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" -cd ${PROJECT_ROOT} +IGNORE=( + "--ignore=unit/models/" + "--ignore=unit/environments/" + "--ignore=unit/algorithms/" + "--ignore=unit/data/" + "--ignore=unit/distributed/" +) -# Source exclusion list for FAST mode -EXCLUDED_UNIT_TESTS=() -if [[ "${FAST:-0}" == "1" ]]; then - source ${SCRIPT_DIR}/excluded_unit_tests.sh -fi - -uv run tests/unit/prepare_unit_test_assets.py - -TEST_PATHS=("unit/") -IGNORE=("--ignore=unit/models/generation/" "--ignore=unit/models/policy/") - -uv run --no-sync bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated - -# Check and run mcore tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra mcore pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --mcore-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No mcore tests to run" -else - uv run --extra mcore bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --mcore-only -fi - -# Check and run automodel tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra automodel pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --automodel-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No automodel tests to run" -else - uv run --extra automodel bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --automodel-only -fi - -# Check and run vllm tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra vllm pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --vllm-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No vllm tests to run" -else - uv run --extra vllm bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only -fi - -# Check and run sglang tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra sglang pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --sglang-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No sglang tests to run" -else - uv run --extra sglang bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --sglang-only -fi - -# Check and run nemo_gym tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra nemo_gym pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --nemo-gym-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No nemo_gym tests to run" -else - uv run --extra nemo_gym bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --nemo-gym-only -vv -fi +uv run --no-sync bash -x ./tests/run_unit.sh "unit/" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated # Skip research tests in fast mode if [[ "${FAST:-0}" != "1" ]]; then diff --git a/tests/unit/L0_Unit_Tests_Policy.sh b/tests/unit/L0_Unit_Tests_Policy.sh deleted file mode 100644 index f19691c421..0000000000 --- a/tests/unit/L0_Unit_Tests_Policy.sh +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/bin/bash -set -xeuo pipefail # Exit immediately if a command exits with a non-zero status - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) - -cd ${PROJECT_ROOT} - -# Source exclusion list for FAST mode -EXCLUDED_UNIT_TESTS=() -if [[ "${FAST:-0}" == "1" ]]; then - source ${SCRIPT_DIR}/excluded_unit_tests.sh -fi - -uv run tests/unit/prepare_unit_test_assets.py - -TEST_PATHS=("unit/models/policy/") -IGNORE=() - -uv run --no-sync bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated - -# Check and run mcore tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra mcore pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --mcore-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No mcore tests to run" -else - uv run --extra mcore bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --mcore-only -fi - -# Check and run automodel tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra automodel pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --automodel-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No automodel tests to run" -else - uv run --extra automodel bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --automodel-only -fi - -# Check and run vllm tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra vllm pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --vllm-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No vllm tests to run" -else - uv run --extra vllm bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only -fi - -# Check and run sglang tests -exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra sglang pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --hf-gated --sglang-only -q >/dev/null 2>&1; echo $?) -if [[ $exit_code -eq 5 ]]; then - echo "No sglang tests to run" -else - uv run --extra sglang bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --sglang-only -fi diff --git a/tests/unit/L0_Unit_Tests_Sglang.sh b/tests/unit/L0_Unit_Tests_Sglang.sh new file mode 100644 index 0000000000..5bf60a092e --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Sglang.sh @@ -0,0 +1,29 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: All SGLang tests (base sglang files + sglang-marked tests anywhere) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +SGLANG_PATHS=( + "unit/models/generation/test_sglang_generation.py" + "unit/models/generation/test_sglang_utils.py" +) + +# Base run on sglang files (picks up unmarked tests) +uv run --no-sync bash -x ./tests/run_unit.sh "${SGLANG_PATHS[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated + +# sglang-only across all unit tests (catch-all) +uv run --extra sglang bash -x ./tests/run_unit.sh "unit/" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --sglang-only diff --git a/tests/unit/L0_Unit_Tests_Vllm_1.sh b/tests/unit/L0_Unit_Tests_Vllm_1.sh new file mode 100644 index 0000000000..08e4e7acda --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Vllm_1.sh @@ -0,0 +1,24 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: vLLM generation tests (base + vllm-marked) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +# Base run (tests without extra markers) +uv run --no-sync bash -x ./tests/run_unit.sh "unit/models/generation/test_vllm*.py" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=0 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated + +# vllm-only run (catch-all across all unit tests) +uv run --extra vllm bash -x ./tests/run_unit.sh "unit/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=0 --num-shards=3 --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only diff --git a/tests/unit/L0_Unit_Tests_Vllm_2.sh b/tests/unit/L0_Unit_Tests_Vllm_2.sh new file mode 100644 index 0000000000..39f6a2a287 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Vllm_2.sh @@ -0,0 +1,24 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: vLLM generation tests (base + vllm-marked) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +# Base run (tests without extra markers) +uv run --no-sync bash -x ./tests/run_unit.sh "unit/models/generation/test_vllm*.py" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=1 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated + +# vllm-only run (catch-all across all unit tests) +uv run --extra vllm bash -x ./tests/run_unit.sh "unit/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=1 --num-shards=3 --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only diff --git a/tests/unit/L0_Unit_Tests_Vllm_3.sh b/tests/unit/L0_Unit_Tests_Vllm_3.sh new file mode 100644 index 0000000000..bdeac8a678 --- /dev/null +++ b/tests/unit/L0_Unit_Tests_Vllm_3.sh @@ -0,0 +1,24 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash +# Shard: vLLM generation tests (base + vllm-marked) + +source "$(dirname "${BASH_SOURCE[0]}")/run_unit_shard_common.sh" + +# Base run (tests without extra markers) +uv run --no-sync bash -x ./tests/run_unit.sh "unit/models/generation/test_vllm*.py" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=2 --num-shards=3 --cov=nemo_rl --cov-report=term-missing --cov-report=json --hf-gated + +# vllm-only run (catch-all across all unit tests) +uv run --extra vllm bash -x ./tests/run_unit.sh "unit/" "${EXCLUDED_UNIT_TESTS[@]}" --shard-id=2 --num-shards=3 --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --vllm-only diff --git a/tests/unit/data/__init__.py b/tests/unit/data/__init__.py new file mode 100644 index 0000000000..4fc25d0d3c --- /dev/null +++ b/tests/unit/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py index e60253c50a..3b4ef1dede 100644 --- a/tests/unit/experience/test_rollouts.py +++ b/tests/unit/experience/test_rollouts.py @@ -946,7 +946,13 @@ def _standardize(d: dict) -> dict: final_batch["total_reward"] = final_batch["total_reward"].tolist() final_batch["loss_multiplier"] = final_batch["loss_multiplier"].tolist() final_batch["length"] = final_batch["length"].tolist() - final_batch["truncated"] = final_batch["truncated"].tolist() + # truncated depends on exact generation output which is not reproducible, + # so just verify each value is a bool rather than checking exact values + if "truncated" in final_batch: + assert all( + isinstance(v, (bool, int)) for v in final_batch["truncated"].tolist() + ) + final_batch.pop("truncated") for key in d["rollout_metrics"]: # We remove these fields from comparison since we cannot guarantee exact generation reproducibility diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index c8d1a6c156..1b0b06cdb6 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -146,6 +146,17 @@ } +def skip_fp8_known_failures() -> None: + device_name = torch.cuda.get_device_name() + if any(gpu_name in device_name for gpu_name in ("H100", "GB200")): + # TODO(https://github.com/NVIDIA-NeMo/RL/issues/2081): Re-enable these + # FP8 vLLM tests once the known H100/GB200 failures are fixed. + pytest.skip( + f"Skipping FP8 vLLM test on {device_name} due to a known failure. " + "See https://github.com/NVIDIA-NeMo/RL/issues/2081" + ) + + @pytest.mark.parametrize( "colocated,async_engine,expected_method,expected_kwargs", [ @@ -981,37 +992,37 @@ async def run_hf_train_process( lm_policy.shutdown() -@pytest.mark.timeout(420) @pytest.mark.asyncio @pytest.mark.parametrize( ("async_engine", "cpu_offload", "vllm_precision", "enable_lora"), [ - (True, False, "bfloat16", False), - (False, True, "bfloat16", False), - (True, False, "fp8", False), - (False, True, "fp8", False), - # LoRA tests (requires dtensor v2 / automodel) - pytest.param(False, False, "bfloat16", True, marks=pytest.mark.automodel), - pytest.param(True, False, "bfloat16", True, marks=pytest.mark.automodel), + pytest.param(True, False, "bfloat16", False, marks=pytest.mark.timeout(900)), + pytest.param(False, True, "bfloat16", False, marks=pytest.mark.timeout(900)), + pytest.param(True, False, "fp8", False, marks=pytest.mark.timeout(900)), + pytest.param(False, True, "fp8", False, marks=pytest.mark.timeout(900)), + # LoRA tests require dtensor v2 / automodel and take longer in CI. + pytest.param( + False, + False, + "bfloat16", + True, + marks=[pytest.mark.automodel, pytest.mark.timeout(900)], + ), + pytest.param( + True, + False, + "bfloat16", + True, + marks=[pytest.mark.automodel, pytest.mark.timeout(900)], + ), ], ) async def test_vllm_generation_with_hf_training_colocated( cluster, tokenizer, async_engine, cpu_offload, vllm_precision, enable_lora ): """This test validates that DTensor policy can work together with colocated vLLM policy.""" - device_name = torch.cuda.get_device_name(0) - if vllm_precision == "fp8" and "GB200" in device_name: - pytest.skip( - "Skipping FP8 test on GB200 until fixed. See https://github.com/NVIDIA-NeMo/RL/issues/2081" - ) - - # Skip the fp8 tests if the GPU is not H100 or newer (compute capability < 9.0) if vllm_precision == "fp8": - major_capability, _ = torch.cuda.get_device_capability() - if major_capability < 9: - pytest.skip( - f"Skipping FP8 test. GPU compute capability {major_capability}.0 is < 9.0 (H100 required)." - ) + skip_fp8_known_failures() # Create VllmGeneration Policy print("Creating vLLM policy...") @@ -1052,20 +1063,31 @@ async def test_vllm_generation_with_hf_training_colocated( ) -@pytest.mark.timeout(300) @pytest.mark.asyncio @pytest.mark.parametrize( ("async_engine", "cpu_offload", "vllm_precision", "enable_lora"), [ - (True, False, "bfloat16", False), - (False, True, "bfloat16", False), + pytest.param(True, False, "bfloat16", False, marks=pytest.mark.timeout(900)), + pytest.param(False, True, "bfloat16", False, marks=pytest.mark.timeout(900)), # NOTE: non-colocated FP8 tests fail on main as of 3/9/2026 with # avg_prob_mult_error=1.13 > 1.08 threshold. Left unskipped to match main. - (True, False, "fp8", False), - (False, True, "fp8", False), - # LoRA tests (requires dtensor v2 / automodel) - pytest.param(False, False, "bfloat16", True, marks=pytest.mark.automodel), - pytest.param(True, False, "bfloat16", True, marks=pytest.mark.automodel), + pytest.param(True, False, "fp8", False, marks=pytest.mark.timeout(900)), + pytest.param(False, True, "fp8", False, marks=pytest.mark.timeout(900)), + # LoRA tests require dtensor v2 / automodel and take longer in CI. + pytest.param( + False, + False, + "bfloat16", + True, + marks=[pytest.mark.automodel, pytest.mark.timeout(900)], + ), + pytest.param( + True, + False, + "bfloat16", + True, + marks=[pytest.mark.automodel, pytest.mark.timeout(900)], + ), ], ) async def test_vllm_generation_with_hf_training_non_colocated( @@ -1076,19 +1098,8 @@ async def test_vllm_generation_with_hf_training_non_colocated( vllm_precision, enable_lora, ): - device_name = torch.cuda.get_device_name(0) - if vllm_precision == "fp8" and "GB200" in device_name: - pytest.skip( - "Skipping FP8 test on GB200 until fixed. See https://github.com/NVIDIA-NeMo/RL/issues/2081" - ) - - # Skip the fp8 tests if the GPU is not H100 or newer (compute capability < 9.0) if vllm_precision == "fp8": - major_capability, _ = torch.cuda.get_device_capability() - if major_capability < 9: - pytest.skip( - f"Skipping FP8 test. GPU compute capability {major_capability}.0 is < 9.0 (H100 required)." - ) + skip_fp8_known_failures() """This test validates that DTensor policy can work together with non-colocated vLLM policy.""" generation_cluster_separate = get_generation_cluster_separate(1) @@ -1714,25 +1725,15 @@ async def test_vllm_http_server_correct_merged_tokens_matches_baseline( vllm_generation.shutdown() -@pytest.mark.timeout(600) +@pytest.mark.timeout(900) @pytest.mark.parametrize("tensor_parallel_size", [1, 2]) @pytest.mark.parametrize("vllm_precision", ["bfloat16", "fp8"]) def test_vllm_weight_update_and_prefix_cache_reset( cluster, tokenizer, tensor_parallel_size, vllm_precision ): """Test that the vLLM prefix cache is correctly reset when weights change.""" - device_name = torch.cuda.get_device_name(0) - if vllm_precision == "fp8" and "GB200" in device_name: - pytest.skip( - "Skipping FP8 test on GB200 until fixed. See https://github.com/NVIDIA-NeMo/RL/issues/2081" - ) - if vllm_precision == "fp8": - major_capability, _ = torch.cuda.get_device_capability() - if major_capability < 9: - pytest.skip( - f"Skipping FP8 test. GPU compute capability {major_capability}.0 is < 9.0 (H100 required)." - ) + skip_fp8_known_failures() from nemo_rl.models.policy.lm_policy import Policy @@ -2130,7 +2131,7 @@ async def test_vllm_refit_non_colocated_update_weights( @pytest.mark.mcore -@pytest.mark.timeout(360) +@pytest.mark.timeout(600) @pytest.mark.parametrize("tensor_parallel_size", [1, 2]) @pytest.mark.parametrize("vllm_precision", ["bfloat16", "fp8"]) @pytest.mark.parametrize("kv_cache_dtype", [None, "fp8"]) @@ -2141,24 +2142,13 @@ def test_vllm_generation_with_megatron_training( This test validates that vLLM and Megatron policies can work together. """ - device_name = torch.cuda.get_device_name(0) - if vllm_precision == "fp8" and "GB200" in device_name: - pytest.skip( - "Skipping FP8 test on GB200 until fixed. See https://github.com/NVIDIA-NeMo/RL/issues/2081" - ) + if vllm_precision == "fp8": + skip_fp8_known_failures() # Skip invalid configurations: kv_cache_dtype=fp8 requires precision=fp8 if kv_cache_dtype == "fp8" and vllm_precision != "fp8": pytest.skip("kv_cache_dtype='fp8' requires precision='fp8'") - # Skip the fp8 tests if the GPU is not H100 or newer (compute capability < 9.0) - if vllm_precision == "fp8": - major_capability, _ = torch.cuda.get_device_capability() - if major_capability < 9: - pytest.skip( - f"Skipping FP8 test. GPU compute capability {major_capability}.0 is < 9.0 (H100 required)." - ) - if cluster.num_gpus_per_node < tensor_parallel_size: pytest.skip(f"Need at least {tensor_parallel_size} GPUs for this test") @@ -2321,19 +2311,8 @@ def test_vllm_generation_with_megatron_training_moe_model( This test validates that vLLM and Megatron policies can work together. """ - device_name = torch.cuda.get_device_name(0) - if vllm_precision == "fp8" and "GB200" in device_name: - pytest.skip( - "Skipping FP8 test on GB200 until fixed. See https://github.com/NVIDIA-NeMo/RL/issues/2081" - ) - - # Skip the fp8 tests if the GPU is not H100 or newer (compute capability < 9.0) if vllm_precision == "fp8": - major_capability, _ = torch.cuda.get_device_capability() - if major_capability < 9: - pytest.skip( - f"Skipping FP8 test. GPU compute capability {major_capability}.0 is < 9.0 (H100 required)." - ) + skip_fp8_known_failures() model_name = "moonshotai/Moonlight-16B-A3B-Instruct" expert_parallel_size = 8 diff --git a/tests/unit/models/policy/__init__.py b/tests/unit/models/policy/__init__.py new file mode 100644 index 0000000000..4fc25d0d3c --- /dev/null +++ b/tests/unit/models/policy/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py index 4043e3c8a3..a1737de3bd 100644 --- a/tests/unit/models/policy/test_dtensor_worker.py +++ b/tests/unit/models/policy/test_dtensor_worker.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import pprint -import time import pytest import ray @@ -27,6 +26,7 @@ from nemo_rl.models.generation import configure_generation_config from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.lm_policy import Policy +from nemo_rl.utils.flops_tracker import FLOPTracker, get_default_hf_config from tests.unit.test_utils import SimpleLossFn @@ -1046,7 +1046,7 @@ def test_dtensor_v1_policy_flops_range_check( ): """Test that the returned FLOPS is within a reasonable range using dtensor backend. - Performs 2 warmup iterations and measures FLOPS for the next 3 iterations. + Performs 2 warmup iterations and checks FLOPS for the next 3 iterations. """ batch_size = 8 seq_len = 128 @@ -1101,12 +1101,9 @@ def test_dtensor_v1_policy_flops_range_check( for warmup_step in range(2): results = policy.train(data, loss_fn) - # Measure FLOPS on the third iteration - print("Measuring FLOPS on 3 iterations...") - time_begin = time.time() + print("Checking FLOPS on 3 iterations...") for train_step in range(3): results = policy.train(data, loss_fn) - runtime_sec = time.time() - time_begin # Check if FLOPS tracking is available if policy.flops_tracker is not None: @@ -1120,14 +1117,19 @@ def test_dtensor_v1_policy_flops_range_check( ) assert total_flops > 0, "total_flops should be positive" - total_tflops = total_flops / 1e12 / 3 - print(f"Total FLOPS: {total_flops:.2e} ({total_tflops:.4f} TFLOPS)") + expected_tracker = FLOPTracker.from_config( + config["model_name"], get_default_hf_config(config["model_name"]) + ) + expected_tracker.track_batch(input_lengths.tolist()) + expected_total_flops = expected_tracker.total_flops - flop_count_total = total_flops * runtime_sec - assert 1e9 < flop_count_total < 5e10, ( - "Total FLOPS should be within 1e9 and 5e10" + assert total_flops == pytest.approx(expected_total_flops, rel=0.05), ( + f"Expected {expected_total_flops:.2e} FLOPS, got {total_flops:.2e}" ) + total_tflops = total_flops / 1e12 + print(f"Total FLOPS: {total_flops:.2e} ({total_tflops:.4f} TFLOPS)") + if "theoretical_tflops" in results: theoretical_tflops = results["theoretical_tflops"] assert isinstance(theoretical_tflops, (int, float)), ( diff --git a/tests/unit/run_unit_shard_common.sh b/tests/unit/run_unit_shard_common.sh new file mode 100644 index 0000000000..3ca50b3f65 --- /dev/null +++ b/tests/unit/run_unit_shard_common.sh @@ -0,0 +1,32 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Common boilerplate for unit test shard scripts. +# Source this file at the top of each L0_Unit_Tests_*.sh shard script. +# It sets up: SCRIPT_DIR, PROJECT_ROOT, FAST exclusions, and test assets. + +set -xeuo pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) + +cd ${PROJECT_ROOT} + +# Source exclusion list for FAST mode +EXCLUDED_UNIT_TESTS=() +if [[ "${FAST:-0}" == "1" ]]; then + source ${SCRIPT_DIR}/excluded_unit_tests.sh +fi + +uv run tests/unit/prepare_unit_test_assets.py diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 760ce027b5..f8a5bdc5d1 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -326,28 +326,3 @@ def test_all_recipes_start_with_algo_hyphen(all_recipe_yaml_rel_paths): assert algo in expected_algos, ( f"Recipe {recipe_yaml} has unexpected algo {algo}" ) - - -def test_functional_tests_exist(): - functional_tests_dir = os.path.join(project_root, "tests", "functional") - - test_list = [] - with open( - os.path.join(functional_tests_dir, "L1_Functional_Tests_GPU.sh"), "r" - ) as f: - for line in f: - line = line.strip() - if line and "./tests/functional" in line: - test_list.append(line.split(" ")[-1].split("/")[-1]) - - missing_list = [] - for filename in os.listdir(functional_tests_dir): - if filename.endswith(".sh"): - if filename == "L1_Functional_Tests_GPU.sh": - continue - if filename not in test_list: - missing_list.append(f"./tests/functional/{filename}") - - assert len(missing_list) == 0, ( - f"Missing functional test scripts in ./tests/functional/L1_Functional_Tests_GPU.sh:\n{'\n'.join(missing_list)}" - ) diff --git a/uv.lock b/uv.lock index 7d182cf205..3cb7b14dff 100644 --- a/uv.lock +++ b/uv.lock @@ -4014,6 +4014,7 @@ test = [ { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, + { name = "pytest-shard" }, { name = "pytest-testmon" }, { name = "pytest-timeout" }, ] @@ -4146,6 +4147,7 @@ test = [ { name = "pytest", specifier = ">=8.4.2" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, + { name = "pytest-shard" }, { name = "pytest-testmon" }, { name = "pytest-timeout" }, ] @@ -5717,6 +5719,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" }, ] +[[package]] +name = "pytest-shard" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/ca/3efa6f3b84dab83220db45997e785be726684c2c2c4267bffb7d80101c7f/pytest-shard-0.1.2.tar.gz", hash = "sha256:b86a967fbfd1c8e50295095ccda031b7e890862ee06531d5142844f4c1d1cd67", size = 3579, upload-time = "2020-12-11T19:52:55.083Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/7a/dbeb4c54e9fc3b59622f410091365f354a69cda1af10c3b83ac0ca6e6f4f/pytest_shard-0.1.2-py3-none-any.whl", hash = "sha256:407a1df385cebe1feb9b4d2e7eeee8b044f8a24f0919421233159a17c59be2b9", size = 4608, upload-time = "2020-12-11T19:52:54.226Z" }, +] + [[package]] name = "pytest-testmon" version = "2.2.0"