diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2188c65aa0b..9655ebf552a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,67 +1,13 @@ -megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo - -megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt - -megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal - -megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba -megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba - -megatron/core/models/hybrid/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-model - -megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets - -megatron/core/tokenizers/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/tokenizers - -megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp - -megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp - -megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing - -megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer - -megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference - -megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets - -megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism - -megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/transformer - -megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech - -megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference - -megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo - -megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training - -megatron/post_training/ @NVIDIA/post-training - -megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs - -megatron/training/ @NVIDIA/training-adlr @NVIDIA/training-nemo -megatron/training/arguments.py +* @NVIDIA/core-nemo @NVIDIA/core-devtech .gitlab/ @NVIDIA/ci .github/ @NVIDIA/ci .github/oncall_schedule.json @NVIDIA/mcore-oncall-rotation .gitlab-ci.yml @NVIDIA/ci docker/ @NVIDIA/ci +tests/unit_tests/run_ci_test.sh @NVIDIA/ci +tests/test_utils/python_scripts/ tests/functional_tests/python_test_utils/ @NVIDIA/ci tests/functional_tests/shell_test_utils/ @NVIDIA/ci -tests/test_utils/recipes/ @NVIDIA/ci -tests/unit_tests/run_ci_test.sh @NVIDIA/ci - -# API Backwards Compatibility Check -scripts/check_api_backwards_compatibility.py @NVIDIA/ci -scripts/README_API_COMPAT.md @NVIDIA/ci -.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci -docs/api-backwards-compatibility-check.md @NVIDIA/ci -tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci - -megatron/rl/ @NVIDIA/reinforcement-learning -examples/rl/ @NVIDIA/reinforcement-learning -test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning -train_rl.py @NVIDIA/reinforcement-learning +pyproject.toml @NVIDIA/ci +uv.lock @NVIDIA/ci diff --git a/.github/scripts/sync_team_usergroups.py b/.github/scripts/sync_team_usergroups.py index c5f40f5fe33..01ef49c9e0a 100644 --- a/.github/scripts/sync_team_usergroups.py +++ b/.github/scripts/sync_team_usergroups.py @@ -19,12 +19,12 @@ Slack user groups to match. """ +import argparse import os import re import sys -import argparse -import requests +import requests from slack_sdk import WebClient from slack_sdk.errors import SlackApiError @@ -53,10 +53,7 @@ def get_headers(): print("Error: GH_TOKEN or GITHUB_TOKEN not set") sys.exit(1) - return { - "Authorization": f"token {token}", - "Accept": "application/vnd.github.v3+json", - } + return {"Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json"} def get_org(): @@ -215,9 +212,7 @@ def get_user_email(username): # Check Signed-off-by lines in the commit message for @nvidia.com emails message = commit_data.get('message', '') - sob_matches = re.findall( - r'Signed-off-by:.*<([^>]+@nvidia\.com)>', message - ) + sob_matches = re.findall(r'Signed-off-by:.*<([^>]+@nvidia\.com)>', message) if sob_matches: _email_cache[username] = sob_matches[0] print(f"Found @nvidia.com email for {username} from Signed-off-by") @@ -339,21 +334,14 @@ def create_slack_usergroup(slack_client, handle, team_slug): try: print(f"Creating Slack usergroup '@{handle}' with name '{name}'...") - response = slack_client.usergroups_create( - name=name, - handle=handle, - description=description, - ) + response = slack_client.usergroups_create(name=name, handle=handle, description=description) usergroup = response.get("usergroup", {}) usergroup_id = usergroup.get("id") if usergroup_id: # Update cache with new usergroup if _usergroups_cache is not None: - _usergroups_cache[handle] = { - "id": usergroup_id, - "users": [], - } + _usergroups_cache[handle] = {"id": usergroup_id, "users": []} print(f"Successfully created Slack usergroup '@{handle}'") return usergroup_id else: @@ -446,9 +434,7 @@ def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False): # 5. Update the usergroup try: - slack_client.usergroups_users_update( - usergroup=usergroup_id, users=slack_user_ids - ) + slack_client.usergroups_users_update(usergroup=usergroup_id, users=slack_user_ids) print(f"\nSuccessfully updated '@{usergroup_handle}' with {len(slack_user_ids)} members") return True except SlackApiError as e: @@ -530,18 +516,12 @@ def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None): def main(): - parser = argparse.ArgumentParser( - description="Sync GitHub team membership to Slack user groups" - ) + parser = argparse.ArgumentParser(description="Sync GitHub team membership to Slack user groups") parser.add_argument( - "--dry-run", - action="store_true", - help="Show what would be done without making changes", + "--dry-run", action="store_true", help="Show what would be done without making changes" ) parser.add_argument( - "--list", - action="store_true", - help="List all configured team-to-usergroup mappings", + "--list", action="store_true", help="List all configured team-to-usergroup mappings" ) parser.add_argument( "--parent-team", @@ -559,8 +539,7 @@ def main(): dest="direct_teams", metavar="SLUG", help=( - "Sync this GitHub team directly (can be repeated). " - f"Defaults to: {DIRECT_TEAM_SLUGS}" + "Sync this GitHub team directly (can be repeated). " f"Defaults to: {DIRECT_TEAM_SLUGS}" ), ) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 1e62f6b3016..35eb570296d 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -78,8 +78,8 @@ jobs: IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} run: | - # Skip SSO check for scheduled jobs, main branch, or merge groups - if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then + # Skip SSO check for scheduled jobs, main branch, dev branch, or merge groups + if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi diff --git a/.github/workflows/mirror-to-main.yml b/.github/workflows/mirror-to-main.yml new file mode 100644 index 00000000000..cb77851942b --- /dev/null +++ b/.github/workflows/mirror-to-main.yml @@ -0,0 +1,129 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Mirror Dev to Main + +on: + push: + branches: + - "pull-request/[0-9]+" + +jobs: + cherry-pick-to-main: + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.PAT }} + + - name: Get PR info + id: get-pr-info + uses: nv-gha-runners/get-pr-info@main + + - name: Configure Git + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "GitHub Actions Bot" + + - name: Cherry-pick to main + env: + GH_TOKEN: ${{ secrets.PAT }} + run: | + set -x + + PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} + BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" + HAS_MIRROR_MAIN_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "mirror-to-main")' || echo "false") + TARGET_BRANCH="cherry-pick-$PR_NUMBER-into-main" + + # Skip if not labeled with mirror-to-main + if [ "$HAS_MIRROR_MAIN_LABEL" != "true" ]; then + echo "PR is not labeled with mirror-to-main, will not mirror to main." + exit 0 + fi + + # Skip if not targeting dev + if [ "$BASE_REF" != "dev" ]; then + echo "PR is not targeting dev, will not mirror to main." + exit 0 + fi + + # Check if target branch already exists + if git ls-remote --heads origin "refs/heads/$TARGET_BRANCH" | grep -q .; then + echo "Target branch already exists, will not cherry-pick again." + exit 0 + fi + + # Get PR details + PR_AUTHOR="${{ fromJSON(steps.get-pr-info.outputs.pr-info).user.login }}" + PR_TITLE="${{ fromJSON(steps.get-pr-info.outputs.pr-info).title }}" + SOURCE_BRANCH="${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.ref }}" + SOURCE_REPO="${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.repo.full_name }}" + + # Fetch all branches + git fetch origin dev + + # Handle forks vs same repo + if [ "$SOURCE_REPO" = "${{ github.repository }}" ]; then + git fetch origin "$SOURCE_BRANCH" + git checkout "$SOURCE_BRANCH" + else + git fetch "https://github.com/$SOURCE_REPO.git" "$SOURCE_BRANCH" + git checkout FETCH_HEAD + fi + + # Find commit range to cherry-pick + START_COMMIT=$(git merge-base origin/dev HEAD) + END_COMMIT=$(git rev-parse HEAD) + + # Create cherry-pick branch from main + git fetch origin main + git checkout main + git checkout -b "$TARGET_BRANCH" + + # Cherry-pick commits + if ! git cherry-pick "$START_COMMIT..$END_COMMIT"; then + # Comment on the original PR about the failure + COMMENT_BODY=$(cat <<'EOF' + ❌ **Cherry-pick to main failed** + + The cherry-pick encountered conflicts and could not be completed automatically. + + **Next steps:** + 1. Manually create a PR with these changes to main + 2. Resolve any conflicts + EOF + ) + + gh pr comment $PR_NUMBER --body "$COMMENT_BODY" + exit 1 + fi + + # Push branch + git push -u origin "$TARGET_BRANCH" + + # Create PR to main + gh pr create \ + --base main \ + --head "$TARGET_BRANCH" \ + --title "cp: \`$PR_TITLE ($PR_NUMBER)\` into \`main\`" \ + --body "[πŸ€–]: Hi @$PR_AUTHOR πŸ‘‹

We've cherry-picked \`$PR_TITLE (#$PR_NUMBER)\` into \`main\` for you! πŸš€

Please review and approve this cherry-pick at your convenience!" \ + --label "cherry-pick" \ + --reviewer "$PR_AUTHOR" + diff --git a/.github/workflows/multi-approval-bot.yml b/.github/workflows/multi-approval-bot.yml deleted file mode 100644 index c7477679201..00000000000 --- a/.github/workflows/multi-approval-bot.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: "Codeowners Approval Workflow" - -on: - push: - branches: - - "pull-request/[0-9]+" - merge_group: - types: [checks_requested] - -jobs: - pre-flight: - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2 - if: github.repository == 'NVIDIA/Megatron-LM' - - codeowners-approval: - needs: [pre-flight] - runs-on: ubuntu-latest - if: | - !(needs.pre-flight.outputs.docs_only == 'true' - || needs.pre-flight.outputs.is_merge_group == 'true' - || needs.pre-flight.outputs.is_deployment_workflow == 'true') - steps: - - name: Get PR info - id: get-pr-info - if: startsWith(github.ref, 'refs/heads/pull-request/') - uses: nv-gha-runners/get-pr-info@main - - - name: Checkout action - uses: actions/checkout@v6 - with: - repository: noamelf/codeowner-multi-approval-action - ref: v0.1 - path: codeowner-multi-approval-action - - - name: Check Codeowners Approval - uses: ./codeowner-multi-approval-action - with: - pr-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} - repo-name: ${{ github.repository }} - github-token: ${{ secrets.PAT }} - - multi-approval-bot-summary: - needs: [pre-flight, codeowners-approval] - if: | - ( - needs.pre-flight.outputs.docs_only == 'true' - || needs.pre-flight.outputs.is_merge_group == 'true' - || needs.pre-flight.outputs.is_deployment_workflow == 'true' - || always() - ) - && github.repository == 'NVIDIA/Megatron-LM' - && !cancelled() - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v6 - - - name: Result - env: - GH_TOKEN: ${{ github.token }} - GITHUB_RUN_ID: ${{ github.run_id }} - SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }} - run: | - FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 - - if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then - echo "βœ… All previous jobs completed successfully" - exit 0 - else - echo "❌ Found $FAILED_JOBS failed job(s)" - # Show which jobs failed - gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' - exit 1 - fi diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml index 1d35494dcd6..e00ce8afc36 100644 --- a/.gitlab/stages/00.pre.yml +++ b/.gitlab/stages/00.pre.yml @@ -71,6 +71,7 @@ pre:create_ci_branches_dev: - branch: ci-dev-rebuild-mcore-nemo-image - branch: ci-dev-mr - branch: ci-dev-nightly + - branch: ci-dev-weekly - branch: ci-dev-upgrade-dependencies tags: - arch/amd64 diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml index 55c4d740659..002c96e7c0f 100644 --- a/.gitlab/stages/04.functional-tests.yml +++ b/.gitlab/stages/04.functional-tests.yml @@ -255,7 +255,7 @@ functional:x_notify: - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE} - export GITLAB_ENDPOINT - export CONTEXT=$FUNCTIONAL_TEST_SCOPE - - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0") + - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]] && echo "1" || "0") - export TEAM_SLUG=$SLACK_ADMIN - | python tests/test_utils/python_scripts/notify.py \ @@ -269,7 +269,7 @@ functional:x_notify: paths: - scripts rules: - - if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main") && $FUNCTIONAL_TEST == "yes" + - if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev") && $FUNCTIONAL_TEST == "yes" when: always - when: never diff --git a/README.md b/README.md index 9a62f9bb750..b22a8d0e8f6 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,31 @@
-Megatron-LM and Megatron Core -============================= +Megatron-LM & Megatron Core +===========================

GPU-optimized library for training transformer models at scale

-[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html) -[![version](https://img.shields.io/badge/release-0.15.0-green)](./CHANGELOG.md) +[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://docs.nvidia.com/Megatron-Core/developer-guide/latest/index.html) +[![version](https://img.shields.io/badge/release-0.12.0-green)](./CHANGELOG.md) [![license](https://img.shields.io/badge/license-Apache-blue)](./LICENSE)
-## About +> ## 🚨 **DEVELOPMENT BRANCH** +> ⚠️ **EXPERIMENTAL FEATURES** - This is the **dev branch** with experimental features. +> +> **β†’ For releases and comprehensive documentation, visit the [main branch](https://github.com/NVIDIA/Megatron-LM)** -This repository contains two components: **Megatron-LM** and **Megatron Core**. +## ⚑ Quickstart -**Megatron-LM** is a reference example that includes Megatron Core plus pre-configured training scripts. Best for research teams, learning distributed training, and quick experimentation. +```bash +# Clone the dev branch +git clone -b dev https://github.com/NVIDIA/Megatron-LM.git +cd Megatron-LM + +# Install from source with dev dependencies (includes transformer_engine) +pip install -e .[mlm,dev] +``` **Megatron Core** is a composable library with GPU-optimized building blocks for custom training frameworks. It provides transformer building blocks, advanced parallelism strategies (TP, PP, DP, EP, CP), mixed precision support (FP16, BF16, FP8, FP4), and model architectures. Best for framework developers and ML engineers building custom training pipelines. @@ -58,14 +68,21 @@ For NGC container setup and all installation options, see the **[Installation Gu - **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
-Previous News +Table of Contents -- **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)). -- **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba). -- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. +**Getting Started** +- [⚑ Quick Start](#-quick-start) +- [🧠 Dev Branch Philosophy](#-dev-branch-philosophy) +- [πŸ“Š Performance & Benchmarking](#-performance--benchmarking) +- [πŸ‘₯ Community & Support](#-community--support) + +**For Complete Documentation** β†’ [Main Branch](https://github.com/NVIDIA/Megatron-LM) | [Official Docs](https://docs.nvidia.com/Megatron-Core/)
+ +## Dev Branch Philosophy + # Project Structure ``` @@ -128,17 +145,32 @@ We also strong scaled the standard GPT-3 model (our version has slightly more th # Roadmaps -- **[MoE Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements +### Fast Iteration +- **Streamlined Review**: 1 code owner + 1 dev approver (can delegate review) + CI/CD + +### Feature Lifecycle (Coming Soon) +- **6-Month Timeline**: Experimental features must graduate to stable or be deprecated +- **Migration Support**: Assistance provided for feature transitions + +### Stability Expectations +- **Experimental Nature**: Features may change or be removed as development progresses +- **Testing**: All features will pass convergence and performance validation before inclusion +- **Support**: Dev branch issues should include `[DEV]` prefix # Resources -## Getting Help +## Performance & Benchmarking -- πŸ“– **[Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html)** - Official documentation -- πŸ› **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests +- πŸš€ [2025/11] [Optimizing DeepSeek-V3 Training Performance on NVIDIA GB200 NVL72](docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-optimization.md). +- ⚑ [2025/11] [A Guide to Reproduce DeepSeek-V3 Pre-training Performance on GB200](docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md). -## Contributing +## Community & Support +### Getting Help +- πŸ“– **[Documentation](https://docs.nvidia.com/Megatron-Core/)** - Official documentation +- πŸ› **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests + +### Contributing We ❀️ contributions! Ways to contribute: - πŸ› **Report bugs** - Help us improve reliability @@ -146,12 +178,9 @@ We ❀️ contributions! Ways to contribute: - πŸ“ **Improve docs** - Make Megatron Core more accessible - πŸ”§ **Submit PRs** - Contribute code improvements -**β†’ [Contributing Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/developer/contribute.html)** - -## Citation - -If you use Megatron in your research or project, we appreciate that you use the following citations: +**β†’ [Contributing Guide](./CONTRIBUTING.md)** +### Citation ```bibtex @article{megatron-lm, title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism}, diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev index 5e3d7419b3a..7f3a5c0552a 100644 --- a/docker/Dockerfile.ci.dev +++ b/docker/Dockerfile.ci.dev @@ -16,7 +16,7 @@ ENV UV_LINK_MODE=copy RUN bash -ex <<"EOF" apt-get update - apt-get install -y --no-install-recommends gettext python3-venv psmisc uuid-runtime + apt-get install -y --no-install-recommends gettext python3-venv psmisc uuid-runtime apt-get clean python -m venv /opt/jet ARCH=$(uname -m) @@ -31,17 +31,21 @@ RUN bash -ex <<"EOF" curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh EOF +RUN ln -sf /usr/local/cuda/targets/x86_64-linux/include/cuda \ + /usr/local/include/cuda +RUN find /usr/local/cuda -name "utility" 2>/dev/null | head -5 && \ + ls /usr/local/cuda/targets/x86_64-linux/include/ | head -20 + COPY README.md pyproject.toml uv.lock /workspace/ COPY megatron/core/__init__.py /workspace/megatron/core/ COPY megatron/core/package_info.py /workspace/megatron/core/ ARG IMAGE_TYPE=dev -ENV NVTE_BUILD_NUM_PHILOX_ROUNDS=3 RUN --mount=type=cache,target=/root/.cache/uv \ bash -ex <<"EOF" export NVTE_CUDA_ARCHS="80;90;100" uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages uv sync --only-group build - uv sync --extra ${IMAGE_TYPE} --extra mlm --link-mode copy --locked \ + uv sync --extra ${IMAGE_TYPE} --extra mlm --group no_pypi_wheels --link-mode copy --locked \ --no-install-package torch \ --no-install-package torchvision \ --no-install-package triton \ @@ -71,7 +75,7 @@ RUN bash -ex <<"EOF" git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git pushd DeepEP - git checkout 34152ae28f80bcc3ee38d7a12cb2ad87cfd4ea72 + git checkout eb9cee7de5a24193bf09500668d3a619d3d3f3fb patch -p1 < /workspace/deepep.patch popd TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/. diff --git a/docs/add_copyright_header.py b/docs/add_copyright_header.py index 9694ef84819..9bc4481c506 100644 --- a/docs/add_copyright_header.py +++ b/docs/add_copyright_header.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + #!/usr/bin/env python3 """One-off script to add NVIDIA copyright header to all .md files under docs/.""" diff --git a/docs/conf.py b/docs/conf.py index 47532648b15..26b618b1eac 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,7 +20,6 @@ import os import sys - # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information @@ -84,24 +83,17 @@ # This is a workaround that uses the parser located in autodoc2_docstrings_parser.py to allow autodoc2 to # render google style docstrings. # Related Issue: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33 - autodoc2_docstring_parser_regexes = [ - (r".*", "docs.autodoc2_docstrings_parser"), - ] + autodoc2_docstring_parser_regexes = [(r".*", "docs.autodoc2_docstrings_parser")] # Regex patterns whose values contain raw regex syntax (e.g. \p{L}) that docutils # mis-parses as footnote/reference markup. Exclude them from the generated docs. - autodoc2_hidden_regexes = [ - r".*\._PATTERN_TIKTOKEN.*", - ] + autodoc2_hidden_regexes = [r".*\._PATTERN_TIKTOKEN.*"] # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = "nvidia_sphinx_theme" html_theme_options = { - "switcher": { - "json_url": "../versions1.json", - "version_match": release, - }, + "switcher": {"json_url": "../versions1.json", "version_match": release}, "icon_links": [ { "name": "GitHub", @@ -114,7 +106,4 @@ html_extra_path = ["project.json", "versions1.json"] # Github links are now getting rate limited from the Github Actions -linkcheck_ignore = [ - ".*github\\.com.*", - ".*githubusercontent\\.com.*", -] +linkcheck_ignore = [".*github\\.com.*", ".*githubusercontent\\.com.*"] diff --git a/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md b/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md new file mode 100644 index 00000000000..8fa3051e479 --- /dev/null +++ b/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md @@ -0,0 +1,358 @@ +--- +orphan: true +--- + +# A Guide to Reproduce DeepSeek-V3 Pre-training Performance on GB200 + +## 1. Dockerfile + +Requirements: +- Transformer Engine: We recommend using commit [d2945c6](https://github.com/NVIDIA/TransformerEngine/commit/d2945c6a571e3978677614d1fe08779966a5a4ef) with PR [2146](https://github.com/NVIDIA/TransformerEngine/pull/2146) and [2150](https://github.com/NVIDIA/TransformerEngine/pull/2150). You could prepare the branch by yourself, or use this [branch](https://github.com/hxbai/TransformerEngine/commits/dev_20251024/) based on TE v2.9 plus the above three commits/PRs. +- cuDNN: v9.14 is required. +- HybridEP: Install it from [here](https://github.com/deepseek-ai/DeepEP/commits/3f601f7ac1c062c46502646ff04c535013bfca00). + +Dockerfile for reference. + +```dockerfile +FROM nvcr.io/nvidia/pytorch:25.09-py3 AS base + +ENV SHELL=/bin/bash + +# ========================= +# Install system packages +# ========================= +RUN rm -rf /opt/megatron-lm && \ + apt-get update && \ + apt-get install -y sudo gdb bash-builtins git zsh autojump tmux curl gettext libfabric-dev && \ + wget https://github.com/mikefarah/yq/releases/download/v4.27.5/yq_linux_arm64 -O /usr/bin/yq && \ + chmod +x /usr/bin/yq + +# ========================= +# Install Python packages +# ========================= +# NOTE: `unset PIP_CONSTRAINT` to install packages that do not meet the default constraint in the base image. +# Some package requirements and related versions are from +# https://github.com/NVIDIA/Megatron-LM/blob/core_v0.12.0/Dockerfile.linting. +# https://github.com/NVIDIA/Megatron-LM/blob/core_v0.12.0/requirements_mlm.txt. +# https://github.com/NVIDIA/Megatron-LM/blob/core_v0.12.0/requirements_ci.txt. +RUN unset PIP_CONSTRAINT && pip install --no-cache-dir debugpy dm-tree torch_tb_profiler einops wandb \ + sentencepiece tokenizers transformers torchvision ftfy modelcards datasets tqdm pydantic \ + nvidia-pytriton py-spy yapf darker \ + tiktoken flask-restful \ + nltk wrapt pytest pytest_asyncio pytest-cov pytest_mock pytest-random-order \ + black==24.4.2 isort==5.13.2 flake8==7.1.0 pylint==3.2.6 coverage mypy \ + setuptools==69.5.1 + +# ========================= +# Install cudnn 9.14.0.64 for correct mxfp8 quantization and layernorm fusion +# ========================= +RUN apt-get update && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + apt-get update && \ + apt-get -y install libcudnn9-cuda-13 + +# ========================= +# Install latest TE +# Use a specific commit instead of main to make it more stable. +# This is based on release_v2.9 branch and contains some CPU and quantization optimizations. +# ========================= +ARG COMMIT="7dd3914726abb79bc99ff5a5db1449458ed64151" +ARG TE="git+https://github.com/hxbai/TransformerEngine.git@${COMMIT}" +RUN pip install nvidia-mathdx==25.1.1 && \ + unset PIP_CONSTRAINT && \ + NVTE_CUDA_ARCHS="100" NVTE_BUILD_THREADS_PER_JOB=8 NVTE_FRAMEWORK=pytorch pip install --no-build-isolation --no-cache-dir $TE + +# ========================= +# Install HybridEP +# ========================= +WORKDIR /home/ +RUN git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git && \ + cd DeepEP && git checkout 3f601f7ac1c062c46502646ff04c535013bfca00 && \ + TORCH_CUDA_ARCH_LIST="10.0" pip install --no-build-isolation . + +# ========================= +# Clean cache +# ========================= +RUN rm -rf /root/.cache /tmp/* +``` + +> [!Tip] +> +> If you prefer to use CUDA 12.9, please change the base container to `nvcr.io/nvidia/pytorch:25.06-py3` and the cuDNN to be installed to `libcudnn9-cuda-12`. + +## 2. Megatron-Core + +We recommend using the [dev branch](https://github.com/NVIDIA/Megatron-LM/tree/dev) after PR [1917](https://github.com/NVIDIA/Megatron-LM/pull/1917). + +```bash +git clone https://github.com/NVIDIA/Megatron-LM.git && \ +cd Megatron-LM && +git checkout effebd81f410bc6566fffee6c320b6f8f762e06d +``` + +## 3. Cluster Configuration + +Since we're using EP 32 on NVL72, it's important to make sure + +> [!Important] +> **Every 32 GB200 GPUs (8 nodes) are in the same NVL domain (or rack)**. + +Usually you can make it via your cluster workload manager. Taking Slurm as an example, you could pass `--segment 8` to the sbatch command to ensure that every segment of 8 nodes will be scheduled to a rack. + +## 4. Training scripts + +### Environment variables + +```bash +CUDA_DEVICE_MAX_CONNECTIONS=1 +NVTE_FWD_LAYERNORM_SM_MARGIN=0 +NVTE_BWD_LAYERNORM_SM_MARGIN=0 +NVLINK_DOMAIN_SIZE=72 +NVTE_ALLOW_NONDETERMINISTIC_ALGO=1 +PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +NCCL_NVLS_ENABLE=0 +NVTE_FUSED_ATTN=1 +NVTE_NORM_FWD_USE_CUDNN=1 +NVTE_NORM_BWD_USE_CUDNN=1 +PYTHONWARNINGS=ignore +NCCL_DEBUG=VERSION +NCCL_GRAPH_REGISTER=0 +``` + +### bindpcie + +Download [bindpcie](https://github.com/NVIDIA/mlperf-common/blob/main/client/bindpcie) to your workdir, make it executable, + +```bash +wget https://raw.githubusercontent.com/NVIDIA/mlperf-common/refs/heads/main/client/bindpcie && +chmod 755 bindpcie +``` + +and then + +> [!Important] +> **Place it at the beginning of your launch command in every process.** + +Taking Slurm as an example, your script should look like + +```bash +#!/bin/bash + +#SBATCH [... sbatch args] + +srun [... srun args] /path/to/bindpcie /path/to/pretrain_gpt.py [... mcore arguments] +``` + +This is a very important step on GB200. + +### Launch script + +```bash +/path/to/bindpcie \ +/path/to/megatron-lm/pretrain_gpt.py \ +--distributed-timeout-minutes 60 \ +--tensor-model-parallel-size 1 \ +--pipeline-model-parallel-size 8 \ +--expert-model-parallel-size 32 \ +--context-parallel-size 1 \ +--expert-tensor-parallel-size 1 \ +--use-distributed-optimizer \ +--overlap-grad-reduce \ +--overlap-param-gather \ +--use-mcore-models \ +--sequence-parallel \ +--use-flash-attn \ +--disable-bias-linear \ +--micro-batch-size 1 \ +--global-batch-size 2048 \ +--train-samples 585937500 \ +--exit-duration-in-mins 220 \ +--no-save-optim \ +--no-check-for-nan-in-loss-and-grad \ +--cross-entropy-loss-fusion \ +--cross-entropy-fusion-impl te \ +--manual-gc \ +--manual-gc-interval 10 \ +--enable-experimental \ +--transformer-impl transformer_engine \ +--seq-length 4096 \ +--data-cache-path /path/to/data_cache \ +--tokenizer-type HuggingFaceTokenizer \ +--tokenizer-model unsloth/DeepSeek-V3 \ +--data-path /path/to/data \ +--split 99,1,0 \ +--no-mmap-bin-files \ +--no-create-attention-mask-in-dataloader \ +--num-workers 6 \ +--num-layers 61 \ +--hidden-size 7168 \ +--ffn-hidden-size 18432 \ +--num-attention-heads 128 \ +--kv-channels 128 \ +--max-position-embeddings 4096 \ +--position-embedding-type rope \ +--rotary-base 10000 \ +--make-vocab-size-divisible-by 3232 \ +--normalization RMSNorm \ +--norm-epsilon 1e-6 \ +--swiglu \ +--untie-embeddings-and-output-weights \ +--multi-latent-attention \ +--attention-dropout 0.0 \ +--hidden-dropout 0.0 \ +--clip-grad 1.0 \ +--weight-decay 0.1 \ +--qk-layernorm \ +--lr-decay-samples 584765624 \ +--lr-warmup-samples 1536000 \ +--lr-warmup-init 3.9e-7 \ +--lr 3.9e-6 \ +--min-lr 3.9e-7 \ +--lr-decay-style cosine \ +--adam-beta1 0.9 \ +--adam-beta2 0.95 \ +--num-experts 256 \ +--moe-layer-freq ([0]*3+[1]*58) \ +--moe-ffn-hidden-size 2048 \ +--moe-shared-expert-intermediate-size 2048 \ +--moe-router-load-balancing-type seq_aux_loss \ +--moe-router-topk 8 \ +--moe-grouped-gemm \ +--moe-aux-loss-coeff 1e-4 \ +--moe-router-group-topk 4 \ +--moe-router-num-groups 8 \ +--moe-router-pre-softmax \ +--moe-router-padding-for-quantization \ +--moe-router-topk-scaling-factor 2.5 \ +--moe-router-score-function sigmoid \ +--moe-router-enable-expert-bias \ +--moe-router-bias-update-rate 1e-3 \ +--moe-router-dtype fp32 \ +--moe-permute-fusion \ +--moe-router-fusion \ +--q-lora-rank 1536 \ +--kv-lora-rank 512 \ +--qk-head-dim 128 \ +--qk-pos-emb-head-dim 64 \ +--v-head-dim 128 \ +--rotary-scaling-factor 40 \ +--mscale 1.0 \ +--mscale-all-dim 1.0 \ +--eval-iters 32 \ +--eval-interval 200 \ +--no-load-optim \ +--no-load-rng \ +--auto-detect-ckpt-format \ +--load None \ +--save /path/to/checkpoints \ +--save-interval 500 \ +--dist-ckpt-strictness log_all \ +--init-method-std 0.02 \ +--log-timers-to-tensorboard \ +--log-memory-to-tensorboard \ +--log-validation-ppl-to-tensorboard \ +--log-throughput \ +--log-interval 1 \ +--logging-level 40 \ +--tensorboard-dir /path/to/tensorboard \ +--wandb-project deepseek-v3-benchmarking-v0.15 \ +--wandb-exp-name DeepSeek-V3-TP1PP8EP32CP1VPP4-MBS1GBS2048-v0.15 \ +--bf16 \ +--enable-experimental \ +--recompute-granularity selective \ +--recompute-modules moe_act mlp \ +--cuda-graph-impl transformer_engine \ +--cuda-graph-scope attn moe_router moe_preprocess \ +--te-rng-tracker \ +--pipeline-model-parallel-layout "Et|(tt|)*30L" \ +--moe-router-force-load-balancing \ +--moe-token-dispatcher-type flex \ +--moe-flex-dispatcher-backend hybridep \ +--moe-hybridep-num-sms 32 \ +--fp8-recipe mxfp8 \ +--fp8-format e4m3 \ +--fp8-param-gather \ +--reuse-grad-buf-for-mxfp8-param-ag \ +--use-precision-aware-optimizer \ +--main-grads-dtype fp32 \ +--main-params-dtype fp32 \ +--exp-avg-dtype bf16 \ +--exp-avg-sq-dtype bf16 \ +``` + +### Explanation of arguments + +The following arguments indicate key optimizations. + +- Pipeline parallel layout + +```bash +--pipeline-model-parallel-layout "Et|(tt|)*30L" +``` + +`E` stands for embedding, `t` for transformer layer, `L` for Loss. So it's interpreted as a total of 32 stages, where the first stage is Embedding + 1 transformer layer, the last stage is Loss, and the middle 30 stages are 2 transformer layers. + +- Fine-grained recompute + +```bash +--recompute-granularity selective \ +--recompute-modules moe_act mlp \ +``` + +- Partial CUDA Graphs + +```bash +--cuda-graph-impl transformer_engine \ +--cuda-graph-scope attn moe_router moe_preprocess \ +--te-rng-tracker \ +``` + +- Force load balancing for performance benchmark + +```bash +--moe-router-force-load-balancing \ +``` + +- HybridEP + +```bash +--moe-token-dispatcher-type flex \ +--moe-flex-dispatcher-backend hybridep \ +--moe-hybridep-num-sms 32 \ +``` + +- MXFP8 recipe + +```bash +--fp8-recipe mxfp8 \ +--fp8-format e4m3 \ +--fp8-param-gather \ +--reuse-grad-buf-for-mxfp8-param-ag \ +``` + +- BF16 optimizer states + +```bash +--use-precision-aware-optimizer \ +--main-grads-dtype fp32 \ +--main-params-dtype fp32 \ +--exp-avg-dtype bf16 \ +--exp-avg-sq-dtype bf16 \ +``` + +- Kernel fusions + +```bash +--cross-entropy-loss-fusion \ +--cross-entropy-fusion-impl te \ +--moe-permute-fusion \ +--moe-router-fusion \ +``` + +- Manual GC to make ranks better synchronized + +```bash +--manual-gc \ +--manual-gc-interval 10 \ +``` diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image1.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image1.png new file mode 100644 index 00000000000..6e4dad685c4 Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image1.png differ diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image2.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image2.png new file mode 100644 index 00000000000..920e3c57f94 Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image2.png differ diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image3.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image3.png new file mode 100644 index 00000000000..f606dbfb744 Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image3.png differ diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image4.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image4.png new file mode 100644 index 00000000000..04239401edd Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image4.png differ diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image5.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image5.png new file mode 100644 index 00000000000..0128fc7ae45 Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image5.png differ diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image6.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image6.png new file mode 100644 index 00000000000..cb2ed2eb9ad Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image6.png differ diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image7.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image7.png new file mode 100644 index 00000000000..325d0fd4f52 Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image7.png differ diff --git a/docs/source/api-guide/router_replay.md b/docs/source/api-guide/router_replay.md new file mode 100644 index 00000000000..b2e043b3065 --- /dev/null +++ b/docs/source/api-guide/router_replay.md @@ -0,0 +1,180 @@ +--- +orphan: true +--- + +# Design Document: MoE Router Replay Feature + +## 1. Overview + +This document provides a detailed description of the "Router Replay" feature implemented within the Megatron-LM Core for Mixture-of-Experts (MoE) models. + +This feature is designed to enhance determinism and analyzability in MoE model training and inference. It enables the model to load routing decisions from a predefined file and enforce their use during the forward pass, thereby bypassing the real-time routing computation. + +## 2. Motivation + +* **Determinism & Reproducibility**: In distributed training, MoE routing decisions can exhibit minor variations due to factors like floating-point precision. By replaying a fixed routing table, the MoE computation path is guaranteed to be identical across runs, which facilitates debugging and reproducing experimental results. +* **Performance Profiling**: The router's own computation (e.g., logits calculation, top-k selection) incurs overhead. In replay mode, this part of the computation can be completely skipped, allowing for more precise isolation and profiling of performance bottlenecks within the Expert Layers themselves. +* **Debugging Aid**: When issues arise in the model, fixing the routing decisions helps to isolate variables, making it easier to determine whether the problem lies with the routing mechanism or the expert computations. + +## 3. Design and Architecture + +The design follows the principles of being non-intrusive and on-demand, with the core idea of activating the replay logic only when explicitly requested by the user. + +* **Core Components**: + * `RouterReplay` (located in `megatron/core/transformer/moe/router_replay.py`): A utility class for replaying MoE routing decisions. When enabled via the `enable_routing_replay` flag, a separate instance of `RouterReplay` is created for each MoE layer's router. Each instance is responsible for loading routing data and providing the deterministic routing decisions for its corresponding layer during the forward pass. + * `enable_routing_replay` (located in `megatron/core/transformer/transformer_config.py`): A boolean global configuration flag that serves as the sole entry point for enabling this feature. + +* **Workflow**: + The feature supports different modes, such as recording and replaying, controlled by a `RouterReplayAction`. + + 1. **Enabling the Feature**: The user sets `enable_routing_replay` to `True` in the model configuration. + 2. **Initialization**: When `enable_routing_replay` is true, each `TopKRouter` creates its own `RouterReplay` instance. + 3. **Mode Configuration**: The user must programmatically set the desired router replay action (e.g., `record`, `forward_replay`, `backward_replay`) on the `RouterReplay` instances. + 4. **Execution Flow (within a mini-batch)**: + * **Forward Pass**: + * For each micro-batch, the `topk_routing_with_score_function` checks the `router_replay_action`. + * **In `record` mode**: The dynamically computed `top-k` expert indices are captured and stored. + * **In `forward_replay` mode**: The function retrieves pre-loaded expert indices from `target_topk_idx`. These indices are used for the forward computation and are also appended to the `replay_backward_list` to prepare for the backward pass. + * **Backward Pass**: + * For each micro-batch (processed in reverse order in pipeline parallelism), the `router_replay_action` is checked again. + * **In `backward_replay` mode**: The function retrieves the expert indices for the corresponding micro-batch by popping them from the `replay_backward_list`. This mode is intended for training recomputation (e.g., activation checkpointing and pipeline recompute) so the same routing decisions are used during recompute/backward as in forward, ensuring determinism and correctness. + +## 4. Implementation Details + +The implementation cleanly separates the replay logic from the router's core computation. + +* **`megatron/core/transformer/transformer_config.py`**: + * Adds the configuration option `enable_routing_replay: bool = False`. + +* **`megatron/core/transformer/moe/moe_utils.py`**: + * Introduces the `RouterReplay` class to manage the state for recording and replaying routing decisions for a single MoE layer. + * `target_topk_idx`: An attribute holding the expert indices for the current micro-batch during forward replay mode. + * `recorded_topk_idx`: An attribute for storing the computed expert indices when in record mode. + * `replay_backward_list`: A list that accumulates the `top-k` indices used during the forward passes of a mini-batch. This list is consumed in FIFO order during the backward pass to ensure correctness under pipeline parallelism. + * `set_target_indices()`: A method to load the replay indices into `target_topk_idx` for the forward pass. + * `record_indices()`: A method to save the computed indices. + * The `topk_routing_with_score_function` is modified to contain the core logic. It checks the `router_replay_action` on the `router_replay` instance and accordingly performs one of the following actions: computes and records indices, replays indices from `target_topk_idx` (for forward), replays indices from `replay_backward_list` (for backward), or falls through to the default dynamic routing. + +### Training recompute usage +- During forward replay, `set_target_indices()` prepares `replay_backward_list` so each micro-batch’s indices are available for recomputation. +- During recompute/backward, set action to `REPLAY_BACKWARD` so indices are consumed in FIFO order to mirror the forward sequence. + +## 5. Usage Guide + +1. **Enable & Instantiate** + - Create one `RouterReplay` instance per MoE router layer when building the model. + - Optionally use the global helpers to set/clear actions across all layers. +2. **Record Routing Decisions** + - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)`. + - Run the model; retrieve per-layer indices via `RouterReplay.get_recorded_data()` and persist. +3. **Forward Replay** + - Load indices and distribute: `RouterReplay.set_replay_data(list_of_tensors)`. + - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD)`. + - Run the model; dynamic top‑k is bypassed and target indices are used. +4. **Backward Replay** + - For training recomputation (activation checkpointing or pipeline recompute), set action: `REPLAY_BACKWARD` during recomputation. + - Per micro‑batch indices are consumed from `replay_backward_list` in FIFO order. +5. **Cleanup** + - Use `RouterReplay.clear_global_indices()`, `RouterReplay.clear_global_router_replay_action()`, and `RouterReplay.clear_global_router_replay_instances()` to restore default behavior and prevent memory leaks. + +### Quick usage with `topk_routing_with_score_function` + +```python +import torch +from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction +from megatron.core.transformer.moe.moe_utils import topk_routing_with_score_function + +rr = RouterReplay() + +# Record +RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD) +logits = torch.randn(8, 16) +probs_rec, routing_map_rec = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr, +) +recorded = rr.get_recorded_indices() +torch.save(recorded, "/tmp/replay.pt") + +# Forward replay +rr.clear_router_replay_action() +rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD) +target = torch.load("/tmp/replay.pt") +rr.set_target_indices(target) +probs_rep, routing_map_rep = topk_routing_with_score_function( + logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr, +) + +RouterReplay.clear_global_router_replay_action() +RouterReplay.clear_global_indices() +RouterReplay.clear_global_router_replay_instances() +``` + +## 6. Minimal Demo + +Here is a minimal code example showing how to use RouterReplay for recording and replaying: + +```python +import torch +import torch.distributed as dist +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.core.transformer.moe.router import TopKRouter +from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction + + +# Initialize distributed training +if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + +# Create a transformer config with RouterReplay enabled +config = TransformerConfig( + num_experts=8, + expert_model_parallel_size=1, + num_top_k=2, + enable_routing_replay=True +) + +# Create a TopKRouter instance +router = TopKRouter(config) + +# Generate sample input (batch_size, sequence_length, hidden_size) +logits = torch.randn(16, 32, 8).to(torch.cuda.current_device()) + +# ----------------- +# 1. Recording Mode +# ----------------- +print("=== Recording Mode ===") +# Set global router replay action to RECORD +RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD) + +# Perform routing +routing_output = router.forward(logits) +print(f"Recorded top-k indices shape: {routing_output.top_k_idx.shape}") + +# ----------------- +# 2. Forward Replay Mode +# ----------------- +print("\n=== Forward Replay Mode ===") +# Save recorded indices to a file +torch.save(routing_output.top_k_idx, "/tmp/replay.pt") + +# Load indices from file and set as target for replay +replay_indices = torch.load("/tmp/replay.pt") +for router_instance in RouterReplay.global_router_replay_instances: + router_instance.target_topk_idx = replay_indices + +# Set global router replay action to REPLAY_FORWARD +RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD) + +# Perform routing again - this will use the replayed indices +replay_routing_output = router.forward(logits) +print(f"Replayed top-k indices shape: {replay_routing_output.top_k_idx.shape}") +print(f"Are indices the same? {torch.equal(routing_output.top_k_idx, replay_routing_output.top_k_idx)}") + + +# Clean up +RouterReplay.clear_global_router_replay_action() +RouterReplay.clear_global_indices() +RouterReplay.clear_global_router_replay_instances() +if dist.is_initialized(): + dist.destroy_process_group() +``` diff --git a/docs/user-guide/features/index.md b/docs/user-guide/features/index.md index d080fe3f256..514568afac9 100644 --- a/docs/user-guide/features/index.md +++ b/docs/user-guide/features/index.md @@ -20,6 +20,7 @@ context_parallel custom_fsdp dist_optimizer optimizer_cpu_offload +paged_stash pipeline_parallel_layout tokenizers megatron_energon diff --git a/docs/user-guide/features/paged_stash.md b/docs/user-guide/features/paged_stash.md new file mode 100644 index 00000000000..4b7d807ace2 --- /dev/null +++ b/docs/user-guide/features/paged_stash.md @@ -0,0 +1,59 @@ + + +# MoE Paged Stash + +*This is an experimental feature and may change.* + +**Paged stash** = **sync-free** expert execution + **paged stashing** (packing routed-expert activations for backward into paged buffers). + +**Sync-free:** `--moe-flex-dispatcher-backend hybridep`, `--use-transformer-engine-op-fuser`, and `--moe-expert-rank-capacity-factor` pre-size dispatch and fused grouped expert buffers from a user-controlled capacity, avoiding a per-step device query / realloc loop for buffer sizing. + +**Paged stashing:** `--moe-paged-stash` stores those activations in paged CUDA buffers (optional pinned host spill). It helps save activation memory; sync-free still works without it, at the cost of higher activation memory use. + +Whenever `moe_expert_rank_capacity_factor` is set, a **runner** wraps forward-backward: after each pass it checks **stash overflow** (only with `--moe-paged-stash`) and **token over-budget**. If either hits any rank, the step **reruns once** without capacity padding and without paged stashing. + +## Prerequisites + +HybridEP + TE fused grouped experts are required whenever `moe_expert_rank_capacity_factor` is set. With `moe_paged_stash` enabled: capacity factor must be set; no `cpu_offloading`; `offload_modules` must not include `expert_fc1` or `moe_act`. The runner is active whenever capacity factor is set (even without `--moe-paged-stash`) for over-budget reruns; stash overflow is checked only when paged stashing is on. + +## Configuration + +```bash +# Sync-free +--moe-flex-dispatcher-backend hybridep +--use-transformer-engine-op-fuser +--moe-expert-rank-capacity-factor + +# Paged stashing (to avoid memory waste due to fragmentation) +--moe-paged-stash +``` + +## Tuning (paged stashing only) + +```bash +# Page size for stashing +--moe-paged-stash-page-size 64 +# CUDA stashing buffer scaling factor (default 1.10) +--moe-paged-stash-buffer-size-factor-cuda 1.10 +# Host spill (0 = off); same sign rule as CUDA +--moe-paged-stash-buffer-size-factor-cpu 0.0 +``` + +## What `moe_expert_rank_capacity_factor` and `moe_paged_stash_buffer_size_factor_cuda` mean + +Both are **multipliers on buffer size relative to the perfectly balanced case**β€”the space you would need if routed tokens were evenly distributed across expert ranks. A larger factor adds headroom for real-world **skew**. + +## Choosing `moe_expert_rank_capacity_factor` and stash buffer scales + +Profile how far real routing departs from the **balanced** reference, then pick factors so **skew spikes** rarely exceed your margin (avoid constant reruns). + +- **`moe_expert_rank_capacity_factor`:** pick from profiles so **over-budget token drop** is uncommon; set **slightly above** the profiled value so reruns stay rare. +- **`moe_paged_stash_buffer_size_factor_cuda`:** size from the **same stats** (peaks vs averages) so **stash overflow** is uncommon; undersizing triggers reruns like over-budget. +- **`moe_paged_stash_buffer_size_factor_cpu`:** set **> 0** to allow **spill to pinned host** when CUDA pages are fullβ€”often **avoids overflow / rerun** at the cost of host memory and more overhead from paged stashing. diff --git a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py index 183c6c695bd..a8b72bb39ae 100644 --- a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py +++ b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py @@ -6,28 +6,33 @@ import json import os import sys -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), - os.path.pardir, os.path.pardir))) + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) +) +from typing import Union + import torch -from megatron.training import get_args -from megatron.training import get_tokenizer -from megatron.training import print_rank_0 -from megatron.training.checkpointing import load_checkpoint + +import megatron.legacy.model from megatron.core import mpu -from megatron.training.arguments import parse_and_validate_args -from megatron.training.initialize import initialize_megatron -from megatron.legacy.model import GPTModel -from megatron.training import get_model -from megatron.inference.text_generation import generate_and_post_process -from megatron.training.arguments import core_transformer_config_from_args from megatron.core.models.gpt import GPTModel -from typing import Union -import megatron.legacy.model +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_layer_local_spec, + get_gpt_layer_with_transformer_engine_spec, +) from megatron.core.transformer.spec_utils import import_module -from megatron.training.arguments import core_transformer_config_from_args -from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_layer_local_spec +from megatron.inference.text_generation import generate_and_post_process +from megatron.legacy.model import GPTModel +from megatron.training import get_args, get_model, get_tokenizer, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args, parse_and_validate_args +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron -def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: + +def model_provider( + pre_process=True, post_process=True +) -> Union[GPTModel, megatron.legacy.model.GPTModel]: """Builds the model. If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model. @@ -51,26 +56,23 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat num_tokentypes=0, parallel_output=False, pre_process=pre_process, - post_process=post_process + post_process=post_process, ) else: if args.spec is None: if args.transformer_impl == 'local': transformer_layer_spec = get_gpt_layer_local_spec( - num_experts=args.num_experts, - moe_grouped_gemm=args.moe_grouped_gemm + num_experts=args.num_experts, moe_grouped_gemm=args.moe_grouped_gemm ) elif args.transformer_impl == 'transformer_engine': transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( - num_experts=args.num_experts, - moe_grouped_gemm=args.moe_grouped_gemm + num_experts=args.num_experts, moe_grouped_gemm=args.moe_grouped_gemm ) else: raise ValueError(f"Invalid transformer_impl {args.transformer_impl}") elif args.spec[0] == 'local': transformer_layer_spec = get_gpt_layer_local_spec( - num_experts=args.num_experts, - moe_grouped_gemm=args.moe_grouped_gemm + num_experts=args.num_experts, moe_grouped_gemm=args.moe_grouped_gemm ) else: transformer_layer_spec = import_module(args.spec) @@ -86,37 +88,46 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat parallel_output=False, share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, position_embedding_type=args.position_embedding_type, - rotary_percent=args.rotary_percent + rotary_percent=args.rotary_percent, ) return model + def add_text_generate_args(parser): """Text generation arguments.""" group = parser.add_argument_group(title='text generation') - group.add_argument("--temperature", type=float, default=1.0, - help='Sampling temperature.') - group.add_argument("--greedy", action='store_true', default=False, - help='Use greedy sampling.') - group.add_argument("--top_p", type=float, default=0.0, - help='Top p sampling.') - group.add_argument("--top_k", type=int, default=0, - help='Top k sampling.') - group.add_argument("--out-seq-length", type=int, default=1024, - help='Size of the output generated text.') - group.add_argument("--sample-input-file", type=str, default=None, - help='Get input from file instead of interactive mode, ' - 'each line is an input.') - group.add_argument("--sample-output-file", type=str, default=None, - help='Output file got from --sample-input-file') - group.add_argument("--num-samples", type=int, default=0, - help='Number of samples to generate unconditionally, ' - 'defaults to 0 and interactive conditional sampling') - group.add_argument("--genfile", type=str, - help='Output file when generating unconditionally') + group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') + group.add_argument("--greedy", action='store_true', default=False, help='Use greedy sampling.') + group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') + group.add_argument("--top_k", type=int, default=0, help='Top k sampling.') + group.add_argument( + "--out-seq-length", type=int, default=1024, help='Size of the output generated text.' + ) + group.add_argument( + "--sample-input-file", + type=str, + default=None, + help='Get input from file instead of interactive mode, ' 'each line is an input.', + ) + group.add_argument( + "--sample-output-file", + type=str, + default=None, + help='Output file got from --sample-input-file', + ) + group.add_argument( + "--num-samples", + type=int, + default=0, + help='Number of samples to generate unconditionally, ' + 'defaults to 0 and interactive conditional sampling', + ) + group.add_argument("--genfile", type=str, help='Output file when generating unconditionally') return parser + def generate_samples_unconditional(model): args = get_args() @@ -124,6 +135,7 @@ def generate_samples_unconditional(model): cnt = 0 num_samples = args.num_samples from tqdm import tqdm + pbar = tqdm(total=num_samples) while True: @@ -131,16 +143,23 @@ def generate_samples_unconditional(model): sentences = [''] * args.global_batch_size print("global batch size", args.global_batch_size) max_len = args.out_seq_length - resp_sentences, resp_sentences_seg, output_logits, \ - tokens = generate_and_post_process(model, prompts=sentences, - tokens_to_generate=max_len, - return_output_log_probs=False, - top_k_sampling=args.top_k, - top_p_sampling=args.top_p, - add_BOS=True, - temperature=1.0) + resp_sentences, resp_sentences_seg, output_logits, tokens = generate_and_post_process( + model, + prompts=sentences, + tokens_to_generate=max_len, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=True, + temperature=1.0, + ) for prompt, generation, token in zip(sentences, resp_sentences, tokens): - datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt} + datum = { + 'text': generation[len(prompt) :], + 'all_text': generation, + 'prompt': prompt, + 'id': cnt, + } yield datum cnt += 1 pbar.update() @@ -161,6 +180,7 @@ def generate_samples_conditional(model): num_samples = args.num_samples cnt = 0 from tqdm import tqdm + pbar = tqdm(total=num_samples) fname = open(args.sample_input_file, "r") @@ -184,16 +204,23 @@ def generate_samples_conditional(model): sentences.append(raw_text) max_len = args.out_seq_length - resp_sentences, resp_sentences_seg, output_logits, \ - tokens = generate_and_post_process(model, prompts=sentences, - tokens_to_generate=max_len, - return_output_log_probs=False, - top_k_sampling=args.top_k, - top_p_sampling=args.top_p, - add_BOS=False, - temperature=1.0) + resp_sentences, resp_sentences_seg, output_logits, tokens = generate_and_post_process( + model, + prompts=sentences, + tokens_to_generate=max_len, + return_output_log_probs=False, + top_k_sampling=args.top_k, + top_p_sampling=args.top_p, + add_BOS=False, + temperature=1.0, + ) for prompt, generation, token in zip(sentences, resp_sentences, tokens): - datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt} + datum = { + 'text': generation[len(prompt) :], + 'all_text': generation, + 'prompt': prompt, + 'id': cnt, + } yield datum cnt += 1 pbar.update() @@ -220,8 +247,7 @@ def generate_and_write_samples_conditional(model): args = get_args() if args.sample_output_file is None: sample_output_file = args.sample_input_file + ".out" - print('`sample-output-file` not specified, setting ' - 'it to {}'.format(sample_output_file)) + print('`sample-output-file` not specified, setting ' 'it to {}'.format(sample_output_file)) else: sample_output_file = args.sample_output_file with open(sample_output_file, 'w') as f: @@ -233,11 +259,15 @@ def generate_and_write_samples_conditional(model): def main(): """Main program.""" - parse_and_validate_args(extra_args_provider=add_text_generate_args, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer', - 'no_load_rng': True, - 'no_load_optim': True, - 'seq_length': 2048}) + parse_and_validate_args( + extra_args_provider=add_text_generate_args, + args_defaults={ + 'tokenizer_type': 'GPT2BPETokenizer', + 'no_load_rng': True, + 'no_load_optim': True, + 'seq_length': 2048, + }, + ) initialize_megatron() # Set up model and load checkpoint diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py index 02a257c1b46..e5df38fe856 100644 --- a/examples/inference/gpt/gpt_dynamic_inference.py +++ b/examples/inference/gpt/gpt_dynamic_inference.py @@ -11,10 +11,11 @@ from collections import defaultdict from typing import Dict, List, Optional -from megatron.training.arguments import parse_and_validate_args import torch from tqdm import tqdm +from megatron.training.arguments import parse_and_validate_args + sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) @@ -241,7 +242,10 @@ def _process_step_result(result): # Suspend. if attempted_step_count % args.suspend_resume_interval == 0: - print("**** step %d/%d ... suspend." % (engine.context.step_count, attempted_step_count)) + print( + "**** step %d/%d ... suspend." + % (engine.context.step_count, attempted_step_count) + ) engine.suspend() # Resume, 0+ attempted steps later. @@ -251,7 +255,10 @@ def _process_step_result(result): % args.suspend_resume_interval == 0 ): - print("**** step %d/%d ... resume." % (engine.context.step_count, attempted_step_count)) + print( + "**** step %d/%d ... resume." + % (engine.context.step_count, attempted_step_count) + ) engine.resume() # If engine suspended, continue to next iter. @@ -469,7 +476,9 @@ def escape_str(s): # Attach peak memory metrics; the functional test only validates these # if the fields exist in the golden values. json_results.update(peak_mem_stats) - json_results["lifetime_prefill_token_count"] = engine.context.lifetime_prefill_token_count + json_results["lifetime_prefill_token_count"] = ( + engine.context.lifetime_prefill_token_count + ) print(f' Saving results to {args.output_path}') with open(args.output_path, "w") as fp: diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py index 247404d537e..31c2b3529de 100644 --- a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py +++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py @@ -9,7 +9,6 @@ from collections import defaultdict from typing import List -from megatron.training.arguments import parse_and_validate_args import torch import torch.distributed as dist @@ -25,6 +24,7 @@ get_model_for_inference, ) from megatron.training import get_args, get_tokenizer, initialize_megatron +from megatron.training.arguments import parse_and_validate_args # pylint: disable=line-too-long @@ -73,11 +73,15 @@ async def main( ) # All ranks agree on the number of suspend/resume cycles from args. - num_suspend_resume_cycles = len(requests) // args.suspend_resume_interval if args.suspend_resume_interval else 0 + num_suspend_resume_cycles = ( + len(requests) // args.suspend_resume_interval if args.suspend_resume_interval else 0 + ) # Create client and run example. if dist.get_rank() == 0: - client = InferenceClient(dp_addr, deserialize=True) # submits requests to the inference coordinator + client = InferenceClient( + dp_addr, deserialize=True + ) # submits requests to the inference coordinator client.start() base_arrival_time = time.time_ns() / 10**9 for request in requests: @@ -103,7 +107,10 @@ async def main( futures.append(client.add_request(request.prompt_text, request.sampling_params)) num_requests_added += 1 - if num_requests_added >= next_suspend_at and cycles_done < num_suspend_resume_cycles: + if ( + num_requests_added >= next_suspend_at + and cycles_done < num_suspend_resume_cycles + ): await suspend_resume_cycle(client, engine, args, futures) cycles_done += 1 next_suspend_at += args.suspend_resume_interval @@ -120,7 +127,10 @@ async def main( futures.append(client.add_request(request.prompt_text, request.sampling_params)) num_requests_added += 1 - if num_requests_added >= next_suspend_at and cycles_done < num_suspend_resume_cycles: + if ( + num_requests_added >= next_suspend_at + and cycles_done < num_suspend_resume_cycles + ): await suspend_resume_cycle(client, engine, args, futures) cycles_done += 1 next_suspend_at += args.suspend_resume_interval @@ -159,7 +169,7 @@ async def main( throughputs.append(throughput) if req.routing_indices is not None: result_dict["routing_indices"] = req.routing_indices.tolist() - + json_results[req.request_id] = result_dict throughput_dict = {"throughput": throughputs} if args.throughput_check_only: diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py index d3dd619eaa1..9c748fdf795 100644 --- a/examples/inference/gpt/gpt_static_inference.py +++ b/examples/inference/gpt/gpt_static_inference.py @@ -5,7 +5,6 @@ import time from argparse import Namespace -from megatron.training.arguments import parse_and_validate_args import torch from megatron.core.inference.contexts import StaticInferenceContext @@ -20,6 +19,7 @@ ) from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer from megatron.core.transformer.module import MegatronModule +from megatron.training.arguments import parse_and_validate_args sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) diff --git a/examples/mimo/train.py b/examples/mimo/train.py index a89c83728e0..05eb4f2ab0c 100644 --- a/examples/mimo/train.py +++ b/examples/mimo/train.py @@ -9,26 +9,24 @@ from functools import partial from typing import Any, Dict, Iterator -from megatron.training.arguments import parse_and_validate_args import torch -from megatron.training import get_args, pretrain, print_rank_0 from megatron.core.parallel_state import ( + get_context_parallel_group, + get_data_parallel_group, get_tensor_model_parallel_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_src_rank, - get_context_parallel_group, - get_data_parallel_group, ) +from megatron.training import get_args, pretrain, print_rank_0 +from megatron.training.arguments import parse_and_validate_args sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) ) from data.energon_avlm_task_encoder import llava_avlm_dataloader_provider from data.energon_vlm_task_encoder import llava_vlm_dataloader_provider -from data.mock import ( - train_valid_test_datasets_provider as mock_train_valid_test_datasets_provider, -) +from data.mock import train_valid_test_datasets_provider as mock_train_valid_test_datasets_provider from model_providers.llava_avlm import model_provider_llava_avlm from model_providers.llava_vlm import model_provider_llava_vlm from model_providers.mock import model_provider_mock_vlm_single_encoder @@ -50,13 +48,24 @@ "llava_avlm": llava_avlm_dataloader_provider, } + def add_mimo_args(parser): """Add MIMO-specific arguments to the parser.""" group = parser.add_argument_group('MIMO', 'MIMO specific arguments') # MIMO-specific parameters - group.add_argument('--dataset-provider', type=str, default='mock', help='Dataset provider to choose from [mock, llava_vlm, video_llava_vlm, llava_avlm]') - group.add_argument('--model-provider', type=str, default='mock', help='Model provider to choose from [mock, llava_vlm, video_llava_vlm, llava_avlm]') + group.add_argument( + '--dataset-provider', + type=str, + default='mock', + help='Dataset provider to choose from [mock, llava_vlm, video_llava_vlm, llava_avlm]', + ) + group.add_argument( + '--model-provider', + type=str, + default='mock', + help='Model provider to choose from [mock, llava_vlm, video_llava_vlm, llava_avlm]', + ) # mock dataloader related args # can control mock samples with total seq length and image seq length @@ -71,17 +80,29 @@ def add_mimo_args(parser): '--audio-encoder-model', type=str, default=None, help='Audio encoder model name' ) group.add_argument( - '--hf-assign-unused-tokens', type=str, nargs='+', default=None, - help='Assigning unused tokens to special tokens. Example: ' - '--hf-assign-unused-tokens "